From 3ea5e56c263e04fd69aa8b801d3904c49ff00b8a Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 8 Mar 2022 13:27:08 +1000 Subject: [PATCH] SP ASM performance improvements Mostly improving Aarch64 assembly. Change Karatsuba implementations. Specialised code for exponentiating to 0x10001 for RSA. --- configure.ac | 6 +- wolfcrypt/src/ecc.c | 1 + wolfcrypt/src/sp_arm32.c | 4280 ++++--- wolfcrypt/src/sp_arm64.c | 20272 +++++++++++++++++------------- wolfcrypt/src/sp_armthumb.c | 15725 ++++++++++++----------- wolfcrypt/src/sp_c32.c | 1580 +-- wolfcrypt/src/sp_c64.c | 928 +- wolfcrypt/src/sp_cortexm.c | 2284 ++-- wolfcrypt/src/sp_x86_64.c | 287 +- wolfcrypt/src/sp_x86_64_asm.S | 14642 ++++++++++----------- wolfcrypt/src/sp_x86_64_asm.asm | 14232 ++++++++++----------- 11 files changed, 39443 insertions(+), 34794 deletions(-) diff --git a/configure.ac b/configure.ac index eb23164c6..f3a81cab2 100644 --- a/configure.ac +++ b/configure.ac @@ -5958,7 +5958,7 @@ do ENABLED_SP_FF_3072=yes ENABLED_SP_ECC=yes ENABLED_SP_EC_256=yes - if test "$host_cpu" = "x86_64"; then + if test "$host_cpu" = "x86_64" || test "$host_cpu" = "aarch64"; then ENABLED_SP_FF_4096=yes ENABLED_SP_EC_384=yes ENABLED_SP_EC_521=yes @@ -5973,7 +5973,7 @@ do ENABLED_SP_FF_3072=yes ENABLED_SP_ECC=yes ENABLED_SP_EC_256=yes - if test "$host_cpu" = "x86_64"; then + if test "$host_cpu" = "x86_64" || test "$host_cpu" = "aarch64"; then ENABLED_SP_FF_4096=yes ENABLED_SP_EC_384=yes ENABLED_SP_EC_521=yes @@ -5988,7 +5988,7 @@ do ENABLED_SP_FF_3072=yes ENABLED_SP_ECC=yes ENABLED_SP_EC_256=yes - if test "$host_cpu" = "x86_64"; then + if test "$host_cpu" = "x86_64" || test "$host_cpu" = "aarch64"; then ENABLED_SP_FF_4096=yes ENABLED_SP_EC_384=yes ENABLED_SP_EC_521=yes diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c index c56a316c5..8c038623d 100644 --- a/wolfcrypt/src/ecc.c +++ b/wolfcrypt/src/ecc.c @@ -6152,6 +6152,7 @@ int wc_ecc_sign_hash_ex(const byte* in, word32 inlen, WC_RNG* rng, #endif } #endif + (void)sign_k; } #else (void)inlen; diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index 296541be6..9118131e8 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -48,19 +48,19 @@ #include #ifdef WOLFSSL_SP_ARM32_ASM -#define SP_PRINT_NUM(var, name, total, words, bits) \ - do { \ - int ii; \ - fprintf(stderr, name "=0x"); \ - for (ii = words - 1; ii >= 0; ii--) \ - fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ - fprintf(stderr, "\n"); \ +#define SP_PRINT_NUM(var, name, total, words, bits) \ + do { \ + int ii; \ + fprintf(stderr, name "=0x"); \ + for (ii = ((bits + 31) / 32) - 1; ii >= 0; ii--) \ + fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ + fprintf(stderr, "\n"); \ } while (0) -#define SP_PRINT_VAL(var, name) \ +#define SP_PRINT_VAL(var, name) \ fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var) -#define SP_PRINT_INT(var, name) \ +#define SP_PRINT_INT(var, name) \ fprintf(stderr, name "=%d\n", var) #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) @@ -672,345 +672,6 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) ); } -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #32\n\t" - "mov r12, #0\n\t" - "# A[0] * A[0]\n\t" - "ldr r10, [%[a], #0]\n\t" - "umull r8, r3, r10, r10\n\t" - "mov r4, #0\n\t" - "str r8, [sp]\n\t" - "# A[0] * A[1]\n\t" - "ldr r10, [%[a], #4]\n\t" - "ldr r8, [%[a], #0]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r12, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "str r3, [sp, #4]\n\t" - "# A[0] * A[2]\n\t" - "ldr r10, [%[a], #8]\n\t" - "ldr r8, [%[a], #0]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r12, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, r12\n\t" - "# A[1] * A[1]\n\t" - "ldr r10, [%[a], #4]\n\t" - "umull r8, r9, r10, r10\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, r12\n\t" - "str r4, [sp, #8]\n\t" - "# A[0] * A[3]\n\t" - "ldr r10, [%[a], #12]\n\t" - "ldr r8, [%[a], #0]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r12, r12\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, r12\n\t" - "# A[1] * A[2]\n\t" - "ldr r10, [%[a], #8]\n\t" - "ldr r8, [%[a], #4]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, r12\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, r12\n\t" - "str r2, [sp, #12]\n\t" - "# A[0] * A[4]\n\t" - "ldr r10, [%[a], #16]\n\t" - "ldr r8, [%[a], #0]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r12, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "# A[1] * A[3]\n\t" - "ldr r10, [%[a], #12]\n\t" - "ldr r8, [%[a], #4]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "# A[2] * A[2]\n\t" - "ldr r10, [%[a], #8]\n\t" - "umull r8, r9, r10, r10\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "str r3, [sp, #16]\n\t" - "# A[0] * A[5]\n\t" - "ldr r10, [%[a], #20]\n\t" - "ldr r8, [%[a], #0]\n\t" - "umull r5, r6, r10, r8\n\t" - "mov r3, #0\n\t" - "mov r7, #0\n\t" - "# A[1] * A[4]\n\t" - "ldr r10, [%[a], #16]\n\t" - "ldr r8, [%[a], #4]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "# A[2] * A[3]\n\t" - "ldr r10, [%[a], #12]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "adds r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adc r7, r7, r7\n\t" - "adds r4, r4, r5\n\t" - "adcs r2, r2, r6\n\t" - "adc r3, r3, r7\n\t" - "str r4, [sp, #20]\n\t" - "# A[0] * A[6]\n\t" - "ldr r10, [%[a], #24]\n\t" - "ldr r8, [%[a], #0]\n\t" - "umull r5, r6, r10, r8\n\t" - "mov r4, #0\n\t" - "mov r7, #0\n\t" - "# A[1] * A[5]\n\t" - "ldr r10, [%[a], #20]\n\t" - "ldr r8, [%[a], #4]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "# A[2] * A[4]\n\t" - "ldr r10, [%[a], #16]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "# A[3] * A[3]\n\t" - "ldr r10, [%[a], #12]\n\t" - "umull r8, r9, r10, r10\n\t" - "adds r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adc r7, r7, r7\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "adds r2, r2, r5\n\t" - "adcs r3, r3, r6\n\t" - "adc r4, r4, r7\n\t" - "str r2, [sp, #24]\n\t" - "# A[0] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #0]\n\t" - "umull r5, r6, r10, r8\n\t" - "mov r2, #0\n\t" - "mov r7, #0\n\t" - "# A[1] * A[6]\n\t" - "ldr r10, [%[a], #24]\n\t" - "ldr r8, [%[a], #4]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "# A[2] * A[5]\n\t" - "ldr r10, [%[a], #20]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "# A[3] * A[4]\n\t" - "ldr r10, [%[a], #16]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "adds r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adc r7, r7, r7\n\t" - "adds r3, r3, r5\n\t" - "adcs r4, r4, r6\n\t" - "adc r2, r2, r7\n\t" - "str r3, [sp, #28]\n\t" - "# A[1] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #4]\n\t" - "umull r5, r6, r10, r8\n\t" - "mov r3, #0\n\t" - "mov r7, #0\n\t" - "# A[2] * A[6]\n\t" - "ldr r10, [%[a], #24]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "# A[3] * A[5]\n\t" - "ldr r10, [%[a], #20]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "# A[4] * A[4]\n\t" - "ldr r10, [%[a], #16]\n\t" - "umull r8, r9, r10, r10\n\t" - "adds r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adc r7, r7, r7\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "adds r4, r4, r5\n\t" - "adcs r2, r2, r6\n\t" - "adc r3, r3, r7\n\t" - "str r4, [%[r], #32]\n\t" - "# A[2] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r5, r6, r10, r8\n\t" - "mov r4, #0\n\t" - "mov r7, #0\n\t" - "# A[3] * A[6]\n\t" - "ldr r10, [%[a], #24]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "# A[4] * A[5]\n\t" - "ldr r10, [%[a], #20]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, r12\n\t" - "adds r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adc r7, r7, r7\n\t" - "adds r2, r2, r5\n\t" - "adcs r3, r3, r6\n\t" - "adc r4, r4, r7\n\t" - "str r2, [%[r], #36]\n\t" - "# A[3] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r12, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "# A[4] * A[6]\n\t" - "ldr r10, [%[a], #24]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "# A[5] * A[5]\n\t" - "ldr r10, [%[a], #20]\n\t" - "umull r8, r9, r10, r10\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "str r3, [%[r], #40]\n\t" - "# A[4] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r12, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, r12\n\t" - "# A[5] * A[6]\n\t" - "ldr r10, [%[a], #24]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, r12\n\t" - "str r4, [%[r], #44]\n\t" - "# A[5] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r12, r12\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, r12\n\t" - "# A[6] * A[6]\n\t" - "ldr r10, [%[a], #24]\n\t" - "umull r8, r9, r10, r10\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, r12\n\t" - "str r2, [%[r], #48]\n\t" - "# A[6] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r8, r9, r10, r8\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r12, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r12\n\t" - "str r3, [%[r], #52]\n\t" - "# A[7] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "umull r8, r9, r10, r10\n\t" - "adds r4, r4, r8\n\t" - "adc r2, r2, r9\n\t" - "str r4, [%[r], #56]\n\t" - "str r2, [%[r], #60]\n\t" - "ldm sp!, {r2, r3, r4, r8}\n\t" - "stm %[r]!, {r2, r3, r4, r8}\n\t" - "ldm sp!, {r2, r3, r4, r8}\n\t" - "stm %[r]!, {r2, r3, r4, r8}\n\t" - "sub %[r], %[r], #32\n\t" - : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r8", "r9", "r10", "r8", "r5", "r6", "r7", "r12" - ); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -1213,7 +874,7 @@ SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, sp_digit z1[16]; sp_digit a1[8]; sp_digit b1[8]; - sp_digit z2[16]; + sp_digit* z2 = r + 16; sp_digit u; sp_digit ca; sp_digit cb; @@ -1221,45 +882,22 @@ SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, ca = sp_2048_add_8(a1, a, &a[8]); cb = sp_2048_add_8(b1, b, &b[8]); u = ca & cb; - sp_2048_mul_8(z1, a1, b1); + sp_2048_mul_8(z2, &a[8], &b[8]); sp_2048_mul_8(z0, a, b); - sp_2048_mask_8(r + 16, a1, 0 - cb); + sp_2048_mul_8(z1, a1, b1); + + u += sp_2048_sub_in_place_16(z1, z0); + u += sp_2048_sub_in_place_16(z1, z2); + sp_2048_mask_8(a1, a1, 0 - cb); + u += sp_2048_add_8(z1 + 8, z1 + 8, a1); sp_2048_mask_8(b1, b1, 0 - ca); - u += sp_2048_add_8(r + 16, r + 16, b1); - u += sp_2048_sub_in_place_16(z1, z2); - u += sp_2048_sub_in_place_16(z1, z0); - u += sp_2048_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - (void)sp_2048_add_16(r + 16, r + 16, z2); -} + u += sp_2048_add_8(z1 + 8, z1 + 8, b1); -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[16]; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit u; - - u = sp_2048_add_8(a1, a, &a[8]); - sp_2048_sqr_8(z1, a1); - sp_2048_sqr_8(z2, &a[8]); - sp_2048_sqr_8(z0, a); - sp_2048_mask_8(r + 16, a1, 0 - u); - u += sp_2048_add_8(r + 16, r + 16, r + 16); - u += sp_2048_sub_in_place_16(z1, z2); - u += sp_2048_sub_in_place_16(z1, z0); u += sp_2048_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - (void)sp_2048_add_16(r + 16, r + 16, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (8 - 1)); + a1[0] = u; + (void)sp_2048_add_8(r + 24, r + 24, a1); } /* Sub b from a into a. (a -= b) @@ -1506,7 +1144,7 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, sp_digit z1[32]; sp_digit a1[16]; sp_digit b1[16]; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit u; sp_digit ca; sp_digit cb; @@ -1514,45 +1152,22 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, ca = sp_2048_add_16(a1, a, &a[16]); cb = sp_2048_add_16(b1, b, &b[16]); u = ca & cb; - sp_2048_mul_16(z1, a1, b1); + sp_2048_mul_16(z2, &a[16], &b[16]); sp_2048_mul_16(z0, a, b); - sp_2048_mask_16(r + 32, a1, 0 - cb); + sp_2048_mul_16(z1, a1, b1); + + u += sp_2048_sub_in_place_32(z1, z0); + u += sp_2048_sub_in_place_32(z1, z2); + sp_2048_mask_16(a1, a1, 0 - cb); + u += sp_2048_add_16(z1 + 16, z1 + 16, a1); sp_2048_mask_16(b1, b1, 0 - ca); - u += sp_2048_add_16(r + 32, r + 32, b1); - u += sp_2048_sub_in_place_32(z1, z2); - u += sp_2048_sub_in_place_32(z1, z0); - u += sp_2048_add_32(r + 16, r + 16, z1); - r[48] = u; - XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1)); - (void)sp_2048_add_32(r + 32, r + 32, z2); -} + u += sp_2048_add_16(z1 + 16, z1 + 16, b1); -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[32]; - sp_digit z1[32]; - sp_digit a1[16]; - sp_digit u; - - u = sp_2048_add_16(a1, a, &a[16]); - sp_2048_sqr_16(z1, a1); - sp_2048_sqr_16(z2, &a[16]); - sp_2048_sqr_16(z0, a); - sp_2048_mask_16(r + 32, a1, 0 - u); - u += sp_2048_add_16(r + 32, r + 32, r + 32); - u += sp_2048_sub_in_place_32(z1, z2); - u += sp_2048_sub_in_place_32(z1, z0); u += sp_2048_add_32(r + 16, r + 16, z1); - r[48] = u; - XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1)); - (void)sp_2048_add_32(r + 32, r + 32, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (16 - 1)); + a1[0] = u; + (void)sp_2048_add_16(r + 48, r + 48, a1); } /* Sub b from a into a. (a -= b) @@ -1959,7 +1574,7 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, sp_digit z1[64]; sp_digit a1[32]; sp_digit b1[32]; - sp_digit z2[64]; + sp_digit* z2 = r + 64; sp_digit u; sp_digit ca; sp_digit cb; @@ -1967,18 +1582,636 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, ca = sp_2048_add_32(a1, a, &a[32]); cb = sp_2048_add_32(b1, b, &b[32]); u = ca & cb; - sp_2048_mul_32(z1, a1, b1); + sp_2048_mul_32(z2, &a[32], &b[32]); sp_2048_mul_32(z0, a, b); - sp_2048_mask_32(r + 64, a1, 0 - cb); - sp_2048_mask_32(b1, b1, 0 - ca); - u += sp_2048_add_32(r + 64, r + 64, b1); - u += sp_2048_sub_in_place_64(z1, z2); + sp_2048_mul_32(z1, a1, b1); + u += sp_2048_sub_in_place_64(z1, z0); + u += sp_2048_sub_in_place_64(z1, z2); + sp_2048_mask_32(a1, a1, 0 - cb); + u += sp_2048_add_32(z1 + 32, z1 + 32, a1); + sp_2048_mask_32(b1, b1, 0 - ca); + u += sp_2048_add_32(z1 + 32, z1 + 32, b1); + u += sp_2048_add_64(r + 32, r + 32, z1); - r[96] = u; - XMEMSET(r + 96 + 1, 0, sizeof(sp_digit) * (32 - 1)); - (void)sp_2048_add_64(r + 64, r + 64, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (32 - 1)); + a1[0] = u; + (void)sp_2048_add_32(r + 96, r + 96, a1); +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "mov r12, #0\n\t" + "# A[0] * A[0]\n\t" + "ldr r10, [%[a], #0]\n\t" + "umull r8, r3, r10, r10\n\t" + "mov r4, #0\n\t" + "str r8, [sp]\n\t" + "# A[0] * A[1]\n\t" + "ldr r10, [%[a], #4]\n\t" + "ldr r8, [%[a], #0]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r12, r12\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "str r3, [sp, #4]\n\t" + "# A[0] * A[2]\n\t" + "ldr r10, [%[a], #8]\n\t" + "ldr r8, [%[a], #0]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r4, r4, r8\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r12, r12\n\t" + "adds r4, r4, r8\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r3, r12\n\t" + "# A[1] * A[1]\n\t" + "ldr r10, [%[a], #4]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r4, r4, r8\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r3, r12\n\t" + "str r4, [sp, #8]\n\t" + "# A[0] * A[3]\n\t" + "ldr r10, [%[a], #12]\n\t" + "ldr r8, [%[a], #0]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r12, r12\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r4, r12\n\t" + "# A[1] * A[2]\n\t" + "ldr r10, [%[a], #8]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r4, r12\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r4, r12\n\t" + "str r2, [sp, #12]\n\t" + "# A[0] * A[4]\n\t" + "ldr r10, [%[a], #16]\n\t" + "ldr r8, [%[a], #0]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r12, r12\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "# A[1] * A[3]\n\t" + "ldr r10, [%[a], #12]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "# A[2] * A[2]\n\t" + "ldr r10, [%[a], #8]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "str r3, [sp, #16]\n\t" + "# A[0] * A[5]\n\t" + "ldr r10, [%[a], #20]\n\t" + "ldr r8, [%[a], #0]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r3, #0\n\t" + "mov r7, #0\n\t" + "# A[1] * A[4]\n\t" + "ldr r10, [%[a], #16]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "# A[2] * A[3]\n\t" + "ldr r10, [%[a], #12]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r4, r4, r5\n\t" + "adcs r2, r2, r6\n\t" + "adc r3, r3, r7\n\t" + "str r4, [sp, #20]\n\t" + "# A[0] * A[6]\n\t" + "ldr r10, [%[a], #24]\n\t" + "ldr r8, [%[a], #0]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r4, #0\n\t" + "mov r7, #0\n\t" + "# A[1] * A[5]\n\t" + "ldr r10, [%[a], #20]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "# A[2] * A[4]\n\t" + "ldr r10, [%[a], #16]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "# A[3] * A[3]\n\t" + "ldr r10, [%[a], #12]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "str r2, [sp, #24]\n\t" + "# A[0] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #0]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r2, #0\n\t" + "mov r7, #0\n\t" + "# A[1] * A[6]\n\t" + "ldr r10, [%[a], #24]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "# A[2] * A[5]\n\t" + "ldr r10, [%[a], #20]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "# A[3] * A[4]\n\t" + "ldr r10, [%[a], #16]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc r2, r2, r7\n\t" + "str r3, [sp, #28]\n\t" + "# A[1] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r3, #0\n\t" + "mov r7, #0\n\t" + "# A[2] * A[6]\n\t" + "ldr r10, [%[a], #24]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "# A[3] * A[5]\n\t" + "ldr r10, [%[a], #20]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "# A[4] * A[4]\n\t" + "ldr r10, [%[a], #16]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "adds r4, r4, r5\n\t" + "adcs r2, r2, r6\n\t" + "adc r3, r3, r7\n\t" + "str r4, [%[r], #32]\n\t" + "# A[2] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r4, #0\n\t" + "mov r7, #0\n\t" + "# A[3] * A[6]\n\t" + "ldr r10, [%[a], #24]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "# A[4] * A[5]\n\t" + "ldr r10, [%[a], #20]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r12\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "str r2, [%[r], #36]\n\t" + "# A[3] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r12, r12\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "# A[4] * A[6]\n\t" + "ldr r10, [%[a], #24]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "# A[5] * A[5]\n\t" + "ldr r10, [%[a], #20]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "str r3, [%[r], #40]\n\t" + "# A[4] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r4, r4, r8\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r12, r12\n\t" + "adds r4, r4, r8\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r3, r12\n\t" + "# A[5] * A[6]\n\t" + "ldr r10, [%[a], #24]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r4, r4, r8\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r3, r12\n\t" + "adds r4, r4, r8\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r3, r12\n\t" + "str r4, [%[r], #44]\n\t" + "# A[5] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r12, r12\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r4, r12\n\t" + "# A[6] * A[6]\n\t" + "ldr r10, [%[a], #24]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r4, r12\n\t" + "str r2, [%[r], #48]\n\t" + "# A[6] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r12, r12\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r12\n\t" + "str r3, [%[r], #52]\n\t" + "# A[7] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r4, r4, r8\n\t" + "adc r2, r2, r9\n\t" + "str r4, [%[r], #56]\n\t" + "str r2, [%[r], #60]\n\t" + "ldm sp!, {r2, r3, r4, r8}\n\t" + "stm %[r]!, {r2, r3, r4, r8}\n\t" + "ldm sp!, {r2, r3, r4, r8}\n\t" + "stm %[r]!, {r2, r3, r4, r8}\n\t" + "sub %[r], %[r], #32\n\t" + : + : [r] "r" (r), [a] "r" (a) + : "memory", "r2", "r3", "r4", "r8", "r9", "r10", "r8", "r5", "r6", "r7", "r12" + ); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_2048_sub_8(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldrd r3, r4, [%[a], #0]\n\t" + "ldrd r5, r6, [%[a], #8]\n\t" + "ldrd r7, r8, [%[b], #0]\n\t" + "ldrd r9, r10, [%[b], #8]\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #0]\n\t" + "strd r5, r6, [%[r], #8]\n\t" + "ldrd r3, r4, [%[a], #16]\n\t" + "ldrd r5, r6, [%[a], #24]\n\t" + "ldrd r7, r8, [%[b], #16]\n\t" + "ldrd r9, r10, [%[b], #24]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #16]\n\t" + "strd r5, r6, [%[r], #24]\n\t" + "sbc %[c], %[c], #0\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + + return c; +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit* z2 = r + 16; + sp_digit z1[16]; + sp_digit* a1 = z1; + sp_digit zero[8]; + sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 8); + + mask = sp_2048_sub_8(a1, a, &a[8]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_8(a1, p1, p2); + + sp_2048_sqr_8(z2, &a[8]); + sp_2048_sqr_8(z0, a); + sp_2048_sqr_8(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_16(z1, z2); + u -= sp_2048_sub_in_place_16(z1, z0); + u += sp_2048_sub_in_place_16(r + 8, z1); + zero[0] = u; + (void)sp_2048_add_8(r + 24, r + 24, zero); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_2048_sub_16(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldrd r3, r4, [%[a], #0]\n\t" + "ldrd r5, r6, [%[a], #8]\n\t" + "ldrd r7, r8, [%[b], #0]\n\t" + "ldrd r9, r10, [%[b], #8]\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #0]\n\t" + "strd r5, r6, [%[r], #8]\n\t" + "ldrd r3, r4, [%[a], #16]\n\t" + "ldrd r5, r6, [%[a], #24]\n\t" + "ldrd r7, r8, [%[b], #16]\n\t" + "ldrd r9, r10, [%[b], #24]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #16]\n\t" + "strd r5, r6, [%[r], #24]\n\t" + "ldrd r3, r4, [%[a], #32]\n\t" + "ldrd r5, r6, [%[a], #40]\n\t" + "ldrd r7, r8, [%[b], #32]\n\t" + "ldrd r9, r10, [%[b], #40]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #40]\n\t" + "ldrd r3, r4, [%[a], #48]\n\t" + "ldrd r5, r6, [%[a], #56]\n\t" + "ldrd r7, r8, [%[b], #48]\n\t" + "ldrd r9, r10, [%[b], #56]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #48]\n\t" + "strd r5, r6, [%[r], #56]\n\t" + "sbc %[c], %[c], #0\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + + return c; +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit* z2 = r + 32; + sp_digit z1[32]; + sp_digit* a1 = z1; + sp_digit zero[16]; + sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 16); + + mask = sp_2048_sub_16(a1, a, &a[16]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_16(a1, p1, p2); + + sp_2048_sqr_16(z2, &a[16]); + sp_2048_sqr_16(z0, a); + sp_2048_sqr_16(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_32(z1, z2); + u -= sp_2048_sub_in_place_32(z1, z0); + u += sp_2048_sub_in_place_32(r + 16, z1); + zero[0] = u; + (void)sp_2048_add_16(r + 48, r + 48, zero); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldrd r3, r4, [%[a], #0]\n\t" + "ldrd r5, r6, [%[a], #8]\n\t" + "ldrd r7, r8, [%[b], #0]\n\t" + "ldrd r9, r10, [%[b], #8]\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #0]\n\t" + "strd r5, r6, [%[r], #8]\n\t" + "ldrd r3, r4, [%[a], #16]\n\t" + "ldrd r5, r6, [%[a], #24]\n\t" + "ldrd r7, r8, [%[b], #16]\n\t" + "ldrd r9, r10, [%[b], #24]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #16]\n\t" + "strd r5, r6, [%[r], #24]\n\t" + "ldrd r3, r4, [%[a], #32]\n\t" + "ldrd r5, r6, [%[a], #40]\n\t" + "ldrd r7, r8, [%[b], #32]\n\t" + "ldrd r9, r10, [%[b], #40]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #40]\n\t" + "ldrd r3, r4, [%[a], #48]\n\t" + "ldrd r5, r6, [%[a], #56]\n\t" + "ldrd r7, r8, [%[b], #48]\n\t" + "ldrd r9, r10, [%[b], #56]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #48]\n\t" + "strd r5, r6, [%[r], #56]\n\t" + "ldrd r3, r4, [%[a], #64]\n\t" + "ldrd r5, r6, [%[a], #72]\n\t" + "ldrd r7, r8, [%[b], #64]\n\t" + "ldrd r9, r10, [%[b], #72]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #64]\n\t" + "strd r5, r6, [%[r], #72]\n\t" + "ldrd r3, r4, [%[a], #80]\n\t" + "ldrd r5, r6, [%[a], #88]\n\t" + "ldrd r7, r8, [%[b], #80]\n\t" + "ldrd r9, r10, [%[b], #88]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #80]\n\t" + "strd r5, r6, [%[r], #88]\n\t" + "ldrd r3, r4, [%[a], #96]\n\t" + "ldrd r5, r6, [%[a], #104]\n\t" + "ldrd r7, r8, [%[b], #96]\n\t" + "ldrd r9, r10, [%[b], #104]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #96]\n\t" + "strd r5, r6, [%[r], #104]\n\t" + "ldrd r3, r4, [%[a], #112]\n\t" + "ldrd r5, r6, [%[a], #120]\n\t" + "ldrd r7, r8, [%[b], #112]\n\t" + "ldrd r9, r10, [%[b], #120]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #112]\n\t" + "strd r5, r6, [%[r], #120]\n\t" + "sbc %[c], %[c], #0\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + + return c; } /* Square a and put result in r. (r = a * a) @@ -1989,23 +2222,32 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[64]; + sp_digit* z2 = r + 64; sp_digit z1[64]; - sp_digit a1[32]; + sp_digit* a1 = z1; + sp_digit zero[32]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 32); + + mask = sp_2048_sub_32(a1, a, &a[32]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_32(a1, p1, p2); - u = sp_2048_add_32(a1, a, &a[32]); - sp_2048_sqr_32(z1, a1); sp_2048_sqr_32(z2, &a[32]); sp_2048_sqr_32(z0, a); - sp_2048_mask_32(r + 64, a1, 0 - u); - u += sp_2048_add_32(r + 64, r + 64, r + 64); - u += sp_2048_sub_in_place_64(z1, z2); - u += sp_2048_sub_in_place_64(z1, z0); - u += sp_2048_add_64(r + 32, r + 32, z1); - r[96] = u; - XMEMSET(r + 96 + 1, 0, sizeof(sp_digit) * (32 - 1)); - (void)sp_2048_add_64(r + 64, r + 64, z2); + sp_2048_sqr_32(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_64(z1, z2); + u -= sp_2048_sub_in_place_64(z1, z0); + u += sp_2048_sub_in_place_64(r + 32, z1); + zero[0] = u; + (void)sp_2048_add_32(r + 96, r + 96, zero); } #endif /* !WOLFSSL_SP_SMALL */ @@ -3464,7 +3706,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_32(r, a, b); @@ -3478,7 +3720,7 @@ static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_32(r, a); @@ -3754,11 +3996,11 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -4239,7 +4481,7 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig div = d[31]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i=31; i>=0; i--) { + for (i = 31; i >= 0; i--) { sp_digit hi = t1[32 + i] - (t1[32 + i] == div); r1 = div_2048_word_32(hi, t1[32 + i - 1], div); @@ -5506,7 +5748,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_64(r, a, b); @@ -5520,7 +5762,7 @@ static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_64(r, a); @@ -5747,11 +5989,11 @@ static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -5829,9 +6071,13 @@ static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, s div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_2048_word_64(hi, t1[64 + i - 1], div); + for (i = 63; i >= 0; i--) { + if (t1[64 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_2048_word_64(t1[64 + i], t1[64 + i - 1], div); + } sp_2048_mul_d_64(t2, d, r1); t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2); @@ -6676,7 +6922,7 @@ static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_dig div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { + for (i = 63; i >= 0; i--) { sp_digit hi = t1[64 + i] - (t1[64 + i] == div); r1 = div_2048_word_64(hi, t1[64 + i - 1], div); @@ -7063,9 +7309,9 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 64; r = a + 64 * 2; m = r + 64 * 2; - ah = a + 64; sp_2048_from_bin(ah, 64, in, inLen); #if DIGIT_BIT >= 32 @@ -7083,7 +7329,38 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_2048_from_mp(m, 64, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_2048_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 64); + err = sp_2048_mod_64_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_2048_mont_sqr_64(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_2048_mont_mul_64(r, r, ah, m, mp); + + for (i = 63; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_2048_sub_in_place_64(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_2048_sqr_64(r, ah); err = sp_2048_mod_64_cond(r, r, m); @@ -7111,7 +7388,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 64); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_2048_mont_sqr_64(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_2048_mont_mul_64(r, r, a, m, mp); @@ -7146,6 +7423,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, } #ifndef WOLFSSL_RSA_PUBLIC_ONLY +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -7159,29 +7437,46 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig { sp_digit c = 0; -#ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov r9, #0\n\t" - "mov r8, #0\n\t" + "mov r7, #0\n\t" + "mov r6, #0\n\t" "1:\n\t" "adds %[c], %[c], #-1\n\t" - "ldr r4, [%[a], r8]\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adcs r4, r4, r6\n\t" - "adc %[c], r9, r9\n\t" - "str r4, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, #128\n\t" + "ldr r4, [%[a], r6]\n\t" + "ldr r5, [%[b], r6]\n\t" + "and r5, r5, %[m]\n\t" + "adcs r4, r4, r5\n\t" + "adc %[c], r7, r7\n\t" + "str r4, [%[r], r6]\n\t" + "add r6, r6, #4\n\t" + "cmp r6, #128\n\t" "blt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7" ); -#else + + return c; +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ + sp_digit c = 0; + __asm__ __volatile__ ( - "mov r9, #0\n\t" + "mov r8, #0\n\t" "ldrd r4, r5, [%[a], #0]\n\t" "ldrd r6, r7, [%[b], #0]\n\t" "and r6, r6, %[m]\n\t" @@ -7294,15 +7589,15 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig "adcs r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "strd r4, r5, [%[r], #120]\n\t" - "adc %[c], r9, r9\n\t" + "adc %[c], r8, r8\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7", "r8" ); -#endif /* WOLFSSL_SP_SMALL */ return c; } +#endif /* !WOLFSSL_SP_SMALL */ /* RSA private key operation. * @@ -9412,6 +9707,1228 @@ static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b) ); } +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov r14, #0\n\t" + "ldrd r3, r4, [%[a], #0]\n\t" + "ldrd r5, r6, [%[a], #8]\n\t" + "ldrd r7, r8, [%[b], #0]\n\t" + "ldrd r9, r10, [%[b], #8]\n\t" + "adds r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #0]\n\t" + "strd r5, r6, [%[r], #8]\n\t" + "ldrd r3, r4, [%[a], #16]\n\t" + "ldrd r5, r6, [%[a], #24]\n\t" + "ldrd r7, r8, [%[b], #16]\n\t" + "ldrd r9, r10, [%[b], #24]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #16]\n\t" + "strd r5, r6, [%[r], #24]\n\t" + "ldrd r3, r4, [%[a], #32]\n\t" + "ldrd r5, r6, [%[a], #40]\n\t" + "ldrd r7, r8, [%[b], #32]\n\t" + "ldrd r9, r10, [%[b], #40]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #40]\n\t" + "adc %[c], r14, r14\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" + ); + + return c; +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer and result. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_in_place_24(sp_digit* a, const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldrd r2, r3, [%[a], #0]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[b], #0]\n\t" + "ldrd r8, r9, [%[b], #8]\n\t" + "subs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #0]\n\t" + "strd r4, r5, [%[a], #8]\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r4, r5, [%[a], #24]\n\t" + "ldrd r6, r7, [%[b], #16]\n\t" + "ldrd r8, r9, [%[b], #24]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #16]\n\t" + "strd r4, r5, [%[a], #24]\n\t" + "ldrd r2, r3, [%[a], #32]\n\t" + "ldrd r4, r5, [%[a], #40]\n\t" + "ldrd r6, r7, [%[b], #32]\n\t" + "ldrd r8, r9, [%[b], #40]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #32]\n\t" + "strd r4, r5, [%[a], #40]\n\t" + "ldrd r2, r3, [%[a], #48]\n\t" + "ldrd r4, r5, [%[a], #56]\n\t" + "ldrd r6, r7, [%[b], #48]\n\t" + "ldrd r8, r9, [%[b], #56]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #48]\n\t" + "strd r4, r5, [%[a], #56]\n\t" + "ldrd r2, r3, [%[a], #64]\n\t" + "ldrd r4, r5, [%[a], #72]\n\t" + "ldrd r6, r7, [%[b], #64]\n\t" + "ldrd r8, r9, [%[b], #72]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #64]\n\t" + "strd r4, r5, [%[a], #72]\n\t" + "ldrd r2, r3, [%[a], #80]\n\t" + "ldrd r4, r5, [%[a], #88]\n\t" + "ldrd r6, r7, [%[b], #80]\n\t" + "ldrd r8, r9, [%[b], #88]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #80]\n\t" + "strd r4, r5, [%[a], #88]\n\t" + "sbc %[c], r9, r9\n\t" + : [c] "+r" (c) + : [a] "r" (a), [b] "r" (b) + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" + ); + + return c; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov r14, #0\n\t" + "ldrd r3, r4, [%[a], #0]\n\t" + "ldrd r5, r6, [%[a], #8]\n\t" + "ldrd r7, r8, [%[b], #0]\n\t" + "ldrd r9, r10, [%[b], #8]\n\t" + "adds r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #0]\n\t" + "strd r5, r6, [%[r], #8]\n\t" + "ldrd r3, r4, [%[a], #16]\n\t" + "ldrd r5, r6, [%[a], #24]\n\t" + "ldrd r7, r8, [%[b], #16]\n\t" + "ldrd r9, r10, [%[b], #24]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #16]\n\t" + "strd r5, r6, [%[r], #24]\n\t" + "ldrd r3, r4, [%[a], #32]\n\t" + "ldrd r5, r6, [%[a], #40]\n\t" + "ldrd r7, r8, [%[b], #32]\n\t" + "ldrd r9, r10, [%[b], #40]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #40]\n\t" + "ldrd r3, r4, [%[a], #48]\n\t" + "ldrd r5, r6, [%[a], #56]\n\t" + "ldrd r7, r8, [%[b], #48]\n\t" + "ldrd r9, r10, [%[b], #56]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #48]\n\t" + "strd r5, r6, [%[r], #56]\n\t" + "ldrd r3, r4, [%[a], #64]\n\t" + "ldrd r5, r6, [%[a], #72]\n\t" + "ldrd r7, r8, [%[b], #64]\n\t" + "ldrd r9, r10, [%[b], #72]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #64]\n\t" + "strd r5, r6, [%[r], #72]\n\t" + "ldrd r3, r4, [%[a], #80]\n\t" + "ldrd r5, r6, [%[a], #88]\n\t" + "ldrd r7, r8, [%[b], #80]\n\t" + "ldrd r9, r10, [%[b], #88]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #80]\n\t" + "strd r5, r6, [%[r], #88]\n\t" + "adc %[c], r14, r14\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" + ); + + return c; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<12; i++) { + r[i] = a[i] & m; + } +#else + r[0] = a[0] & m; + r[1] = a[1] & m; + r[2] = a[2] & m; + r[3] = a[3] & m; + r[4] = a[4] & m; + r[5] = a[5] & m; + r[6] = a[6] & m; + r[7] = a[7] & m; + r[8] = a[8] & m; + r[9] = a[9] & m; + r[10] = a[10] & m; + r[11] = a[11] & m; +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[24]; + sp_digit a1[12]; + sp_digit b1[12]; + sp_digit* z2 = r + 24; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_3072_add_12(a1, a, &a[12]); + cb = sp_3072_add_12(b1, b, &b[12]); + u = ca & cb; + + sp_3072_mul_12(z2, &a[12], &b[12]); + sp_3072_mul_12(z0, a, b); + sp_3072_mul_12(z1, a1, b1); + + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_sub_in_place_24(z1, z2); + sp_3072_mask_12(a1, a1, 0 - cb); + u += sp_3072_add_12(z1 + 12, z1 + 12, a1); + sp_3072_mask_12(b1, b1, 0 - ca); + u += sp_3072_add_12(z1 + 12, z1 + 12, b1); + + u += sp_3072_add_24(r + 12, r + 12, z1); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (12 - 1)); + a1[0] = u; + (void)sp_3072_add_12(r + 36, r + 36, a1); +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer and result. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldrd r2, r3, [%[a], #0]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[b], #0]\n\t" + "ldrd r8, r9, [%[b], #8]\n\t" + "subs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #0]\n\t" + "strd r4, r5, [%[a], #8]\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r4, r5, [%[a], #24]\n\t" + "ldrd r6, r7, [%[b], #16]\n\t" + "ldrd r8, r9, [%[b], #24]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #16]\n\t" + "strd r4, r5, [%[a], #24]\n\t" + "ldrd r2, r3, [%[a], #32]\n\t" + "ldrd r4, r5, [%[a], #40]\n\t" + "ldrd r6, r7, [%[b], #32]\n\t" + "ldrd r8, r9, [%[b], #40]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #32]\n\t" + "strd r4, r5, [%[a], #40]\n\t" + "ldrd r2, r3, [%[a], #48]\n\t" + "ldrd r4, r5, [%[a], #56]\n\t" + "ldrd r6, r7, [%[b], #48]\n\t" + "ldrd r8, r9, [%[b], #56]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #48]\n\t" + "strd r4, r5, [%[a], #56]\n\t" + "ldrd r2, r3, [%[a], #64]\n\t" + "ldrd r4, r5, [%[a], #72]\n\t" + "ldrd r6, r7, [%[b], #64]\n\t" + "ldrd r8, r9, [%[b], #72]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #64]\n\t" + "strd r4, r5, [%[a], #72]\n\t" + "ldrd r2, r3, [%[a], #80]\n\t" + "ldrd r4, r5, [%[a], #88]\n\t" + "ldrd r6, r7, [%[b], #80]\n\t" + "ldrd r8, r9, [%[b], #88]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #80]\n\t" + "strd r4, r5, [%[a], #88]\n\t" + "ldrd r2, r3, [%[a], #96]\n\t" + "ldrd r4, r5, [%[a], #104]\n\t" + "ldrd r6, r7, [%[b], #96]\n\t" + "ldrd r8, r9, [%[b], #104]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #96]\n\t" + "strd r4, r5, [%[a], #104]\n\t" + "ldrd r2, r3, [%[a], #112]\n\t" + "ldrd r4, r5, [%[a], #120]\n\t" + "ldrd r6, r7, [%[b], #112]\n\t" + "ldrd r8, r9, [%[b], #120]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #112]\n\t" + "strd r4, r5, [%[a], #120]\n\t" + "ldrd r2, r3, [%[a], #128]\n\t" + "ldrd r4, r5, [%[a], #136]\n\t" + "ldrd r6, r7, [%[b], #128]\n\t" + "ldrd r8, r9, [%[b], #136]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #128]\n\t" + "strd r4, r5, [%[a], #136]\n\t" + "ldrd r2, r3, [%[a], #144]\n\t" + "ldrd r4, r5, [%[a], #152]\n\t" + "ldrd r6, r7, [%[b], #144]\n\t" + "ldrd r8, r9, [%[b], #152]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #144]\n\t" + "strd r4, r5, [%[a], #152]\n\t" + "ldrd r2, r3, [%[a], #160]\n\t" + "ldrd r4, r5, [%[a], #168]\n\t" + "ldrd r6, r7, [%[b], #160]\n\t" + "ldrd r8, r9, [%[b], #168]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #160]\n\t" + "strd r4, r5, [%[a], #168]\n\t" + "ldrd r2, r3, [%[a], #176]\n\t" + "ldrd r4, r5, [%[a], #184]\n\t" + "ldrd r6, r7, [%[b], #176]\n\t" + "ldrd r8, r9, [%[b], #184]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #176]\n\t" + "strd r4, r5, [%[a], #184]\n\t" + "sbc %[c], r9, r9\n\t" + : [c] "+r" (c) + : [a] "r" (a), [b] "r" (b) + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" + ); + + return c; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov r14, #0\n\t" + "ldrd r3, r4, [%[a], #0]\n\t" + "ldrd r5, r6, [%[a], #8]\n\t" + "ldrd r7, r8, [%[b], #0]\n\t" + "ldrd r9, r10, [%[b], #8]\n\t" + "adds r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #0]\n\t" + "strd r5, r6, [%[r], #8]\n\t" + "ldrd r3, r4, [%[a], #16]\n\t" + "ldrd r5, r6, [%[a], #24]\n\t" + "ldrd r7, r8, [%[b], #16]\n\t" + "ldrd r9, r10, [%[b], #24]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #16]\n\t" + "strd r5, r6, [%[r], #24]\n\t" + "ldrd r3, r4, [%[a], #32]\n\t" + "ldrd r5, r6, [%[a], #40]\n\t" + "ldrd r7, r8, [%[b], #32]\n\t" + "ldrd r9, r10, [%[b], #40]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #40]\n\t" + "ldrd r3, r4, [%[a], #48]\n\t" + "ldrd r5, r6, [%[a], #56]\n\t" + "ldrd r7, r8, [%[b], #48]\n\t" + "ldrd r9, r10, [%[b], #56]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #48]\n\t" + "strd r5, r6, [%[r], #56]\n\t" + "ldrd r3, r4, [%[a], #64]\n\t" + "ldrd r5, r6, [%[a], #72]\n\t" + "ldrd r7, r8, [%[b], #64]\n\t" + "ldrd r9, r10, [%[b], #72]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #64]\n\t" + "strd r5, r6, [%[r], #72]\n\t" + "ldrd r3, r4, [%[a], #80]\n\t" + "ldrd r5, r6, [%[a], #88]\n\t" + "ldrd r7, r8, [%[b], #80]\n\t" + "ldrd r9, r10, [%[b], #88]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #80]\n\t" + "strd r5, r6, [%[r], #88]\n\t" + "ldrd r3, r4, [%[a], #96]\n\t" + "ldrd r5, r6, [%[a], #104]\n\t" + "ldrd r7, r8, [%[b], #96]\n\t" + "ldrd r9, r10, [%[b], #104]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #96]\n\t" + "strd r5, r6, [%[r], #104]\n\t" + "ldrd r3, r4, [%[a], #112]\n\t" + "ldrd r5, r6, [%[a], #120]\n\t" + "ldrd r7, r8, [%[b], #112]\n\t" + "ldrd r9, r10, [%[b], #120]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #112]\n\t" + "strd r5, r6, [%[r], #120]\n\t" + "ldrd r3, r4, [%[a], #128]\n\t" + "ldrd r5, r6, [%[a], #136]\n\t" + "ldrd r7, r8, [%[b], #128]\n\t" + "ldrd r9, r10, [%[b], #136]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #128]\n\t" + "strd r5, r6, [%[r], #136]\n\t" + "ldrd r3, r4, [%[a], #144]\n\t" + "ldrd r5, r6, [%[a], #152]\n\t" + "ldrd r7, r8, [%[b], #144]\n\t" + "ldrd r9, r10, [%[b], #152]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #144]\n\t" + "strd r5, r6, [%[r], #152]\n\t" + "ldrd r3, r4, [%[a], #160]\n\t" + "ldrd r5, r6, [%[a], #168]\n\t" + "ldrd r7, r8, [%[b], #160]\n\t" + "ldrd r9, r10, [%[b], #168]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #160]\n\t" + "strd r5, r6, [%[r], #168]\n\t" + "ldrd r3, r4, [%[a], #176]\n\t" + "ldrd r5, r6, [%[a], #184]\n\t" + "ldrd r7, r8, [%[b], #176]\n\t" + "ldrd r9, r10, [%[b], #184]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #176]\n\t" + "strd r5, r6, [%[r], #184]\n\t" + "adc %[c], r14, r14\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" + ); + + return c; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<24; i++) { + r[i] = a[i] & m; + } +#else + int i; + + for (i = 0; i < 24; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[48]; + sp_digit a1[24]; + sp_digit b1[24]; + sp_digit* z2 = r + 48; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_3072_add_24(a1, a, &a[24]); + cb = sp_3072_add_24(b1, b, &b[24]); + u = ca & cb; + + sp_3072_mul_24(z2, &a[24], &b[24]); + sp_3072_mul_24(z0, a, b); + sp_3072_mul_24(z1, a1, b1); + + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_sub_in_place_48(z1, z2); + sp_3072_mask_24(a1, a1, 0 - cb); + u += sp_3072_add_24(z1 + 24, z1 + 24, a1); + sp_3072_mask_24(b1, b1, 0 - ca); + u += sp_3072_add_24(z1 + 24, z1 + 24, b1); + + u += sp_3072_add_48(r + 24, r + 24, z1); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (24 - 1)); + a1[0] = u; + (void)sp_3072_add_24(r + 72, r + 72, a1); +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer and result. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldrd r2, r3, [%[a], #0]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[b], #0]\n\t" + "ldrd r8, r9, [%[b], #8]\n\t" + "subs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #0]\n\t" + "strd r4, r5, [%[a], #8]\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r4, r5, [%[a], #24]\n\t" + "ldrd r6, r7, [%[b], #16]\n\t" + "ldrd r8, r9, [%[b], #24]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #16]\n\t" + "strd r4, r5, [%[a], #24]\n\t" + "ldrd r2, r3, [%[a], #32]\n\t" + "ldrd r4, r5, [%[a], #40]\n\t" + "ldrd r6, r7, [%[b], #32]\n\t" + "ldrd r8, r9, [%[b], #40]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #32]\n\t" + "strd r4, r5, [%[a], #40]\n\t" + "ldrd r2, r3, [%[a], #48]\n\t" + "ldrd r4, r5, [%[a], #56]\n\t" + "ldrd r6, r7, [%[b], #48]\n\t" + "ldrd r8, r9, [%[b], #56]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #48]\n\t" + "strd r4, r5, [%[a], #56]\n\t" + "ldrd r2, r3, [%[a], #64]\n\t" + "ldrd r4, r5, [%[a], #72]\n\t" + "ldrd r6, r7, [%[b], #64]\n\t" + "ldrd r8, r9, [%[b], #72]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #64]\n\t" + "strd r4, r5, [%[a], #72]\n\t" + "ldrd r2, r3, [%[a], #80]\n\t" + "ldrd r4, r5, [%[a], #88]\n\t" + "ldrd r6, r7, [%[b], #80]\n\t" + "ldrd r8, r9, [%[b], #88]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #80]\n\t" + "strd r4, r5, [%[a], #88]\n\t" + "ldrd r2, r3, [%[a], #96]\n\t" + "ldrd r4, r5, [%[a], #104]\n\t" + "ldrd r6, r7, [%[b], #96]\n\t" + "ldrd r8, r9, [%[b], #104]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #96]\n\t" + "strd r4, r5, [%[a], #104]\n\t" + "ldrd r2, r3, [%[a], #112]\n\t" + "ldrd r4, r5, [%[a], #120]\n\t" + "ldrd r6, r7, [%[b], #112]\n\t" + "ldrd r8, r9, [%[b], #120]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #112]\n\t" + "strd r4, r5, [%[a], #120]\n\t" + "ldrd r2, r3, [%[a], #128]\n\t" + "ldrd r4, r5, [%[a], #136]\n\t" + "ldrd r6, r7, [%[b], #128]\n\t" + "ldrd r8, r9, [%[b], #136]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #128]\n\t" + "strd r4, r5, [%[a], #136]\n\t" + "ldrd r2, r3, [%[a], #144]\n\t" + "ldrd r4, r5, [%[a], #152]\n\t" + "ldrd r6, r7, [%[b], #144]\n\t" + "ldrd r8, r9, [%[b], #152]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #144]\n\t" + "strd r4, r5, [%[a], #152]\n\t" + "ldrd r2, r3, [%[a], #160]\n\t" + "ldrd r4, r5, [%[a], #168]\n\t" + "ldrd r6, r7, [%[b], #160]\n\t" + "ldrd r8, r9, [%[b], #168]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #160]\n\t" + "strd r4, r5, [%[a], #168]\n\t" + "ldrd r2, r3, [%[a], #176]\n\t" + "ldrd r4, r5, [%[a], #184]\n\t" + "ldrd r6, r7, [%[b], #176]\n\t" + "ldrd r8, r9, [%[b], #184]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #176]\n\t" + "strd r4, r5, [%[a], #184]\n\t" + "ldrd r2, r3, [%[a], #192]\n\t" + "ldrd r4, r5, [%[a], #200]\n\t" + "ldrd r6, r7, [%[b], #192]\n\t" + "ldrd r8, r9, [%[b], #200]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #192]\n\t" + "strd r4, r5, [%[a], #200]\n\t" + "ldrd r2, r3, [%[a], #208]\n\t" + "ldrd r4, r5, [%[a], #216]\n\t" + "ldrd r6, r7, [%[b], #208]\n\t" + "ldrd r8, r9, [%[b], #216]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #208]\n\t" + "strd r4, r5, [%[a], #216]\n\t" + "ldrd r2, r3, [%[a], #224]\n\t" + "ldrd r4, r5, [%[a], #232]\n\t" + "ldrd r6, r7, [%[b], #224]\n\t" + "ldrd r8, r9, [%[b], #232]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #224]\n\t" + "strd r4, r5, [%[a], #232]\n\t" + "ldrd r2, r3, [%[a], #240]\n\t" + "ldrd r4, r5, [%[a], #248]\n\t" + "ldrd r6, r7, [%[b], #240]\n\t" + "ldrd r8, r9, [%[b], #248]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #240]\n\t" + "strd r4, r5, [%[a], #248]\n\t" + "ldrd r2, r3, [%[a], #256]\n\t" + "ldrd r4, r5, [%[a], #264]\n\t" + "ldrd r6, r7, [%[b], #256]\n\t" + "ldrd r8, r9, [%[b], #264]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #256]\n\t" + "strd r4, r5, [%[a], #264]\n\t" + "ldrd r2, r3, [%[a], #272]\n\t" + "ldrd r4, r5, [%[a], #280]\n\t" + "ldrd r6, r7, [%[b], #272]\n\t" + "ldrd r8, r9, [%[b], #280]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #272]\n\t" + "strd r4, r5, [%[a], #280]\n\t" + "ldrd r2, r3, [%[a], #288]\n\t" + "ldrd r4, r5, [%[a], #296]\n\t" + "ldrd r6, r7, [%[b], #288]\n\t" + "ldrd r8, r9, [%[b], #296]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #288]\n\t" + "strd r4, r5, [%[a], #296]\n\t" + "ldrd r2, r3, [%[a], #304]\n\t" + "ldrd r4, r5, [%[a], #312]\n\t" + "ldrd r6, r7, [%[b], #304]\n\t" + "ldrd r8, r9, [%[b], #312]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #304]\n\t" + "strd r4, r5, [%[a], #312]\n\t" + "ldrd r2, r3, [%[a], #320]\n\t" + "ldrd r4, r5, [%[a], #328]\n\t" + "ldrd r6, r7, [%[b], #320]\n\t" + "ldrd r8, r9, [%[b], #328]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #320]\n\t" + "strd r4, r5, [%[a], #328]\n\t" + "ldrd r2, r3, [%[a], #336]\n\t" + "ldrd r4, r5, [%[a], #344]\n\t" + "ldrd r6, r7, [%[b], #336]\n\t" + "ldrd r8, r9, [%[b], #344]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #336]\n\t" + "strd r4, r5, [%[a], #344]\n\t" + "ldrd r2, r3, [%[a], #352]\n\t" + "ldrd r4, r5, [%[a], #360]\n\t" + "ldrd r6, r7, [%[b], #352]\n\t" + "ldrd r8, r9, [%[b], #360]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #352]\n\t" + "strd r4, r5, [%[a], #360]\n\t" + "ldrd r2, r3, [%[a], #368]\n\t" + "ldrd r4, r5, [%[a], #376]\n\t" + "ldrd r6, r7, [%[b], #368]\n\t" + "ldrd r8, r9, [%[b], #376]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "strd r2, r3, [%[a], #368]\n\t" + "strd r4, r5, [%[a], #376]\n\t" + "sbc %[c], r9, r9\n\t" + : [c] "+r" (c) + : [a] "r" (a), [b] "r" (b) + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" + ); + + return c; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov r14, #0\n\t" + "ldrd r3, r4, [%[a], #0]\n\t" + "ldrd r5, r6, [%[a], #8]\n\t" + "ldrd r7, r8, [%[b], #0]\n\t" + "ldrd r9, r10, [%[b], #8]\n\t" + "adds r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #0]\n\t" + "strd r5, r6, [%[r], #8]\n\t" + "ldrd r3, r4, [%[a], #16]\n\t" + "ldrd r5, r6, [%[a], #24]\n\t" + "ldrd r7, r8, [%[b], #16]\n\t" + "ldrd r9, r10, [%[b], #24]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #16]\n\t" + "strd r5, r6, [%[r], #24]\n\t" + "ldrd r3, r4, [%[a], #32]\n\t" + "ldrd r5, r6, [%[a], #40]\n\t" + "ldrd r7, r8, [%[b], #32]\n\t" + "ldrd r9, r10, [%[b], #40]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #40]\n\t" + "ldrd r3, r4, [%[a], #48]\n\t" + "ldrd r5, r6, [%[a], #56]\n\t" + "ldrd r7, r8, [%[b], #48]\n\t" + "ldrd r9, r10, [%[b], #56]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #48]\n\t" + "strd r5, r6, [%[r], #56]\n\t" + "ldrd r3, r4, [%[a], #64]\n\t" + "ldrd r5, r6, [%[a], #72]\n\t" + "ldrd r7, r8, [%[b], #64]\n\t" + "ldrd r9, r10, [%[b], #72]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #64]\n\t" + "strd r5, r6, [%[r], #72]\n\t" + "ldrd r3, r4, [%[a], #80]\n\t" + "ldrd r5, r6, [%[a], #88]\n\t" + "ldrd r7, r8, [%[b], #80]\n\t" + "ldrd r9, r10, [%[b], #88]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #80]\n\t" + "strd r5, r6, [%[r], #88]\n\t" + "ldrd r3, r4, [%[a], #96]\n\t" + "ldrd r5, r6, [%[a], #104]\n\t" + "ldrd r7, r8, [%[b], #96]\n\t" + "ldrd r9, r10, [%[b], #104]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #96]\n\t" + "strd r5, r6, [%[r], #104]\n\t" + "ldrd r3, r4, [%[a], #112]\n\t" + "ldrd r5, r6, [%[a], #120]\n\t" + "ldrd r7, r8, [%[b], #112]\n\t" + "ldrd r9, r10, [%[b], #120]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #112]\n\t" + "strd r5, r6, [%[r], #120]\n\t" + "ldrd r3, r4, [%[a], #128]\n\t" + "ldrd r5, r6, [%[a], #136]\n\t" + "ldrd r7, r8, [%[b], #128]\n\t" + "ldrd r9, r10, [%[b], #136]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #128]\n\t" + "strd r5, r6, [%[r], #136]\n\t" + "ldrd r3, r4, [%[a], #144]\n\t" + "ldrd r5, r6, [%[a], #152]\n\t" + "ldrd r7, r8, [%[b], #144]\n\t" + "ldrd r9, r10, [%[b], #152]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #144]\n\t" + "strd r5, r6, [%[r], #152]\n\t" + "ldrd r3, r4, [%[a], #160]\n\t" + "ldrd r5, r6, [%[a], #168]\n\t" + "ldrd r7, r8, [%[b], #160]\n\t" + "ldrd r9, r10, [%[b], #168]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #160]\n\t" + "strd r5, r6, [%[r], #168]\n\t" + "ldrd r3, r4, [%[a], #176]\n\t" + "ldrd r5, r6, [%[a], #184]\n\t" + "ldrd r7, r8, [%[b], #176]\n\t" + "ldrd r9, r10, [%[b], #184]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #176]\n\t" + "strd r5, r6, [%[r], #184]\n\t" + "ldrd r3, r4, [%[a], #192]\n\t" + "ldrd r5, r6, [%[a], #200]\n\t" + "ldrd r7, r8, [%[b], #192]\n\t" + "ldrd r9, r10, [%[b], #200]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #192]\n\t" + "strd r5, r6, [%[r], #200]\n\t" + "ldrd r3, r4, [%[a], #208]\n\t" + "ldrd r5, r6, [%[a], #216]\n\t" + "ldrd r7, r8, [%[b], #208]\n\t" + "ldrd r9, r10, [%[b], #216]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #208]\n\t" + "strd r5, r6, [%[r], #216]\n\t" + "ldrd r3, r4, [%[a], #224]\n\t" + "ldrd r5, r6, [%[a], #232]\n\t" + "ldrd r7, r8, [%[b], #224]\n\t" + "ldrd r9, r10, [%[b], #232]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #224]\n\t" + "strd r5, r6, [%[r], #232]\n\t" + "ldrd r3, r4, [%[a], #240]\n\t" + "ldrd r5, r6, [%[a], #248]\n\t" + "ldrd r7, r8, [%[b], #240]\n\t" + "ldrd r9, r10, [%[b], #248]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #240]\n\t" + "strd r5, r6, [%[r], #248]\n\t" + "ldrd r3, r4, [%[a], #256]\n\t" + "ldrd r5, r6, [%[a], #264]\n\t" + "ldrd r7, r8, [%[b], #256]\n\t" + "ldrd r9, r10, [%[b], #264]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #256]\n\t" + "strd r5, r6, [%[r], #264]\n\t" + "ldrd r3, r4, [%[a], #272]\n\t" + "ldrd r5, r6, [%[a], #280]\n\t" + "ldrd r7, r8, [%[b], #272]\n\t" + "ldrd r9, r10, [%[b], #280]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #272]\n\t" + "strd r5, r6, [%[r], #280]\n\t" + "ldrd r3, r4, [%[a], #288]\n\t" + "ldrd r5, r6, [%[a], #296]\n\t" + "ldrd r7, r8, [%[b], #288]\n\t" + "ldrd r9, r10, [%[b], #296]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #288]\n\t" + "strd r5, r6, [%[r], #296]\n\t" + "ldrd r3, r4, [%[a], #304]\n\t" + "ldrd r5, r6, [%[a], #312]\n\t" + "ldrd r7, r8, [%[b], #304]\n\t" + "ldrd r9, r10, [%[b], #312]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #304]\n\t" + "strd r5, r6, [%[r], #312]\n\t" + "ldrd r3, r4, [%[a], #320]\n\t" + "ldrd r5, r6, [%[a], #328]\n\t" + "ldrd r7, r8, [%[b], #320]\n\t" + "ldrd r9, r10, [%[b], #328]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #320]\n\t" + "strd r5, r6, [%[r], #328]\n\t" + "ldrd r3, r4, [%[a], #336]\n\t" + "ldrd r5, r6, [%[a], #344]\n\t" + "ldrd r7, r8, [%[b], #336]\n\t" + "ldrd r9, r10, [%[b], #344]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #336]\n\t" + "strd r5, r6, [%[r], #344]\n\t" + "ldrd r3, r4, [%[a], #352]\n\t" + "ldrd r5, r6, [%[a], #360]\n\t" + "ldrd r7, r8, [%[b], #352]\n\t" + "ldrd r9, r10, [%[b], #360]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #352]\n\t" + "strd r5, r6, [%[r], #360]\n\t" + "ldrd r3, r4, [%[a], #368]\n\t" + "ldrd r5, r6, [%[a], #376]\n\t" + "ldrd r7, r8, [%[b], #368]\n\t" + "ldrd r9, r10, [%[b], #376]\n\t" + "adcs r3, r3, r7\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #368]\n\t" + "strd r5, r6, [%[r], #376]\n\t" + "adc %[c], r14, r14\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" + ); + + return c; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<48; i++) { + r[i] = a[i] & m; + } +#else + int i; + + for (i = 0; i < 48; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[96]; + sp_digit a1[48]; + sp_digit b1[48]; + sp_digit* z2 = r + 96; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_3072_add_48(a1, a, &a[48]); + cb = sp_3072_add_48(b1, b, &b[48]); + u = ca & cb; + + sp_3072_mul_48(z2, &a[48], &b[48]); + sp_3072_mul_48(z0, a, b); + sp_3072_mul_48(z1, a1, b1); + + u += sp_3072_sub_in_place_96(z1, z0); + u += sp_3072_sub_in_place_96(z1, z2); + sp_3072_mask_48(a1, a1, 0 - cb); + u += sp_3072_add_48(z1 + 48, z1 + 48, a1); + sp_3072_mask_48(b1, b1, 0 - ca); + u += sp_3072_add_48(z1 + 48, z1 + 48, b1); + + u += sp_3072_add_96(r + 48, r + 48, z1); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (48 - 1)); + a1[0] = u; + (void)sp_3072_add_48(r + 144, r + 144, a1); +} + /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -10091,284 +11608,57 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) ); } -/* Add b to a into r. (r = a + b) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, +static sp_digit sp_3072_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit c = 0; __asm__ __volatile__ ( - "mov r14, #0\n\t" "ldrd r3, r4, [%[a], #0]\n\t" "ldrd r5, r6, [%[a], #8]\n\t" "ldrd r7, r8, [%[b], #0]\n\t" "ldrd r9, r10, [%[b], #8]\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #0]\n\t" "strd r5, r6, [%[r], #8]\n\t" "ldrd r3, r4, [%[a], #16]\n\t" "ldrd r5, r6, [%[a], #24]\n\t" "ldrd r7, r8, [%[b], #16]\n\t" "ldrd r9, r10, [%[b], #24]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #16]\n\t" "strd r5, r6, [%[r], #24]\n\t" "ldrd r3, r4, [%[a], #32]\n\t" "ldrd r5, r6, [%[a], #40]\n\t" "ldrd r7, r8, [%[b], #32]\n\t" "ldrd r9, r10, [%[b], #40]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #32]\n\t" "strd r5, r6, [%[r], #40]\n\t" - "adc %[c], r14, r14\n\t" + "sbc %[c], %[c], #0\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return c; } -/* Sub b from a into a. (a -= b) - * - * a A single precision integer and result. - * b A single precision integer. - */ -static sp_digit sp_3072_sub_in_place_24(sp_digit* a, const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldrd r2, r3, [%[a], #0]\n\t" - "ldrd r4, r5, [%[a], #8]\n\t" - "ldrd r6, r7, [%[b], #0]\n\t" - "ldrd r8, r9, [%[b], #8]\n\t" - "subs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #0]\n\t" - "strd r4, r5, [%[a], #8]\n\t" - "ldrd r2, r3, [%[a], #16]\n\t" - "ldrd r4, r5, [%[a], #24]\n\t" - "ldrd r6, r7, [%[b], #16]\n\t" - "ldrd r8, r9, [%[b], #24]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #16]\n\t" - "strd r4, r5, [%[a], #24]\n\t" - "ldrd r2, r3, [%[a], #32]\n\t" - "ldrd r4, r5, [%[a], #40]\n\t" - "ldrd r6, r7, [%[b], #32]\n\t" - "ldrd r8, r9, [%[b], #40]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #32]\n\t" - "strd r4, r5, [%[a], #40]\n\t" - "ldrd r2, r3, [%[a], #48]\n\t" - "ldrd r4, r5, [%[a], #56]\n\t" - "ldrd r6, r7, [%[b], #48]\n\t" - "ldrd r8, r9, [%[b], #56]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #48]\n\t" - "strd r4, r5, [%[a], #56]\n\t" - "ldrd r2, r3, [%[a], #64]\n\t" - "ldrd r4, r5, [%[a], #72]\n\t" - "ldrd r6, r7, [%[b], #64]\n\t" - "ldrd r8, r9, [%[b], #72]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #64]\n\t" - "strd r4, r5, [%[a], #72]\n\t" - "ldrd r2, r3, [%[a], #80]\n\t" - "ldrd r4, r5, [%[a], #88]\n\t" - "ldrd r6, r7, [%[b], #80]\n\t" - "ldrd r8, r9, [%[b], #88]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #80]\n\t" - "strd r4, r5, [%[a], #88]\n\t" - "sbc %[c], r9, r9\n\t" - : [c] "+r" (c) - : [a] "r" (a), [b] "r" (b) - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" - ); - - return c; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "mov r14, #0\n\t" - "ldrd r3, r4, [%[a], #0]\n\t" - "ldrd r5, r6, [%[a], #8]\n\t" - "ldrd r7, r8, [%[b], #0]\n\t" - "ldrd r9, r10, [%[b], #8]\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #0]\n\t" - "strd r5, r6, [%[r], #8]\n\t" - "ldrd r3, r4, [%[a], #16]\n\t" - "ldrd r5, r6, [%[a], #24]\n\t" - "ldrd r7, r8, [%[b], #16]\n\t" - "ldrd r9, r10, [%[b], #24]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #16]\n\t" - "strd r5, r6, [%[r], #24]\n\t" - "ldrd r3, r4, [%[a], #32]\n\t" - "ldrd r5, r6, [%[a], #40]\n\t" - "ldrd r7, r8, [%[b], #32]\n\t" - "ldrd r9, r10, [%[b], #40]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #32]\n\t" - "strd r5, r6, [%[r], #40]\n\t" - "ldrd r3, r4, [%[a], #48]\n\t" - "ldrd r5, r6, [%[a], #56]\n\t" - "ldrd r7, r8, [%[b], #48]\n\t" - "ldrd r9, r10, [%[b], #56]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #48]\n\t" - "strd r5, r6, [%[r], #56]\n\t" - "ldrd r3, r4, [%[a], #64]\n\t" - "ldrd r5, r6, [%[a], #72]\n\t" - "ldrd r7, r8, [%[b], #64]\n\t" - "ldrd r9, r10, [%[b], #72]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #64]\n\t" - "strd r5, r6, [%[r], #72]\n\t" - "ldrd r3, r4, [%[a], #80]\n\t" - "ldrd r5, r6, [%[a], #88]\n\t" - "ldrd r7, r8, [%[b], #80]\n\t" - "ldrd r9, r10, [%[b], #88]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #80]\n\t" - "strd r5, r6, [%[r], #88]\n\t" - "adc %[c], r14, r14\n\t" - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" - ); - - return c; -} - -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<12; i++) { - r[i] = a[i] & m; - } -#else - r[0] = a[0] & m; - r[1] = a[1] & m; - r[2] = a[2] & m; - r[3] = a[3] & m; - r[4] = a[4] & m; - r[5] = a[5] & m; - r[6] = a[6] & m; - r[7] = a[7] & m; - r[8] = a[8] & m; - r[9] = a[9] & m; - r[10] = a[10] & m; - r[11] = a[11] & m; -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[24]; - sp_digit a1[12]; - sp_digit b1[12]; - sp_digit z2[24]; - sp_digit u; - sp_digit ca; - sp_digit cb; - - ca = sp_3072_add_12(a1, a, &a[12]); - cb = sp_3072_add_12(b1, b, &b[12]); - u = ca & cb; - sp_3072_mul_12(z1, a1, b1); - sp_3072_mul_12(z2, &a[12], &b[12]); - sp_3072_mul_12(z0, a, b); - sp_3072_mask_12(r + 24, a1, 0 - cb); - sp_3072_mask_12(b1, b1, 0 - ca); - u += sp_3072_add_12(r + 24, r + 24, b1); - u += sp_3072_sub_in_place_24(z1, z2); - u += sp_3072_sub_in_place_24(z1, z0); - u += sp_3072_add_24(r + 12, r + 12, z1); - r[36] = u; - XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); - (void)sp_3072_add_24(r + 24, r + 24, z2); -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -10377,371 +11667,115 @@ SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[24]; + sp_digit* z2 = r + 24; sp_digit z1[24]; - sp_digit a1[12]; + sp_digit* a1 = z1; + sp_digit zero[12]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 12); + + mask = sp_3072_sub_12(a1, a, &a[12]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_12(a1, p1, p2); - u = sp_3072_add_12(a1, a, &a[12]); - sp_3072_sqr_12(z1, a1); sp_3072_sqr_12(z2, &a[12]); sp_3072_sqr_12(z0, a); - sp_3072_mask_12(r + 24, a1, 0 - u); - u += sp_3072_add_12(r + 24, r + 24, r + 24); - u += sp_3072_sub_in_place_24(z1, z2); - u += sp_3072_sub_in_place_24(z1, z0); - u += sp_3072_add_24(r + 12, r + 12, z1); - r[36] = u; - XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); - (void)sp_3072_add_24(r + 24, r + 24, z2); + sp_3072_sqr_12(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_24(z1, z2); + u -= sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_sub_in_place_24(r + 12, z1); + zero[0] = u; + (void)sp_3072_add_12(r + 36, r + 36, zero); } -/* Sub b from a into a. (a -= b) - * - * a A single precision integer and result. - * b A single precision integer. - */ -static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldrd r2, r3, [%[a], #0]\n\t" - "ldrd r4, r5, [%[a], #8]\n\t" - "ldrd r6, r7, [%[b], #0]\n\t" - "ldrd r8, r9, [%[b], #8]\n\t" - "subs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #0]\n\t" - "strd r4, r5, [%[a], #8]\n\t" - "ldrd r2, r3, [%[a], #16]\n\t" - "ldrd r4, r5, [%[a], #24]\n\t" - "ldrd r6, r7, [%[b], #16]\n\t" - "ldrd r8, r9, [%[b], #24]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #16]\n\t" - "strd r4, r5, [%[a], #24]\n\t" - "ldrd r2, r3, [%[a], #32]\n\t" - "ldrd r4, r5, [%[a], #40]\n\t" - "ldrd r6, r7, [%[b], #32]\n\t" - "ldrd r8, r9, [%[b], #40]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #32]\n\t" - "strd r4, r5, [%[a], #40]\n\t" - "ldrd r2, r3, [%[a], #48]\n\t" - "ldrd r4, r5, [%[a], #56]\n\t" - "ldrd r6, r7, [%[b], #48]\n\t" - "ldrd r8, r9, [%[b], #56]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #48]\n\t" - "strd r4, r5, [%[a], #56]\n\t" - "ldrd r2, r3, [%[a], #64]\n\t" - "ldrd r4, r5, [%[a], #72]\n\t" - "ldrd r6, r7, [%[b], #64]\n\t" - "ldrd r8, r9, [%[b], #72]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #64]\n\t" - "strd r4, r5, [%[a], #72]\n\t" - "ldrd r2, r3, [%[a], #80]\n\t" - "ldrd r4, r5, [%[a], #88]\n\t" - "ldrd r6, r7, [%[b], #80]\n\t" - "ldrd r8, r9, [%[b], #88]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #80]\n\t" - "strd r4, r5, [%[a], #88]\n\t" - "ldrd r2, r3, [%[a], #96]\n\t" - "ldrd r4, r5, [%[a], #104]\n\t" - "ldrd r6, r7, [%[b], #96]\n\t" - "ldrd r8, r9, [%[b], #104]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #96]\n\t" - "strd r4, r5, [%[a], #104]\n\t" - "ldrd r2, r3, [%[a], #112]\n\t" - "ldrd r4, r5, [%[a], #120]\n\t" - "ldrd r6, r7, [%[b], #112]\n\t" - "ldrd r8, r9, [%[b], #120]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #112]\n\t" - "strd r4, r5, [%[a], #120]\n\t" - "ldrd r2, r3, [%[a], #128]\n\t" - "ldrd r4, r5, [%[a], #136]\n\t" - "ldrd r6, r7, [%[b], #128]\n\t" - "ldrd r8, r9, [%[b], #136]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #128]\n\t" - "strd r4, r5, [%[a], #136]\n\t" - "ldrd r2, r3, [%[a], #144]\n\t" - "ldrd r4, r5, [%[a], #152]\n\t" - "ldrd r6, r7, [%[b], #144]\n\t" - "ldrd r8, r9, [%[b], #152]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #144]\n\t" - "strd r4, r5, [%[a], #152]\n\t" - "ldrd r2, r3, [%[a], #160]\n\t" - "ldrd r4, r5, [%[a], #168]\n\t" - "ldrd r6, r7, [%[b], #160]\n\t" - "ldrd r8, r9, [%[b], #168]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #160]\n\t" - "strd r4, r5, [%[a], #168]\n\t" - "ldrd r2, r3, [%[a], #176]\n\t" - "ldrd r4, r5, [%[a], #184]\n\t" - "ldrd r6, r7, [%[b], #176]\n\t" - "ldrd r8, r9, [%[b], #184]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #176]\n\t" - "strd r4, r5, [%[a], #184]\n\t" - "sbc %[c], r9, r9\n\t" - : [c] "+r" (c) - : [a] "r" (a), [b] "r" (b) - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" - ); - - return c; -} - -/* Add b to a into r. (r = a + b) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, +static sp_digit sp_3072_sub_24(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit c = 0; __asm__ __volatile__ ( - "mov r14, #0\n\t" "ldrd r3, r4, [%[a], #0]\n\t" "ldrd r5, r6, [%[a], #8]\n\t" "ldrd r7, r8, [%[b], #0]\n\t" "ldrd r9, r10, [%[b], #8]\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #0]\n\t" "strd r5, r6, [%[r], #8]\n\t" "ldrd r3, r4, [%[a], #16]\n\t" "ldrd r5, r6, [%[a], #24]\n\t" "ldrd r7, r8, [%[b], #16]\n\t" "ldrd r9, r10, [%[b], #24]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #16]\n\t" "strd r5, r6, [%[r], #24]\n\t" "ldrd r3, r4, [%[a], #32]\n\t" "ldrd r5, r6, [%[a], #40]\n\t" "ldrd r7, r8, [%[b], #32]\n\t" "ldrd r9, r10, [%[b], #40]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #32]\n\t" "strd r5, r6, [%[r], #40]\n\t" "ldrd r3, r4, [%[a], #48]\n\t" "ldrd r5, r6, [%[a], #56]\n\t" "ldrd r7, r8, [%[b], #48]\n\t" "ldrd r9, r10, [%[b], #56]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #48]\n\t" "strd r5, r6, [%[r], #56]\n\t" "ldrd r3, r4, [%[a], #64]\n\t" "ldrd r5, r6, [%[a], #72]\n\t" "ldrd r7, r8, [%[b], #64]\n\t" "ldrd r9, r10, [%[b], #72]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #64]\n\t" "strd r5, r6, [%[r], #72]\n\t" "ldrd r3, r4, [%[a], #80]\n\t" "ldrd r5, r6, [%[a], #88]\n\t" "ldrd r7, r8, [%[b], #80]\n\t" "ldrd r9, r10, [%[b], #88]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #80]\n\t" "strd r5, r6, [%[r], #88]\n\t" - "ldrd r3, r4, [%[a], #96]\n\t" - "ldrd r5, r6, [%[a], #104]\n\t" - "ldrd r7, r8, [%[b], #96]\n\t" - "ldrd r9, r10, [%[b], #104]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #96]\n\t" - "strd r5, r6, [%[r], #104]\n\t" - "ldrd r3, r4, [%[a], #112]\n\t" - "ldrd r5, r6, [%[a], #120]\n\t" - "ldrd r7, r8, [%[b], #112]\n\t" - "ldrd r9, r10, [%[b], #120]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #112]\n\t" - "strd r5, r6, [%[r], #120]\n\t" - "ldrd r3, r4, [%[a], #128]\n\t" - "ldrd r5, r6, [%[a], #136]\n\t" - "ldrd r7, r8, [%[b], #128]\n\t" - "ldrd r9, r10, [%[b], #136]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #128]\n\t" - "strd r5, r6, [%[r], #136]\n\t" - "ldrd r3, r4, [%[a], #144]\n\t" - "ldrd r5, r6, [%[a], #152]\n\t" - "ldrd r7, r8, [%[b], #144]\n\t" - "ldrd r9, r10, [%[b], #152]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #144]\n\t" - "strd r5, r6, [%[r], #152]\n\t" - "ldrd r3, r4, [%[a], #160]\n\t" - "ldrd r5, r6, [%[a], #168]\n\t" - "ldrd r7, r8, [%[b], #160]\n\t" - "ldrd r9, r10, [%[b], #168]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #160]\n\t" - "strd r5, r6, [%[r], #168]\n\t" - "ldrd r3, r4, [%[a], #176]\n\t" - "ldrd r5, r6, [%[a], #184]\n\t" - "ldrd r7, r8, [%[b], #176]\n\t" - "ldrd r9, r10, [%[b], #184]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #176]\n\t" - "strd r5, r6, [%[r], #184]\n\t" - "adc %[c], r14, r14\n\t" + "sbc %[c], %[c], #0\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return c; } -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<24; i++) { - r[i] = a[i] & m; - } -#else - int i; - - for (i = 0; i < 24; i += 8) { - r[i+0] = a[i+0] & m; - r[i+1] = a[i+1] & m; - r[i+2] = a[i+2] & m; - r[i+3] = a[i+3] & m; - r[i+4] = a[i+4] & m; - r[i+5] = a[i+5] & m; - r[i+6] = a[i+6] & m; - r[i+7] = a[i+7] & m; - } -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[48]; - sp_digit a1[24]; - sp_digit b1[24]; - sp_digit z2[48]; - sp_digit u; - sp_digit ca; - sp_digit cb; - - ca = sp_3072_add_24(a1, a, &a[24]); - cb = sp_3072_add_24(b1, b, &b[24]); - u = ca & cb; - sp_3072_mul_24(z1, a1, b1); - sp_3072_mul_24(z2, &a[24], &b[24]); - sp_3072_mul_24(z0, a, b); - sp_3072_mask_24(r + 48, a1, 0 - cb); - sp_3072_mask_24(b1, b1, 0 - ca); - u += sp_3072_add_24(r + 48, r + 48, b1); - u += sp_3072_sub_in_place_48(z1, z2); - u += sp_3072_sub_in_place_48(z1, z0); - u += sp_3072_add_48(r + 24, r + 24, z1); - r[72] = u; - XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); - (void)sp_3072_add_48(r + 48, r + 48, z2); -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -10750,611 +11784,175 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[48]; + sp_digit* z2 = r + 48; sp_digit z1[48]; - sp_digit a1[24]; + sp_digit* a1 = z1; + sp_digit zero[24]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 24); + + mask = sp_3072_sub_24(a1, a, &a[24]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_24(a1, p1, p2); - u = sp_3072_add_24(a1, a, &a[24]); - sp_3072_sqr_24(z1, a1); sp_3072_sqr_24(z2, &a[24]); sp_3072_sqr_24(z0, a); - sp_3072_mask_24(r + 48, a1, 0 - u); - u += sp_3072_add_24(r + 48, r + 48, r + 48); - u += sp_3072_sub_in_place_48(z1, z2); - u += sp_3072_sub_in_place_48(z1, z0); - u += sp_3072_add_48(r + 24, r + 24, z1); - r[72] = u; - XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); - (void)sp_3072_add_48(r + 48, r + 48, z2); + sp_3072_sqr_24(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_48(z1, z2); + u -= sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_sub_in_place_48(r + 24, z1); + zero[0] = u; + (void)sp_3072_add_24(r + 72, r + 72, zero); } -/* Sub b from a into a. (a -= b) - * - * a A single precision integer and result. - * b A single precision integer. - */ -static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldrd r2, r3, [%[a], #0]\n\t" - "ldrd r4, r5, [%[a], #8]\n\t" - "ldrd r6, r7, [%[b], #0]\n\t" - "ldrd r8, r9, [%[b], #8]\n\t" - "subs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #0]\n\t" - "strd r4, r5, [%[a], #8]\n\t" - "ldrd r2, r3, [%[a], #16]\n\t" - "ldrd r4, r5, [%[a], #24]\n\t" - "ldrd r6, r7, [%[b], #16]\n\t" - "ldrd r8, r9, [%[b], #24]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #16]\n\t" - "strd r4, r5, [%[a], #24]\n\t" - "ldrd r2, r3, [%[a], #32]\n\t" - "ldrd r4, r5, [%[a], #40]\n\t" - "ldrd r6, r7, [%[b], #32]\n\t" - "ldrd r8, r9, [%[b], #40]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #32]\n\t" - "strd r4, r5, [%[a], #40]\n\t" - "ldrd r2, r3, [%[a], #48]\n\t" - "ldrd r4, r5, [%[a], #56]\n\t" - "ldrd r6, r7, [%[b], #48]\n\t" - "ldrd r8, r9, [%[b], #56]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #48]\n\t" - "strd r4, r5, [%[a], #56]\n\t" - "ldrd r2, r3, [%[a], #64]\n\t" - "ldrd r4, r5, [%[a], #72]\n\t" - "ldrd r6, r7, [%[b], #64]\n\t" - "ldrd r8, r9, [%[b], #72]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #64]\n\t" - "strd r4, r5, [%[a], #72]\n\t" - "ldrd r2, r3, [%[a], #80]\n\t" - "ldrd r4, r5, [%[a], #88]\n\t" - "ldrd r6, r7, [%[b], #80]\n\t" - "ldrd r8, r9, [%[b], #88]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #80]\n\t" - "strd r4, r5, [%[a], #88]\n\t" - "ldrd r2, r3, [%[a], #96]\n\t" - "ldrd r4, r5, [%[a], #104]\n\t" - "ldrd r6, r7, [%[b], #96]\n\t" - "ldrd r8, r9, [%[b], #104]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #96]\n\t" - "strd r4, r5, [%[a], #104]\n\t" - "ldrd r2, r3, [%[a], #112]\n\t" - "ldrd r4, r5, [%[a], #120]\n\t" - "ldrd r6, r7, [%[b], #112]\n\t" - "ldrd r8, r9, [%[b], #120]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #112]\n\t" - "strd r4, r5, [%[a], #120]\n\t" - "ldrd r2, r3, [%[a], #128]\n\t" - "ldrd r4, r5, [%[a], #136]\n\t" - "ldrd r6, r7, [%[b], #128]\n\t" - "ldrd r8, r9, [%[b], #136]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #128]\n\t" - "strd r4, r5, [%[a], #136]\n\t" - "ldrd r2, r3, [%[a], #144]\n\t" - "ldrd r4, r5, [%[a], #152]\n\t" - "ldrd r6, r7, [%[b], #144]\n\t" - "ldrd r8, r9, [%[b], #152]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #144]\n\t" - "strd r4, r5, [%[a], #152]\n\t" - "ldrd r2, r3, [%[a], #160]\n\t" - "ldrd r4, r5, [%[a], #168]\n\t" - "ldrd r6, r7, [%[b], #160]\n\t" - "ldrd r8, r9, [%[b], #168]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #160]\n\t" - "strd r4, r5, [%[a], #168]\n\t" - "ldrd r2, r3, [%[a], #176]\n\t" - "ldrd r4, r5, [%[a], #184]\n\t" - "ldrd r6, r7, [%[b], #176]\n\t" - "ldrd r8, r9, [%[b], #184]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #176]\n\t" - "strd r4, r5, [%[a], #184]\n\t" - "ldrd r2, r3, [%[a], #192]\n\t" - "ldrd r4, r5, [%[a], #200]\n\t" - "ldrd r6, r7, [%[b], #192]\n\t" - "ldrd r8, r9, [%[b], #200]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #192]\n\t" - "strd r4, r5, [%[a], #200]\n\t" - "ldrd r2, r3, [%[a], #208]\n\t" - "ldrd r4, r5, [%[a], #216]\n\t" - "ldrd r6, r7, [%[b], #208]\n\t" - "ldrd r8, r9, [%[b], #216]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #208]\n\t" - "strd r4, r5, [%[a], #216]\n\t" - "ldrd r2, r3, [%[a], #224]\n\t" - "ldrd r4, r5, [%[a], #232]\n\t" - "ldrd r6, r7, [%[b], #224]\n\t" - "ldrd r8, r9, [%[b], #232]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #224]\n\t" - "strd r4, r5, [%[a], #232]\n\t" - "ldrd r2, r3, [%[a], #240]\n\t" - "ldrd r4, r5, [%[a], #248]\n\t" - "ldrd r6, r7, [%[b], #240]\n\t" - "ldrd r8, r9, [%[b], #248]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #240]\n\t" - "strd r4, r5, [%[a], #248]\n\t" - "ldrd r2, r3, [%[a], #256]\n\t" - "ldrd r4, r5, [%[a], #264]\n\t" - "ldrd r6, r7, [%[b], #256]\n\t" - "ldrd r8, r9, [%[b], #264]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #256]\n\t" - "strd r4, r5, [%[a], #264]\n\t" - "ldrd r2, r3, [%[a], #272]\n\t" - "ldrd r4, r5, [%[a], #280]\n\t" - "ldrd r6, r7, [%[b], #272]\n\t" - "ldrd r8, r9, [%[b], #280]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #272]\n\t" - "strd r4, r5, [%[a], #280]\n\t" - "ldrd r2, r3, [%[a], #288]\n\t" - "ldrd r4, r5, [%[a], #296]\n\t" - "ldrd r6, r7, [%[b], #288]\n\t" - "ldrd r8, r9, [%[b], #296]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #288]\n\t" - "strd r4, r5, [%[a], #296]\n\t" - "ldrd r2, r3, [%[a], #304]\n\t" - "ldrd r4, r5, [%[a], #312]\n\t" - "ldrd r6, r7, [%[b], #304]\n\t" - "ldrd r8, r9, [%[b], #312]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #304]\n\t" - "strd r4, r5, [%[a], #312]\n\t" - "ldrd r2, r3, [%[a], #320]\n\t" - "ldrd r4, r5, [%[a], #328]\n\t" - "ldrd r6, r7, [%[b], #320]\n\t" - "ldrd r8, r9, [%[b], #328]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #320]\n\t" - "strd r4, r5, [%[a], #328]\n\t" - "ldrd r2, r3, [%[a], #336]\n\t" - "ldrd r4, r5, [%[a], #344]\n\t" - "ldrd r6, r7, [%[b], #336]\n\t" - "ldrd r8, r9, [%[b], #344]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #336]\n\t" - "strd r4, r5, [%[a], #344]\n\t" - "ldrd r2, r3, [%[a], #352]\n\t" - "ldrd r4, r5, [%[a], #360]\n\t" - "ldrd r6, r7, [%[b], #352]\n\t" - "ldrd r8, r9, [%[b], #360]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #352]\n\t" - "strd r4, r5, [%[a], #360]\n\t" - "ldrd r2, r3, [%[a], #368]\n\t" - "ldrd r4, r5, [%[a], #376]\n\t" - "ldrd r6, r7, [%[b], #368]\n\t" - "ldrd r8, r9, [%[b], #376]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "strd r2, r3, [%[a], #368]\n\t" - "strd r4, r5, [%[a], #376]\n\t" - "sbc %[c], r9, r9\n\t" - : [c] "+r" (c) - : [a] "r" (a), [b] "r" (b) - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" - ); - - return c; -} - -/* Add b to a into r. (r = a + b) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, +static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit c = 0; __asm__ __volatile__ ( - "mov r14, #0\n\t" "ldrd r3, r4, [%[a], #0]\n\t" "ldrd r5, r6, [%[a], #8]\n\t" "ldrd r7, r8, [%[b], #0]\n\t" "ldrd r9, r10, [%[b], #8]\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #0]\n\t" "strd r5, r6, [%[r], #8]\n\t" "ldrd r3, r4, [%[a], #16]\n\t" "ldrd r5, r6, [%[a], #24]\n\t" "ldrd r7, r8, [%[b], #16]\n\t" "ldrd r9, r10, [%[b], #24]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #16]\n\t" "strd r5, r6, [%[r], #24]\n\t" "ldrd r3, r4, [%[a], #32]\n\t" "ldrd r5, r6, [%[a], #40]\n\t" "ldrd r7, r8, [%[b], #32]\n\t" "ldrd r9, r10, [%[b], #40]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #32]\n\t" "strd r5, r6, [%[r], #40]\n\t" "ldrd r3, r4, [%[a], #48]\n\t" "ldrd r5, r6, [%[a], #56]\n\t" "ldrd r7, r8, [%[b], #48]\n\t" "ldrd r9, r10, [%[b], #56]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #48]\n\t" "strd r5, r6, [%[r], #56]\n\t" "ldrd r3, r4, [%[a], #64]\n\t" "ldrd r5, r6, [%[a], #72]\n\t" "ldrd r7, r8, [%[b], #64]\n\t" "ldrd r9, r10, [%[b], #72]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #64]\n\t" "strd r5, r6, [%[r], #72]\n\t" "ldrd r3, r4, [%[a], #80]\n\t" "ldrd r5, r6, [%[a], #88]\n\t" "ldrd r7, r8, [%[b], #80]\n\t" "ldrd r9, r10, [%[b], #88]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #80]\n\t" "strd r5, r6, [%[r], #88]\n\t" "ldrd r3, r4, [%[a], #96]\n\t" "ldrd r5, r6, [%[a], #104]\n\t" "ldrd r7, r8, [%[b], #96]\n\t" "ldrd r9, r10, [%[b], #104]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #96]\n\t" "strd r5, r6, [%[r], #104]\n\t" "ldrd r3, r4, [%[a], #112]\n\t" "ldrd r5, r6, [%[a], #120]\n\t" "ldrd r7, r8, [%[b], #112]\n\t" "ldrd r9, r10, [%[b], #120]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #112]\n\t" "strd r5, r6, [%[r], #120]\n\t" "ldrd r3, r4, [%[a], #128]\n\t" "ldrd r5, r6, [%[a], #136]\n\t" "ldrd r7, r8, [%[b], #128]\n\t" "ldrd r9, r10, [%[b], #136]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #128]\n\t" "strd r5, r6, [%[r], #136]\n\t" "ldrd r3, r4, [%[a], #144]\n\t" "ldrd r5, r6, [%[a], #152]\n\t" "ldrd r7, r8, [%[b], #144]\n\t" "ldrd r9, r10, [%[b], #152]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #144]\n\t" "strd r5, r6, [%[r], #152]\n\t" "ldrd r3, r4, [%[a], #160]\n\t" "ldrd r5, r6, [%[a], #168]\n\t" "ldrd r7, r8, [%[b], #160]\n\t" "ldrd r9, r10, [%[b], #168]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #160]\n\t" "strd r5, r6, [%[r], #168]\n\t" "ldrd r3, r4, [%[a], #176]\n\t" "ldrd r5, r6, [%[a], #184]\n\t" "ldrd r7, r8, [%[b], #176]\n\t" "ldrd r9, r10, [%[b], #184]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" "strd r3, r4, [%[r], #176]\n\t" "strd r5, r6, [%[r], #184]\n\t" - "ldrd r3, r4, [%[a], #192]\n\t" - "ldrd r5, r6, [%[a], #200]\n\t" - "ldrd r7, r8, [%[b], #192]\n\t" - "ldrd r9, r10, [%[b], #200]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #192]\n\t" - "strd r5, r6, [%[r], #200]\n\t" - "ldrd r3, r4, [%[a], #208]\n\t" - "ldrd r5, r6, [%[a], #216]\n\t" - "ldrd r7, r8, [%[b], #208]\n\t" - "ldrd r9, r10, [%[b], #216]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #208]\n\t" - "strd r5, r6, [%[r], #216]\n\t" - "ldrd r3, r4, [%[a], #224]\n\t" - "ldrd r5, r6, [%[a], #232]\n\t" - "ldrd r7, r8, [%[b], #224]\n\t" - "ldrd r9, r10, [%[b], #232]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #224]\n\t" - "strd r5, r6, [%[r], #232]\n\t" - "ldrd r3, r4, [%[a], #240]\n\t" - "ldrd r5, r6, [%[a], #248]\n\t" - "ldrd r7, r8, [%[b], #240]\n\t" - "ldrd r9, r10, [%[b], #248]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #240]\n\t" - "strd r5, r6, [%[r], #248]\n\t" - "ldrd r3, r4, [%[a], #256]\n\t" - "ldrd r5, r6, [%[a], #264]\n\t" - "ldrd r7, r8, [%[b], #256]\n\t" - "ldrd r9, r10, [%[b], #264]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #256]\n\t" - "strd r5, r6, [%[r], #264]\n\t" - "ldrd r3, r4, [%[a], #272]\n\t" - "ldrd r5, r6, [%[a], #280]\n\t" - "ldrd r7, r8, [%[b], #272]\n\t" - "ldrd r9, r10, [%[b], #280]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #272]\n\t" - "strd r5, r6, [%[r], #280]\n\t" - "ldrd r3, r4, [%[a], #288]\n\t" - "ldrd r5, r6, [%[a], #296]\n\t" - "ldrd r7, r8, [%[b], #288]\n\t" - "ldrd r9, r10, [%[b], #296]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #288]\n\t" - "strd r5, r6, [%[r], #296]\n\t" - "ldrd r3, r4, [%[a], #304]\n\t" - "ldrd r5, r6, [%[a], #312]\n\t" - "ldrd r7, r8, [%[b], #304]\n\t" - "ldrd r9, r10, [%[b], #312]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #304]\n\t" - "strd r5, r6, [%[r], #312]\n\t" - "ldrd r3, r4, [%[a], #320]\n\t" - "ldrd r5, r6, [%[a], #328]\n\t" - "ldrd r7, r8, [%[b], #320]\n\t" - "ldrd r9, r10, [%[b], #328]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #320]\n\t" - "strd r5, r6, [%[r], #328]\n\t" - "ldrd r3, r4, [%[a], #336]\n\t" - "ldrd r5, r6, [%[a], #344]\n\t" - "ldrd r7, r8, [%[b], #336]\n\t" - "ldrd r9, r10, [%[b], #344]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #336]\n\t" - "strd r5, r6, [%[r], #344]\n\t" - "ldrd r3, r4, [%[a], #352]\n\t" - "ldrd r5, r6, [%[a], #360]\n\t" - "ldrd r7, r8, [%[b], #352]\n\t" - "ldrd r9, r10, [%[b], #360]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #352]\n\t" - "strd r5, r6, [%[r], #360]\n\t" - "ldrd r3, r4, [%[a], #368]\n\t" - "ldrd r5, r6, [%[a], #376]\n\t" - "ldrd r7, r8, [%[b], #368]\n\t" - "ldrd r9, r10, [%[b], #376]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "strd r3, r4, [%[r], #368]\n\t" - "strd r5, r6, [%[r], #376]\n\t" - "adc %[c], r14, r14\n\t" + "sbc %[c], %[c], #0\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return c; } -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<48; i++) { - r[i] = a[i] & m; - } -#else - int i; - - for (i = 0; i < 48; i += 8) { - r[i+0] = a[i+0] & m; - r[i+1] = a[i+1] & m; - r[i+2] = a[i+2] & m; - r[i+3] = a[i+3] & m; - r[i+4] = a[i+4] & m; - r[i+5] = a[i+5] & m; - r[i+6] = a[i+6] & m; - r[i+7] = a[i+7] & m; - } -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[96]; - sp_digit a1[48]; - sp_digit b1[48]; - sp_digit z2[96]; - sp_digit u; - sp_digit ca; - sp_digit cb; - - ca = sp_3072_add_48(a1, a, &a[48]); - cb = sp_3072_add_48(b1, b, &b[48]); - u = ca & cb; - sp_3072_mul_48(z1, a1, b1); - sp_3072_mul_48(z2, &a[48], &b[48]); - sp_3072_mul_48(z0, a, b); - sp_3072_mask_48(r + 96, a1, 0 - cb); - sp_3072_mask_48(b1, b1, 0 - ca); - u += sp_3072_add_48(r + 96, r + 96, b1); - u += sp_3072_sub_in_place_96(z1, z2); - u += sp_3072_sub_in_place_96(z1, z0); - u += sp_3072_add_96(r + 48, r + 48, z1); - r[144] = u; - XMEMSET(r + 144 + 1, 0, sizeof(sp_digit) * (48 - 1)); - (void)sp_3072_add_96(r + 96, r + 96, z2); -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -11363,23 +11961,32 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[96]; + sp_digit* z2 = r + 96; sp_digit z1[96]; - sp_digit a1[48]; + sp_digit* a1 = z1; + sp_digit zero[48]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 48); + + mask = sp_3072_sub_48(a1, a, &a[48]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_48(a1, p1, p2); - u = sp_3072_add_48(a1, a, &a[48]); - sp_3072_sqr_48(z1, a1); sp_3072_sqr_48(z2, &a[48]); sp_3072_sqr_48(z0, a); - sp_3072_mask_48(r + 96, a1, 0 - u); - u += sp_3072_add_48(r + 96, r + 96, r + 96); - u += sp_3072_sub_in_place_96(z1, z2); - u += sp_3072_sub_in_place_96(z1, z0); - u += sp_3072_add_96(r + 48, r + 48, z1); - r[144] = u; - XMEMSET(r + 144 + 1, 0, sizeof(sp_digit) * (48 - 1)); - (void)sp_3072_add_96(r + 96, r + 96, z2); + sp_3072_sqr_48(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_96(z1, z2); + u -= sp_3072_sub_in_place_96(z1, z0); + u += sp_3072_sub_in_place_96(r + 48, z1); + zero[0] = u; + (void)sp_3072_add_48(r + 144, r + 144, zero); } #endif /* !WOLFSSL_SP_SMALL */ @@ -13262,7 +13869,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_48(r, a, b); @@ -13276,7 +13883,7 @@ static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_48(r, a); @@ -13664,11 +14271,11 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -14325,7 +14932,7 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig div = d[47]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); - for (i=47; i>=0; i--) { + for (i = 47; i >= 0; i--) { sp_digit hi = t1[48 + i] - (t1[48 + i] == div); r1 = div_3072_word_48(hi, t1[48 + i - 1], div); @@ -15992,7 +16599,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_96(r, a, b); @@ -16006,7 +16613,7 @@ static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_96(r, a); @@ -16313,11 +16920,11 @@ static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -16395,9 +17002,13 @@ static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, s div = d[95]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); - for (i=95; i>=0; i--) { - sp_digit hi = t1[96 + i] - (t1[96 + i] == div); - r1 = div_3072_word_96(hi, t1[96 + i - 1], div); + for (i = 95; i >= 0; i--) { + if (t1[96 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_3072_word_96(t1[96 + i], t1[96 + i - 1], div); + } sp_3072_mul_d_96(t2, d, r1); t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2); @@ -17594,7 +18205,7 @@ static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_dig div = d[95]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); - for (i=95; i>=0; i--) { + for (i = 95; i >= 0; i--) { sp_digit hi = t1[96 + i] - (t1[96 + i] == div); r1 = div_3072_word_96(hi, t1[96 + i - 1], div); @@ -17981,9 +18592,9 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 96; r = a + 96 * 2; m = r + 96 * 2; - ah = a + 96; sp_3072_from_bin(ah, 96, in, inLen); #if DIGIT_BIT >= 32 @@ -18001,7 +18612,38 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_3072_from_mp(m, 96, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_3072_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 96); + err = sp_3072_mod_96_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_3072_mont_sqr_96(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_3072_mont_mul_96(r, r, ah, m, mp); + + for (i = 95; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_3072_sub_in_place_96(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_3072_sqr_96(r, ah); err = sp_3072_mod_96_cond(r, r, m); @@ -18029,7 +18671,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 96); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_3072_mont_sqr_96(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_3072_mont_mul_96(r, r, a, m, mp); @@ -18064,6 +18706,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, } #ifndef WOLFSSL_RSA_PUBLIC_ONLY +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -18077,29 +18720,46 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_dig { sp_digit c = 0; -#ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov r9, #0\n\t" - "mov r8, #0\n\t" + "mov r7, #0\n\t" + "mov r6, #0\n\t" "1:\n\t" "adds %[c], %[c], #-1\n\t" - "ldr r4, [%[a], r8]\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adcs r4, r4, r6\n\t" - "adc %[c], r9, r9\n\t" - "str r4, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, #192\n\t" + "ldr r4, [%[a], r6]\n\t" + "ldr r5, [%[b], r6]\n\t" + "and r5, r5, %[m]\n\t" + "adcs r4, r4, r5\n\t" + "adc %[c], r7, r7\n\t" + "str r4, [%[r], r6]\n\t" + "add r6, r6, #4\n\t" + "cmp r6, #192\n\t" "blt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7" ); -#else + + return c; +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ + sp_digit c = 0; + __asm__ __volatile__ ( - "mov r9, #0\n\t" + "mov r8, #0\n\t" "ldrd r4, r5, [%[a], #0]\n\t" "ldrd r6, r7, [%[b], #0]\n\t" "and r6, r6, %[m]\n\t" @@ -18268,15 +18928,15 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_dig "adcs r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "strd r4, r5, [%[r], #184]\n\t" - "adc %[c], r9, r9\n\t" + "adc %[c], r8, r8\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7", "r8" ); -#endif /* WOLFSSL_SP_SMALL */ return c; } +#endif /* !WOLFSSL_SP_SMALL */ /* RSA private key operation. * @@ -20272,7 +20932,7 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, sp_digit z1[128]; sp_digit a1[64]; sp_digit b1[64]; - sp_digit z2[128]; + sp_digit* z2 = r + 128; sp_digit u; sp_digit ca; sp_digit cb; @@ -20280,18 +20940,22 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, ca = sp_2048_add_64(a1, a, &a[64]); cb = sp_2048_add_64(b1, b, &b[64]); u = ca & cb; - sp_2048_mul_64(z1, a1, b1); + sp_2048_mul_64(z2, &a[64], &b[64]); sp_2048_mul_64(z0, a, b); - sp_2048_mask_64(r + 128, a1, 0 - cb); - sp_2048_mask_64(b1, b1, 0 - ca); - u += sp_2048_add_64(r + 128, r + 128, b1); - u += sp_4096_sub_in_place_128(z1, z2); + sp_2048_mul_64(z1, a1, b1); + u += sp_4096_sub_in_place_128(z1, z0); + u += sp_4096_sub_in_place_128(z1, z2); + sp_2048_mask_64(a1, a1, 0 - cb); + u += sp_2048_add_64(z1 + 64, z1 + 64, a1); + sp_2048_mask_64(b1, b1, 0 - ca); + u += sp_2048_add_64(z1 + 64, z1 + 64, b1); + u += sp_4096_add_128(r + 64, r + 64, z1); - r[192] = u; - XMEMSET(r + 192 + 1, 0, sizeof(sp_digit) * (64 - 1)); - (void)sp_4096_add_128(r + 128, r + 128, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (64 - 1)); + a1[0] = u; + (void)sp_4096_add_64(r + 192, r + 192, a1); } /* Square a and put result in r. (r = a * a) @@ -20302,23 +20966,32 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[128]; + sp_digit* z2 = r + 128; sp_digit z1[128]; - sp_digit a1[64]; + sp_digit* a1 = z1; + sp_digit zero[64]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 64); + + mask = sp_2048_sub_64(a1, a, &a[64]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_64(a1, p1, p2); - u = sp_2048_add_64(a1, a, &a[64]); - sp_2048_sqr_64(z1, a1); sp_2048_sqr_64(z2, &a[64]); sp_2048_sqr_64(z0, a); - sp_2048_mask_64(r + 128, a1, 0 - u); - u += sp_2048_add_64(r + 128, r + 128, r + 128); - u += sp_4096_sub_in_place_128(z1, z2); - u += sp_4096_sub_in_place_128(z1, z0); - u += sp_4096_add_128(r + 64, r + 64, z1); - r[192] = u; - XMEMSET(r + 192 + 1, 0, sizeof(sp_digit) * (64 - 1)); - (void)sp_4096_add_128(r + 128, r + 128, z2); + sp_2048_sqr_64(z1, a1); + + u = 0; + u -= sp_4096_sub_in_place_128(z1, z2); + u -= sp_4096_sub_in_place_128(z1, z0); + u += sp_4096_sub_in_place_128(r + 64, z1); + zero[0] = u; + (void)sp_2048_add_64(r + 192, r + 192, zero); } #endif /* !WOLFSSL_SP_SMALL */ @@ -23197,7 +23870,7 @@ SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_128(r, a, b); @@ -23211,7 +23884,7 @@ static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_128(r, a); @@ -23598,11 +24271,11 @@ static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -23680,9 +24353,13 @@ static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, div = d[127]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); - for (i=127; i>=0; i--) { - sp_digit hi = t1[128 + i] - (t1[128 + i] == div); - r1 = div_4096_word_128(hi, t1[128 + i - 1], div); + for (i = 127; i >= 0; i--) { + if (t1[128 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_4096_word_128(t1[128 + i], t1[128 + i - 1], div); + } sp_4096_mul_d_128(t2, d, r1); t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2); @@ -25231,7 +25908,7 @@ static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_di div = d[127]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); - for (i=127; i>=0; i--) { + for (i = 127; i >= 0; i--) { sp_digit hi = t1[128 + i] - (t1[128 + i] == div); r1 = div_4096_word_128(hi, t1[128 + i - 1], div); @@ -25618,9 +26295,9 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 128; r = a + 128 * 2; m = r + 128 * 2; - ah = a + 128; sp_4096_from_bin(ah, 128, in, inLen); #if DIGIT_BIT >= 32 @@ -25638,7 +26315,38 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_4096_from_mp(m, 128, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_4096_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 128); + err = sp_4096_mod_128_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_4096_mont_sqr_128(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_4096_mont_mul_128(r, r, ah, m, mp); + + for (i = 127; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_4096_sub_in_place_128(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_4096_sqr_128(r, ah); err = sp_4096_mod_128_cond(r, r, m); @@ -25666,7 +26374,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 128); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_4096_mont_sqr_128(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_4096_mont_mul_128(r, r, a, m, mp); @@ -25701,6 +26409,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, } #ifndef WOLFSSL_RSA_PUBLIC_ONLY +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -25714,29 +26423,46 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_dig { sp_digit c = 0; -#ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov r9, #0\n\t" - "mov r8, #0\n\t" + "mov r7, #0\n\t" + "mov r6, #0\n\t" "1:\n\t" "adds %[c], %[c], #-1\n\t" - "ldr r4, [%[a], r8]\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adcs r4, r4, r6\n\t" - "adc %[c], r9, r9\n\t" - "str r4, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, #256\n\t" + "ldr r4, [%[a], r6]\n\t" + "ldr r5, [%[b], r6]\n\t" + "and r5, r5, %[m]\n\t" + "adcs r4, r4, r5\n\t" + "adc %[c], r7, r7\n\t" + "str r4, [%[r], r6]\n\t" + "add r6, r6, #4\n\t" + "cmp r6, #256\n\t" "blt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7" ); -#else + + return c; +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ + sp_digit c = 0; + __asm__ __volatile__ ( - "mov r9, #0\n\t" + "mov r8, #0\n\t" "ldrd r4, r5, [%[a], #0]\n\t" "ldrd r6, r7, [%[b], #0]\n\t" "and r6, r6, %[m]\n\t" @@ -25961,15 +26687,15 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_dig "adcs r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "strd r4, r5, [%[r], #248]\n\t" - "adc %[c], r9, r9\n\t" + "adc %[c], r8, r8\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7", "r8" ); -#endif /* WOLFSSL_SP_SMALL */ return c; } +#endif /* !WOLFSSL_SP_SMALL */ /* RSA private key operation. * @@ -34761,11 +35487,11 @@ static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -34869,7 +35595,7 @@ static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit div = d[7]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 8); - for (i=7; i>=0; i--) { + for (i = 7; i >= 0; i--) { sp_digit hi = t1[8 + i] - (t1[8 + i] == div); r1 = div_256_word_8(hi, t1[8 + i - 1], div); @@ -39270,7 +39996,7 @@ SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_mul_12(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_384_mul_12(r, a, b); @@ -39284,7 +40010,7 @@ static void sp_384_mont_mul_12(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_sqr_12(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_sqr_12(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_384_sqr_12(r, a); @@ -39681,6 +40407,7 @@ static void sp_384_mont_tpl_12(sp_digit* r, const sp_digit* a, const sp_digit* m sp_384_cond_sub_12(r, r, m, 0 - o); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -39694,29 +40421,46 @@ static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a, const sp_digi { sp_digit c = 0; -#ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov r9, #0\n\t" - "mov r8, #0\n\t" + "mov r7, #0\n\t" + "mov r6, #0\n\t" "1:\n\t" "adds %[c], %[c], #-1\n\t" - "ldr r4, [%[a], r8]\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adcs r4, r4, r6\n\t" - "adc %[c], r9, r9\n\t" - "str r4, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, #48\n\t" + "ldr r4, [%[a], r6]\n\t" + "ldr r5, [%[b], r6]\n\t" + "and r5, r5, %[m]\n\t" + "adcs r4, r4, r5\n\t" + "adc %[c], r7, r7\n\t" + "str r4, [%[r], r6]\n\t" + "add r6, r6, #4\n\t" + "cmp r6, #48\n\t" "blt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7" ); -#else + + return c; +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ + sp_digit c = 0; + __asm__ __volatile__ ( - "mov r9, #0\n\t" + "mov r8, #0\n\t" "ldrd r4, r5, [%[a], #0]\n\t" "ldrd r6, r7, [%[b], #0]\n\t" "and r6, r6, %[m]\n\t" @@ -39759,15 +40503,15 @@ static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a, const sp_digi "adcs r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "strd r4, r5, [%[r], #40]\n\t" - "adc %[c], r9, r9\n\t" + "adc %[c], r8, r8\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7", "r8" ); -#endif /* WOLFSSL_SP_SMALL */ return c; } +#endif /* !WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * @@ -43876,11 +44620,11 @@ static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -43988,7 +44732,7 @@ static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digi div = d[11]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 12); - for (i=11; i>=0; i--) { + for (i = 11; i >= 0; i--) { sp_digit hi = t1[12 + i] - (t1[12 + i] == div); r1 = div_384_word_12(hi, t1[12 + i - 1], div); @@ -50272,7 +51016,7 @@ SP_NOINLINE static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_mul_17(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_mul_17(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_521_mul_17(r, a, b); @@ -50286,7 +51030,7 @@ static void sp_521_mont_mul_17(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_sqr_17(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_sqr_17(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_521_sqr_17(r, a); @@ -50967,6 +51711,7 @@ static void sp_521_mont_tpl_17(sp_digit* r, const sp_digit* a, const sp_digit* m ); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -50980,29 +51725,46 @@ static sp_digit sp_521_cond_add_17(sp_digit* r, const sp_digit* a, const sp_digi { sp_digit c = 0; -#ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov r9, #0\n\t" - "mov r8, #0\n\t" + "mov r7, #0\n\t" + "mov r6, #0\n\t" "1:\n\t" "adds %[c], %[c], #-1\n\t" - "ldr r4, [%[a], r8]\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adcs r4, r4, r6\n\t" - "adc %[c], r9, r9\n\t" - "str r4, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, #68\n\t" + "ldr r4, [%[a], r6]\n\t" + "ldr r5, [%[b], r6]\n\t" + "and r5, r5, %[m]\n\t" + "adcs r4, r4, r5\n\t" + "adc %[c], r7, r7\n\t" + "str r4, [%[r], r6]\n\t" + "add r6, r6, #4\n\t" + "cmp r6, #68\n\t" "blt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7" ); -#else + + return c; +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_521_cond_add_17(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ + sp_digit c = 0; + __asm__ __volatile__ ( - "mov r9, #0\n\t" + "mov r8, #0\n\t" "ldrd r4, r5, [%[a], #0]\n\t" "ldrd r6, r7, [%[b], #0]\n\t" "and r6, r6, %[m]\n\t" @@ -51064,15 +51826,15 @@ static sp_digit sp_521_cond_add_17(sp_digit* r, const sp_digit* a, const sp_digi "and r6, r6, %[m]\n\t" "adcs r4, r4, r6\n\t" "str r4, [%[r], #64]\n\t" - "adc %[c], r9, r9\n\t" + "adc %[c], r8, r8\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7", "r8" ); -#endif /* WOLFSSL_SP_SMALL */ return c; } +#endif /* !WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * @@ -56376,11 +57138,11 @@ static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -56498,7 +57260,7 @@ static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d, sp_digi sp_521_lshift_17(sd, d, 23); sp_521_lshift_34(t1, t1, 23); - for (i=16; i>=0; i--) { + for (i = 16; i >= 0; i--) { sp_digit hi = t1[17 + i] - (t1[17 + i] == div); r1 = div_521_word_17(hi, t1[17 + i - 1], div); @@ -61640,7 +62402,7 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, sp_digit z1[32]; sp_digit a1[16]; sp_digit b1[16]; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit u; sp_digit ca; sp_digit cb; @@ -61648,18 +62410,83 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, ca = sp_1024_add_16(a1, a, &a[16]); cb = sp_1024_add_16(b1, b, &b[16]); u = ca & cb; - sp_1024_mul_16(z1, a1, b1); + sp_1024_mul_16(z2, &a[16], &b[16]); sp_1024_mul_16(z0, a, b); - sp_1024_mask_16(r + 32, a1, 0 - cb); - sp_1024_mask_16(b1, b1, 0 - ca); - u += sp_1024_add_16(r + 32, r + 32, b1); - u += sp_1024_sub_in_place_32(z1, z2); + sp_1024_mul_16(z1, a1, b1); + u += sp_1024_sub_in_place_32(z1, z0); + u += sp_1024_sub_in_place_32(z1, z2); + sp_1024_mask_16(a1, a1, 0 - cb); + u += sp_1024_add_16(z1 + 16, z1 + 16, a1); + sp_1024_mask_16(b1, b1, 0 - ca); + u += sp_1024_add_16(z1 + 16, z1 + 16, b1); + u += sp_1024_add_32(r + 16, r + 16, z1); - r[48] = u; - XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1)); - (void)sp_1024_add_32(r + 32, r + 32, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (16 - 1)); + a1[0] = u; + (void)sp_1024_add_16(r + 48, r + 48, a1); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_1024_sub_16(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldrd r3, r4, [%[a], #0]\n\t" + "ldrd r5, r6, [%[a], #8]\n\t" + "ldrd r7, r8, [%[b], #0]\n\t" + "ldrd r9, r10, [%[b], #8]\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #0]\n\t" + "strd r5, r6, [%[r], #8]\n\t" + "ldrd r3, r4, [%[a], #16]\n\t" + "ldrd r5, r6, [%[a], #24]\n\t" + "ldrd r7, r8, [%[b], #16]\n\t" + "ldrd r9, r10, [%[b], #24]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #16]\n\t" + "strd r5, r6, [%[r], #24]\n\t" + "ldrd r3, r4, [%[a], #32]\n\t" + "ldrd r5, r6, [%[a], #40]\n\t" + "ldrd r7, r8, [%[b], #32]\n\t" + "ldrd r9, r10, [%[b], #40]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #40]\n\t" + "ldrd r3, r4, [%[a], #48]\n\t" + "ldrd r5, r6, [%[a], #56]\n\t" + "ldrd r7, r8, [%[b], #48]\n\t" + "ldrd r9, r10, [%[b], #56]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "strd r3, r4, [%[r], #48]\n\t" + "strd r5, r6, [%[r], #56]\n\t" + "sbc %[c], %[c], #0\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + + return c; } /* Square a and put result in r. (r = a * a) @@ -61670,23 +62497,32 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit z1[32]; - sp_digit a1[16]; + sp_digit* a1 = z1; + sp_digit zero[16]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 16); + + mask = sp_1024_sub_16(a1, a, &a[16]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_1024_sub_16(a1, p1, p2); - u = sp_1024_add_16(a1, a, &a[16]); - sp_1024_sqr_16(z1, a1); sp_1024_sqr_16(z2, &a[16]); sp_1024_sqr_16(z0, a); - sp_1024_mask_16(r + 32, a1, 0 - u); - u += sp_1024_add_16(r + 32, r + 32, r + 32); - u += sp_1024_sub_in_place_32(z1, z2); - u += sp_1024_sub_in_place_32(z1, z0); - u += sp_1024_add_32(r + 16, r + 16, z1); - r[48] = u; - XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1)); - (void)sp_1024_add_32(r + 32, r + 32, z2); + sp_1024_sqr_16(z1, a1); + + u = 0; + u -= sp_1024_sub_in_place_32(z1, z2); + u -= sp_1024_sub_in_place_32(z1, z0); + u += sp_1024_sub_in_place_32(r + 16, z1); + zero[0] = u; + (void)sp_1024_add_16(r + 48, r + 48, zero); } #else @@ -62410,11 +63246,11 @@ static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -62925,7 +63761,7 @@ static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_dig div = d[31]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i=31; i>=0; i--) { + for (i = 31; i >= 0; i--) { sp_digit hi = t1[32 + i] - (t1[32 + i] == div); r1 = div_1024_word_32(hi, t1[32 + i - 1], div); @@ -63559,7 +64395,7 @@ SP_NOINLINE static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_1024_mul_32(r, a, b); @@ -63573,7 +64409,7 @@ static void sp_1024_mont_mul_32(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_sqr_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_sqr_32(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_1024_sqr_32(r, a); @@ -64481,6 +65317,7 @@ static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* ); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -64494,29 +65331,46 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig { sp_digit c = 0; -#ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov r9, #0\n\t" - "mov r8, #0\n\t" + "mov r7, #0\n\t" + "mov r6, #0\n\t" "1:\n\t" "adds %[c], %[c], #-1\n\t" - "ldr r4, [%[a], r8]\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adcs r4, r4, r6\n\t" - "adc %[c], r9, r9\n\t" - "str r4, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, #128\n\t" + "ldr r4, [%[a], r6]\n\t" + "ldr r5, [%[b], r6]\n\t" + "and r5, r5, %[m]\n\t" + "adcs r4, r4, r5\n\t" + "adc %[c], r7, r7\n\t" + "str r4, [%[r], r6]\n\t" + "add r6, r6, #4\n\t" + "cmp r6, #128\n\t" "blt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7" ); -#else + + return c; +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ + sp_digit c = 0; + __asm__ __volatile__ ( - "mov r9, #0\n\t" + "mov r8, #0\n\t" "ldrd r4, r5, [%[a], #0]\n\t" "ldrd r6, r7, [%[b], #0]\n\t" "and r6, r6, %[m]\n\t" @@ -64629,15 +65483,15 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig "adcs r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "strd r4, r5, [%[r], #120]\n\t" - "adc %[c], r9, r9\n\t" + "adc %[c], r8, r8\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7", "r8" ); -#endif /* WOLFSSL_SP_SMALL */ return c; } +#endif /* !WOLFSSL_SP_SMALL */ static void sp_1024_rshift1_32(sp_digit* r, const sp_digit* a) { diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index bda5f232b..89d1bab2c 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -48,19 +48,19 @@ #include #ifdef WOLFSSL_SP_ARM64_ASM -#define SP_PRINT_NUM(var, name, total, words, bits) \ - do { \ - int ii; \ - fprintf(stderr, name "=0x"); \ - for (ii = words - 1; ii >= 0; ii--) \ - fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ - fprintf(stderr, "\n"); \ +#define SP_PRINT_NUM(var, name, total, words, bits) \ + do { \ + int ii; \ + fprintf(stderr, name "=0x"); \ + for (ii = ((bits + 63) / 64) - 1; ii >= 0; ii--) \ + fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ + fprintf(stderr, "\n"); \ } while (0) -#define SP_PRINT_VAL(var, name) \ +#define SP_PRINT_VAL(var, name) \ fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var) -#define SP_PRINT_INT(var, name) \ +#define SP_PRINT_INT(var, name) \ fprintf(stderr, name "=%d\n", var) #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) @@ -74,41 +74,74 @@ */ static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n) { - int i; - int j; - byte* d; + sp_int64 nl = n; + sp_int64 size8 = size * 8; - for (i = n - 1,j = 0; i >= 7; i -= 8) { - r[j] = ((sp_digit)a[i - 0] << 0) | - ((sp_digit)a[i - 1] << 8) | - ((sp_digit)a[i - 2] << 16) | - ((sp_digit)a[i - 3] << 24) | - ((sp_digit)a[i - 4] << 32) | - ((sp_digit)a[i - 5] << 40) | - ((sp_digit)a[i - 6] << 48) | - ((sp_digit)a[i - 7] << 56); - j++; - } - - if (i >= 0) { - r[j] = 0; - - d = (byte*)r; - switch (i) { - case 6: d[n - 1 - 6] = a[6]; //fallthrough - case 5: d[n - 1 - 5] = a[5]; //fallthrough - case 4: d[n - 1 - 4] = a[4]; //fallthrough - case 3: d[n - 1 - 3] = a[3]; //fallthrough - case 2: d[n - 1 - 2] = a[2]; //fallthrough - case 1: d[n - 1 - 1] = a[1]; //fallthrough - case 0: d[n - 1 - 0] = a[0]; //fallthrough - } - j++; - } - - for (; j < size; j++) { - r[j] = 0; - } + __asm__ __volatile__ ( + "add x4, %[a], %[n]\n\t" + "mov x5, %[r]\n\t" + "sub x4, x4, 8\n\t" + "subs x6, %[n], 8\n\t" + "mov x7, xzr\n\t" + "blt 2f\n\t" + /* Put in mulitples of 8 bytes. */ + "1:\n\t" + "ldr x8, [x4], -8\n\t" + "subs x6, x6, 8\n\t" + "rev x8, x8\n\t" + "str x8, [x5], 8\n\t" + "add x7, x7, 8\n\t" + "b.ge 1b\n\t" + "2:\n\t" + "cmp x6, -7\n\t" + "b.lt 20f\n\t" + /* Put in less than 8 bytes. */ + "str xzr, [x5]\n\t" + "add x7, x7, 8\n\t" + "add x4, x4, 7\n\t" + "b.eq 17f\n\t" + "cmp x6, -5\n\t" + "b.lt 16f\n\t" + "b.eq 15f\n\t" + "cmp x6, -3\n\t" + "b.lt 14f\n\t" + "b.eq 13f\n\t" + "cmp x6, -2\n\t" + "b.eq 12f\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "12:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "13:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "14:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "15:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "16:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "17:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "20:\n\t" + "add x5, %[r], x7\n\t" + "subs x7, %[size], x7\n\t" + "b.eq 30f\n\t" + /* Zero out remaining words. */ + "21:\n\t" + "subs x7, x7, 8\n\t" + "str xzr, [x5], 8\n\t" + "b.gt 21b\n\t" + "30:\n\t" + : + : [r] "r" (r), [size] "r" (size8), [a] "r" (a), [n] "r" (nl) + : "memory", "x4", "x5", "x6", "x7", "x8" + ); } /* Convert an mp_int to an array of sp_digit. @@ -207,15 +240,15 @@ static void sp_2048_to_bin_32(sp_digit* r, byte* a) int i; int j = 0; - for (i = 31; i >= 0; i--) { - a[j++] = r[i] >> 56; - a[j++] = r[i] >> 48; - a[j++] = r[i] >> 40; - a[j++] = r[i] >> 32; - a[j++] = r[i] >> 24; - a[j++] = r[i] >> 16; - a[j++] = r[i] >> 8; - a[j++] = r[i] >> 0; + for (i = 31; i >= 0; i--, j += 8) { + __asm__ __volatile__ ( + "ldr x4, [%[r]]\n\t" + "rev x4, x4\n\t" + "str x4, [%[a]]\n\t" + : + : [r] "r" (r + i), [a] "r" (a + j) + : "memory", "x4" + ); } } @@ -651,249 +684,6 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) ); } -/* Square a and put result in r. (r = a * a) - * - * All registers version. - * - * r A single precision integer. - * a A single precision integer. - */ -static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldp x21, x22, [%[a], 0]\n\t" - "ldp x23, x24, [%[a], 16]\n\t" - "ldp x25, x26, [%[a], 32]\n\t" - "ldp x27, x28, [%[a], 48]\n\t" - "# A[0] * A[1]\n\t" - "mul x6, x21, x22\n\t" - "umulh x7, x21, x22\n\t" - "# A[0] * A[2]\n\t" - "mul x4, x21, x23\n\t" - "umulh x5, x21, x23\n\t" - "adds x7, x7, x4\n\t" - "# A[0] * A[3]\n\t" - "mul x4, x21, x24\n\t" - "adc x8, xzr, x5\n\t" - "umulh x5, x21, x24\n\t" - "adds x8, x8, x4\n\t" - "# A[1] * A[2]\n\t" - "mul x4, x22, x23\n\t" - "adc x9, xzr, x5\n\t" - "umulh x5, x22, x23\n\t" - "adds x8, x8, x4\n\t" - "# A[0] * A[4]\n\t" - "mul x4, x21, x25\n\t" - "adcs x9, x9, x5\n\t" - "umulh x5, x21, x25\n\t" - "adc x10, xzr, xzr\n\t" - "adds x9, x9, x4\n\t" - "# A[1] * A[3]\n\t" - "mul x4, x22, x24\n\t" - "adc x10, x10, x5\n\t" - "umulh x5, x22, x24\n\t" - "adds x9, x9, x4\n\t" - "# A[0] * A[5]\n\t" - "mul x4, x21, x26\n\t" - "adcs x10, x10, x5\n\t" - "umulh x5, x21, x26\n\t" - "adc x11, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" - "# A[1] * A[4]\n\t" - "mul x4, x22, x25\n\t" - "adc x11, x11, x5\n\t" - "umulh x5, x22, x25\n\t" - "adds x10, x10, x4\n\t" - "# A[2] * A[3]\n\t" - "mul x4, x23, x24\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x23, x24\n\t" - "adc x12, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" - "# A[0] * A[6]\n\t" - "mul x4, x21, x27\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x21, x27\n\t" - "adc x12, x12, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[1] * A[5]\n\t" - "mul x4, x22, x26\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x22, x26\n\t" - "adc x13, xzr, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[2] * A[4]\n\t" - "mul x4, x23, x25\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x23, x25\n\t" - "adc x13, x13, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[0] * A[7]\n\t" - "mul x4, x21, x28\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x21, x28\n\t" - "adc x13, x13, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[1] * A[6]\n\t" - "mul x4, x22, x27\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x22, x27\n\t" - "adc x14, xzr, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[2] * A[5]\n\t" - "mul x4, x23, x26\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x23, x26\n\t" - "adc x14, x14, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[3] * A[4]\n\t" - "mul x4, x24, x25\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x24, x25\n\t" - "adc x14, x14, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[1] * A[7]\n\t" - "mul x4, x22, x28\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x22, x28\n\t" - "adc x14, x14, xzr\n\t" - "adds x13, x13, x4\n\t" - "# A[2] * A[6]\n\t" - "mul x4, x23, x27\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x23, x27\n\t" - "adc x15, xzr, xzr\n\t" - "adds x13, x13, x4\n\t" - "# A[3] * A[5]\n\t" - "mul x4, x24, x26\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x24, x26\n\t" - "adc x15, x15, xzr\n\t" - "adds x13, x13, x4\n\t" - "# A[2] * A[7]\n\t" - "mul x4, x23, x28\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x23, x28\n\t" - "adc x15, x15, xzr\n\t" - "adds x14, x14, x4\n\t" - "# A[3] * A[6]\n\t" - "mul x4, x24, x27\n\t" - "adcs x15, x15, x5\n\t" - "umulh x5, x24, x27\n\t" - "adc x16, xzr, xzr\n\t" - "adds x14, x14, x4\n\t" - "# A[4] * A[5]\n\t" - "mul x4, x25, x26\n\t" - "adcs x15, x15, x5\n\t" - "umulh x5, x25, x26\n\t" - "adc x16, x16, xzr\n\t" - "adds x14, x14, x4\n\t" - "# A[3] * A[7]\n\t" - "mul x4, x24, x28\n\t" - "adcs x15, x15, x5\n\t" - "umulh x5, x24, x28\n\t" - "adc x16, x16, xzr\n\t" - "adds x15, x15, x4\n\t" - "# A[4] * A[6]\n\t" - "mul x4, x25, x27\n\t" - "adcs x16, x16, x5\n\t" - "umulh x5, x25, x27\n\t" - "adc x17, xzr, xzr\n\t" - "adds x15, x15, x4\n\t" - "# A[4] * A[7]\n\t" - "mul x4, x25, x28\n\t" - "adcs x16, x16, x5\n\t" - "umulh x5, x25, x28\n\t" - "adc x17, x17, xzr\n\t" - "adds x16, x16, x4\n\t" - "# A[5] * A[6]\n\t" - "mul x4, x26, x27\n\t" - "adcs x17, x17, x5\n\t" - "umulh x5, x26, x27\n\t" - "adc x19, xzr, xzr\n\t" - "adds x16, x16, x4\n\t" - "# A[5] * A[7]\n\t" - "mul x4, x26, x28\n\t" - "adcs x17, x17, x5\n\t" - "umulh x5, x26, x28\n\t" - "adc x19, x19, xzr\n\t" - "adds x17, x17, x4\n\t" - "# A[6] * A[7]\n\t" - "mul x4, x27, x28\n\t" - "adcs x19, x19, x5\n\t" - "umulh x5, x27, x28\n\t" - "adc x20, xzr, xzr\n\t" - "adds x19, x19, x4\n\t" - "adc x20, x20, x5\n\t" - "# Double\n\t" - "adds x6, x6, x6\n\t" - "adcs x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x12, x12, x12\n\t" - "adcs x13, x13, x13\n\t" - "adcs x14, x14, x14\n\t" - "adcs x15, x15, x15\n\t" - "adcs x16, x16, x16\n\t" - "adcs x17, x17, x17\n\t" - "adcs x19, x19, x19\n\t" - "# A[0] * A[0]\n\t" - "mul x5, x21, x21\n\t" - "adcs x20, x20, x20\n\t" - "umulh x2, x21, x21\n\t" - "cset x21, cs\n\t" - "# A[1] * A[1]\n\t" - "mul x3, x22, x22\n\t" - "adds x6, x6, x2\n\t" - "umulh x4, x22, x22\n\t" - "adcs x7, x7, x3\n\t" - "# A[2] * A[2]\n\t" - "mul x2, x23, x23\n\t" - "adcs x8, x8, x4\n\t" - "umulh x3, x23, x23\n\t" - "adcs x9, x9, x2\n\t" - "# A[3] * A[3]\n\t" - "mul x4, x24, x24\n\t" - "adcs x10, x10, x3\n\t" - "umulh x2, x24, x24\n\t" - "adcs x11, x11, x4\n\t" - "# A[4] * A[4]\n\t" - "mul x3, x25, x25\n\t" - "adcs x12, x12, x2\n\t" - "umulh x4, x25, x25\n\t" - "adcs x13, x13, x3\n\t" - "# A[5] * A[5]\n\t" - "mul x2, x26, x26\n\t" - "adcs x14, x14, x4\n\t" - "umulh x3, x26, x26\n\t" - "adcs x15, x15, x2\n\t" - "# A[6] * A[6]\n\t" - "mul x4, x27, x27\n\t" - "adcs x16, x16, x3\n\t" - "umulh x2, x27, x27\n\t" - "adcs x17, x17, x4\n\t" - "# A[7] * A[7]\n\t" - "mul x3, x28, x28\n\t" - "adcs x19, x19, x2\n\t" - "umulh x4, x28, x28\n\t" - "adcs x20, x20, x3\n\t" - "stp x5, x6, [%[r], 0]\n\t" - "adc x21, x21, x4\n\t" - "stp x7, x8, [%[r], 16]\n\t" - "stp x9, x10, [%[r], 32]\n\t" - "stp x11, x12, [%[r], 48]\n\t" - "stp x13, x14, [%[r], 64]\n\t" - "stp x15, x16, [%[r], 80]\n\t" - "stp x17, x19, [%[r], 96]\n\t" - "stp x20, x21, [%[r], 112]\n\t" - : - : [r] "r" (r), [a] "r" (a) - : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" - ); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -933,6 +723,38 @@ static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a, return (sp_digit)r; } +/* Add digit to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_2048_add_word_8(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "ldp x3, x4, [%[a], 0]\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "adds x3, x3, %[b]\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 0]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 16]\n\t" + "ldp x3, x4, [%[a], 32]\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 32]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 48]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6" + ); +} + /* Sub b from a into a. (a -= b) * * a A single precision integer and result. @@ -1049,63 +871,57 @@ static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, return (sp_digit)r; } -/* AND m into each word of a and store in r. +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. */ -static void sp_2048_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<8; i++) { - r[i] = a[i] & m; - } -#else - r[0] = a[0] & m; - r[1] = a[1] & m; - r[2] = a[2] & m; - r[3] = a[3] & m; - r[4] = a[4] & m; - r[5] = a[5] & m; - r[6] = a[6] & m; - r[7] = a[7] & m; -#endif -} - -/* Add digit to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static void sp_2048_add_zero_8(sp_digit* r, const sp_digit* a, - const sp_digit d) +static sp_digit sp_2048_cond_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) { __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "adds x3, x3, %[d]\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 48]\n\t" - : - : [r] "r" (r), [a] "r" (a), [d] "r" (d) - : "memory", "x3", "x4", "x5", "x6" + + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x10, x11, [%[b], 48]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 32]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 48]\n\t" + "cset %[r], cs\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" ); + + return (sp_digit)r; } +#endif /* !WOLFSSL_SP_SMALL */ /* Multiply a and b into r. (r = a * b) * @@ -1120,119 +936,74 @@ SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, sp_digit z1[16]; sp_digit a1[8]; sp_digit b1[8]; - sp_digit z2[16]; - sp_digit u, ca, cb; + sp_digit* z2 = r + 16; + sp_digit u; + sp_digit ca; + sp_digit cb; ca = sp_2048_add_8(a1, a, &a[8]); cb = sp_2048_add_8(b1, b, &b[8]); u = ca & cb; - sp_2048_mul_8(z1, a1, b1); + sp_2048_mul_8(z2, &a[8], &b[8]); sp_2048_mul_8(z0, a, b); - sp_2048_mask_8(r + 16, a1, 0 - cb); - sp_2048_mask_8(b1, b1, 0 - ca); - u += sp_2048_add_8(r + 16, r + 16, b1); - u += sp_2048_sub_in_place_16(z1, z2); + sp_2048_mul_8(z1, a1, b1); + u += sp_2048_sub_in_place_16(z1, z0); + u += sp_2048_sub_in_place_16(z1, z2); + u += sp_2048_cond_add_8(z1 + 8, z1 + 8, a1, 0 - cb); + u += sp_2048_cond_add_8(z1 + 8, z1 + 8, b1, 0 - ca); + u += sp_2048_add_16(r + 8, r + 8, z1); - u += sp_2048_add_8(r + 16, r + 16, z2); - sp_2048_add_zero_8(r + 24, z2 + 8, u); + (void)sp_2048_add_word_8(r + 24, r + 24, u); } -#ifdef WOLFSSL_SP_SMALL -/* Double a into r. (r = a + a) +/* Add digit to a into r. (r = a + b) * * r A single precision integer. * a A single precision integer. + * b A single precision integer. */ -static sp_digit sp_2048_dbl_8(sp_digit* r, const sp_digit* a) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "add x11, %[a], 64\n\t" - "\n1:\n\t" - "adds %[c], %[c], #-1\n\t" - "ldp x3, x4, [%[a]], #16\n\t" - "ldp x5, x6, [%[a]], #16\n\t" - "adcs x3, x3, x3\n\t" - "adcs x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r]], #16\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" - "cmp %[a], x11\n\t" - "b.ne 1b\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x3", "x4", "x5", "x6", "x11" - ); - - return c; -} - -#else -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -static sp_digit sp_2048_dbl_8(sp_digit* r, const sp_digit* a) +static void sp_2048_add_word_16(sp_digit* r, const sp_digit* a, + sp_digit b) { __asm__ __volatile__ ( "ldp x3, x4, [%[a], 0]\n\t" - "adds x3, x3, x3\n\t" - "ldr x5, [%[a], 16]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 24]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "adds x3, x3, %[b]\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, x6\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 16]\n\t" "ldp x3, x4, [%[a], 32]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 48]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 56]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, x6\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 48]\n\t" - "cset %[r], cs\n\t" - : [r] "+r" (r) - : [a] "r" (a) + "ldp x3, x4, [%[a], 64]\n\t" + "ldp x5, x6, [%[a], 80]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 64]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 80]\n\t" + "ldp x3, x4, [%[a], 96]\n\t" + "ldp x5, x6, [%[a], 112]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 96]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 112]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6" ); - - return (sp_digit)r; -} - -#endif /* WOLFSSL_SP_SMALL */ -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[16]; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit u; - - u = sp_2048_add_8(a1, a, &a[8]); - sp_2048_sqr_8(z1, a1); - sp_2048_sqr_8(z2, &a[8]); - sp_2048_sqr_8(z0, a); - sp_2048_mask_8(r + 16, a1, 0 - u); - u += sp_2048_dbl_8(r + 16, r + 16); - u += sp_2048_sub_in_place_16(z1, z2); - u += sp_2048_sub_in_place_16(z1, z0); - u += sp_2048_add_16(r + 8, r + 8, z1); - u += sp_2048_add_8(r + 16, r + 16, z2); - sp_2048_add_zero_8(r + 24, z2 + 8, u); } /* Sub b from a into a. (a -= b) @@ -1431,83 +1202,85 @@ static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, return (sp_digit)r; } -/* AND m into each word of a and store in r. +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. */ -static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<16; i++) { - r[i] = a[i] & m; - } -#else - int i; - - for (i = 0; i < 16; i += 8) { - r[i+0] = a[i+0] & m; - r[i+1] = a[i+1] & m; - r[i+2] = a[i+2] & m; - r[i+3] = a[i+3] & m; - r[i+4] = a[i+4] & m; - r[i+5] = a[i+5] & m; - r[i+6] = a[i+6] & m; - r[i+7] = a[i+7] & m; - } -#endif -} - -/* Add digit to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static void sp_2048_add_zero_16(sp_digit* r, const sp_digit* a, - const sp_digit d) +static sp_digit sp_2048_cond_add_16(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) { __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "adds x3, x3, %[d]\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 48]\n\t" - "ldp x3, x4, [%[a], 64]\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 64]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 80]\n\t" - "ldp x3, x4, [%[a], 96]\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 96]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 112]\n\t" - : - : [r] "r" (r), [a] "r" (a), [d] "r" (d) - : "memory", "x3", "x4", "x5", "x6" + + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x10, x11, [%[b], 48]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 32]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 48]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "ldp x10, x11, [%[b], 80]\n\t" + "ldp x4, x5, [%[a], 64]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 64]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 80]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "ldp x10, x11, [%[b], 112]\n\t" + "ldp x4, x5, [%[a], 96]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 96]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 112]\n\t" + "cset %[r], cs\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" ); + + return (sp_digit)r; } +#endif /* !WOLFSSL_SP_SMALL */ /* Multiply a and b into r. (r = a * b) * @@ -1522,113 +1295,1094 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, sp_digit z1[32]; sp_digit a1[16]; sp_digit b1[16]; - sp_digit z2[32]; - sp_digit u, ca, cb; + sp_digit* z2 = r + 32; + sp_digit u; + sp_digit ca; + sp_digit cb; ca = sp_2048_add_16(a1, a, &a[16]); cb = sp_2048_add_16(b1, b, &b[16]); u = ca & cb; - sp_2048_mul_16(z1, a1, b1); + sp_2048_mul_16(z2, &a[16], &b[16]); sp_2048_mul_16(z0, a, b); - sp_2048_mask_16(r + 32, a1, 0 - cb); - sp_2048_mask_16(b1, b1, 0 - ca); - u += sp_2048_add_16(r + 32, r + 32, b1); - u += sp_2048_sub_in_place_32(z1, z2); + sp_2048_mul_16(z1, a1, b1); + u += sp_2048_sub_in_place_32(z1, z0); + u += sp_2048_sub_in_place_32(z1, z2); + u += sp_2048_cond_add_16(z1 + 16, z1 + 16, a1, 0 - cb); + u += sp_2048_cond_add_16(z1 + 16, z1 + 16, b1, 0 - ca); + u += sp_2048_add_32(r + 16, r + 16, z1); - u += sp_2048_add_16(r + 32, r + 32, z2); - sp_2048_add_zero_16(r + 48, z2 + 16, u); + (void)sp_2048_add_word_16(r + 48, r + 48, u); } -#ifdef WOLFSSL_SP_SMALL -/* Double a into r. (r = a + a) +/* Square a and put result in r. (r = a * a) * * r A single precision integer. * a A single precision integer. */ -static sp_digit sp_2048_dbl_16(sp_digit* r, const sp_digit* a) +static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) { - sp_digit c = 0; - __asm__ __volatile__ ( - "add x11, %[a], 128\n\t" - "\n1:\n\t" - "adds %[c], %[c], #-1\n\t" - "ldp x3, x4, [%[a]], #16\n\t" - "ldp x5, x6, [%[a]], #16\n\t" - "adcs x3, x3, x3\n\t" - "adcs x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r]], #16\n\t" + "ldp x10, x11, [%[a], 0]\n\t" + "ldp x12, x13, [%[a], 16]\n\t" + "ldp x14, x15, [%[a], 32]\n\t" + "ldp x16, x17, [%[a], 48]\n\t" + "ldp x19, x20, [%[a], 64]\n\t" + "ldp x21, x22, [%[a], 80]\n\t" + "ldp x23, x24, [%[a], 96]\n\t" + "ldp x25, x26, [%[a], 112]\n\t" + "# A[0] * A[0]\n\t" + "mul x2, x10, x10\n\t" + "umulh x3, x10, x10\n\t" + "str x2, [%[r]]\n\t" + "mov x4, xzr\n\t" + "# A[0] * A[1]\n\t" + "mul x8, x10, x11\n\t" + "umulh x9, x10, x11\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, xzr, xzr\n\t" + "adds x3, x3, x8\n\t" + "str x3, [%[r], 8]\n\t" + "# A[0] * A[2]\n\t" + "mul x8, x10, x12\n\t" + "adcs x4, x4, x9\n\t" + "umulh x9, x10, x12\n\t" + "adc x2, x2, xzr\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, xzr, xzr\n\t" + "adds x4, x4, x8\n\t" + "# A[1] * A[1]\n\t" + "mul x8, x11, x11\n\t" + "adcs x2, x2, x9\n\t" + "umulh x9, x11, x11\n\t" + "adc x3, x3, xzr\n\t" + "adds x4, x4, x8\n\t" + "str x4, [%[r], 16]\n\t" + "# A[0] * A[3]\n\t" + "mul x8, x10, x13\n\t" + "adcs x2, x2, x9\n\t" + "umulh x9, x10, x13\n\t" + "adc x3, x3, xzr\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, xzr, xzr\n\t" + "adds x2, x2, x8\n\t" + "# A[1] * A[2]\n\t" + "mul x8, x11, x12\n\t" + "adcs x3, x3, x9\n\t" + "umulh x9, x11, x12\n\t" + "adc x4, x4, xzr\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "adds x2, x2, x8\n\t" + "str x2, [%[r], 24]\n\t" + "# A[0] * A[4]\n\t" + "mul x8, x10, x14\n\t" + "adcs x3, x3, x9\n\t" + "umulh x9, x10, x14\n\t" + "adc x4, x4, xzr\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, xzr, xzr\n\t" + "adds x3, x3, x8\n\t" + "# A[1] * A[3]\n\t" + "mul x8, x11, x13\n\t" + "adcs x4, x4, x9\n\t" + "umulh x9, x11, x13\n\t" + "adc x2, x2, xzr\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, x2, xzr\n\t" + "adds x3, x3, x8\n\t" + "# A[2] * A[2]\n\t" + "mul x8, x12, x12\n\t" + "adcs x4, x4, x9\n\t" + "umulh x9, x12, x12\n\t" + "adc x2, x2, xzr\n\t" + "adds x3, x3, x8\n\t" + "str x3, [%[r], 32]\n\t" + "# A[0] * A[5]\n\t" + "mul x5, x10, x15\n\t" + "adcs x4, x4, x9\n\t" + "umulh x6, x10, x15\n\t" + "adc x2, x2, xzr\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[4]\n\t" + "mul x8, x11, x14\n\t" + "umulh x9, x11, x14\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[3]\n\t" + "mul x8, x12, x13\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x13\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" - "cmp %[a], x11\n\t" - "b.ne 1b\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a) + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 40]\n\t" + "# A[0] * A[6]\n\t" + "mul x5, x10, x16\n\t" + "umulh x6, x10, x16\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[5]\n\t" + "mul x8, x11, x15\n\t" + "umulh x9, x11, x15\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[4]\n\t" + "mul x8, x12, x14\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x14\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[3]\n\t" + "mul x8, x13, x13\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x13\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 48]\n\t" + "# A[0] * A[7]\n\t" + "mul x5, x10, x17\n\t" + "umulh x6, x10, x17\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[6]\n\t" + "mul x8, x11, x16\n\t" + "umulh x9, x11, x16\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[5]\n\t" + "mul x8, x12, x15\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x15\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[4]\n\t" + "mul x8, x13, x14\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x14\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 56]\n\t" + "# A[0] * A[8]\n\t" + "mul x5, x10, x19\n\t" + "umulh x6, x10, x19\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[7]\n\t" + "mul x8, x11, x17\n\t" + "umulh x9, x11, x17\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[6]\n\t" + "mul x8, x12, x16\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x16\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[5]\n\t" + "mul x8, x13, x15\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x15\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[4]\n\t" + "mul x8, x14, x14\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x14\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 64]\n\t" + "# A[0] * A[9]\n\t" + "mul x5, x10, x20\n\t" + "umulh x6, x10, x20\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[8]\n\t" + "mul x8, x11, x19\n\t" + "umulh x9, x11, x19\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[7]\n\t" + "mul x8, x12, x17\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x17\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[6]\n\t" + "mul x8, x13, x16\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x16\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[5]\n\t" + "mul x8, x14, x15\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x15\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 72]\n\t" + "# A[0] * A[10]\n\t" + "mul x5, x10, x21\n\t" + "umulh x6, x10, x21\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[9]\n\t" + "mul x8, x11, x20\n\t" + "umulh x9, x11, x20\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[8]\n\t" + "mul x8, x12, x19\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x19\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[7]\n\t" + "mul x8, x13, x17\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x17\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[6]\n\t" + "mul x8, x14, x16\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x16\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[5] * A[5]\n\t" + "mul x8, x15, x15\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x15, x15\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 80]\n\t" + "# A[0] * A[11]\n\t" + "mul x5, x10, x22\n\t" + "umulh x6, x10, x22\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[10]\n\t" + "mul x8, x11, x21\n\t" + "umulh x9, x11, x21\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[9]\n\t" + "mul x8, x12, x20\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x20\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[8]\n\t" + "mul x8, x13, x19\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x19\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[7]\n\t" + "mul x8, x14, x17\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x17\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[5] * A[6]\n\t" + "mul x8, x15, x16\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x15, x16\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 88]\n\t" + "# A[0] * A[12]\n\t" + "mul x5, x10, x23\n\t" + "umulh x6, x10, x23\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[11]\n\t" + "mul x8, x11, x22\n\t" + "umulh x9, x11, x22\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[10]\n\t" + "mul x8, x12, x21\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x21\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[9]\n\t" + "mul x8, x13, x20\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x20\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[8]\n\t" + "mul x8, x14, x19\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x19\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[5] * A[7]\n\t" + "mul x8, x15, x17\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x15, x17\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[6] * A[6]\n\t" + "mul x8, x16, x16\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x16, x16\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 96]\n\t" + "# A[0] * A[13]\n\t" + "mul x5, x10, x24\n\t" + "umulh x6, x10, x24\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[12]\n\t" + "mul x8, x11, x23\n\t" + "umulh x9, x11, x23\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[11]\n\t" + "mul x8, x12, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[10]\n\t" + "mul x8, x13, x21\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x21\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[9]\n\t" + "mul x8, x14, x20\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x20\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[5] * A[8]\n\t" + "mul x8, x15, x19\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x15, x19\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[6] * A[7]\n\t" + "mul x8, x16, x17\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x16, x17\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 104]\n\t" + "# A[0] * A[14]\n\t" + "mul x5, x10, x25\n\t" + "umulh x6, x10, x25\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[13]\n\t" + "mul x8, x11, x24\n\t" + "umulh x9, x11, x24\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[12]\n\t" + "mul x8, x12, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[11]\n\t" + "mul x8, x13, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[10]\n\t" + "mul x8, x14, x21\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x21\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[5] * A[9]\n\t" + "mul x8, x15, x20\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x15, x20\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[6] * A[8]\n\t" + "mul x8, x16, x19\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x16, x19\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[7] * A[7]\n\t" + "mul x8, x17, x17\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x17, x17\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 112]\n\t" + "# A[0] * A[15]\n\t" + "mul x5, x10, x26\n\t" + "umulh x6, x10, x26\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[14]\n\t" + "mul x8, x11, x25\n\t" + "umulh x9, x11, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[2] * A[13]\n\t" + "mul x8, x12, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x12, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[12]\n\t" + "mul x8, x13, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[11]\n\t" + "mul x8, x14, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[5] * A[10]\n\t" + "mul x8, x15, x21\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x15, x21\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[6] * A[9]\n\t" + "mul x8, x16, x20\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x16, x20\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[7] * A[8]\n\t" + "mul x8, x17, x19\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x17, x19\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 120]\n\t" + "# A[1] * A[15]\n\t" + "mul x5, x11, x26\n\t" + "umulh x6, x11, x26\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[2] * A[14]\n\t" + "mul x8, x12, x25\n\t" + "umulh x9, x12, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[3] * A[13]\n\t" + "mul x8, x13, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x13, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[12]\n\t" + "mul x8, x14, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[5] * A[11]\n\t" + "mul x8, x15, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x15, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[6] * A[10]\n\t" + "mul x8, x16, x21\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x16, x21\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[7] * A[9]\n\t" + "mul x8, x17, x20\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x17, x20\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[8] * A[8]\n\t" + "mul x8, x19, x19\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x19, x19\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 128]\n\t" + "# A[2] * A[15]\n\t" + "mul x5, x12, x26\n\t" + "umulh x6, x12, x26\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[3] * A[14]\n\t" + "mul x8, x13, x25\n\t" + "umulh x9, x13, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[4] * A[13]\n\t" + "mul x8, x14, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x14, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[5] * A[12]\n\t" + "mul x8, x15, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x15, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[6] * A[11]\n\t" + "mul x8, x16, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x16, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[7] * A[10]\n\t" + "mul x8, x17, x21\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x17, x21\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[8] * A[9]\n\t" + "mul x8, x19, x20\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x19, x20\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 136]\n\t" + "# A[3] * A[15]\n\t" + "mul x5, x13, x26\n\t" + "umulh x6, x13, x26\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[4] * A[14]\n\t" + "mul x8, x14, x25\n\t" + "umulh x9, x14, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[5] * A[13]\n\t" + "mul x8, x15, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x15, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[6] * A[12]\n\t" + "mul x8, x16, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x16, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[7] * A[11]\n\t" + "mul x8, x17, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x17, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[8] * A[10]\n\t" + "mul x8, x19, x21\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x19, x21\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[9] * A[9]\n\t" + "mul x8, x20, x20\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x20, x20\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 144]\n\t" + "# A[4] * A[15]\n\t" + "mul x5, x14, x26\n\t" + "umulh x6, x14, x26\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[5] * A[14]\n\t" + "mul x8, x15, x25\n\t" + "umulh x9, x15, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[6] * A[13]\n\t" + "mul x8, x16, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x16, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[7] * A[12]\n\t" + "mul x8, x17, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x17, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[8] * A[11]\n\t" + "mul x8, x19, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x19, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[9] * A[10]\n\t" + "mul x8, x20, x21\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x20, x21\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 152]\n\t" + "# A[5] * A[15]\n\t" + "mul x5, x15, x26\n\t" + "umulh x6, x15, x26\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[6] * A[14]\n\t" + "mul x8, x16, x25\n\t" + "umulh x9, x16, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[7] * A[13]\n\t" + "mul x8, x17, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x17, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[8] * A[12]\n\t" + "mul x8, x19, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x19, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[9] * A[11]\n\t" + "mul x8, x20, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x20, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[10] * A[10]\n\t" + "mul x8, x21, x21\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x21, x21\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 160]\n\t" + "# A[6] * A[15]\n\t" + "mul x5, x16, x26\n\t" + "umulh x6, x16, x26\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[7] * A[14]\n\t" + "mul x8, x17, x25\n\t" + "umulh x9, x17, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[8] * A[13]\n\t" + "mul x8, x19, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x19, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[9] * A[12]\n\t" + "mul x8, x20, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x20, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[10] * A[11]\n\t" + "mul x8, x21, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x21, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 168]\n\t" + "# A[7] * A[15]\n\t" + "mul x5, x17, x26\n\t" + "umulh x6, x17, x26\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[8] * A[14]\n\t" + "mul x8, x19, x25\n\t" + "umulh x9, x19, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[9] * A[13]\n\t" + "mul x8, x20, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x20, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[10] * A[12]\n\t" + "mul x8, x21, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x21, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[11] * A[11]\n\t" + "mul x8, x22, x22\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x22, x22\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 176]\n\t" + "# A[8] * A[15]\n\t" + "mul x5, x19, x26\n\t" + "umulh x6, x19, x26\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[9] * A[14]\n\t" + "mul x8, x20, x25\n\t" + "umulh x9, x20, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[10] * A[13]\n\t" + "mul x8, x21, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x21, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[11] * A[12]\n\t" + "mul x8, x22, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x22, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 184]\n\t" + "# A[9] * A[15]\n\t" + "mul x5, x20, x26\n\t" + "umulh x6, x20, x26\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[10] * A[14]\n\t" + "mul x8, x21, x25\n\t" + "umulh x9, x21, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[11] * A[13]\n\t" + "mul x8, x22, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x22, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "# A[12] * A[12]\n\t" + "mul x8, x23, x23\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x23, x23\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 192]\n\t" + "# A[10] * A[15]\n\t" + "mul x5, x21, x26\n\t" + "umulh x6, x21, x26\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[11] * A[14]\n\t" + "mul x8, x22, x25\n\t" + "umulh x9, x22, x25\n\t" + "adds x5, x5, x8\n\t" + "# A[12] * A[13]\n\t" + "mul x8, x23, x24\n\t" + "adcs x6, x6, x9\n\t" + "umulh x9, x23, x24\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 200]\n\t" + "# A[11] * A[15]\n\t" + "mul x8, x22, x26\n\t" + "umulh x9, x22, x26\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, xzr, xzr\n\t" + "adds x4, x4, x8\n\t" + "# A[12] * A[14]\n\t" + "mul x8, x23, x25\n\t" + "adcs x2, x2, x9\n\t" + "umulh x9, x23, x25\n\t" + "adc x3, x3, xzr\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, x3, xzr\n\t" + "adds x4, x4, x8\n\t" + "# A[13] * A[13]\n\t" + "mul x8, x24, x24\n\t" + "adcs x2, x2, x9\n\t" + "umulh x9, x24, x24\n\t" + "adc x3, x3, xzr\n\t" + "adds x4, x4, x8\n\t" + "str x4, [%[r], 208]\n\t" + "# A[12] * A[15]\n\t" + "mul x8, x23, x26\n\t" + "adcs x2, x2, x9\n\t" + "umulh x9, x23, x26\n\t" + "adc x3, x3, xzr\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, xzr, xzr\n\t" + "adds x2, x2, x8\n\t" + "# A[13] * A[14]\n\t" + "mul x8, x24, x25\n\t" + "adcs x3, x3, x9\n\t" + "umulh x9, x24, x25\n\t" + "adc x4, x4, xzr\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "adds x2, x2, x8\n\t" + "str x2, [%[r], 216]\n\t" + "# A[13] * A[15]\n\t" + "mul x8, x24, x26\n\t" + "adcs x3, x3, x9\n\t" + "umulh x9, x24, x26\n\t" + "adc x4, x4, xzr\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, xzr, xzr\n\t" + "adds x3, x3, x8\n\t" + "# A[14] * A[14]\n\t" + "mul x8, x25, x25\n\t" + "adcs x4, x4, x9\n\t" + "umulh x9, x25, x25\n\t" + "adc x2, x2, xzr\n\t" + "adds x3, x3, x8\n\t" + "str x3, [%[r], 224]\n\t" + "# A[14] * A[15]\n\t" + "mul x8, x25, x26\n\t" + "adcs x4, x4, x9\n\t" + "umulh x9, x25, x26\n\t" + "adc x2, x2, xzr\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, xzr, xzr\n\t" + "adds x4, x4, x8\n\t" + "str x4, [%[r], 232]\n\t" + "# A[15] * A[15]\n\t" + "mul x8, x26, x26\n\t" + "adcs x2, x2, x9\n\t" + "umulh x9, x26, x26\n\t" + "adc x3, x3, xzr\n\t" + "adds x2, x2, x8\n\t" + "adc x3, x3, x9\n\t" + "stp x2, x3, [%[r], 240]\n\t" : - : "memory", "x3", "x4", "x5", "x6", "x11" + : [r] "r" (r), [a] "r" (a) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26" ); - - return c; } -#else -/* Double a into r. (r = a + a) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. + * b A single precision integer. */ -static sp_digit sp_2048_dbl_16(sp_digit* r, const sp_digit* a) +static sp_digit sp_2048_sub_16(sp_digit* r, const sp_digit* a, + const sp_digit* b) { __asm__ __volatile__ ( "ldp x3, x4, [%[a], 0]\n\t" - "adds x3, x3, x3\n\t" - "ldr x5, [%[a], 16]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 24]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 0]\n\t" + "subs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 16]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 16]\n\t" "ldp x3, x4, [%[a], 32]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 48]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 56]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 32]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 48]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 48]\n\t" "ldp x3, x4, [%[a], 64]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 80]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 88]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 64]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 80]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 80]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 64]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 80]\n\t" "ldp x3, x4, [%[a], 96]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 112]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 120]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 96]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 112]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 112]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 96]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 112]\n\t" - "cset %[r], cs\n\t" + "csetm %[r], cc\n\t" : [r] "+r" (r) - : [a] "r" (a) - : "memory", "x3", "x4", "x5", "x6" + : [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); return (sp_digit)r; } -#endif /* WOLFSSL_SP_SMALL */ /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -1637,22 +2391,31 @@ static sp_digit sp_2048_dbl_16(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit z1[32]; - sp_digit a1[16]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 16; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 16); + + mask = sp_2048_sub_16(a1, a, &a[16]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_16(a1, p1, p2); - u = sp_2048_add_16(a1, a, &a[16]); - sp_2048_sqr_16(z1, a1); sp_2048_sqr_16(z2, &a[16]); sp_2048_sqr_16(z0, a); - sp_2048_mask_16(r + 32, a1, 0 - u); - u += sp_2048_dbl_16(r + 32, r + 32); - u += sp_2048_sub_in_place_32(z1, z2); - u += sp_2048_sub_in_place_32(z1, z0); - u += sp_2048_add_32(r + 16, r + 16, z1); - u += sp_2048_add_16(r + 32, r + 32, z2); - sp_2048_add_zero_16(r + 48, z2 + 16, u); + sp_2048_sqr_16(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_32(z1, z2); + u -= sp_2048_sub_in_place_32(z1, z0); + u += sp_2048_sub_in_place_32(r + 16, z1); + sp_2048_add_word_16(r + 48, r + 48, u); } #endif /* !WOLFSSL_SP_SMALL */ @@ -1742,10 +2505,10 @@ static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) sp_digit tmp[64]; __asm__ __volatile__ ( - "mov x5, 0\n\t" - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" "\n1:\n\t" "subs x3, x5, 248\n\t" "csel x3, xzr, x3, cc\n\t" @@ -1791,10 +2554,10 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) sp_digit tmp[64]; __asm__ __volatile__ ( - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" - "mov x5, 0\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" + "mov x5, xzr\n\t" "\n1:\n\t" "subs x3, x5, 248\n\t" "csel x3, xzr, x3, cc\n\t" @@ -1849,23 +2612,6 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) #endif /* WOLFSSL_SP_SMALL */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) -{ - int i; - - for (i=0; i<16; i++) { - r[i] = a[i] & m; - } -} - -#endif /* WOLFSSL_SP_SMALL */ -#ifdef WOLFSSL_SP_SMALL /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -1951,10 +2697,10 @@ static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b) sp_digit tmp[32]; __asm__ __volatile__ ( - "mov x5, 0\n\t" - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" "\n1:\n\t" "subs x3, x5, 120\n\t" "csel x3, xzr, x3, cc\n\t" @@ -2000,10 +2746,10 @@ static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) sp_digit tmp[32]; __asm__ __volatile__ ( - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" - "mov x5, 0\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" + "mov x5, xzr\n\t" "\n1:\n\t" "subs x3, x5, 120\n\t" "csel x3, xzr, x3, cc\n\t" @@ -2094,9 +2840,9 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, "ldr x8, [%[a]]\n\t" "mul x5, %[b], x8\n\t" "umulh x3, %[b], x8\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "str x5, [%[r]]\n\t" - "mov x5, 0\n\t" + "mov x5, xzr\n\t" "mov x9, #8\n\t" "1:\n\t" "ldr x8, [%[a], x9]\n\t" @@ -2120,275 +2866,275 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, #else __asm__ __volatile__ ( "# A[0] * B\n\t" - "ldp x8, x9, [%[a]]\n\t" - "mul x3, %[b], x8\n\t" - "umulh x4, %[b], x8\n\t" - "mov x5, 0\n\t" + "ldp x9, x10, [%[a]]\n\t" + "mul x3, %[b], x9\n\t" + "umulh x4, %[b], x9\n\t" + "mov x5, xzr\n\t" "# A[1] * B\n\t" "str x3, [%[r]]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adds x4, x4, x6\n\t" "# A[2] * B\n\t" - "ldp x8, x9, [%[a], 16]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" "str x4, [%[r], 8]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[3] * B\n\t" "str x5, [%[r], 16]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[4] * B\n\t" - "ldp x8, x9, [%[a], 32]\n\t" + "ldp x9, x10, [%[a], 32]\n\t" "str x3, [%[r], 24]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[5] * B\n\t" "str x4, [%[r], 32]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[6] * B\n\t" - "ldp x8, x9, [%[a], 48]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" "str x5, [%[r], 40]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[7] * B\n\t" "str x3, [%[r], 48]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[8] * B\n\t" - "ldp x8, x9, [%[a], 64]\n\t" + "ldp x9, x10, [%[a], 64]\n\t" "str x4, [%[r], 56]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[9] * B\n\t" "str x5, [%[r], 64]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[10] * B\n\t" - "ldp x8, x9, [%[a], 80]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" "str x3, [%[r], 72]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[11] * B\n\t" "str x4, [%[r], 80]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[12] * B\n\t" - "ldp x8, x9, [%[a], 96]\n\t" + "ldp x9, x10, [%[a], 96]\n\t" "str x5, [%[r], 88]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[13] * B\n\t" "str x3, [%[r], 96]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[14] * B\n\t" - "ldp x8, x9, [%[a], 112]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" "str x4, [%[r], 104]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[15] * B\n\t" "str x5, [%[r], 112]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[16] * B\n\t" - "ldp x8, x9, [%[a], 128]\n\t" + "ldp x9, x10, [%[a], 128]\n\t" "str x3, [%[r], 120]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[17] * B\n\t" "str x4, [%[r], 128]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[18] * B\n\t" - "ldp x8, x9, [%[a], 144]\n\t" + "ldp x9, x10, [%[a], 144]\n\t" "str x5, [%[r], 136]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[19] * B\n\t" "str x3, [%[r], 144]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[20] * B\n\t" - "ldp x8, x9, [%[a], 160]\n\t" + "ldp x9, x10, [%[a], 160]\n\t" "str x4, [%[r], 152]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[21] * B\n\t" "str x5, [%[r], 160]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[22] * B\n\t" - "ldp x8, x9, [%[a], 176]\n\t" + "ldp x9, x10, [%[a], 176]\n\t" "str x3, [%[r], 168]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[23] * B\n\t" "str x4, [%[r], 176]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[24] * B\n\t" - "ldp x8, x9, [%[a], 192]\n\t" + "ldp x9, x10, [%[a], 192]\n\t" "str x5, [%[r], 184]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[25] * B\n\t" "str x3, [%[r], 192]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[26] * B\n\t" - "ldp x8, x9, [%[a], 208]\n\t" + "ldp x9, x10, [%[a], 208]\n\t" "str x4, [%[r], 200]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[27] * B\n\t" "str x5, [%[r], 208]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[28] * B\n\t" - "ldp x8, x9, [%[a], 224]\n\t" + "ldp x9, x10, [%[a], 224]\n\t" "str x3, [%[r], 216]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[29] * B\n\t" "str x4, [%[r], 224]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[30] * B\n\t" - "ldp x8, x9, [%[a], 240]\n\t" + "ldp x9, x10, [%[a], 240]\n\t" "str x5, [%[r], 232]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[31] * B\n\t" "str x3, [%[r], 240]\n\t" - "mul x6, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "adc x5, x5, x7\n\t" "stp x4, x5, [%[r], 248]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #endif } @@ -2417,226 +3163,202 @@ static void sp_2048_mont_norm_16(sp_digit* r, const sp_digit* m) SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, sp_digit mp) { - __asm__ __volatile__ ( - "ldp x14, x15, [%[m], 0]\n\t" - "ldp x16, x17, [%[m], 16]\n\t" - "ldp x19, x20, [%[m], 32]\n\t" - "ldp x21, x22, [%[m], 48]\n\t" - "ldp x23, x24, [%[m], 64]\n\t" - "ldp x25, x26, [%[m], 80]\n\t" - "ldp x27, x28, [%[m], 96]\n\t" - "mov x3, xzr\n\t" - "# i = 16\n\t" - "mov x4, 16\n\t" "ldp x12, x13, [%[a], 0]\n\t" + "ldp x14, x15, [%[a], 16]\n\t" + "ldp x16, x17, [%[a], 32]\n\t" + "ldp x19, x20, [%[a], 48]\n\t" + "ldp x21, x22, [%[a], 64]\n\t" + "ldp x23, x24, [%[a], 80]\n\t" + "ldp x25, x26, [%[a], 96]\n\t" + "ldp x27, x28, [%[a], 112]\n\t" + "mov x3, xzr\n\t" + "# i = 0..15\n\t" + "mov x4, 16\n\t" "\n1:\n\t" "# mu = a[i] * mp\n\t" "mul x9, %[mp], x12\n\t" "# a[i+0] += m[0] * mu\n\t" - "mul x7, x14, x9\n\t" - "umulh x8, x14, x9\n\t" + "ldp x10, x11, [%[m], 0]\n\t" + "mul x7, x10, x9\n\t" + "umulh x8, x10, x9\n\t" "adds x12, x12, x7\n\t" "# a[i+1] += m[1] * mu\n\t" - "mul x7, x15, x9\n\t" "adc x6, x8, xzr\n\t" - "umulh x8, x15, x9\n\t" + "mul x7, x11, x9\n\t" + "umulh x8, x11, x9\n\t" "adds x12, x13, x7\n\t" "# a[i+2] += m[2] * mu\n\t" - "ldr x13, [%[a], 16]\n\t" + "ldp x11, x10, [%[m], 16]\n\t" "adc x5, x8, xzr\n\t" - "mul x7, x16, x9\n\t" "adds x12, x12, x6\n\t" - "umulh x8, x16, x9\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "adds x13, x13, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x13, x14, x7\n\t" "# a[i+3] += m[3] * mu\n\t" - "ldr x10, [%[a], 24]\n\t" "adc x6, x8, xzr\n\t" - "mul x7, x17, x9\n\t" "adds x13, x13, x5\n\t" - "umulh x8, x17, x9\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x14, x15, x7\n\t" "# a[i+4] += m[4] * mu\n\t" - "ldr x11, [%[a], 32]\n\t" + "ldp x11, x10, [%[m], 32]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x19, x9\n\t" + "adds x14, x14, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "str x10, [%[a], 24]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x15, x16, x7\n\t" "# a[i+5] += m[5] * mu\n\t" - "ldr x10, [%[a], 40]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x20, x9\n\t" + "adds x15, x15, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "str x11, [%[a], 32]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x16, x17, x7\n\t" "# a[i+6] += m[6] * mu\n\t" - "ldr x11, [%[a], 48]\n\t" + "ldp x11, x10, [%[m], 48]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x21, x9\n\t" + "adds x16, x16, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "str x10, [%[a], 40]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x17, x19, x7\n\t" "# a[i+7] += m[7] * mu\n\t" - "ldr x10, [%[a], 56]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x22, x9\n\t" + "adds x17, x17, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "str x11, [%[a], 48]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x19, x20, x7\n\t" "# a[i+8] += m[8] * mu\n\t" - "ldr x11, [%[a], 64]\n\t" + "ldp x11, x10, [%[m], 64]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x23, x9\n\t" + "adds x19, x19, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x23, x9\n\t" - "str x10, [%[a], 56]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x20, x21, x7\n\t" "# a[i+9] += m[9] * mu\n\t" - "ldr x10, [%[a], 72]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x24, x9\n\t" + "adds x20, x20, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x24, x9\n\t" - "str x11, [%[a], 64]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x21, x22, x7\n\t" "# a[i+10] += m[10] * mu\n\t" - "ldr x11, [%[a], 80]\n\t" + "ldp x11, x10, [%[m], 80]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x25, x9\n\t" + "adds x21, x21, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x25, x9\n\t" - "str x10, [%[a], 72]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x22, x23, x7\n\t" "# a[i+11] += m[11] * mu\n\t" - "ldr x10, [%[a], 88]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x26, x9\n\t" + "adds x22, x22, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x26, x9\n\t" - "str x11, [%[a], 80]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x23, x24, x7\n\t" "# a[i+12] += m[12] * mu\n\t" - "ldr x11, [%[a], 96]\n\t" + "ldp x11, x10, [%[m], 96]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x27, x9\n\t" + "adds x23, x23, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x27, x9\n\t" - "str x10, [%[a], 88]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x24, x25, x7\n\t" "# a[i+13] += m[13] * mu\n\t" - "ldr x10, [%[a], 104]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x28, x9\n\t" + "adds x24, x24, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x28, x9\n\t" - "str x11, [%[a], 96]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x25, x26, x7\n\t" "# a[i+14] += m[14] * mu\n\t" - "ldr x11, [%[a], 112]\n\t" + "ldp x11, x10, [%[m], 112]\n\t" "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 112]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" + "adds x25, x25, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 104]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x26, x27, x7\n\t" "# a[i+15] += m[15] * mu\n\t" - "ldr x10, [%[a], 120]\n\t" + "ldr x10, [%[m], 120]\n\t" "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 120]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" + "adds x26, x26, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" + "umulh x8, x10, x9\n\t" "adds x6, x6, x7\n\t" "adcs x8, x8, x3\n\t" - "str x11, [%[a], 112]\n\t" - "cset x3, cs\n\t" - "adds x10, x10, x6\n\t" - "ldr x11, [%[a], 128]\n\t" - "str x10, [%[a], 120]\n\t" - "adcs x11, x11, x8\n\t" - "str x11, [%[a], 128]\n\t" + "cset x3, cs\n\t" + "adds x27, x28, x6\n\t" + "ldr x28, [%[a], 128]\n\t" + "adcs x28, x28, x8\n\t" "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" "bne 1b\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" "# Create mask\n\t" "neg x3, x3\n\t" "mov x9, %[a]\n\t" "sub %[a], %[a], 128\n\t" "# Subtract masked modulus\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" - "and x14, x14, x3\n\t" - "ldp x11, x10, [x9, 16]\n\t" - "and x15, x15, x3\n\t" - "subs x12, x12, x14\n\t" - "and x16, x16, x3\n\t" - "sbcs x13, x13, x15\n\t" - "and x17, x17, x3\n\t" - "sbcs x11, x11, x16\n\t" - "stp x12, x13, [%[a], 0]\n\t" - "sbcs x10, x10, x17\n\t" - "stp x11, x10, [%[a], 16]\n\t" - "ldp x12, x13, [x9, 32]\n\t" - "and x19, x19, x3\n\t" - "ldp x11, x10, [x9, 48]\n\t" - "and x20, x20, x3\n\t" - "sbcs x12, x12, x19\n\t" - "and x21, x21, x3\n\t" - "sbcs x13, x13, x20\n\t" - "and x22, x22, x3\n\t" - "sbcs x11, x11, x21\n\t" - "stp x12, x13, [%[a], 32]\n\t" - "sbcs x10, x10, x22\n\t" - "stp x11, x10, [%[a], 48]\n\t" - "ldp x12, x13, [x9, 64]\n\t" - "and x23, x23, x3\n\t" - "ldp x11, x10, [x9, 80]\n\t" - "and x24, x24, x3\n\t" - "sbcs x12, x12, x23\n\t" - "and x25, x25, x3\n\t" - "sbcs x13, x13, x24\n\t" - "and x26, x26, x3\n\t" - "sbcs x11, x11, x25\n\t" - "stp x12, x13, [%[a], 64]\n\t" - "sbcs x10, x10, x26\n\t" - "stp x11, x10, [%[a], 80]\n\t" - "ldp x7, x8, [%[m], 112]\n\t" - "ldp x12, x13, [x9, 96]\n\t" - "and x27, x27, x3\n\t" - "ldp x11, x10, [x9, 112]\n\t" - "and x28, x28, x3\n\t" - "sbcs x12, x12, x27\n\t" + "ldp x4, x5, [%[m], 0]\n\t" + "ldp x6, x7, [%[m], 16]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "subs x12, x12, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x13, x13, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x28\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 96]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 112]\n\t" - : [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + "sbcs x14, x14, x6\n\t" + "stp x12, x13, [%[a], 0]\n\t" + "sbcs x15, x15, x7\n\t" + "stp x14, x15, [%[a], 16]\n\t" + "ldp x4, x5, [%[m], 32]\n\t" + "ldp x6, x7, [%[m], 48]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x16, x16, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x17, x17, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x19, x19, x6\n\t" + "stp x16, x17, [%[a], 32]\n\t" + "sbcs x20, x20, x7\n\t" + "stp x19, x20, [%[a], 48]\n\t" + "ldp x4, x5, [%[m], 64]\n\t" + "ldp x6, x7, [%[m], 80]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x21, x21, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x22, x22, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x23, x23, x6\n\t" + "stp x21, x22, [%[a], 64]\n\t" + "sbcs x24, x24, x7\n\t" + "stp x23, x24, [%[a], 80]\n\t" + "ldp x4, x5, [%[m], 96]\n\t" + "ldp x6, x7, [%[m], 112]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x25, x25, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x26, x26, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x27, x27, x6\n\t" + "stp x25, x26, [%[a], 96]\n\t" + "sbcs x28, x28, x7\n\t" + "stp x27, x28, [%[a], 112]\n\t" + : [a] "+r" (a), [mp] "+r" (mp) + : [m] "r" (m) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } @@ -2650,7 +3372,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_16(r, a, b); @@ -2664,7 +3386,7 @@ static void sp_2048_mont_mul_16(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_16(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_16(r, a); @@ -2788,9 +3510,9 @@ static void sp_2048_mul_d_16(sp_digit* r, const sp_digit* a, "ldr x8, [%[a]]\n\t" "mul x5, %[b], x8\n\t" "umulh x3, %[b], x8\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "str x5, [%[r]]\n\t" - "mov x5, 0\n\t" + "mov x5, xzr\n\t" "mov x9, #8\n\t" "1:\n\t" "ldr x8, [%[a], x9]\n\t" @@ -2814,175 +3536,178 @@ static void sp_2048_mul_d_16(sp_digit* r, const sp_digit* a, #else __asm__ __volatile__ ( "# A[0] * B\n\t" - "ldp x8, x9, [%[a]]\n\t" - "mul x3, %[b], x8\n\t" - "umulh x4, %[b], x8\n\t" - "mov x5, 0\n\t" + "ldp x9, x10, [%[a]]\n\t" + "mul x3, %[b], x9\n\t" + "umulh x4, %[b], x9\n\t" + "mov x5, xzr\n\t" "# A[1] * B\n\t" "str x3, [%[r]]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adds x4, x4, x6\n\t" "# A[2] * B\n\t" - "ldp x8, x9, [%[a], 16]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" "str x4, [%[r], 8]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[3] * B\n\t" "str x5, [%[r], 16]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[4] * B\n\t" - "ldp x8, x9, [%[a], 32]\n\t" + "ldp x9, x10, [%[a], 32]\n\t" "str x3, [%[r], 24]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[5] * B\n\t" "str x4, [%[r], 32]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[6] * B\n\t" - "ldp x8, x9, [%[a], 48]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" "str x5, [%[r], 40]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[7] * B\n\t" "str x3, [%[r], 48]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[8] * B\n\t" - "ldp x8, x9, [%[a], 64]\n\t" + "ldp x9, x10, [%[a], 64]\n\t" "str x4, [%[r], 56]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[9] * B\n\t" "str x5, [%[r], 64]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[10] * B\n\t" - "ldp x8, x9, [%[a], 80]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" "str x3, [%[r], 72]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[11] * B\n\t" "str x4, [%[r], 80]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[12] * B\n\t" - "ldp x8, x9, [%[a], 96]\n\t" + "ldp x9, x10, [%[a], 96]\n\t" "str x5, [%[r], 88]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[13] * B\n\t" "str x3, [%[r], 96]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[14] * B\n\t" - "ldp x8, x9, [%[a], 112]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" "str x4, [%[r], 104]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[15] * B\n\t" "str x5, [%[r], 112]\n\t" - "mul x6, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "adc x4, x4, x7\n\t" "stp x3, x4, [%[r], 120]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. */ static sp_digit div_2048_word_16(sp_digit d1, sp_digit d0, sp_digit div) { - sp_digit r; - __asm__ __volatile__ ( - "lsr x5, %[div], 32\n\t" - "add x5, x5, 1\n\t" + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" "lsl x6, x3, 32\n\t" "mul x4, %[div], x6\n\t" "umulh x3, %[div], x6\n\t" "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "udiv x3, %[d1], x5\n\t" - "lsl x3, x3, 32\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "umulh x3, %[div], x3\n\t" - "subs %[d0], %[d0], x4\n\t" - "sbc %[d1], %[d1], x3\n\t" + "cmp %[d1], x5\n\t" + "cset x9, ge\n\t" + "csetm x10, ge\n\t" + "lsl x9, x9, #32\n\t" + "and x7, x7, x10\n\t" + "and x8, x8, x10\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" "udiv x3, x3, x5\n\t" "add x6, x6, x3\n\t" @@ -2991,23 +3716,52 @@ static sp_digit div_2048_word_16(sp_digit d1, sp_digit d0, sp_digit div) "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" - "udiv x3, x3, x5\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "sub %[d0], %[d0], x4\n\t" + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" "udiv x3, %[d0], %[div]\n\t" - "add %[r], x6, x3\n\t" + "add %[d1], x6, x3\n\t" - : [r] "=r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "x3", "x4", "x5", "x6" + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); - return r; + return d1; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<16; i++) { + r[i] = a[i] & m; + } +#else + int i; + + for (i = 0; i < 16; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif } /* Compare a with b in constant time. @@ -3021,147 +3775,139 @@ static sp_int64 sp_2048_cmp_16(const sp_digit* a, const sp_digit* b) { #ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "mov x5, 120\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "mov x10, #16\n\t" + "add %[a], %[a], #112\n\t" + "add %[b], %[b], #112\n\t" "1:\n\t" - "ldr x6, [%[a], x5]\n\t" - "ldr x7, [%[b], x5]\n\t" - "and x6, x6, x4\n\t" - "and x7, x7, x4\n\t" - "subs x6, x6, x7\n\t" - "csel x2, x3, x2, hi\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "subs x5, x5, #8\n\t" - "b.cs 1b\n\t" - "eor %[a], x2, x4\n\t" - : [a] "+r" (a) - : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + "ldp x6, x7, [%[a]], -16\n\t" + "ldp x8, x9, [%[b]], -16\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x10, x10, #2\n\t" + "b.ne 1b\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #else __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "ldp x7, x8, [%[b], 112]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "ldp x8, x9, [%[b], 112]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 96]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "ldp x7, x8, [%[b], 80]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "ldp x8, x9, [%[b], 80]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 64]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "ldp x7, x8, [%[b], 48]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "ldp x8, x9, [%[b], 48]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 32]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "ldp x7, x8, [%[b], 16]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 16]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 0]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "eor %[a], x2, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" : [a] "+r" (a) : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); #endif @@ -3188,7 +3934,7 @@ static WC_INLINE int sp_2048_div_16(const sp_digit* a, const sp_digit* d, sp_dig div = d[15]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 16); - for (i=15; i>=0; i--) { + for (i = 15; i >= 0; i--) { sp_digit hi = t1[16 + i] - (t1[16 + i] == div); r1 = div_2048_word_16(hi, t1[16 + i - 1], div); @@ -3575,442 +4321,408 @@ static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m) SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) { - __asm__ __volatile__ ( - "ldp x14, x15, [%[m], 0]\n\t" - "ldp x16, x17, [%[m], 16]\n\t" - "ldp x19, x20, [%[m], 32]\n\t" - "ldp x21, x22, [%[m], 48]\n\t" - "ldp x23, x24, [%[m], 64]\n\t" - "ldp x25, x26, [%[m], 80]\n\t" - "ldp x27, x28, [%[m], 96]\n\t" + "ldp x11, x12, [%[a], 0]\n\t" + "ldp x13, x14, [%[a], 16]\n\t" + "ldp x15, x16, [%[a], 32]\n\t" + "ldp x17, x19, [%[a], 48]\n\t" + "ldp x20, x21, [%[a], 64]\n\t" + "ldp x22, x23, [%[a], 80]\n\t" + "# No carry yet\n\t" "mov x3, xzr\n\t" - "# i = 32\n\t" + "# i = 0..31\n\t" "mov x4, 32\n\t" - "ldp x12, x13, [%[a], 0]\n\t" "\n1:\n\t" "# mu = a[i] * mp\n\t" - "mul x9, %[mp], x12\n\t" + "mul x10, %[mp], x11\n\t" + "ldp x24, x25, [%[m], 0]\n\t" + "ldp x26, x27, [%[m], 16]\n\t" "# a[i+0] += m[0] * mu\n\t" - "mul x7, x14, x9\n\t" - "umulh x8, x14, x9\n\t" - "adds x12, x12, x7\n\t" + "mul x5, x24, x10\n\t" + "umulh x6, x24, x10\n\t" "# a[i+1] += m[1] * mu\n\t" - "mul x7, x15, x9\n\t" - "adc x6, x8, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x12, x13, x7\n\t" + "adds x11, x11, x5\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x11, x12, x5\n\t" "# a[i+2] += m[2] * mu\n\t" - "ldr x13, [%[a], 16]\n\t" - "adc x5, x8, xzr\n\t" - "mul x7, x16, x9\n\t" - "adds x12, x12, x6\n\t" - "umulh x8, x16, x9\n\t" - "adc x5, x5, xzr\n\t" - "adds x13, x13, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x11, x11, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x12, x13, x5\n\t" "# a[i+3] += m[3] * mu\n\t" - "ldr x10, [%[a], 24]\n\t" - "adc x6, x8, xzr\n\t" - "mul x7, x17, x9\n\t" - "adds x13, x13, x5\n\t" - "umulh x8, x17, x9\n\t" "adc x6, x6, xzr\n\t" - "adds x10, x10, x7\n\t" + "adds x12, x12, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x13, x14, x5\n\t" + "ldp x24, x25, [%[m], 32]\n\t" + "ldp x26, x27, [%[m], 48]\n\t" "# a[i+4] += m[4] * mu\n\t" - "ldr x11, [%[a], 32]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x19, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "str x10, [%[a], 24]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x13, x13, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x24, x10\n\t" + "adds x14, x15, x5\n\t" "# a[i+5] += m[5] * mu\n\t" - "ldr x10, [%[a], 40]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x20, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "str x11, [%[a], 32]\n\t" - "adds x10, x10, x7\n\t" + "adds x14, x14, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x15, x16, x5\n\t" "# a[i+6] += m[6] * mu\n\t" - "ldr x11, [%[a], 48]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x21, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "str x10, [%[a], 40]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x15, x15, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x16, x17, x5\n\t" "# a[i+7] += m[7] * mu\n\t" - "ldr x10, [%[a], 56]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x22, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "str x11, [%[a], 48]\n\t" - "adds x10, x10, x7\n\t" + "adds x16, x16, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x17, x19, x5\n\t" + "ldp x24, x25, [%[m], 64]\n\t" + "ldp x26, x27, [%[m], 80]\n\t" "# a[i+8] += m[8] * mu\n\t" - "ldr x11, [%[a], 64]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x23, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x23, x9\n\t" - "str x10, [%[a], 56]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x17, x17, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x24, x10\n\t" + "adds x19, x20, x5\n\t" "# a[i+9] += m[9] * mu\n\t" - "ldr x10, [%[a], 72]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x24, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x24, x9\n\t" - "str x11, [%[a], 64]\n\t" - "adds x10, x10, x7\n\t" + "adds x19, x19, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x20, x21, x5\n\t" "# a[i+10] += m[10] * mu\n\t" - "ldr x11, [%[a], 80]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x25, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x25, x9\n\t" - "str x10, [%[a], 72]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x20, x20, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x21, x22, x5\n\t" "# a[i+11] += m[11] * mu\n\t" - "ldr x10, [%[a], 88]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x26, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x26, x9\n\t" - "str x11, [%[a], 80]\n\t" - "adds x10, x10, x7\n\t" + "adds x21, x21, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x22, x23, x5\n\t" + "ldp x24, x25, [%[m], 96]\n\t" + "ldp x26, x27, [%[m], 112]\n\t" "# a[i+12] += m[12] * mu\n\t" - "ldr x11, [%[a], 96]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x27, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x27, x9\n\t" - "str x10, [%[a], 88]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x22, x22, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "ldr x23, [%[a], 96]\n\t" + "umulh x6, x24, x10\n\t" + "adds x23, x23, x5\n\t" "# a[i+13] += m[13] * mu\n\t" - "ldr x10, [%[a], 104]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x28, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x28, x9\n\t" - "str x11, [%[a], 96]\n\t" - "adds x10, x10, x7\n\t" + "adds x23, x23, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "ldp x8, x9, [%[a], 104]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+14] += m[14] * mu\n\t" - "ldr x11, [%[a], 112]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 112]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 104]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 104]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+15] += m[15] * mu\n\t" - "ldr x10, [%[a], 120]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 120]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 112]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 112]\n\t" + "ldp x8, x9, [%[a], 120]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 128]\n\t" + "ldp x26, x27, [%[m], 144]\n\t" "# a[i+16] += m[16] * mu\n\t" - "ldr x11, [%[a], 128]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 128]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 120]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 120]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+17] += m[17] * mu\n\t" - "ldr x10, [%[a], 136]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 136]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 128]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 128]\n\t" + "ldp x8, x9, [%[a], 136]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+18] += m[18] * mu\n\t" - "ldr x11, [%[a], 144]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 144]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 136]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 136]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+19] += m[19] * mu\n\t" - "ldr x10, [%[a], 152]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 152]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 144]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 144]\n\t" + "ldp x8, x9, [%[a], 152]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 160]\n\t" + "ldp x26, x27, [%[m], 176]\n\t" "# a[i+20] += m[20] * mu\n\t" - "ldr x11, [%[a], 160]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 160]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 152]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 152]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+21] += m[21] * mu\n\t" - "ldr x10, [%[a], 168]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 168]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 160]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 160]\n\t" + "ldp x8, x9, [%[a], 168]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+22] += m[22] * mu\n\t" - "ldr x11, [%[a], 176]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 176]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 168]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 168]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+23] += m[23] * mu\n\t" - "ldr x10, [%[a], 184]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 184]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 176]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 176]\n\t" + "ldp x8, x9, [%[a], 184]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 192]\n\t" + "ldp x26, x27, [%[m], 208]\n\t" "# a[i+24] += m[24] * mu\n\t" - "ldr x11, [%[a], 192]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 192]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 184]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 184]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+25] += m[25] * mu\n\t" - "ldr x10, [%[a], 200]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 200]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 192]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 192]\n\t" + "ldp x8, x9, [%[a], 200]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+26] += m[26] * mu\n\t" - "ldr x11, [%[a], 208]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 208]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 200]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 200]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+27] += m[27] * mu\n\t" - "ldr x10, [%[a], 216]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 216]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 208]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 208]\n\t" + "ldp x8, x9, [%[a], 216]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 224]\n\t" + "ldp x26, x27, [%[m], 240]\n\t" "# a[i+28] += m[28] * mu\n\t" - "ldr x11, [%[a], 224]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 224]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 216]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 216]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+29] += m[29] * mu\n\t" - "ldr x10, [%[a], 232]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 232]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 224]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 224]\n\t" + "ldp x8, x9, [%[a], 232]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+30] += m[30] * mu\n\t" - "ldr x11, [%[a], 240]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 240]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 232]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 232]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+31] += m[31] * mu\n\t" - "ldr x10, [%[a], 248]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 248]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x8, x8, x3\n\t" - "str x11, [%[a], 240]\n\t" - "cset x3, cs\n\t" - "adds x10, x10, x6\n\t" - "ldr x11, [%[a], 256]\n\t" - "str x10, [%[a], 248]\n\t" - "adcs x11, x11, x8\n\t" - "str x11, [%[a], 256]\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 240]\n\t" + "umulh x7, x27, x10\n\t" + "ldp x8, x9, [%[a], 248]\n\t" + "adds x5, x5, x6\n\t" + "adcs x7, x7, x3\n\t" + "cset x3, cs\n\t" + "adds x8, x8, x5\n\t" + "str x8, [%[a], 248]\n\t" + "adcs x9, x9, x7\n\t" + "str x9, [%[a], 256]\n\t" "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" - "bne 1b\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" + "b.ne 1b\n\t" "# Create mask\n\t" "neg x3, x3\n\t" - "mov x9, %[a]\n\t" + "mov %[mp], %[a]\n\t" "sub %[a], %[a], 256\n\t" "# Subtract masked modulus\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" - "and x14, x14, x3\n\t" - "ldp x11, x10, [x9, 16]\n\t" - "and x15, x15, x3\n\t" - "subs x12, x12, x14\n\t" - "and x16, x16, x3\n\t" - "sbcs x13, x13, x15\n\t" - "and x17, x17, x3\n\t" - "sbcs x11, x11, x16\n\t" - "stp x12, x13, [%[a], 0]\n\t" - "sbcs x10, x10, x17\n\t" - "stp x11, x10, [%[a], 16]\n\t" - "ldp x12, x13, [x9, 32]\n\t" - "and x19, x19, x3\n\t" - "ldp x11, x10, [x9, 48]\n\t" - "and x20, x20, x3\n\t" - "sbcs x12, x12, x19\n\t" - "and x21, x21, x3\n\t" - "sbcs x13, x13, x20\n\t" - "and x22, x22, x3\n\t" - "sbcs x11, x11, x21\n\t" - "stp x12, x13, [%[a], 32]\n\t" - "sbcs x10, x10, x22\n\t" - "stp x11, x10, [%[a], 48]\n\t" - "ldp x12, x13, [x9, 64]\n\t" - "and x23, x23, x3\n\t" - "ldp x11, x10, [x9, 80]\n\t" - "and x24, x24, x3\n\t" - "sbcs x12, x12, x23\n\t" - "and x25, x25, x3\n\t" - "sbcs x13, x13, x24\n\t" - "and x26, x26, x3\n\t" - "sbcs x11, x11, x25\n\t" - "stp x12, x13, [%[a], 64]\n\t" - "sbcs x10, x10, x26\n\t" - "stp x11, x10, [%[a], 80]\n\t" - "ldp x7, x8, [%[m], 112]\n\t" - "ldp x12, x13, [x9, 96]\n\t" - "and x27, x27, x3\n\t" - "ldp x11, x10, [x9, 112]\n\t" - "and x28, x28, x3\n\t" - "sbcs x12, x12, x27\n\t" - "and x7, x7, x3\n\t" - "sbcs x13, x13, x28\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 96]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 112]\n\t" - "ldp x5, x6, [%[m], 128]\n\t" - "ldp x7, x8, [%[m], 144]\n\t" - "ldp x12, x13, [x9, 128]\n\t" + "ldp x4, x5, [%[m], 0]\n\t" + "ldp x6, x7, [%[m], 16]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 144]\n\t" + "subs x11, x11, x4\n\t" "and x6, x6, x3\n\t" "sbcs x12, x12, x5\n\t" "and x7, x7, x3\n\t" "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 128]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 144]\n\t" - "ldp x5, x6, [%[m], 160]\n\t" - "ldp x7, x8, [%[m], 176]\n\t" - "ldp x12, x13, [x9, 160]\n\t" + "stp x11, x12, [%[a], 0]\n\t" + "sbcs x14, x14, x7\n\t" + "stp x13, x14, [%[a], 16]\n\t" + "ldp x4, x5, [%[m], 32]\n\t" + "ldp x6, x7, [%[m], 48]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 176]\n\t" + "sbcs x15, x15, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x16, x16, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 160]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 176]\n\t" - "ldp x5, x6, [%[m], 192]\n\t" - "ldp x7, x8, [%[m], 208]\n\t" - "ldp x12, x13, [x9, 192]\n\t" + "sbcs x17, x17, x6\n\t" + "stp x15, x16, [%[a], 32]\n\t" + "sbcs x19, x19, x7\n\t" + "stp x17, x19, [%[a], 48]\n\t" + "ldp x4, x5, [%[m], 64]\n\t" + "ldp x6, x7, [%[m], 80]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 208]\n\t" + "sbcs x20, x20, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x21, x21, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 192]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 208]\n\t" - "ldp x5, x6, [%[m], 224]\n\t" - "ldp x7, x8, [%[m], 240]\n\t" - "ldp x12, x13, [x9, 224]\n\t" + "sbcs x22, x22, x6\n\t" + "stp x20, x21, [%[a], 64]\n\t" + "sbcs x23, x23, x7\n\t" + "stp x22, x23, [%[a], 80]\n\t" + "ldp x4, x5, [%[m], 96]\n\t" + "ldp x6, x7, [%[m], 112]\n\t" + "ldp x8, x9, [%[mp], 96]\n\t" + "ldp x10, x11, [%[mp], 112]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 240]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 96]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 224]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 240]\n\t" - : [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + "stp x10, x11, [%[a], 112]\n\t" + "ldp x4, x5, [%[m], 128]\n\t" + "ldp x6, x7, [%[m], 144]\n\t" + "ldp x8, x9, [%[mp], 128]\n\t" + "ldp x10, x11, [%[mp], 144]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 128]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 144]\n\t" + "ldp x4, x5, [%[m], 160]\n\t" + "ldp x6, x7, [%[m], 176]\n\t" + "ldp x8, x9, [%[mp], 160]\n\t" + "ldp x10, x11, [%[mp], 176]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 160]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 176]\n\t" + "ldp x4, x5, [%[m], 192]\n\t" + "ldp x6, x7, [%[m], 208]\n\t" + "ldp x8, x9, [%[mp], 192]\n\t" + "ldp x10, x11, [%[mp], 208]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 192]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 208]\n\t" + "ldp x4, x5, [%[m], 224]\n\t" + "ldp x6, x7, [%[m], 240]\n\t" + "ldp x8, x9, [%[mp], 224]\n\t" + "ldp x10, x11, [%[mp], 240]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 224]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 240]\n\t" + : [a] "+r" (a), [mp] "+r" (mp) + : [m] "r" (m) + : "memory", "x3", "x4", "x5", "x6", "x7", "x10", "x8", "x9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } @@ -4024,7 +4736,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_32(r, a, b); @@ -4038,7 +4750,7 @@ static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_32(r, a); @@ -4183,38 +4895,38 @@ static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. */ -static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) +static sp_digit div_2048_word_32_cond(sp_digit d1, sp_digit d0, sp_digit div) { - sp_digit r; - __asm__ __volatile__ ( - "lsr x5, %[div], 32\n\t" - "add x5, x5, 1\n\t" + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" "lsl x6, x3, 32\n\t" "mul x4, %[div], x6\n\t" "umulh x3, %[div], x6\n\t" "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "udiv x3, %[d1], x5\n\t" - "lsl x3, x3, 32\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "umulh x3, %[div], x3\n\t" - "subs %[d0], %[d0], x4\n\t" - "sbc %[d1], %[d1], x3\n\t" + "cmp %[d1], x5\n\t" + "b.lt 1f\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" + "1:\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" "udiv x3, x3, x5\n\t" "add x6, x6, x3\n\t" @@ -4223,23 +4935,25 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" - "udiv x3, x3, x5\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "sub %[d0], %[d0], x4\n\t" + "cmp x3, x5\n\t" + "b.lt 2f\n\t" + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" + "2:\n\t" "udiv x3, %[d0], %[div]\n\t" - "add %[r], x6, x3\n\t" + "add %[d1], x6, x3\n\t" - : [r] "=r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "x3", "x4", "x5", "x6" + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); - return r; + return d1; } /* Divide d in a and put remainder into r (m*d + r = a) @@ -4262,9 +4976,13 @@ static WC_INLINE int sp_2048_div_32_cond(const sp_digit* a, const sp_digit* d, s div = d[31]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i=31; i>=0; i--) { - sp_digit hi = t1[32 + i] - (t1[32 + i] == div); - r1 = div_2048_word_32(hi, t1[32 + i - 1], div); + for (i = 31; i >= 0; i--) { + if (t1[32 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_2048_word_32_cond(t1[32 + i], t1[32 + i - 1], div); + } sp_2048_mul_d_32(t2, d, r1); t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2); @@ -4461,6 +5179,67 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig #endif /* WOLFSSL_SP_SMALL */ } +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + */ +static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) +{ + __asm__ __volatile__ ( + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" + + "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" + "lsl x6, x3, 32\n\t" + "mul x4, %[div], x6\n\t" + "umulh x3, %[div], x6\n\t" + "subs %[d0], %[d0], x4\n\t" + "sbc %[d1], %[d1], x3\n\t" + + "cmp %[d1], x5\n\t" + "cset x9, ge\n\t" + "csetm x10, ge\n\t" + "lsl x9, x9, #32\n\t" + "and x7, x7, x10\n\t" + "and x8, x8, x10\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" + + "extr x3, %[d1], %[d0], 32\n\t" + + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "umulh x3, %[div], x3\n\t" + "subs %[d0], %[d0], x4\n\t" + "sbc %[d1], %[d1], x3\n\t" + + "extr x3, %[d1], %[d0], 32\n\t" + + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" + + "udiv x3, %[d0], %[div]\n\t" + "add %[d1], x6, x3\n\t" + + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + ); + + return d1; +} + /* AND m into each word of a and store in r. * * r A single precision integer. @@ -4502,259 +5281,235 @@ static sp_int64 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b) { #ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "mov x5, 248\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "mov x10, #32\n\t" + "add %[a], %[a], #240\n\t" + "add %[b], %[b], #240\n\t" "1:\n\t" - "ldr x6, [%[a], x5]\n\t" - "ldr x7, [%[b], x5]\n\t" - "and x6, x6, x4\n\t" - "and x7, x7, x4\n\t" - "subs x6, x6, x7\n\t" - "csel x2, x3, x2, hi\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "subs x5, x5, #8\n\t" - "b.cs 1b\n\t" - "eor %[a], x2, x4\n\t" - : [a] "+r" (a) - : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + "ldp x6, x7, [%[a]], -16\n\t" + "ldp x8, x9, [%[b]], -16\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x10, x10, #2\n\t" + "b.ne 1b\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #else __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "ldp x5, x6, [%[a], 240]\n\t" - "ldp x7, x8, [%[b], 240]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "ldp x6, x7, [%[a], 240]\n\t" + "ldp x8, x9, [%[b], 240]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 224]\n\t" - "ldp x7, x8, [%[b], 224]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 224]\n\t" + "ldp x8, x9, [%[b], 224]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 208]\n\t" - "ldp x7, x8, [%[b], 208]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 208]\n\t" + "ldp x8, x9, [%[b], 208]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 192]\n\t" - "ldp x7, x8, [%[b], 192]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 192]\n\t" + "ldp x8, x9, [%[b], 192]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 176]\n\t" - "ldp x7, x8, [%[b], 176]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 176]\n\t" + "ldp x8, x9, [%[b], 176]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 160]\n\t" - "ldp x7, x8, [%[b], 160]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 160]\n\t" + "ldp x8, x9, [%[b], 160]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 144]\n\t" - "ldp x7, x8, [%[b], 144]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 144]\n\t" + "ldp x8, x9, [%[b], 144]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 128]\n\t" - "ldp x7, x8, [%[b], 128]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 128]\n\t" + "ldp x8, x9, [%[b], 128]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "ldp x7, x8, [%[b], 112]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "ldp x8, x9, [%[b], 112]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 96]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "ldp x7, x8, [%[b], 80]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "ldp x8, x9, [%[b], 80]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 64]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "ldp x7, x8, [%[b], 48]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "ldp x8, x9, [%[b], 48]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 32]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "ldp x7, x8, [%[b], 16]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 16]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 0]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "eor %[a], x2, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" : [a] "+r" (a) : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); #endif @@ -4781,7 +5536,7 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig div = d[31]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i=31; i>=0; i--) { + for (i = 31; i >= 0; i--) { sp_digit hi = t1[32 + i] - (t1[32 + i] == div); r1 = div_2048_word_32(hi, t1[32 + i - 1], div); @@ -4832,9 +5587,9 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) sp_digit* td = NULL; #else - sp_digit td[8 * 64]; + sp_digit td[32 * 64]; #endif - sp_digit* t[8]; + sp_digit* t[32]; sp_digit* norm = NULL; sp_digit mp = 1; sp_digit n; @@ -4853,7 +5608,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) if (err == MP_OKAY) { - td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 64), NULL, + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 64), NULL, DYNAMIC_TYPE_TMP_BUFFER); if (td == NULL) err = MEMORY_E; @@ -4862,7 +5617,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, if (err == MP_OKAY) { norm = td; - for (i=0; i<8; i++) { + for (i=0; i<32; i++) { t[i] = td + i * 64; } @@ -4889,6 +5644,30 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, sp_2048_mont_mul_32(t[ 5], t[ 3], t[ 2], m, mp); sp_2048_mont_sqr_32(t[ 6], t[ 3], m, mp); sp_2048_mont_mul_32(t[ 7], t[ 4], t[ 3], m, mp); + sp_2048_mont_sqr_32(t[ 8], t[ 4], m, mp); + sp_2048_mont_mul_32(t[ 9], t[ 5], t[ 4], m, mp); + sp_2048_mont_sqr_32(t[10], t[ 5], m, mp); + sp_2048_mont_mul_32(t[11], t[ 6], t[ 5], m, mp); + sp_2048_mont_sqr_32(t[12], t[ 6], m, mp); + sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp); + sp_2048_mont_sqr_32(t[14], t[ 7], m, mp); + sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp); + sp_2048_mont_sqr_32(t[16], t[ 8], m, mp); + sp_2048_mont_mul_32(t[17], t[ 9], t[ 8], m, mp); + sp_2048_mont_sqr_32(t[18], t[ 9], m, mp); + sp_2048_mont_mul_32(t[19], t[10], t[ 9], m, mp); + sp_2048_mont_sqr_32(t[20], t[10], m, mp); + sp_2048_mont_mul_32(t[21], t[11], t[10], m, mp); + sp_2048_mont_sqr_32(t[22], t[11], m, mp); + sp_2048_mont_mul_32(t[23], t[12], t[11], m, mp); + sp_2048_mont_sqr_32(t[24], t[12], m, mp); + sp_2048_mont_mul_32(t[25], t[13], t[12], m, mp); + sp_2048_mont_sqr_32(t[26], t[13], m, mp); + sp_2048_mont_mul_32(t[27], t[14], t[13], m, mp); + sp_2048_mont_sqr_32(t[28], t[14], m, mp); + sp_2048_mont_mul_32(t[29], t[15], t[14], m, mp); + sp_2048_mont_sqr_32(t[30], t[15], m, mp); + sp_2048_mont_mul_32(t[31], t[16], t[15], m, mp); i = (bits - 1) / 64; n = e[i--]; @@ -4896,9 +5675,9 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, if (c == 0) { c = 64; } - c -= bits % 3; + c -= bits % 5; if (c == 64) { - c = 61; + c = 59; } if (c < 0) { /* Number of bits in top word is less than number needed. */ @@ -4918,27 +5697,29 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, n <<= 64 - c; } XMEMCPY(r, t[y], sizeof(sp_digit) * 32); - for (; i>=0 || c>=3; ) { + for (; i>=0 || c>=5; ) { if (c == 0) { n = e[i--]; - y = (byte)(n >> 61); - n <<= 3; - c = 61; + y = (byte)(n >> 59); + n <<= 5; + c = 59; } - else if (c < 3) { - y = (byte)(n >> 61); + else if (c < 5) { + y = (byte)(n >> 59); n = e[i--]; - c = 3 - c; + c = 5 - c; y |= (byte)(n >> (64 - c)); n <<= c; c = 64 - c; } else { - y = (byte)((n >> 61) & 0x7); - n <<= 3; - c -= 3; + y = (byte)((n >> 59) & 0x1f); + n <<= 5; + c -= 5; } + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); sp_2048_mont_sqr_32(r, r, m, mp); sp_2048_mont_sqr_32(r, r, m, mp); sp_2048_mont_sqr_32(r, r, m, mp); @@ -4978,9 +5759,9 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) sp_digit* td = NULL; #else - sp_digit td[16 * 64]; + sp_digit td[64 * 64]; #endif - sp_digit* t[16]; + sp_digit* t[64]; sp_digit* norm = NULL; sp_digit mp = 1; sp_digit n; @@ -4999,7 +5780,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) if (err == MP_OKAY) { - td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (16 * 64), NULL, + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (64 * 64), NULL, DYNAMIC_TYPE_TMP_BUFFER); if (td == NULL) err = MEMORY_E; @@ -5008,7 +5789,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, if (err == MP_OKAY) { norm = td; - for (i=0; i<16; i++) { + for (i=0; i<64; i++) { t[i] = td + i * 64; } @@ -5043,6 +5824,54 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp); sp_2048_mont_sqr_32(t[14], t[ 7], m, mp); sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp); + sp_2048_mont_sqr_32(t[16], t[ 8], m, mp); + sp_2048_mont_mul_32(t[17], t[ 9], t[ 8], m, mp); + sp_2048_mont_sqr_32(t[18], t[ 9], m, mp); + sp_2048_mont_mul_32(t[19], t[10], t[ 9], m, mp); + sp_2048_mont_sqr_32(t[20], t[10], m, mp); + sp_2048_mont_mul_32(t[21], t[11], t[10], m, mp); + sp_2048_mont_sqr_32(t[22], t[11], m, mp); + sp_2048_mont_mul_32(t[23], t[12], t[11], m, mp); + sp_2048_mont_sqr_32(t[24], t[12], m, mp); + sp_2048_mont_mul_32(t[25], t[13], t[12], m, mp); + sp_2048_mont_sqr_32(t[26], t[13], m, mp); + sp_2048_mont_mul_32(t[27], t[14], t[13], m, mp); + sp_2048_mont_sqr_32(t[28], t[14], m, mp); + sp_2048_mont_mul_32(t[29], t[15], t[14], m, mp); + sp_2048_mont_sqr_32(t[30], t[15], m, mp); + sp_2048_mont_mul_32(t[31], t[16], t[15], m, mp); + sp_2048_mont_sqr_32(t[32], t[16], m, mp); + sp_2048_mont_mul_32(t[33], t[17], t[16], m, mp); + sp_2048_mont_sqr_32(t[34], t[17], m, mp); + sp_2048_mont_mul_32(t[35], t[18], t[17], m, mp); + sp_2048_mont_sqr_32(t[36], t[18], m, mp); + sp_2048_mont_mul_32(t[37], t[19], t[18], m, mp); + sp_2048_mont_sqr_32(t[38], t[19], m, mp); + sp_2048_mont_mul_32(t[39], t[20], t[19], m, mp); + sp_2048_mont_sqr_32(t[40], t[20], m, mp); + sp_2048_mont_mul_32(t[41], t[21], t[20], m, mp); + sp_2048_mont_sqr_32(t[42], t[21], m, mp); + sp_2048_mont_mul_32(t[43], t[22], t[21], m, mp); + sp_2048_mont_sqr_32(t[44], t[22], m, mp); + sp_2048_mont_mul_32(t[45], t[23], t[22], m, mp); + sp_2048_mont_sqr_32(t[46], t[23], m, mp); + sp_2048_mont_mul_32(t[47], t[24], t[23], m, mp); + sp_2048_mont_sqr_32(t[48], t[24], m, mp); + sp_2048_mont_mul_32(t[49], t[25], t[24], m, mp); + sp_2048_mont_sqr_32(t[50], t[25], m, mp); + sp_2048_mont_mul_32(t[51], t[26], t[25], m, mp); + sp_2048_mont_sqr_32(t[52], t[26], m, mp); + sp_2048_mont_mul_32(t[53], t[27], t[26], m, mp); + sp_2048_mont_sqr_32(t[54], t[27], m, mp); + sp_2048_mont_mul_32(t[55], t[28], t[27], m, mp); + sp_2048_mont_sqr_32(t[56], t[28], m, mp); + sp_2048_mont_mul_32(t[57], t[29], t[28], m, mp); + sp_2048_mont_sqr_32(t[58], t[29], m, mp); + sp_2048_mont_mul_32(t[59], t[30], t[29], m, mp); + sp_2048_mont_sqr_32(t[60], t[30], m, mp); + sp_2048_mont_mul_32(t[61], t[31], t[30], m, mp); + sp_2048_mont_sqr_32(t[62], t[31], m, mp); + sp_2048_mont_mul_32(t[63], t[32], t[31], m, mp); i = (bits - 1) / 64; n = e[i--]; @@ -5050,9 +5879,9 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, if (c == 0) { c = 64; } - c -= bits % 4; + c -= bits % 6; if (c == 64) { - c = 60; + c = 58; } if (c < 0) { /* Number of bits in top word is less than number needed. */ @@ -5072,31 +5901,33 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, n <<= 64 - c; } XMEMCPY(r, t[y], sizeof(sp_digit) * 32); - for (; i>=0 || c>=4; ) { + for (; i>=0 || c>=6; ) { if (c == 0) { n = e[i--]; - y = (byte)(n >> 60); - n <<= 4; - c = 60; + y = (byte)(n >> 58); + n <<= 6; + c = 58; } - else if (c < 4) { - y = (byte)(n >> 60); + else if (c < 6) { + y = (byte)(n >> 58); n = e[i--]; - c = 4 - c; + c = 6 - c; y |= (byte)(n >> (64 - c)); n <<= c; c = 64 - c; } else { - y = (byte)((n >> 60) & 0xf); - n <<= 4; - c -= 4; + y = (byte)((n >> 58) & 0x3f); + n <<= 6; + c -= 6; } sp_2048_mont_sqr_32(r, r, m, mp); sp_2048_mont_sqr_32(r, r, m, mp); sp_2048_mont_sqr_32(r, r, m, mp); sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); sp_2048_mont_mul_32(r, r, t[y], m, mp); } @@ -5167,9 +5998,9 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 32; r = a + 32 * 2; m = r + 32 * 2; - ah = a + 32; sp_2048_from_bin(ah, 32, in, inLen); #if DIGIT_BIT >= 64 @@ -5187,7 +6018,38 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_2048_from_mp(m, 32, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_2048_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 32); + err = sp_2048_mod_32_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_2048_mont_sqr_32(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_2048_mont_mul_32(r, r, ah, m, mp); + + for (i = 31; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_2048_sub_in_place_32(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_2048_sqr_32(r, ah); err = sp_2048_mod_32_cond(r, r, m); @@ -5215,7 +6077,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 32); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_2048_mont_sqr_32(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_2048_mont_mul_32(r, r, a, m, mp); @@ -5250,6 +6112,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, } #ifndef WOLFSSL_RSA_PUBLIC_ONLY +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -5261,7 +6124,6 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, static sp_digit sp_2048_cond_add_16(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m) { -#ifdef WOLFSSL_SP_SMALL sp_digit c = 0; __asm__ __volatile__ ( @@ -5279,78 +6141,12 @@ static sp_digit sp_2048_cond_add_16(sp_digit* r, const sp_digit* a, const sp_dig "b.lt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + : "memory", "x4", "x5", "x8", "x9", "x10", "x11", "x12" ); return c; -#else - __asm__ __volatile__ ( - - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "adds x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldp x5, x7, [%[b], 64]\n\t" - "ldp x11, x12, [%[b], 80]\n\t" - "ldp x4, x6, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 80]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 64]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 80]\n\t" - "ldp x5, x7, [%[b], 96]\n\t" - "ldp x11, x12, [%[b], 112]\n\t" - "ldp x4, x6, [%[a], 96]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 112]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 96]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 112]\n\t" - "cset %[r], cs\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* WOLFSSL_SP_SMALL */ /* RSA private key operation. * @@ -6123,41 +6919,74 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod, */ static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n) { - int i; - int j; - byte* d; + sp_int64 nl = n; + sp_int64 size8 = size * 8; - for (i = n - 1,j = 0; i >= 7; i -= 8) { - r[j] = ((sp_digit)a[i - 0] << 0) | - ((sp_digit)a[i - 1] << 8) | - ((sp_digit)a[i - 2] << 16) | - ((sp_digit)a[i - 3] << 24) | - ((sp_digit)a[i - 4] << 32) | - ((sp_digit)a[i - 5] << 40) | - ((sp_digit)a[i - 6] << 48) | - ((sp_digit)a[i - 7] << 56); - j++; - } - - if (i >= 0) { - r[j] = 0; - - d = (byte*)r; - switch (i) { - case 6: d[n - 1 - 6] = a[6]; //fallthrough - case 5: d[n - 1 - 5] = a[5]; //fallthrough - case 4: d[n - 1 - 4] = a[4]; //fallthrough - case 3: d[n - 1 - 3] = a[3]; //fallthrough - case 2: d[n - 1 - 2] = a[2]; //fallthrough - case 1: d[n - 1 - 1] = a[1]; //fallthrough - case 0: d[n - 1 - 0] = a[0]; //fallthrough - } - j++; - } - - for (; j < size; j++) { - r[j] = 0; - } + __asm__ __volatile__ ( + "add x4, %[a], %[n]\n\t" + "mov x5, %[r]\n\t" + "sub x4, x4, 8\n\t" + "subs x6, %[n], 8\n\t" + "mov x7, xzr\n\t" + "blt 2f\n\t" + /* Put in mulitples of 8 bytes. */ + "1:\n\t" + "ldr x8, [x4], -8\n\t" + "subs x6, x6, 8\n\t" + "rev x8, x8\n\t" + "str x8, [x5], 8\n\t" + "add x7, x7, 8\n\t" + "b.ge 1b\n\t" + "2:\n\t" + "cmp x6, -7\n\t" + "b.lt 20f\n\t" + /* Put in less than 8 bytes. */ + "str xzr, [x5]\n\t" + "add x7, x7, 8\n\t" + "add x4, x4, 7\n\t" + "b.eq 17f\n\t" + "cmp x6, -5\n\t" + "b.lt 16f\n\t" + "b.eq 15f\n\t" + "cmp x6, -3\n\t" + "b.lt 14f\n\t" + "b.eq 13f\n\t" + "cmp x6, -2\n\t" + "b.eq 12f\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "12:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "13:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "14:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "15:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "16:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "17:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "20:\n\t" + "add x5, %[r], x7\n\t" + "subs x7, %[size], x7\n\t" + "b.eq 30f\n\t" + /* Zero out remaining words. */ + "21:\n\t" + "subs x7, x7, 8\n\t" + "str xzr, [x5], 8\n\t" + "b.gt 21b\n\t" + "30:\n\t" + : + : [r] "r" (r), [size] "r" (size8), [a] "r" (a), [n] "r" (nl) + : "memory", "x4", "x5", "x6", "x7", "x8" + ); } /* Convert an mp_int to an array of sp_digit. @@ -6256,15 +7085,15 @@ static void sp_3072_to_bin_48(sp_digit* r, byte* a) int i; int j = 0; - for (i = 47; i >= 0; i--) { - a[j++] = r[i] >> 56; - a[j++] = r[i] >> 48; - a[j++] = r[i] >> 40; - a[j++] = r[i] >> 32; - a[j++] = r[i] >> 24; - a[j++] = r[i] >> 16; - a[j++] = r[i] >> 8; - a[j++] = r[i] >> 0; + for (i = 47; i >= 0; i--, j += 8) { + __asm__ __volatile__ ( + "ldr x4, [%[r]]\n\t" + "rev x4, x4\n\t" + "str x4, [%[a]]\n\t" + : + : [r] "r" (r + i), [a] "r" (a + j) + : "memory", "x4" + ); } } @@ -6289,1670 +7118,355 @@ static void sp_3072_to_bin_48(sp_digit* r, byte* a) * a A single precision integer. * b A single precision integer. */ -static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b) +static void sp_3072_mul_6(sp_digit* r, const sp_digit* a, const sp_digit* b) { - sp_digit tmp[12]; - __asm__ __volatile__ ( - "ldp x10, x11, [%[a], 0]\n\t" - "ldp x12, x13, [%[a], 16]\n\t" - "ldp x14, x15, [%[a], 32]\n\t" - "ldp x16, x17, [%[a], 48]\n\t" - "ldp x19, x20, [%[a], 64]\n\t" - "ldp x21, x22, [%[a], 80]\n\t" + "ldp x8, x9, [%[a], 0]\n\t" + "ldp x10, x11, [%[a], 16]\n\t" + "ldp x12, x13, [%[a], 32]\n\t" + "ldp x14, x15, [%[b], 0]\n\t" + "ldp x16, x17, [%[b], 16]\n\t" + "ldp x19, x20, [%[b], 32]\n\t" "# A[0] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "mul x4, x10, x9\n\t" - "umulh x5, x10, x9\n\t" - "mov x6, 0\n\t" - "str x4, [%[tmp]]\n\t" + "mul x3, x8, x14\n\t" + "umulh x4, x8, x14\n\t" + "str x3, [%[r]]\n\t" "# A[0] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x8, x15\n\t" + "umulh x7, x8, x15\n\t" + "adds x4, x4, x6\n\t" "# A[1] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x4, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x5, x5, x7\n\t" - "adcs x6, x6, x8\n\t" - "str x5, [%[tmp], 8]\n\t" - "adc x4, x4, xzr\n\t" + "mul x6, x9, x14\n\t" + "adc x5, xzr, x7\n\t" + "umulh x7, x9, x14\n\t" + "adds x4, x4, x6\n\t" + "adcs x5, x5, x7\n\t" + "str x4, [%[r], 8]\n\t" + "adc x3, xzr, xzr\n\t" "# A[0] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x8, x16\n\t" + "umulh x7, x8, x16\n\t" + "adds x5, x5, x6\n\t" "# A[1] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x5, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x9, x15\n\t" + "adcs x3, x3, x7\n\t" + "umulh x7, x9, x15\n\t" + "adc x4, xzr, xzr\n\t" + "adds x5, x5, x6\n\t" "# A[2] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x4, x4, x8\n\t" - "str x6, [%[tmp], 16]\n\t" - "adc x5, x5, xzr\n\t" + "mul x6, x10, x14\n\t" + "adcs x3, x3, x7\n\t" + "umulh x7, x10, x14\n\t" + "adc x4, x4, xzr\n\t" + "adds x5, x5, x6\n\t" + "adcs x3, x3, x7\n\t" + "str x5, [%[r], 16]\n\t" + "adc x4, x4, xzr\n\t" "# A[0] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x8, x17\n\t" + "umulh x7, x8, x17\n\t" + "adds x3, x3, x6\n\t" "# A[1] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x6, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x9, x16\n\t" + "adcs x4, x4, x7\n\t" + "umulh x7, x9, x16\n\t" + "adc x5, xzr, xzr\n\t" + "adds x3, x3, x6\n\t" "# A[2] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x10, x15\n\t" + "adcs x4, x4, x7\n\t" + "umulh x7, x10, x15\n\t" + "adc x5, x5, xzr\n\t" + "adds x3, x3, x6\n\t" "# A[3] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "str x4, [%[tmp], 24]\n\t" - "adc x6, x6, xzr\n\t" + "mul x6, x11, x14\n\t" + "adcs x4, x4, x7\n\t" + "umulh x7, x11, x14\n\t" + "adc x5, x5, xzr\n\t" + "adds x3, x3, x6\n\t" + "adcs x4, x4, x7\n\t" + "str x3, [%[r], 24]\n\t" + "adc x5, x5, xzr\n\t" "# A[0] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x8, x19\n\t" + "umulh x7, x8, x19\n\t" + "adds x4, x4, x6\n\t" "# A[1] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x4, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x9, x17\n\t" + "adcs x5, x5, x7\n\t" + "umulh x7, x9, x17\n\t" + "adc x3, xzr, xzr\n\t" + "adds x4, x4, x6\n\t" "# A[2] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x10, x16\n\t" + "adcs x5, x5, x7\n\t" + "umulh x7, x10, x16\n\t" + "adc x3, x3, xzr\n\t" + "adds x4, x4, x6\n\t" "# A[3] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x11, x15\n\t" + "adcs x5, x5, x7\n\t" + "umulh x7, x11, x15\n\t" + "adc x3, x3, xzr\n\t" + "adds x4, x4, x6\n\t" "# A[4] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x5, x5, x7\n\t" - "adcs x6, x6, x8\n\t" - "str x5, [%[tmp], 32]\n\t" - "adc x4, x4, xzr\n\t" + "mul x6, x12, x14\n\t" + "adcs x5, x5, x7\n\t" + "umulh x7, x12, x14\n\t" + "adc x3, x3, xzr\n\t" + "adds x4, x4, x6\n\t" + "adcs x5, x5, x7\n\t" + "str x4, [%[r], 32]\n\t" + "adc x3, x3, xzr\n\t" "# A[0] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x8, x20\n\t" + "umulh x7, x8, x20\n\t" + "adds x5, x5, x6\n\t" "# A[1] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x5, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x9, x19\n\t" + "adcs x3, x3, x7\n\t" + "umulh x7, x9, x19\n\t" + "adc x4, xzr, xzr\n\t" + "adds x5, x5, x6\n\t" "# A[2] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x10, x17\n\t" + "adcs x3, x3, x7\n\t" + "umulh x7, x10, x17\n\t" + "adc x4, x4, xzr\n\t" + "adds x5, x5, x6\n\t" "# A[3] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x11, x16\n\t" + "adcs x3, x3, x7\n\t" + "umulh x7, x11, x16\n\t" + "adc x4, x4, xzr\n\t" + "adds x5, x5, x6\n\t" "# A[4] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x12, x15\n\t" + "adcs x3, x3, x7\n\t" + "umulh x7, x12, x15\n\t" + "adc x4, x4, xzr\n\t" + "adds x5, x5, x6\n\t" "# A[5] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x4, x4, x8\n\t" - "str x6, [%[tmp], 40]\n\t" - "adc x5, x5, xzr\n\t" - "# A[0] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x13, x14\n\t" + "adcs x3, x3, x7\n\t" + "umulh x7, x13, x14\n\t" + "adc x4, x4, xzr\n\t" + "adds x5, x5, x6\n\t" + "adcs x3, x3, x7\n\t" + "str x5, [%[r], 40]\n\t" + "adc x4, x4, xzr\n\t" "# A[1] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x6, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x9, x20\n\t" + "umulh x7, x9, x20\n\t" + "adds x3, x3, x6\n\t" "# A[2] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x10, x19\n\t" + "adcs x4, x4, x7\n\t" + "umulh x7, x10, x19\n\t" + "adc x5, xzr, xzr\n\t" + "adds x3, x3, x6\n\t" "# A[3] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x11, x17\n\t" + "adcs x4, x4, x7\n\t" + "umulh x7, x11, x17\n\t" + "adc x5, x5, xzr\n\t" + "adds x3, x3, x6\n\t" "# A[4] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x12, x16\n\t" + "adcs x4, x4, x7\n\t" + "umulh x7, x12, x16\n\t" + "adc x5, x5, xzr\n\t" + "adds x3, x3, x6\n\t" "# A[5] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[6] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "str x4, [%[tmp], 48]\n\t" - "adc x6, x6, xzr\n\t" - "# A[0] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[1] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x4, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x13, x15\n\t" + "adcs x4, x4, x7\n\t" + "umulh x7, x13, x15\n\t" + "adc x5, x5, xzr\n\t" + "adds x3, x3, x6\n\t" + "adcs x4, x4, x7\n\t" + "str x3, [%[r], 48]\n\t" + "adc x5, x5, xzr\n\t" "# A[2] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x10, x20\n\t" + "umulh x7, x10, x20\n\t" + "adds x4, x4, x6\n\t" "# A[3] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x11, x19\n\t" + "adcs x5, x5, x7\n\t" + "umulh x7, x11, x19\n\t" + "adc x3, xzr, xzr\n\t" + "adds x4, x4, x6\n\t" "# A[4] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x12, x17\n\t" + "adcs x5, x5, x7\n\t" + "umulh x7, x12, x17\n\t" + "adc x3, x3, xzr\n\t" + "adds x4, x4, x6\n\t" "# A[5] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[6] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[7] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x5, x5, x7\n\t" - "adcs x6, x6, x8\n\t" - "str x5, [%[tmp], 56]\n\t" - "adc x4, x4, xzr\n\t" - "# A[0] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[1] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x5, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[2] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x13, x16\n\t" + "adcs x5, x5, x7\n\t" + "umulh x7, x13, x16\n\t" + "adc x3, x3, xzr\n\t" + "adds x4, x4, x6\n\t" + "adcs x5, x5, x7\n\t" + "str x4, [%[r], 56]\n\t" + "adc x3, x3, xzr\n\t" "# A[3] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x11, x20\n\t" + "umulh x7, x11, x20\n\t" + "adds x5, x5, x6\n\t" "# A[4] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x6, x6, x7\n\t" + "mul x6, x12, x19\n\t" + "adcs x3, x3, x7\n\t" + "umulh x7, x12, x19\n\t" + "adc x4, xzr, xzr\n\t" + "adds x5, x5, x6\n\t" "# A[5] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[6] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[7] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[8] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x4, x4, x8\n\t" - "str x6, [%[tmp], 64]\n\t" - "adc x5, x5, xzr\n\t" - "# A[0] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[1] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x6, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[2] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[3] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x13, x17\n\t" + "adcs x3, x3, x7\n\t" + "umulh x7, x13, x17\n\t" + "adc x4, x4, xzr\n\t" + "adds x5, x5, x6\n\t" + "adcs x3, x3, x7\n\t" + "str x5, [%[r], 64]\n\t" + "adc x4, x4, xzr\n\t" "# A[4] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x4, x4, x7\n\t" + "mul x6, x12, x20\n\t" + "umulh x7, x12, x20\n\t" + "adds x3, x3, x6\n\t" "# A[5] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[6] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[7] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[8] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[9] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "str x4, [%[tmp], 72]\n\t" - "adc x6, x6, xzr\n\t" - "# A[0] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[1] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x4, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[2] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[3] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[4] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x5, x5, x7\n\t" + "mul x6, x13, x19\n\t" + "adcs x4, x4, x7\n\t" + "umulh x7, x13, x19\n\t" + "adc x5, xzr, xzr\n\t" + "adds x3, x3, x6\n\t" + "adcs x4, x4, x7\n\t" + "str x3, [%[r], 72]\n\t" + "adc x5, x5, xzr\n\t" "# A[5] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[6] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[7] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[8] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[9] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[10] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x5, x5, x7\n\t" - "adcs x6, x6, x8\n\t" - "str x5, [%[tmp], 80]\n\t" - "adc x4, x4, xzr\n\t" - "# A[0] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x10, x9\n\t" - "umulh x8, x10, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[1] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x11, x9\n\t" - "adc x5, xzr, xzr\n\t" - "umulh x8, x11, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[2] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[3] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[4] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[5] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[6] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[7] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[8] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[9] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[10] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[11] * B[0]\n\t" - "ldr x9, [%[b], 0]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x4, x4, x8\n\t" - "str x6, [%[tmp], 88]\n\t" - "adc x5, x5, xzr\n\t" - "# A[1] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x11, x9\n\t" - "umulh x8, x11, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[2] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x12, x9\n\t" - "adc x6, xzr, xzr\n\t" - "umulh x8, x12, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[3] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[4] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[5] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[6] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[7] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[8] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[9] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[10] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[11] * B[1]\n\t" - "ldr x9, [%[b], 8]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "str x4, [%[r], 96]\n\t" - "adc x6, x6, xzr\n\t" - "# A[2] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x12, x9\n\t" - "umulh x8, x12, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[3] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x13, x9\n\t" - "adc x4, xzr, xzr\n\t" - "umulh x8, x13, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[4] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[5] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[6] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[7] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[8] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[9] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[10] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[11] * B[2]\n\t" - "ldr x9, [%[b], 16]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x5, x5, x7\n\t" - "adcs x6, x6, x8\n\t" - "str x5, [%[r], 104]\n\t" - "adc x4, x4, xzr\n\t" - "# A[3] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x13, x9\n\t" - "umulh x8, x13, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[4] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x14, x9\n\t" - "adc x5, xzr, xzr\n\t" - "umulh x8, x14, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[5] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[6] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[7] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[8] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[9] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[10] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[11] * B[3]\n\t" - "ldr x9, [%[b], 24]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x4, x4, x8\n\t" - "str x6, [%[r], 112]\n\t" - "adc x5, x5, xzr\n\t" - "# A[4] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x14, x9\n\t" - "umulh x8, x14, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[5] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x15, x9\n\t" - "adc x6, xzr, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[6] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[7] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[8] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[9] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[10] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[11] * B[4]\n\t" - "ldr x9, [%[b], 32]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "str x4, [%[r], 120]\n\t" - "adc x6, x6, xzr\n\t" - "# A[5] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x15, x9\n\t" - "umulh x8, x15, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[6] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x16, x9\n\t" - "adc x4, xzr, xzr\n\t" - "umulh x8, x16, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[7] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[8] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[9] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[10] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[11] * B[5]\n\t" - "ldr x9, [%[b], 40]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x5, x5, x7\n\t" - "adcs x6, x6, x8\n\t" - "str x5, [%[r], 128]\n\t" - "adc x4, x4, xzr\n\t" - "# A[6] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x16, x9\n\t" - "umulh x8, x16, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[7] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x17, x9\n\t" - "adc x5, xzr, xzr\n\t" - "umulh x8, x17, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[8] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[9] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[10] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[11] * B[6]\n\t" - "ldr x9, [%[b], 48]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x4, x4, x8\n\t" - "str x6, [%[r], 136]\n\t" - "adc x5, x5, xzr\n\t" - "# A[7] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x17, x9\n\t" - "umulh x8, x17, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[8] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x19, x9\n\t" - "adc x6, xzr, xzr\n\t" - "umulh x8, x19, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[9] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[10] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[11] * B[7]\n\t" - "ldr x9, [%[b], 56]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "str x4, [%[r], 144]\n\t" - "adc x6, x6, xzr\n\t" - "# A[8] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x19, x9\n\t" - "umulh x8, x19, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[9] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x20, x9\n\t" - "adc x4, xzr, xzr\n\t" - "umulh x8, x20, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[10] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x5, x5, x7\n\t" - "# A[11] * B[8]\n\t" - "ldr x9, [%[b], 64]\n\t" - "adcs x6, x6, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x4, x4, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x5, x5, x7\n\t" - "adcs x6, x6, x8\n\t" - "str x5, [%[r], 152]\n\t" - "adc x4, x4, xzr\n\t" - "# A[9] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x20, x9\n\t" - "umulh x8, x20, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[10] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x21, x9\n\t" - "adc x5, xzr, xzr\n\t" - "umulh x8, x21, x9\n\t" - "adds x6, x6, x7\n\t" - "# A[11] * B[9]\n\t" - "ldr x9, [%[b], 72]\n\t" - "adcs x4, x4, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x4, x4, x8\n\t" - "str x6, [%[r], 160]\n\t" - "adc x5, x5, xzr\n\t" - "# A[10] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x21, x9\n\t" - "umulh x8, x21, x9\n\t" - "adds x4, x4, x7\n\t" - "# A[11] * B[10]\n\t" - "ldr x9, [%[b], 80]\n\t" - "adcs x5, x5, x8\n\t" - "mul x7, x22, x9\n\t" - "adc x6, xzr, xzr\n\t" - "umulh x8, x22, x9\n\t" - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "str x4, [%[r], 168]\n\t" - "adc x6, x6, xzr\n\t" - "# A[11] * B[11]\n\t" - "ldr x9, [%[b], 88]\n\t" - "mul x7, x22, x9\n\t" - "umulh x8, x22, x9\n\t" - "adds x5, x5, x7\n\t" - "adc x6, x6, x8\n\t" - "stp x5, x6, [%[r], 176]\n\t" - "ldp x10, x11, [%[tmp], 0]\n\t" - "ldp x12, x13, [%[tmp], 16]\n\t" - "ldp x14, x15, [%[tmp], 32]\n\t" - "ldp x16, x17, [%[tmp], 48]\n\t" - "ldp x19, x20, [%[tmp], 64]\n\t" - "ldp x21, x22, [%[tmp], 80]\n\t" - "stp x10, x11, [%[r], 0]\n\t" - "stp x12, x13, [%[r], 16]\n\t" - "stp x14, x15, [%[r], 32]\n\t" - "stp x16, x17, [%[r], 48]\n\t" - "stp x19, x20, [%[r], 64]\n\t" - "stp x21, x22, [%[r], 80]\n\t" + "mul x6, x13, x20\n\t" + "umulh x7, x13, x20\n\t" + "adds x4, x4, x6\n\t" + "adc x5, x5, x7\n\t" + "stp x4, x5, [%[r], 80]\n\t" : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20" ); } -/* Square a and put result in r. (r = a * a) +/* Add b to a into r. (r = a + b) * * r A single precision integer. * a A single precision integer. + * b A single precision integer. */ -static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) +static sp_digit sp_3072_add_6(sp_digit* r, const sp_digit* a, + const sp_digit* b) { __asm__ __volatile__ ( - "ldp x10, x11, [%[a], 0]\n\t" - "ldp x12, x13, [%[a], 16]\n\t" - "ldp x14, x15, [%[a], 32]\n\t" - "ldp x16, x17, [%[a], 48]\n\t" - "ldp x19, x20, [%[a], 64]\n\t" - "ldp x21, x22, [%[a], 80]\n\t" - "# A[0] * A[0]\n\t" - "mul x2, x10, x10\n\t" - "umulh x3, x10, x10\n\t" - "str x2, [%[r]]\n\t" - "mov x4, 0\n\t" - "# A[0] * A[1]\n\t" - "mul x8, x10, x11\n\t" - "umulh x9, x10, x11\n\t" - "adds x3, x3, x8\n\t" - "adcs x4, x4, x9\n\t" - "adc x2, xzr, xzr\n\t" - "adds x3, x3, x8\n\t" - "str x3, [%[r], 8]\n\t" - "# A[0] * A[2]\n\t" - "mul x8, x10, x12\n\t" - "adcs x4, x4, x9\n\t" - "umulh x9, x10, x12\n\t" - "adc x2, x2, xzr\n\t" - "adds x4, x4, x8\n\t" - "adcs x2, x2, x9\n\t" - "adc x3, xzr, xzr\n\t" - "adds x4, x4, x8\n\t" - "# A[1] * A[1]\n\t" - "mul x8, x11, x11\n\t" - "adcs x2, x2, x9\n\t" - "umulh x9, x11, x11\n\t" - "adc x3, x3, xzr\n\t" - "adds x4, x4, x8\n\t" - "str x4, [%[r], 16]\n\t" - "# A[0] * A[3]\n\t" - "mul x8, x10, x13\n\t" - "adcs x2, x2, x9\n\t" - "umulh x9, x10, x13\n\t" - "adc x3, x3, xzr\n\t" - "adds x2, x2, x8\n\t" - "adcs x3, x3, x9\n\t" - "adc x4, xzr, xzr\n\t" - "adds x2, x2, x8\n\t" - "# A[1] * A[2]\n\t" - "mul x8, x11, x12\n\t" - "adcs x3, x3, x9\n\t" - "umulh x9, x11, x12\n\t" - "adc x4, x4, xzr\n\t" - "adds x2, x2, x8\n\t" - "adcs x3, x3, x9\n\t" - "adc x4, x4, xzr\n\t" - "adds x2, x2, x8\n\t" - "str x2, [%[r], 24]\n\t" - "# A[0] * A[4]\n\t" - "mul x8, x10, x14\n\t" - "adcs x3, x3, x9\n\t" - "umulh x9, x10, x14\n\t" - "adc x4, x4, xzr\n\t" - "adds x3, x3, x8\n\t" - "adcs x4, x4, x9\n\t" - "adc x2, xzr, xzr\n\t" - "adds x3, x3, x8\n\t" - "# A[1] * A[3]\n\t" - "mul x8, x11, x13\n\t" - "adcs x4, x4, x9\n\t" - "umulh x9, x11, x13\n\t" - "adc x2, x2, xzr\n\t" - "adds x3, x3, x8\n\t" - "adcs x4, x4, x9\n\t" - "adc x2, x2, xzr\n\t" - "adds x3, x3, x8\n\t" - "# A[2] * A[2]\n\t" - "mul x8, x12, x12\n\t" - "adcs x4, x4, x9\n\t" - "umulh x9, x12, x12\n\t" - "adc x2, x2, xzr\n\t" - "adds x3, x3, x8\n\t" + "ldp x3, x4, [%[a], 0]\n\t" + "ldp x7, x8, [%[b], 0]\n\t" + "adds x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "adcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 16]\n\t" + "adcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 0]\n\t" + "adcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 16]\n\t" + "ldr x3, [%[a], 32]\n\t" + "ldr x4, [%[a], 40]\n\t" + "ldr x7, [%[b], 32]\n\t" + "ldr x8, [%[b], 40]\n\t" + "adcs x3, x3, x7\n\t" + "adcs x4, x4, x8\n\t" "str x3, [%[r], 32]\n\t" - "# A[0] * A[5]\n\t" - "mul x5, x10, x15\n\t" - "adcs x4, x4, x9\n\t" - "umulh x6, x10, x15\n\t" - "adc x2, x2, xzr\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" - "# A[1] * A[4]\n\t" - "mul x8, x11, x14\n\t" - "umulh x9, x11, x14\n\t" - "adds x5, x5, x8\n\t" - "# A[2] * A[3]\n\t" - "mul x8, x12, x13\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x12, x13\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x4, x4, x5\n\t" - "adcs x2, x2, x6\n\t" - "adc x3, x3, x7\n\t" "str x4, [%[r], 40]\n\t" - "# A[0] * A[6]\n\t" - "mul x5, x10, x16\n\t" - "umulh x6, x10, x16\n\t" - "mov x4, 0\n\t" - "mov x7, 0\n\t" - "# A[1] * A[5]\n\t" - "mul x8, x11, x15\n\t" - "umulh x9, x11, x15\n\t" - "adds x5, x5, x8\n\t" - "# A[2] * A[4]\n\t" - "mul x8, x12, x14\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x12, x14\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[3] * A[3]\n\t" - "mul x8, x13, x13\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x13, x13\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x2, x2, x5\n\t" - "adcs x3, x3, x6\n\t" - "adc x4, x4, x7\n\t" - "str x2, [%[r], 48]\n\t" - "# A[0] * A[7]\n\t" - "mul x5, x10, x17\n\t" - "umulh x6, x10, x17\n\t" - "mov x2, 0\n\t" - "mov x7, 0\n\t" - "# A[1] * A[6]\n\t" - "mul x8, x11, x16\n\t" - "umulh x9, x11, x16\n\t" - "adds x5, x5, x8\n\t" - "# A[2] * A[5]\n\t" - "mul x8, x12, x15\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x12, x15\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[3] * A[4]\n\t" - "mul x8, x13, x14\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x13, x14\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x3, x3, x5\n\t" - "adcs x4, x4, x6\n\t" - "adc x2, x2, x7\n\t" - "str x3, [%[r], 56]\n\t" - "# A[0] * A[8]\n\t" - "mul x5, x10, x19\n\t" - "umulh x6, x10, x19\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" - "# A[1] * A[7]\n\t" - "mul x8, x11, x17\n\t" - "umulh x9, x11, x17\n\t" - "adds x5, x5, x8\n\t" - "# A[2] * A[6]\n\t" - "mul x8, x12, x16\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x12, x16\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[3] * A[5]\n\t" - "mul x8, x13, x15\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x13, x15\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[4] * A[4]\n\t" - "mul x8, x14, x14\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x14, x14\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x4, x4, x5\n\t" - "adcs x2, x2, x6\n\t" - "adc x3, x3, x7\n\t" - "str x4, [%[r], 64]\n\t" - "# A[0] * A[9]\n\t" - "mul x5, x10, x20\n\t" - "umulh x6, x10, x20\n\t" - "mov x4, 0\n\t" - "mov x7, 0\n\t" - "# A[1] * A[8]\n\t" - "mul x8, x11, x19\n\t" - "umulh x9, x11, x19\n\t" - "adds x5, x5, x8\n\t" - "# A[2] * A[7]\n\t" - "mul x8, x12, x17\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x12, x17\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[3] * A[6]\n\t" - "mul x8, x13, x16\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x13, x16\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[4] * A[5]\n\t" - "mul x8, x14, x15\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x14, x15\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x2, x2, x5\n\t" - "adcs x3, x3, x6\n\t" - "adc x4, x4, x7\n\t" - "str x2, [%[r], 72]\n\t" - "# A[0] * A[10]\n\t" - "mul x5, x10, x21\n\t" - "umulh x6, x10, x21\n\t" - "mov x2, 0\n\t" - "mov x7, 0\n\t" - "# A[1] * A[9]\n\t" - "mul x8, x11, x20\n\t" - "umulh x9, x11, x20\n\t" - "adds x5, x5, x8\n\t" - "# A[2] * A[8]\n\t" - "mul x8, x12, x19\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x12, x19\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[3] * A[7]\n\t" - "mul x8, x13, x17\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x13, x17\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[4] * A[6]\n\t" - "mul x8, x14, x16\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x14, x16\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[5] * A[5]\n\t" - "mul x8, x15, x15\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x15, x15\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x3, x3, x5\n\t" - "adcs x4, x4, x6\n\t" - "adc x2, x2, x7\n\t" - "str x3, [%[r], 80]\n\t" - "# A[0] * A[11]\n\t" - "mul x5, x10, x22\n\t" - "umulh x6, x10, x22\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" - "# A[1] * A[10]\n\t" - "mul x8, x11, x21\n\t" - "umulh x9, x11, x21\n\t" - "adds x5, x5, x8\n\t" - "# A[2] * A[9]\n\t" - "mul x8, x12, x20\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x12, x20\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[3] * A[8]\n\t" - "mul x8, x13, x19\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x13, x19\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[4] * A[7]\n\t" - "mul x8, x14, x17\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x14, x17\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[5] * A[6]\n\t" - "mul x8, x15, x16\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x15, x16\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x4, x4, x5\n\t" - "adcs x2, x2, x6\n\t" - "adc x3, x3, x7\n\t" - "str x4, [%[r], 88]\n\t" - "# A[1] * A[11]\n\t" - "mul x5, x11, x22\n\t" - "umulh x6, x11, x22\n\t" - "mov x4, 0\n\t" - "mov x7, 0\n\t" - "# A[2] * A[10]\n\t" - "mul x8, x12, x21\n\t" - "umulh x9, x12, x21\n\t" - "adds x5, x5, x8\n\t" - "# A[3] * A[9]\n\t" - "mul x8, x13, x20\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x13, x20\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[4] * A[8]\n\t" - "mul x8, x14, x19\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x14, x19\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[5] * A[7]\n\t" - "mul x8, x15, x17\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x15, x17\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[6] * A[6]\n\t" - "mul x8, x16, x16\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x16, x16\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x2, x2, x5\n\t" - "adcs x3, x3, x6\n\t" - "adc x4, x4, x7\n\t" - "str x2, [%[r], 96]\n\t" - "# A[2] * A[11]\n\t" - "mul x5, x12, x22\n\t" - "umulh x6, x12, x22\n\t" - "mov x2, 0\n\t" - "mov x7, 0\n\t" - "# A[3] * A[10]\n\t" - "mul x8, x13, x21\n\t" - "umulh x9, x13, x21\n\t" - "adds x5, x5, x8\n\t" - "# A[4] * A[9]\n\t" - "mul x8, x14, x20\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x14, x20\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[5] * A[8]\n\t" - "mul x8, x15, x19\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x15, x19\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[6] * A[7]\n\t" - "mul x8, x16, x17\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x16, x17\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x3, x3, x5\n\t" - "adcs x4, x4, x6\n\t" - "adc x2, x2, x7\n\t" - "str x3, [%[r], 104]\n\t" - "# A[3] * A[11]\n\t" - "mul x5, x13, x22\n\t" - "umulh x6, x13, x22\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" - "# A[4] * A[10]\n\t" - "mul x8, x14, x21\n\t" - "umulh x9, x14, x21\n\t" - "adds x5, x5, x8\n\t" - "# A[5] * A[9]\n\t" - "mul x8, x15, x20\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x15, x20\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[6] * A[8]\n\t" - "mul x8, x16, x19\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x16, x19\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[7] * A[7]\n\t" - "mul x8, x17, x17\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x17, x17\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x4, x4, x5\n\t" - "adcs x2, x2, x6\n\t" - "adc x3, x3, x7\n\t" - "str x4, [%[r], 112]\n\t" - "# A[4] * A[11]\n\t" - "mul x5, x14, x22\n\t" - "umulh x6, x14, x22\n\t" - "mov x4, 0\n\t" - "mov x7, 0\n\t" - "# A[5] * A[10]\n\t" - "mul x8, x15, x21\n\t" - "umulh x9, x15, x21\n\t" - "adds x5, x5, x8\n\t" - "# A[6] * A[9]\n\t" - "mul x8, x16, x20\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x16, x20\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[7] * A[8]\n\t" - "mul x8, x17, x19\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x17, x19\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x2, x2, x5\n\t" - "adcs x3, x3, x6\n\t" - "adc x4, x4, x7\n\t" - "str x2, [%[r], 120]\n\t" - "# A[5] * A[11]\n\t" - "mul x5, x15, x22\n\t" - "umulh x6, x15, x22\n\t" - "mov x2, 0\n\t" - "mov x7, 0\n\t" - "# A[6] * A[10]\n\t" - "mul x8, x16, x21\n\t" - "umulh x9, x16, x21\n\t" - "adds x5, x5, x8\n\t" - "# A[7] * A[9]\n\t" - "mul x8, x17, x20\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x17, x20\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "# A[8] * A[8]\n\t" - "mul x8, x19, x19\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x19, x19\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x3, x3, x5\n\t" - "adcs x4, x4, x6\n\t" - "adc x2, x2, x7\n\t" - "str x3, [%[r], 128]\n\t" - "# A[6] * A[11]\n\t" - "mul x5, x16, x22\n\t" - "umulh x6, x16, x22\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" - "# A[7] * A[10]\n\t" - "mul x8, x17, x21\n\t" - "umulh x9, x17, x21\n\t" - "adds x5, x5, x8\n\t" - "# A[8] * A[9]\n\t" - "mul x8, x19, x20\n\t" - "adcs x6, x6, x9\n\t" - "umulh x9, x19, x20\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x7, x7, xzr\n\t" - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "adds x4, x4, x5\n\t" - "adcs x2, x2, x6\n\t" - "adc x3, x3, x7\n\t" - "str x4, [%[r], 136]\n\t" - "# A[7] * A[11]\n\t" - "mul x8, x17, x22\n\t" - "umulh x9, x17, x22\n\t" - "adds x2, x2, x8\n\t" - "adcs x3, x3, x9\n\t" - "adc x4, xzr, xzr\n\t" - "adds x2, x2, x8\n\t" - "# A[8] * A[10]\n\t" - "mul x8, x19, x21\n\t" - "adcs x3, x3, x9\n\t" - "umulh x9, x19, x21\n\t" - "adc x4, x4, xzr\n\t" - "adds x2, x2, x8\n\t" - "adcs x3, x3, x9\n\t" - "adc x4, x4, xzr\n\t" - "adds x2, x2, x8\n\t" - "# A[9] * A[9]\n\t" - "mul x8, x20, x20\n\t" - "adcs x3, x3, x9\n\t" - "umulh x9, x20, x20\n\t" - "adc x4, x4, xzr\n\t" - "adds x2, x2, x8\n\t" - "str x2, [%[r], 144]\n\t" - "# A[8] * A[11]\n\t" - "mul x8, x19, x22\n\t" - "adcs x3, x3, x9\n\t" - "umulh x9, x19, x22\n\t" - "adc x4, x4, xzr\n\t" - "adds x3, x3, x8\n\t" - "adcs x4, x4, x9\n\t" - "adc x2, xzr, xzr\n\t" - "adds x3, x3, x8\n\t" - "# A[9] * A[10]\n\t" - "mul x8, x20, x21\n\t" - "adcs x4, x4, x9\n\t" - "umulh x9, x20, x21\n\t" - "adc x2, x2, xzr\n\t" - "adds x3, x3, x8\n\t" - "adcs x4, x4, x9\n\t" - "adc x2, x2, xzr\n\t" - "adds x3, x3, x8\n\t" - "str x3, [%[r], 152]\n\t" - "# A[9] * A[11]\n\t" - "mul x8, x20, x22\n\t" - "adcs x4, x4, x9\n\t" - "umulh x9, x20, x22\n\t" - "adc x2, x2, xzr\n\t" - "adds x4, x4, x8\n\t" - "adcs x2, x2, x9\n\t" - "adc x3, xzr, xzr\n\t" - "adds x4, x4, x8\n\t" - "# A[10] * A[10]\n\t" - "mul x8, x21, x21\n\t" - "adcs x2, x2, x9\n\t" - "umulh x9, x21, x21\n\t" - "adc x3, x3, xzr\n\t" - "adds x4, x4, x8\n\t" - "str x4, [%[r], 160]\n\t" - "# A[10] * A[11]\n\t" - "mul x8, x21, x22\n\t" - "adcs x2, x2, x9\n\t" - "umulh x9, x21, x22\n\t" - "adc x3, x3, xzr\n\t" - "adds x2, x2, x8\n\t" - "adcs x3, x3, x9\n\t" - "adc x4, xzr, xzr\n\t" - "adds x2, x2, x8\n\t" - "str x2, [%[r], 168]\n\t" - "# A[11] * A[11]\n\t" - "mul x8, x22, x22\n\t" - "adcs x3, x3, x9\n\t" - "umulh x9, x22, x22\n\t" - "adc x4, x4, xzr\n\t" - "adds x3, x3, x8\n\t" - "adc x4, x4, x9\n\t" - "stp x3, x4, [%[r], 176]\n\t" - : - : [r] "r" (r), [a] "r" (a) - : "memory", "x2", "x3", "x4", "x8", "x9", "x10", "x5", "x6", "x7", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" + "cset %[r], cs\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); + + return (sp_digit)r; +} + +/* Add digit to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_3072_add_word_6(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "ldp x3, x4, [%[a], 0]\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "adds x3, x3, %[b]\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 0]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 16]\n\t" + "ldr x3, [%[a], 32]\n\t" + "ldr x4, [%[a], 40]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "str x3, [%[r], 32]\n\t" + "str x4, [%[r], 40]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6" + ); +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer and result. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_in_place_12(sp_digit* a, const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldp x2, x3, [%[a], 0]\n\t" + "ldp x6, x7, [%[b], 0]\n\t" + "subs x2, x2, x6\n\t" + "ldp x4, x5, [%[a], 16]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x8, x9, [%[b], 16]\n\t" + "sbcs x4, x4, x8\n\t" + "stp x2, x3, [%[a], 0]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x4, x5, [%[a], 16]\n\t" + "ldp x2, x3, [%[a], 32]\n\t" + "ldp x6, x7, [%[b], 32]\n\t" + "sbcs x2, x2, x6\n\t" + "ldp x4, x5, [%[a], 48]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x8, x9, [%[b], 48]\n\t" + "sbcs x4, x4, x8\n\t" + "stp x2, x3, [%[a], 32]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x4, x5, [%[a], 48]\n\t" + "ldp x2, x3, [%[a], 64]\n\t" + "ldp x6, x7, [%[b], 64]\n\t" + "sbcs x2, x2, x6\n\t" + "ldp x4, x5, [%[a], 80]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x8, x9, [%[b], 80]\n\t" + "sbcs x4, x4, x8\n\t" + "stp x2, x3, [%[a], 64]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x4, x5, [%[a], 80]\n\t" + "csetm %[a], cc\n\t" + : [a] "+r" (a) + : [b] "r" (b) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + ); + + return (sp_digit)a; } /* Add b to a into r. (r = a + b) @@ -8004,6 +7518,126 @@ static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, return (sp_digit)r; } +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_3072_cond_add_6(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ + __asm__ __volatile__ ( + + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "adcs x5, x5, x9\n\t" + "stp x4, x5, [%[r], 32]\n\t" + "cset %[r], cs\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" + ); + + return (sp_digit)r; +} +#endif /* !WOLFSSL_SP_SMALL */ + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[12]; + sp_digit a1[6]; + sp_digit b1[6]; + sp_digit* z2 = r + 12; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_3072_add_6(a1, a, &a[6]); + cb = sp_3072_add_6(b1, b, &b[6]); + u = ca & cb; + + sp_3072_mul_6(z2, &a[6], &b[6]); + sp_3072_mul_6(z0, a, b); + sp_3072_mul_6(z1, a1, b1); + + u += sp_3072_sub_in_place_12(z1, z0); + u += sp_3072_sub_in_place_12(z1, z2); + u += sp_3072_cond_add_6(z1 + 6, z1 + 6, a1, 0 - cb); + u += sp_3072_cond_add_6(z1 + 6, z1 + 6, b1, 0 - ca); + + u += sp_3072_add_12(r + 6, r + 6, z1); + (void)sp_3072_add_word_6(r + 18, r + 18, u); +} + +/* Add digit to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_3072_add_word_12(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "ldp x3, x4, [%[a], 0]\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "adds x3, x3, %[b]\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 0]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 16]\n\t" + "ldp x3, x4, [%[a], 32]\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 32]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 48]\n\t" + "ldp x3, x4, [%[a], 64]\n\t" + "ldp x5, x6, [%[a], 80]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 64]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 80]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6" + ); +} + /* Sub b from a into a. (a -= b) * * a A single precision integer and result. @@ -8160,34 +7794,105 @@ static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, return (sp_digit)r; } -/* AND m into each word of a and store in r. +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_3072_cond_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ + __asm__ __volatile__ ( + + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x10, x11, [%[b], 48]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 32]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 48]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "ldp x10, x11, [%[b], 80]\n\t" + "ldp x4, x5, [%[a], 64]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 64]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 80]\n\t" + "cset %[r], cs\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" + ); + + return (sp_digit)r; +} +#endif /* !WOLFSSL_SP_SMALL */ + +/* Multiply a and b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. - * m Mask to AND against each digit. + * b A single precision integer. */ -static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) +SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) { -#ifdef WOLFSSL_SP_SMALL - int i; + sp_digit* z0 = r; + sp_digit z1[24]; + sp_digit a1[12]; + sp_digit b1[12]; + sp_digit* z2 = r + 24; + sp_digit u; + sp_digit ca; + sp_digit cb; - for (i=0; i<12; i++) { - r[i] = a[i] & m; - } -#else - r[0] = a[0] & m; - r[1] = a[1] & m; - r[2] = a[2] & m; - r[3] = a[3] & m; - r[4] = a[4] & m; - r[5] = a[5] & m; - r[6] = a[6] & m; - r[7] = a[7] & m; - r[8] = a[8] & m; - r[9] = a[9] & m; - r[10] = a[10] & m; - r[11] = a[11] & m; -#endif + ca = sp_3072_add_12(a1, a, &a[12]); + cb = sp_3072_add_12(b1, b, &b[12]); + u = ca & cb; + + sp_3072_mul_12(z2, &a[12], &b[12]); + sp_3072_mul_12(z0, a, b); + sp_3072_mul_12(z1, a1, b1); + + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_sub_in_place_24(z1, z2); + u += sp_3072_cond_add_12(z1 + 12, z1 + 12, a1, 0 - cb); + u += sp_3072_cond_add_12(z1 + 12, z1 + 12, b1, 0 - ca); + + u += sp_3072_add_24(r + 12, r + 12, z1); + (void)sp_3072_add_word_12(r + 36, r + 36, u); } /* Add digit to a into r. (r = a + b) @@ -8196,13 +7901,13 @@ static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -static void sp_3072_add_zero_12(sp_digit* r, const sp_digit* a, - const sp_digit d) +static void sp_3072_add_word_24(sp_digit* r, const sp_digit* a, + sp_digit b) { __asm__ __volatile__ ( "ldp x3, x4, [%[a], 0]\n\t" "ldp x5, x6, [%[a], 16]\n\t" - "adds x3, x3, %[d]\n\t" + "adds x3, x3, %[b]\n\t" "adcs x4, x4, xzr\n\t" "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 0]\n\t" @@ -8224,149 +7929,36 @@ static void sp_3072_add_zero_12(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 64]\n\t" "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 80]\n\t" + "ldp x3, x4, [%[a], 96]\n\t" + "ldp x5, x6, [%[a], 112]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 96]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 112]\n\t" + "ldp x3, x4, [%[a], 128]\n\t" + "ldp x5, x6, [%[a], 144]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 128]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 144]\n\t" + "ldp x3, x4, [%[a], 160]\n\t" + "ldp x5, x6, [%[a], 176]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 160]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 176]\n\t" : - : [r] "r" (r), [a] "r" (a), [d] "r" (d) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6" ); } -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[24]; - sp_digit a1[12]; - sp_digit b1[12]; - sp_digit z2[24]; - sp_digit u, ca, cb; - - ca = sp_3072_add_12(a1, a, &a[12]); - cb = sp_3072_add_12(b1, b, &b[12]); - u = ca & cb; - sp_3072_mul_12(z1, a1, b1); - sp_3072_mul_12(z2, &a[12], &b[12]); - sp_3072_mul_12(z0, a, b); - sp_3072_mask_12(r + 24, a1, 0 - cb); - sp_3072_mask_12(b1, b1, 0 - ca); - u += sp_3072_add_12(r + 24, r + 24, b1); - u += sp_3072_sub_in_place_24(z1, z2); - u += sp_3072_sub_in_place_24(z1, z0); - u += sp_3072_add_24(r + 12, r + 12, z1); - u += sp_3072_add_12(r + 24, r + 24, z2); - sp_3072_add_zero_12(r + 36, z2 + 12, u); -} - -#ifdef WOLFSSL_SP_SMALL -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -static sp_digit sp_3072_dbl_12(sp_digit* r, const sp_digit* a) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "add x11, %[a], 96\n\t" - "\n1:\n\t" - "adds %[c], %[c], #-1\n\t" - "ldp x3, x4, [%[a]], #16\n\t" - "ldp x5, x6, [%[a]], #16\n\t" - "adcs x3, x3, x3\n\t" - "adcs x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r]], #16\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" - "cmp %[a], x11\n\t" - "b.ne 1b\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x3", "x4", "x5", "x6", "x11" - ); - - return c; -} - -#else -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -static sp_digit sp_3072_dbl_12(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "adds x3, x3, x3\n\t" - "ldr x5, [%[a], 16]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 24]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 48]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 56]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 48]\n\t" - "ldp x3, x4, [%[a], 64]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 80]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 88]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 64]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 80]\n\t" - "cset %[r], cs\n\t" - : [r] "+r" (r) - : [a] "r" (a) - : "memory", "x3", "x4", "x5", "x6" - ); - - return (sp_digit)r; -} - -#endif /* WOLFSSL_SP_SMALL */ -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[24]; - sp_digit z1[24]; - sp_digit a1[12]; - sp_digit u; - - u = sp_3072_add_12(a1, a, &a[12]); - sp_3072_sqr_12(z1, a1); - sp_3072_sqr_12(z2, &a[12]); - sp_3072_sqr_12(z0, a); - sp_3072_mask_12(r + 24, a1, 0 - u); - u += sp_3072_dbl_12(r + 24, r + 24); - u += sp_3072_sub_in_place_24(z1, z2); - u += sp_3072_sub_in_place_24(z1, z0); - u += sp_3072_add_24(r + 12, r + 12, z1); - u += sp_3072_add_12(r + 24, r + 24, z2); - sp_3072_add_zero_12(r + 36, z2 + 12, u); -} - /* Sub b from a into a. (a -= b) * * a A single precision integer and result. @@ -8643,99 +8235,113 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, return (sp_digit)r; } -/* AND m into each word of a and store in r. +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. */ -static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<24; i++) { - r[i] = a[i] & m; - } -#else - int i; - - for (i = 0; i < 24; i += 8) { - r[i+0] = a[i+0] & m; - r[i+1] = a[i+1] & m; - r[i+2] = a[i+2] & m; - r[i+3] = a[i+3] & m; - r[i+4] = a[i+4] & m; - r[i+5] = a[i+5] & m; - r[i+6] = a[i+6] & m; - r[i+7] = a[i+7] & m; - } -#endif -} - -/* Add digit to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static void sp_3072_add_zero_24(sp_digit* r, const sp_digit* a, - const sp_digit d) +static sp_digit sp_3072_cond_add_24(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) { __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "adds x3, x3, %[d]\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 48]\n\t" - "ldp x3, x4, [%[a], 64]\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 64]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 80]\n\t" - "ldp x3, x4, [%[a], 96]\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 96]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 112]\n\t" - "ldp x3, x4, [%[a], 128]\n\t" - "ldp x5, x6, [%[a], 144]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 128]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 144]\n\t" - "ldp x3, x4, [%[a], 160]\n\t" - "ldp x5, x6, [%[a], 176]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 160]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 176]\n\t" - : - : [r] "r" (r), [a] "r" (a), [d] "r" (d) - : "memory", "x3", "x4", "x5", "x6" + + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x10, x11, [%[b], 48]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 32]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 48]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "ldp x10, x11, [%[b], 80]\n\t" + "ldp x4, x5, [%[a], 64]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 64]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 80]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "ldp x10, x11, [%[b], 112]\n\t" + "ldp x4, x5, [%[a], 96]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 96]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 112]\n\t" + "ldp x8, x9, [%[b], 128]\n\t" + "ldp x10, x11, [%[b], 144]\n\t" + "ldp x4, x5, [%[a], 128]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 144]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 128]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 144]\n\t" + "ldp x8, x9, [%[b], 160]\n\t" + "ldp x10, x11, [%[b], 176]\n\t" + "ldp x4, x5, [%[a], 160]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 176]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 160]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 176]\n\t" + "cset %[r], cs\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" ); + + return (sp_digit)r; } +#endif /* !WOLFSSL_SP_SMALL */ /* Multiply a and b into r. (r = a * b) * @@ -8750,131 +8356,2788 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, sp_digit z1[48]; sp_digit a1[24]; sp_digit b1[24]; - sp_digit z2[48]; - sp_digit u, ca, cb; + sp_digit* z2 = r + 48; + sp_digit u; + sp_digit ca; + sp_digit cb; ca = sp_3072_add_24(a1, a, &a[24]); cb = sp_3072_add_24(b1, b, &b[24]); u = ca & cb; - sp_3072_mul_24(z1, a1, b1); + sp_3072_mul_24(z2, &a[24], &b[24]); sp_3072_mul_24(z0, a, b); - sp_3072_mask_24(r + 48, a1, 0 - cb); - sp_3072_mask_24(b1, b1, 0 - ca); - u += sp_3072_add_24(r + 48, r + 48, b1); - u += sp_3072_sub_in_place_48(z1, z2); + sp_3072_mul_24(z1, a1, b1); + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_sub_in_place_48(z1, z2); + u += sp_3072_cond_add_24(z1 + 24, z1 + 24, a1, 0 - cb); + u += sp_3072_cond_add_24(z1 + 24, z1 + 24, b1, 0 - ca); + u += sp_3072_add_48(r + 24, r + 24, z1); - u += sp_3072_add_24(r + 48, r + 48, z2); - sp_3072_add_zero_24(r + 72, z2 + 24, u); + (void)sp_3072_add_word_24(r + 72, r + 72, u); } -#ifdef WOLFSSL_SP_SMALL -/* Double a into r. (r = a + a) +/* Square a and put result in r. (r = a * a) * * r A single precision integer. * a A single precision integer. */ -static sp_digit sp_3072_dbl_24(sp_digit* r, const sp_digit* a) +static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) { - sp_digit c = 0; + sp_digit tmp[24]; __asm__ __volatile__ ( - "add x11, %[a], 192\n\t" - "\n1:\n\t" - "adds %[c], %[c], #-1\n\t" - "ldp x3, x4, [%[a]], #16\n\t" - "ldp x5, x6, [%[a]], #16\n\t" - "adcs x3, x3, x3\n\t" - "adcs x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r]], #16\n\t" + "# A[0] * A[0]\n\t" + "ldr x9, [%[a], 0]\n\t" + "mul x8, x9, x9\n\t" + "umulh x3, x9, x9\n\t" + "mov x4, xzr\n\t" + "str x8, [%[tmp]]\n\t" + "# A[0] * A[1]\n\t" + "ldr x9, [%[a], 8]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, xzr, xzr\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, x2, xzr\n\t" + "str x3, [%[tmp], 8]\n\t" + "# A[0] * A[2]\n\t" + "ldr x9, [%[a], 16]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, xzr, xzr\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, x3, xzr\n\t" + "# A[1] * A[1]\n\t" + "ldr x9, [%[a], 8]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, x3, xzr\n\t" + "str x4, [%[tmp], 16]\n\t" + "# A[0] * A[3]\n\t" + "ldr x9, [%[a], 24]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, xzr, xzr\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "# A[1] * A[2]\n\t" + "ldr x9, [%[a], 16]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "str x2, [%[tmp], 24]\n\t" + "# A[0] * A[4]\n\t" + "ldr x9, [%[a], 32]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, xzr, xzr\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, x2, xzr\n\t" + "# A[1] * A[3]\n\t" + "ldr x9, [%[a], 24]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, x2, xzr\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, x2, xzr\n\t" + "# A[2] * A[2]\n\t" + "ldr x9, [%[a], 16]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, x2, xzr\n\t" + "str x3, [%[tmp], 32]\n\t" + "# A[0] * A[5]\n\t" + "ldr x9, [%[a], 40]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[4]\n\t" + "ldr x9, [%[a], 32]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[3]\n\t" + "ldr x9, [%[a], 24]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" - "cmp %[a], x11\n\t" - "b.ne 1b\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a) + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[tmp], 40]\n\t" + "# A[0] * A[6]\n\t" + "ldr x9, [%[a], 48]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[5]\n\t" + "ldr x9, [%[a], 40]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[4]\n\t" + "ldr x9, [%[a], 32]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[3]\n\t" + "ldr x9, [%[a], 24]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[tmp], 48]\n\t" + "# A[0] * A[7]\n\t" + "ldr x9, [%[a], 56]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[6]\n\t" + "ldr x9, [%[a], 48]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[5]\n\t" + "ldr x9, [%[a], 40]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[4]\n\t" + "ldr x9, [%[a], 32]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[tmp], 56]\n\t" + "# A[0] * A[8]\n\t" + "ldr x9, [%[a], 64]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[7]\n\t" + "ldr x9, [%[a], 56]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[6]\n\t" + "ldr x9, [%[a], 48]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[5]\n\t" + "ldr x9, [%[a], 40]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[4]\n\t" + "ldr x9, [%[a], 32]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[tmp], 64]\n\t" + "# A[0] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[8]\n\t" + "ldr x9, [%[a], 64]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[7]\n\t" + "ldr x9, [%[a], 56]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[6]\n\t" + "ldr x9, [%[a], 48]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[5]\n\t" + "ldr x9, [%[a], 40]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[tmp], 72]\n\t" + "# A[0] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[8]\n\t" + "ldr x9, [%[a], 64]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[7]\n\t" + "ldr x9, [%[a], 56]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[6]\n\t" + "ldr x9, [%[a], 48]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[5]\n\t" + "ldr x9, [%[a], 40]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[tmp], 80]\n\t" + "# A[0] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[8]\n\t" + "ldr x9, [%[a], 64]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[7]\n\t" + "ldr x9, [%[a], 56]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[6]\n\t" + "ldr x9, [%[a], 48]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[tmp], 88]\n\t" + "# A[0] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[8]\n\t" + "ldr x9, [%[a], 64]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[7]\n\t" + "ldr x9, [%[a], 56]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[6]\n\t" + "ldr x9, [%[a], 48]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[tmp], 96]\n\t" + "# A[0] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[8]\n\t" + "ldr x9, [%[a], 64]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[7]\n\t" + "ldr x9, [%[a], 56]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[tmp], 104]\n\t" + "# A[0] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[8]\n\t" + "ldr x9, [%[a], 64]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[7]\n\t" + "ldr x9, [%[a], 56]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[tmp], 112]\n\t" + "# A[0] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[8]\n\t" + "ldr x9, [%[a], 64]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[tmp], 120]\n\t" + "# A[0] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[8]\n\t" + "ldr x9, [%[a], 64]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[tmp], 128]\n\t" + "# A[0] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[tmp], 136]\n\t" + "# A[0] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[9]\n\t" + "ldr x9, [%[a], 72]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[tmp], 144]\n\t" + "# A[0] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[tmp], 152]\n\t" + "# A[0] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[10]\n\t" + "ldr x9, [%[a], 80]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[tmp], 160]\n\t" + "# A[0] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[tmp], 168]\n\t" + "# A[0] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[11]\n\t" + "ldr x9, [%[a], 88]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[tmp], 176]\n\t" + "# A[0] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 0]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[1] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[2] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[tmp], 184]\n\t" + "# A[1] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 8]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[2] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[3] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[12]\n\t" + "ldr x9, [%[a], 96]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 192]\n\t" + "# A[2] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 16]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[3] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[4] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 200]\n\t" + "# A[3] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[4] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[5] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[13] * A[13]\n\t" + "ldr x9, [%[a], 104]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 208]\n\t" + "# A[4] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 32]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[5] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[6] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[13] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 216]\n\t" + "# A[5] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 40]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[6] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[7] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[13] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[14] * A[14]\n\t" + "ldr x9, [%[a], 112]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 224]\n\t" + "# A[6] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 48]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[7] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[8] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[13] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[14] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "ldr x10, [%[a], 112]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 232]\n\t" + "# A[7] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 56]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[8] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[9] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[13] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[14] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 112]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[15] * A[15]\n\t" + "ldr x9, [%[a], 120]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 240]\n\t" + "# A[8] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 64]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[9] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[10] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[13] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[14] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 112]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[15] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "ldr x10, [%[a], 120]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 248]\n\t" + "# A[9] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 72]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[10] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[11] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[13] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[14] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 112]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[15] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 120]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[16] * A[16]\n\t" + "ldr x9, [%[a], 128]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 256]\n\t" + "# A[10] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 80]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[11] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[12] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[13] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[14] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 112]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[15] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 120]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[16] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "ldr x10, [%[a], 128]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 264]\n\t" + "# A[11] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 88]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[12] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[13] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[14] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 112]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[15] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 120]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[16] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 128]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[17] * A[17]\n\t" + "ldr x9, [%[a], 136]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 272]\n\t" + "# A[12] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 96]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[13] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[14] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 112]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[15] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 120]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[16] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 128]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[17] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "ldr x10, [%[a], 136]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 280]\n\t" + "# A[13] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 104]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[14] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 112]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[15] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 120]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[16] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 128]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[17] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 136]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[18] * A[18]\n\t" + "ldr x9, [%[a], 144]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 288]\n\t" + "# A[14] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 112]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[15] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 120]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[16] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 128]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[17] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 136]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[18] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "ldr x10, [%[a], 144]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 296]\n\t" + "# A[15] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 120]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[16] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 128]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[17] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 136]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[18] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 144]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[19] * A[19]\n\t" + "ldr x9, [%[a], 152]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 304]\n\t" + "# A[16] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 128]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" + "# A[17] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 136]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[18] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 144]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[19] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "ldr x10, [%[a], 152]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x2, x2, x5\n\t" + "adcs x3, x3, x6\n\t" + "adc x4, x4, x7\n\t" + "str x2, [%[r], 312]\n\t" + "# A[17] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 136]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" + "# A[18] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 144]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[19] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 152]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[20] * A[20]\n\t" + "ldr x9, [%[a], 160]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x3, x3, x5\n\t" + "adcs x4, x4, x6\n\t" + "adc x2, x2, x7\n\t" + "str x3, [%[r], 320]\n\t" + "# A[18] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 144]\n\t" + "mul x5, x9, x10\n\t" + "umulh x6, x9, x10\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" + "# A[19] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 152]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "# A[20] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "ldr x10, [%[a], 160]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, xzr\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "adds x4, x4, x5\n\t" + "adcs x2, x2, x6\n\t" + "adc x3, x3, x7\n\t" + "str x4, [%[r], 328]\n\t" + "# A[19] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 152]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, xzr, xzr\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "# A[20] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 160]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "# A[21] * A[21]\n\t" + "ldr x9, [%[a], 168]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "str x2, [%[r], 336]\n\t" + "# A[20] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 160]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, xzr, xzr\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, x2, xzr\n\t" + "# A[21] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "ldr x10, [%[a], 168]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, x2, xzr\n\t" + "adds x3, x3, x8\n\t" + "adcs x4, x4, x9\n\t" + "adc x2, x2, xzr\n\t" + "str x3, [%[r], 344]\n\t" + "# A[21] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 168]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, xzr, xzr\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, x3, xzr\n\t" + "# A[22] * A[22]\n\t" + "ldr x9, [%[a], 176]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x4, x4, x8\n\t" + "adcs x2, x2, x9\n\t" + "adc x3, x3, xzr\n\t" + "str x4, [%[r], 352]\n\t" + "# A[22] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "ldr x10, [%[a], 176]\n\t" + "mul x8, x9, x10\n\t" + "umulh x9, x9, x10\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, xzr, xzr\n\t" + "adds x2, x2, x8\n\t" + "adcs x3, x3, x9\n\t" + "adc x4, x4, xzr\n\t" + "str x2, [%[r], 360]\n\t" + "# A[23] * A[23]\n\t" + "ldr x9, [%[a], 184]\n\t" + "mul x8, x9, x9\n\t" + "umulh x9, x9, x9\n\t" + "adds x3, x3, x8\n\t" + "adc x4, x4, x9\n\t" + "stp x3, x4, [%[r], 368]\n\t" + "ldp x9, x10, [%[tmp], 0]\n\t" + "stp x9, x10, [%[r], 0]\n\t" + "ldp x9, x10, [%[tmp], 16]\n\t" + "stp x9, x10, [%[r], 16]\n\t" + "ldp x9, x10, [%[tmp], 32]\n\t" + "stp x9, x10, [%[r], 32]\n\t" + "ldp x9, x10, [%[tmp], 48]\n\t" + "stp x9, x10, [%[r], 48]\n\t" + "ldp x9, x10, [%[tmp], 64]\n\t" + "stp x9, x10, [%[r], 64]\n\t" + "ldp x9, x10, [%[tmp], 80]\n\t" + "stp x9, x10, [%[r], 80]\n\t" + "ldp x9, x10, [%[tmp], 96]\n\t" + "stp x9, x10, [%[r], 96]\n\t" + "ldp x9, x10, [%[tmp], 112]\n\t" + "stp x9, x10, [%[r], 112]\n\t" + "ldp x9, x10, [%[tmp], 128]\n\t" + "stp x9, x10, [%[r], 128]\n\t" + "ldp x9, x10, [%[tmp], 144]\n\t" + "stp x9, x10, [%[r], 144]\n\t" + "ldp x9, x10, [%[tmp], 160]\n\t" + "stp x9, x10, [%[r], 160]\n\t" + "ldp x9, x10, [%[tmp], 176]\n\t" + "stp x9, x10, [%[r], 176]\n\t" : - : "memory", "x3", "x4", "x5", "x6", "x11" + : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) + : "memory", "x2", "x3", "x4", "x8", "x9", "x10", "x5", "x6", "x7" ); - - return c; } -#else -/* Double a into r. (r = a + a) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. + * b A single precision integer. */ -static sp_digit sp_3072_dbl_24(sp_digit* r, const sp_digit* a) +static sp_digit sp_3072_sub_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) { __asm__ __volatile__ ( "ldp x3, x4, [%[a], 0]\n\t" - "adds x3, x3, x3\n\t" - "ldr x5, [%[a], 16]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 24]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 0]\n\t" + "subs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 16]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 16]\n\t" "ldp x3, x4, [%[a], 32]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 48]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 56]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 32]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 48]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 48]\n\t" "ldp x3, x4, [%[a], 64]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 80]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 88]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 64]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 80]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 80]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 64]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 80]\n\t" "ldp x3, x4, [%[a], 96]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 112]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 120]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 96]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 112]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 112]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 96]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 112]\n\t" "ldp x3, x4, [%[a], 128]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 144]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 152]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 128]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 144]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 144]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 128]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 144]\n\t" "ldp x3, x4, [%[a], 160]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 176]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 184]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 160]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 176]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 176]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 160]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 176]\n\t" - "cset %[r], cs\n\t" + "csetm %[r], cc\n\t" : [r] "+r" (r) - : [a] "r" (a) - : "memory", "x3", "x4", "x5", "x6" + : [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); return (sp_digit)r; } -#endif /* WOLFSSL_SP_SMALL */ /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -8883,22 +11146,31 @@ static sp_digit sp_3072_dbl_24(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[48]; + sp_digit* z2 = r + 48; sp_digit z1[48]; - sp_digit a1[24]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 24; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 24); + + mask = sp_3072_sub_24(a1, a, &a[24]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_24(a1, p1, p2); - u = sp_3072_add_24(a1, a, &a[24]); - sp_3072_sqr_24(z1, a1); sp_3072_sqr_24(z2, &a[24]); sp_3072_sqr_24(z0, a); - sp_3072_mask_24(r + 48, a1, 0 - u); - u += sp_3072_dbl_24(r + 48, r + 48); - u += sp_3072_sub_in_place_48(z1, z2); - u += sp_3072_sub_in_place_48(z1, z0); - u += sp_3072_add_48(r + 24, r + 24, z1); - u += sp_3072_add_24(r + 48, r + 48, z2); - sp_3072_add_zero_24(r + 72, z2 + 24, u); + sp_3072_sqr_24(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_48(z1, z2); + u -= sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_sub_in_place_48(r + 24, z1); + sp_3072_add_word_24(r + 72, r + 72, u); } #endif /* !WOLFSSL_SP_SMALL */ @@ -8988,10 +11260,10 @@ static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b) sp_digit tmp[96]; __asm__ __volatile__ ( - "mov x5, 0\n\t" - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" "\n1:\n\t" "subs x3, x5, 376\n\t" "csel x3, xzr, x3, cc\n\t" @@ -9037,10 +11309,10 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) sp_digit tmp[96]; __asm__ __volatile__ ( - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" - "mov x5, 0\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" + "mov x5, xzr\n\t" "\n1:\n\t" "subs x3, x5, 376\n\t" "csel x3, xzr, x3, cc\n\t" @@ -9095,23 +11367,6 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) #endif /* WOLFSSL_SP_SMALL */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) -{ - int i; - - for (i=0; i<24; i++) { - r[i] = a[i] & m; - } -} - -#endif /* WOLFSSL_SP_SMALL */ -#ifdef WOLFSSL_SP_SMALL /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -9197,10 +11452,10 @@ static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, const sp_digit* b) sp_digit tmp[48]; __asm__ __volatile__ ( - "mov x5, 0\n\t" - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" "\n1:\n\t" "subs x3, x5, 184\n\t" "csel x3, xzr, x3, cc\n\t" @@ -9246,10 +11501,10 @@ static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) sp_digit tmp[48]; __asm__ __volatile__ ( - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" - "mov x5, 0\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" + "mov x5, xzr\n\t" "\n1:\n\t" "subs x3, x5, 184\n\t" "csel x3, xzr, x3, cc\n\t" @@ -9340,9 +11595,9 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, "ldr x8, [%[a]]\n\t" "mul x5, %[b], x8\n\t" "umulh x3, %[b], x8\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "str x5, [%[r]]\n\t" - "mov x5, 0\n\t" + "mov x5, xzr\n\t" "mov x9, #8\n\t" "1:\n\t" "ldr x8, [%[a], x9]\n\t" @@ -9366,411 +11621,412 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, #else __asm__ __volatile__ ( "# A[0] * B\n\t" - "ldp x8, x9, [%[a]]\n\t" - "mul x3, %[b], x8\n\t" - "umulh x4, %[b], x8\n\t" - "mov x5, 0\n\t" + "ldp x9, x10, [%[a]]\n\t" + "mul x3, %[b], x9\n\t" + "umulh x4, %[b], x9\n\t" + "mov x5, xzr\n\t" "# A[1] * B\n\t" "str x3, [%[r]]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adds x4, x4, x6\n\t" "# A[2] * B\n\t" - "ldp x8, x9, [%[a], 16]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" "str x4, [%[r], 8]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[3] * B\n\t" "str x5, [%[r], 16]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[4] * B\n\t" - "ldp x8, x9, [%[a], 32]\n\t" + "ldp x9, x10, [%[a], 32]\n\t" "str x3, [%[r], 24]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[5] * B\n\t" "str x4, [%[r], 32]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[6] * B\n\t" - "ldp x8, x9, [%[a], 48]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" "str x5, [%[r], 40]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[7] * B\n\t" "str x3, [%[r], 48]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[8] * B\n\t" - "ldp x8, x9, [%[a], 64]\n\t" + "ldp x9, x10, [%[a], 64]\n\t" "str x4, [%[r], 56]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[9] * B\n\t" "str x5, [%[r], 64]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[10] * B\n\t" - "ldp x8, x9, [%[a], 80]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" "str x3, [%[r], 72]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[11] * B\n\t" "str x4, [%[r], 80]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[12] * B\n\t" - "ldp x8, x9, [%[a], 96]\n\t" + "ldp x9, x10, [%[a], 96]\n\t" "str x5, [%[r], 88]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[13] * B\n\t" "str x3, [%[r], 96]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[14] * B\n\t" - "ldp x8, x9, [%[a], 112]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" "str x4, [%[r], 104]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[15] * B\n\t" "str x5, [%[r], 112]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[16] * B\n\t" - "ldp x8, x9, [%[a], 128]\n\t" + "ldp x9, x10, [%[a], 128]\n\t" "str x3, [%[r], 120]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[17] * B\n\t" "str x4, [%[r], 128]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[18] * B\n\t" - "ldp x8, x9, [%[a], 144]\n\t" + "ldp x9, x10, [%[a], 144]\n\t" "str x5, [%[r], 136]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[19] * B\n\t" "str x3, [%[r], 144]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[20] * B\n\t" - "ldp x8, x9, [%[a], 160]\n\t" + "ldp x9, x10, [%[a], 160]\n\t" "str x4, [%[r], 152]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[21] * B\n\t" "str x5, [%[r], 160]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[22] * B\n\t" - "ldp x8, x9, [%[a], 176]\n\t" + "ldp x9, x10, [%[a], 176]\n\t" "str x3, [%[r], 168]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[23] * B\n\t" "str x4, [%[r], 176]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[24] * B\n\t" - "ldp x8, x9, [%[a], 192]\n\t" + "ldp x9, x10, [%[a], 192]\n\t" "str x5, [%[r], 184]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[25] * B\n\t" "str x3, [%[r], 192]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[26] * B\n\t" - "ldp x8, x9, [%[a], 208]\n\t" + "ldp x9, x10, [%[a], 208]\n\t" "str x4, [%[r], 200]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[27] * B\n\t" "str x5, [%[r], 208]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[28] * B\n\t" - "ldp x8, x9, [%[a], 224]\n\t" + "ldp x9, x10, [%[a], 224]\n\t" "str x3, [%[r], 216]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[29] * B\n\t" "str x4, [%[r], 224]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[30] * B\n\t" - "ldp x8, x9, [%[a], 240]\n\t" + "ldp x9, x10, [%[a], 240]\n\t" "str x5, [%[r], 232]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[31] * B\n\t" "str x3, [%[r], 240]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[32] * B\n\t" - "ldp x8, x9, [%[a], 256]\n\t" + "ldp x9, x10, [%[a], 256]\n\t" "str x4, [%[r], 248]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[33] * B\n\t" "str x5, [%[r], 256]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[34] * B\n\t" - "ldp x8, x9, [%[a], 272]\n\t" + "ldp x9, x10, [%[a], 272]\n\t" "str x3, [%[r], 264]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[35] * B\n\t" "str x4, [%[r], 272]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[36] * B\n\t" - "ldp x8, x9, [%[a], 288]\n\t" + "ldp x9, x10, [%[a], 288]\n\t" "str x5, [%[r], 280]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[37] * B\n\t" "str x3, [%[r], 288]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[38] * B\n\t" - "ldp x8, x9, [%[a], 304]\n\t" + "ldp x9, x10, [%[a], 304]\n\t" "str x4, [%[r], 296]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[39] * B\n\t" "str x5, [%[r], 304]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[40] * B\n\t" - "ldp x8, x9, [%[a], 320]\n\t" + "ldp x9, x10, [%[a], 320]\n\t" "str x3, [%[r], 312]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[41] * B\n\t" "str x4, [%[r], 320]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[42] * B\n\t" - "ldp x8, x9, [%[a], 336]\n\t" + "ldp x9, x10, [%[a], 336]\n\t" "str x5, [%[r], 328]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[43] * B\n\t" "str x3, [%[r], 336]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[44] * B\n\t" - "ldp x8, x9, [%[a], 352]\n\t" + "ldp x9, x10, [%[a], 352]\n\t" "str x4, [%[r], 344]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[45] * B\n\t" "str x5, [%[r], 352]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[46] * B\n\t" - "ldp x8, x9, [%[a], 368]\n\t" + "ldp x9, x10, [%[a], 368]\n\t" "str x3, [%[r], 360]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[47] * B\n\t" "str x4, [%[r], 368]\n\t" - "mul x6, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "adc x3, x3, x7\n\t" - "stp x5, x3, [%[r], 376]\n\t" + "str x5, [%[r], 376]\n\t" + "str x3, [%[r], 384]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #endif } @@ -9799,334 +12055,308 @@ static void sp_3072_mont_norm_24(sp_digit* r, const sp_digit* m) SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, sp_digit mp) { - __asm__ __volatile__ ( - "ldp x14, x15, [%[m], 0]\n\t" - "ldp x16, x17, [%[m], 16]\n\t" - "ldp x19, x20, [%[m], 32]\n\t" - "ldp x21, x22, [%[m], 48]\n\t" - "ldp x23, x24, [%[m], 64]\n\t" - "ldp x25, x26, [%[m], 80]\n\t" - "ldp x27, x28, [%[m], 96]\n\t" + "ldp x11, x12, [%[a], 0]\n\t" + "ldp x13, x14, [%[a], 16]\n\t" + "ldp x15, x16, [%[a], 32]\n\t" + "ldp x17, x19, [%[a], 48]\n\t" + "ldp x20, x21, [%[a], 64]\n\t" + "ldp x22, x23, [%[a], 80]\n\t" + "# No carry yet\n\t" "mov x3, xzr\n\t" - "# i = 24\n\t" + "# i = 0..23\n\t" "mov x4, 24\n\t" - "ldp x12, x13, [%[a], 0]\n\t" "\n1:\n\t" "# mu = a[i] * mp\n\t" - "mul x9, %[mp], x12\n\t" + "mul x10, %[mp], x11\n\t" + "ldp x24, x25, [%[m], 0]\n\t" + "ldp x26, x27, [%[m], 16]\n\t" "# a[i+0] += m[0] * mu\n\t" - "mul x7, x14, x9\n\t" - "umulh x8, x14, x9\n\t" - "adds x12, x12, x7\n\t" + "mul x5, x24, x10\n\t" + "umulh x6, x24, x10\n\t" "# a[i+1] += m[1] * mu\n\t" - "mul x7, x15, x9\n\t" - "adc x6, x8, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x12, x13, x7\n\t" + "adds x11, x11, x5\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x11, x12, x5\n\t" "# a[i+2] += m[2] * mu\n\t" - "ldr x13, [%[a], 16]\n\t" - "adc x5, x8, xzr\n\t" - "mul x7, x16, x9\n\t" - "adds x12, x12, x6\n\t" - "umulh x8, x16, x9\n\t" - "adc x5, x5, xzr\n\t" - "adds x13, x13, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x11, x11, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x12, x13, x5\n\t" "# a[i+3] += m[3] * mu\n\t" - "ldr x10, [%[a], 24]\n\t" - "adc x6, x8, xzr\n\t" - "mul x7, x17, x9\n\t" - "adds x13, x13, x5\n\t" - "umulh x8, x17, x9\n\t" "adc x6, x6, xzr\n\t" - "adds x10, x10, x7\n\t" + "adds x12, x12, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x13, x14, x5\n\t" + "ldp x24, x25, [%[m], 32]\n\t" + "ldp x26, x27, [%[m], 48]\n\t" "# a[i+4] += m[4] * mu\n\t" - "ldr x11, [%[a], 32]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x19, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "str x10, [%[a], 24]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x13, x13, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x24, x10\n\t" + "adds x14, x15, x5\n\t" "# a[i+5] += m[5] * mu\n\t" - "ldr x10, [%[a], 40]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x20, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "str x11, [%[a], 32]\n\t" - "adds x10, x10, x7\n\t" + "adds x14, x14, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x15, x16, x5\n\t" "# a[i+6] += m[6] * mu\n\t" - "ldr x11, [%[a], 48]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x21, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "str x10, [%[a], 40]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x15, x15, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x16, x17, x5\n\t" "# a[i+7] += m[7] * mu\n\t" - "ldr x10, [%[a], 56]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x22, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "str x11, [%[a], 48]\n\t" - "adds x10, x10, x7\n\t" + "adds x16, x16, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x17, x19, x5\n\t" + "ldp x24, x25, [%[m], 64]\n\t" + "ldp x26, x27, [%[m], 80]\n\t" "# a[i+8] += m[8] * mu\n\t" - "ldr x11, [%[a], 64]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x23, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x23, x9\n\t" - "str x10, [%[a], 56]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x17, x17, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x24, x10\n\t" + "adds x19, x20, x5\n\t" "# a[i+9] += m[9] * mu\n\t" - "ldr x10, [%[a], 72]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x24, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x24, x9\n\t" - "str x11, [%[a], 64]\n\t" - "adds x10, x10, x7\n\t" + "adds x19, x19, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x20, x21, x5\n\t" "# a[i+10] += m[10] * mu\n\t" - "ldr x11, [%[a], 80]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x25, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x25, x9\n\t" - "str x10, [%[a], 72]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x20, x20, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x21, x22, x5\n\t" "# a[i+11] += m[11] * mu\n\t" - "ldr x10, [%[a], 88]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x26, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x26, x9\n\t" - "str x11, [%[a], 80]\n\t" - "adds x10, x10, x7\n\t" + "adds x21, x21, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x22, x23, x5\n\t" + "ldp x24, x25, [%[m], 96]\n\t" + "ldp x26, x27, [%[m], 112]\n\t" "# a[i+12] += m[12] * mu\n\t" - "ldr x11, [%[a], 96]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x27, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x27, x9\n\t" - "str x10, [%[a], 88]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x22, x22, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "ldr x23, [%[a], 96]\n\t" + "umulh x6, x24, x10\n\t" + "adds x23, x23, x5\n\t" "# a[i+13] += m[13] * mu\n\t" - "ldr x10, [%[a], 104]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x28, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x28, x9\n\t" - "str x11, [%[a], 96]\n\t" - "adds x10, x10, x7\n\t" + "adds x23, x23, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "ldp x8, x9, [%[a], 104]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+14] += m[14] * mu\n\t" - "ldr x11, [%[a], 112]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 112]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 104]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 104]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+15] += m[15] * mu\n\t" - "ldr x10, [%[a], 120]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 120]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 112]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 112]\n\t" + "ldp x8, x9, [%[a], 120]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 128]\n\t" + "ldp x26, x27, [%[m], 144]\n\t" "# a[i+16] += m[16] * mu\n\t" - "ldr x11, [%[a], 128]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 128]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 120]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 120]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+17] += m[17] * mu\n\t" - "ldr x10, [%[a], 136]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 136]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 128]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 128]\n\t" + "ldp x8, x9, [%[a], 136]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+18] += m[18] * mu\n\t" - "ldr x11, [%[a], 144]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 144]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 136]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 136]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+19] += m[19] * mu\n\t" - "ldr x10, [%[a], 152]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 152]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 144]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 144]\n\t" + "ldp x8, x9, [%[a], 152]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 160]\n\t" + "ldp x26, x27, [%[m], 176]\n\t" "# a[i+20] += m[20] * mu\n\t" - "ldr x11, [%[a], 160]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 160]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 152]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 152]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+21] += m[21] * mu\n\t" - "ldr x10, [%[a], 168]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 168]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 160]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 160]\n\t" + "ldp x8, x9, [%[a], 168]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+22] += m[22] * mu\n\t" - "ldr x11, [%[a], 176]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 176]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 168]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 168]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+23] += m[23] * mu\n\t" - "ldr x10, [%[a], 184]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 184]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x8, x8, x3\n\t" - "str x11, [%[a], 176]\n\t" - "cset x3, cs\n\t" - "adds x10, x10, x6\n\t" - "ldr x11, [%[a], 192]\n\t" - "str x10, [%[a], 184]\n\t" - "adcs x11, x11, x8\n\t" - "str x11, [%[a], 192]\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 176]\n\t" + "umulh x7, x27, x10\n\t" + "ldp x8, x9, [%[a], 184]\n\t" + "adds x5, x5, x6\n\t" + "adcs x7, x7, x3\n\t" + "cset x3, cs\n\t" + "adds x8, x8, x5\n\t" + "str x8, [%[a], 184]\n\t" + "adcs x9, x9, x7\n\t" + "str x9, [%[a], 192]\n\t" "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" - "bne 1b\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" + "b.ne 1b\n\t" "# Create mask\n\t" "neg x3, x3\n\t" - "mov x9, %[a]\n\t" + "mov %[mp], %[a]\n\t" "sub %[a], %[a], 192\n\t" "# Subtract masked modulus\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" - "and x14, x14, x3\n\t" - "ldp x11, x10, [x9, 16]\n\t" - "and x15, x15, x3\n\t" - "subs x12, x12, x14\n\t" - "and x16, x16, x3\n\t" - "sbcs x13, x13, x15\n\t" - "and x17, x17, x3\n\t" - "sbcs x11, x11, x16\n\t" - "stp x12, x13, [%[a], 0]\n\t" - "sbcs x10, x10, x17\n\t" - "stp x11, x10, [%[a], 16]\n\t" - "ldp x12, x13, [x9, 32]\n\t" - "and x19, x19, x3\n\t" - "ldp x11, x10, [x9, 48]\n\t" - "and x20, x20, x3\n\t" - "sbcs x12, x12, x19\n\t" - "and x21, x21, x3\n\t" - "sbcs x13, x13, x20\n\t" - "and x22, x22, x3\n\t" - "sbcs x11, x11, x21\n\t" - "stp x12, x13, [%[a], 32]\n\t" - "sbcs x10, x10, x22\n\t" - "stp x11, x10, [%[a], 48]\n\t" - "ldp x12, x13, [x9, 64]\n\t" - "and x23, x23, x3\n\t" - "ldp x11, x10, [x9, 80]\n\t" - "and x24, x24, x3\n\t" - "sbcs x12, x12, x23\n\t" - "and x25, x25, x3\n\t" - "sbcs x13, x13, x24\n\t" - "and x26, x26, x3\n\t" - "sbcs x11, x11, x25\n\t" - "stp x12, x13, [%[a], 64]\n\t" - "sbcs x10, x10, x26\n\t" - "stp x11, x10, [%[a], 80]\n\t" - "ldp x7, x8, [%[m], 112]\n\t" - "ldp x12, x13, [x9, 96]\n\t" - "and x27, x27, x3\n\t" - "ldp x11, x10, [x9, 112]\n\t" - "and x28, x28, x3\n\t" - "sbcs x12, x12, x27\n\t" - "and x7, x7, x3\n\t" - "sbcs x13, x13, x28\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 96]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 112]\n\t" - "ldp x5, x6, [%[m], 128]\n\t" - "ldp x7, x8, [%[m], 144]\n\t" - "ldp x12, x13, [x9, 128]\n\t" + "ldp x4, x5, [%[m], 0]\n\t" + "ldp x6, x7, [%[m], 16]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 144]\n\t" + "subs x11, x11, x4\n\t" "and x6, x6, x3\n\t" "sbcs x12, x12, x5\n\t" "and x7, x7, x3\n\t" "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 128]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 144]\n\t" - "ldp x5, x6, [%[m], 160]\n\t" - "ldp x7, x8, [%[m], 176]\n\t" - "ldp x12, x13, [x9, 160]\n\t" + "stp x11, x12, [%[a], 0]\n\t" + "sbcs x14, x14, x7\n\t" + "stp x13, x14, [%[a], 16]\n\t" + "ldp x4, x5, [%[m], 32]\n\t" + "ldp x6, x7, [%[m], 48]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 176]\n\t" + "sbcs x15, x15, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x16, x16, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x17, x17, x6\n\t" + "stp x15, x16, [%[a], 32]\n\t" + "sbcs x19, x19, x7\n\t" + "stp x17, x19, [%[a], 48]\n\t" + "ldp x4, x5, [%[m], 64]\n\t" + "ldp x6, x7, [%[m], 80]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x20, x20, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x21, x21, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x22, x22, x6\n\t" + "stp x20, x21, [%[a], 64]\n\t" + "sbcs x23, x23, x7\n\t" + "stp x22, x23, [%[a], 80]\n\t" + "ldp x4, x5, [%[m], 96]\n\t" + "ldp x6, x7, [%[m], 112]\n\t" + "ldp x8, x9, [%[mp], 96]\n\t" + "ldp x10, x11, [%[mp], 112]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 96]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 160]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 176]\n\t" - : [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + "stp x10, x11, [%[a], 112]\n\t" + "ldp x4, x5, [%[m], 128]\n\t" + "ldp x6, x7, [%[m], 144]\n\t" + "ldp x8, x9, [%[mp], 128]\n\t" + "ldp x10, x11, [%[mp], 144]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 128]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 144]\n\t" + "ldp x4, x5, [%[m], 160]\n\t" + "ldp x6, x7, [%[m], 176]\n\t" + "ldp x8, x9, [%[mp], 160]\n\t" + "ldp x10, x11, [%[mp], 176]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 160]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 176]\n\t" + : [a] "+r" (a), [mp] "+r" (mp) + : [m] "r" (m) + : "memory", "x3", "x4", "x5", "x6", "x7", "x10", "x8", "x9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } @@ -10140,7 +12370,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_24(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_24(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_24(r, a, b); @@ -10154,7 +12384,7 @@ static void sp_3072_mont_mul_24(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_24(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_24(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_24(r, a); @@ -10306,9 +12536,9 @@ static void sp_3072_mul_d_24(sp_digit* r, const sp_digit* a, "ldr x8, [%[a]]\n\t" "mul x5, %[b], x8\n\t" "umulh x3, %[b], x8\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "str x5, [%[r]]\n\t" - "mov x5, 0\n\t" + "mov x5, xzr\n\t" "mov x9, #8\n\t" "1:\n\t" "ldr x8, [%[a], x9]\n\t" @@ -10332,243 +12562,247 @@ static void sp_3072_mul_d_24(sp_digit* r, const sp_digit* a, #else __asm__ __volatile__ ( "# A[0] * B\n\t" - "ldp x8, x9, [%[a]]\n\t" - "mul x3, %[b], x8\n\t" - "umulh x4, %[b], x8\n\t" - "mov x5, 0\n\t" + "ldp x9, x10, [%[a]]\n\t" + "mul x3, %[b], x9\n\t" + "umulh x4, %[b], x9\n\t" + "mov x5, xzr\n\t" "# A[1] * B\n\t" "str x3, [%[r]]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adds x4, x4, x6\n\t" "# A[2] * B\n\t" - "ldp x8, x9, [%[a], 16]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" "str x4, [%[r], 8]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[3] * B\n\t" "str x5, [%[r], 16]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[4] * B\n\t" - "ldp x8, x9, [%[a], 32]\n\t" + "ldp x9, x10, [%[a], 32]\n\t" "str x3, [%[r], 24]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[5] * B\n\t" "str x4, [%[r], 32]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[6] * B\n\t" - "ldp x8, x9, [%[a], 48]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" "str x5, [%[r], 40]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[7] * B\n\t" "str x3, [%[r], 48]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[8] * B\n\t" - "ldp x8, x9, [%[a], 64]\n\t" + "ldp x9, x10, [%[a], 64]\n\t" "str x4, [%[r], 56]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[9] * B\n\t" "str x5, [%[r], 64]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[10] * B\n\t" - "ldp x8, x9, [%[a], 80]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" "str x3, [%[r], 72]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[11] * B\n\t" "str x4, [%[r], 80]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[12] * B\n\t" - "ldp x8, x9, [%[a], 96]\n\t" + "ldp x9, x10, [%[a], 96]\n\t" "str x5, [%[r], 88]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[13] * B\n\t" "str x3, [%[r], 96]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[14] * B\n\t" - "ldp x8, x9, [%[a], 112]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" "str x4, [%[r], 104]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[15] * B\n\t" "str x5, [%[r], 112]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[16] * B\n\t" - "ldp x8, x9, [%[a], 128]\n\t" + "ldp x9, x10, [%[a], 128]\n\t" "str x3, [%[r], 120]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[17] * B\n\t" "str x4, [%[r], 128]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[18] * B\n\t" - "ldp x8, x9, [%[a], 144]\n\t" + "ldp x9, x10, [%[a], 144]\n\t" "str x5, [%[r], 136]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[19] * B\n\t" "str x3, [%[r], 144]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[20] * B\n\t" - "ldp x8, x9, [%[a], 160]\n\t" + "ldp x9, x10, [%[a], 160]\n\t" "str x4, [%[r], 152]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[21] * B\n\t" "str x5, [%[r], 160]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[22] * B\n\t" - "ldp x8, x9, [%[a], 176]\n\t" + "ldp x9, x10, [%[a], 176]\n\t" "str x3, [%[r], 168]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[23] * B\n\t" "str x4, [%[r], 176]\n\t" - "mul x6, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "adc x3, x3, x7\n\t" - "stp x5, x3, [%[r], 184]\n\t" + "str x5, [%[r], 184]\n\t" + "str x3, [%[r], 192]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. */ static sp_digit div_3072_word_24(sp_digit d1, sp_digit d0, sp_digit div) { - sp_digit r; - __asm__ __volatile__ ( - "lsr x5, %[div], 32\n\t" - "add x5, x5, 1\n\t" + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" "lsl x6, x3, 32\n\t" "mul x4, %[div], x6\n\t" "umulh x3, %[div], x6\n\t" "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "udiv x3, %[d1], x5\n\t" - "lsl x3, x3, 32\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "umulh x3, %[div], x3\n\t" - "subs %[d0], %[d0], x4\n\t" - "sbc %[d1], %[d1], x3\n\t" + "cmp %[d1], x5\n\t" + "cset x9, ge\n\t" + "csetm x10, ge\n\t" + "lsl x9, x9, #32\n\t" + "and x7, x7, x10\n\t" + "and x8, x8, x10\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" "udiv x3, x3, x5\n\t" "add x6, x6, x3\n\t" @@ -10577,23 +12811,52 @@ static sp_digit div_3072_word_24(sp_digit d1, sp_digit d0, sp_digit div) "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" - "udiv x3, x3, x5\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "sub %[d0], %[d0], x4\n\t" + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" "udiv x3, %[d0], %[div]\n\t" - "add %[r], x6, x3\n\t" + "add %[d1], x6, x3\n\t" - : [r] "=r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "x3", "x4", "x5", "x6" + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); - return r; + return d1; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<24; i++) { + r[i] = a[i] & m; + } +#else + int i; + + for (i = 0; i < 24; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif } /* Compare a with b in constant time. @@ -10607,203 +12870,187 @@ static sp_int64 sp_3072_cmp_24(const sp_digit* a, const sp_digit* b) { #ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "mov x5, 184\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "mov x10, #24\n\t" + "add %[a], %[a], #176\n\t" + "add %[b], %[b], #176\n\t" "1:\n\t" - "ldr x6, [%[a], x5]\n\t" - "ldr x7, [%[b], x5]\n\t" - "and x6, x6, x4\n\t" - "and x7, x7, x4\n\t" - "subs x6, x6, x7\n\t" - "csel x2, x3, x2, hi\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "subs x5, x5, #8\n\t" - "b.cs 1b\n\t" - "eor %[a], x2, x4\n\t" - : [a] "+r" (a) - : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + "ldp x6, x7, [%[a]], -16\n\t" + "ldp x8, x9, [%[b]], -16\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x10, x10, #2\n\t" + "b.ne 1b\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #else __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "ldp x5, x6, [%[a], 176]\n\t" - "ldp x7, x8, [%[b], 176]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "ldp x6, x7, [%[a], 176]\n\t" + "ldp x8, x9, [%[b], 176]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 160]\n\t" - "ldp x7, x8, [%[b], 160]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 160]\n\t" + "ldp x8, x9, [%[b], 160]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 144]\n\t" - "ldp x7, x8, [%[b], 144]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 144]\n\t" + "ldp x8, x9, [%[b], 144]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 128]\n\t" - "ldp x7, x8, [%[b], 128]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 128]\n\t" + "ldp x8, x9, [%[b], 128]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "ldp x7, x8, [%[b], 112]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "ldp x8, x9, [%[b], 112]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 96]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "ldp x7, x8, [%[b], 80]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "ldp x8, x9, [%[b], 80]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 64]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "ldp x7, x8, [%[b], 48]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "ldp x8, x9, [%[b], 48]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 32]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "ldp x7, x8, [%[b], 16]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 16]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 0]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "eor %[a], x2, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" : [a] "+r" (a) : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); #endif @@ -10830,7 +13077,7 @@ static WC_INLINE int sp_3072_div_24(const sp_digit* a, const sp_digit* d, sp_dig div = d[23]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 24); - for (i=23; i>=0; i--) { + for (i = 23; i >= 0; i--) { sp_digit hi = t1[24 + i] - (t1[24 + i] == div); r1 = div_3072_word_24(hi, t1[24 + i - 1], div); @@ -11217,658 +13464,608 @@ static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m) SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) { - __asm__ __volatile__ ( - "ldp x14, x15, [%[m], 0]\n\t" - "ldp x16, x17, [%[m], 16]\n\t" - "ldp x19, x20, [%[m], 32]\n\t" - "ldp x21, x22, [%[m], 48]\n\t" - "ldp x23, x24, [%[m], 64]\n\t" - "ldp x25, x26, [%[m], 80]\n\t" - "ldp x27, x28, [%[m], 96]\n\t" + "ldp x11, x12, [%[a], 0]\n\t" + "ldp x13, x14, [%[a], 16]\n\t" + "ldp x15, x16, [%[a], 32]\n\t" + "ldp x17, x19, [%[a], 48]\n\t" + "ldp x20, x21, [%[a], 64]\n\t" + "ldp x22, x23, [%[a], 80]\n\t" + "# No carry yet\n\t" "mov x3, xzr\n\t" - "# i = 48\n\t" + "# i = 0..47\n\t" "mov x4, 48\n\t" - "ldp x12, x13, [%[a], 0]\n\t" "\n1:\n\t" "# mu = a[i] * mp\n\t" - "mul x9, %[mp], x12\n\t" + "mul x10, %[mp], x11\n\t" + "ldp x24, x25, [%[m], 0]\n\t" + "ldp x26, x27, [%[m], 16]\n\t" "# a[i+0] += m[0] * mu\n\t" - "mul x7, x14, x9\n\t" - "umulh x8, x14, x9\n\t" - "adds x12, x12, x7\n\t" + "mul x5, x24, x10\n\t" + "umulh x6, x24, x10\n\t" "# a[i+1] += m[1] * mu\n\t" - "mul x7, x15, x9\n\t" - "adc x6, x8, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x12, x13, x7\n\t" + "adds x11, x11, x5\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x11, x12, x5\n\t" "# a[i+2] += m[2] * mu\n\t" - "ldr x13, [%[a], 16]\n\t" - "adc x5, x8, xzr\n\t" - "mul x7, x16, x9\n\t" - "adds x12, x12, x6\n\t" - "umulh x8, x16, x9\n\t" - "adc x5, x5, xzr\n\t" - "adds x13, x13, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x11, x11, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x12, x13, x5\n\t" "# a[i+3] += m[3] * mu\n\t" - "ldr x10, [%[a], 24]\n\t" - "adc x6, x8, xzr\n\t" - "mul x7, x17, x9\n\t" - "adds x13, x13, x5\n\t" - "umulh x8, x17, x9\n\t" "adc x6, x6, xzr\n\t" - "adds x10, x10, x7\n\t" + "adds x12, x12, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x13, x14, x5\n\t" + "ldp x24, x25, [%[m], 32]\n\t" + "ldp x26, x27, [%[m], 48]\n\t" "# a[i+4] += m[4] * mu\n\t" - "ldr x11, [%[a], 32]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x19, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "str x10, [%[a], 24]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x13, x13, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x24, x10\n\t" + "adds x14, x15, x5\n\t" "# a[i+5] += m[5] * mu\n\t" - "ldr x10, [%[a], 40]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x20, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "str x11, [%[a], 32]\n\t" - "adds x10, x10, x7\n\t" + "adds x14, x14, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x15, x16, x5\n\t" "# a[i+6] += m[6] * mu\n\t" - "ldr x11, [%[a], 48]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x21, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "str x10, [%[a], 40]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x15, x15, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x16, x17, x5\n\t" "# a[i+7] += m[7] * mu\n\t" - "ldr x10, [%[a], 56]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x22, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "str x11, [%[a], 48]\n\t" - "adds x10, x10, x7\n\t" + "adds x16, x16, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x17, x19, x5\n\t" + "ldp x24, x25, [%[m], 64]\n\t" + "ldp x26, x27, [%[m], 80]\n\t" "# a[i+8] += m[8] * mu\n\t" - "ldr x11, [%[a], 64]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x23, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x23, x9\n\t" - "str x10, [%[a], 56]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x17, x17, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x24, x10\n\t" + "adds x19, x20, x5\n\t" "# a[i+9] += m[9] * mu\n\t" - "ldr x10, [%[a], 72]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x24, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x24, x9\n\t" - "str x11, [%[a], 64]\n\t" - "adds x10, x10, x7\n\t" + "adds x19, x19, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x20, x21, x5\n\t" "# a[i+10] += m[10] * mu\n\t" - "ldr x11, [%[a], 80]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x25, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x25, x9\n\t" - "str x10, [%[a], 72]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x20, x20, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x21, x22, x5\n\t" "# a[i+11] += m[11] * mu\n\t" - "ldr x10, [%[a], 88]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x26, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x26, x9\n\t" - "str x11, [%[a], 80]\n\t" - "adds x10, x10, x7\n\t" + "adds x21, x21, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x22, x23, x5\n\t" + "ldp x24, x25, [%[m], 96]\n\t" + "ldp x26, x27, [%[m], 112]\n\t" "# a[i+12] += m[12] * mu\n\t" - "ldr x11, [%[a], 96]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x27, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x27, x9\n\t" - "str x10, [%[a], 88]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x22, x22, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "ldr x23, [%[a], 96]\n\t" + "umulh x6, x24, x10\n\t" + "adds x23, x23, x5\n\t" "# a[i+13] += m[13] * mu\n\t" - "ldr x10, [%[a], 104]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x28, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x28, x9\n\t" - "str x11, [%[a], 96]\n\t" - "adds x10, x10, x7\n\t" + "adds x23, x23, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "ldp x8, x9, [%[a], 104]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+14] += m[14] * mu\n\t" - "ldr x11, [%[a], 112]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 112]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 104]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 104]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+15] += m[15] * mu\n\t" - "ldr x10, [%[a], 120]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 120]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 112]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 112]\n\t" + "ldp x8, x9, [%[a], 120]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 128]\n\t" + "ldp x26, x27, [%[m], 144]\n\t" "# a[i+16] += m[16] * mu\n\t" - "ldr x11, [%[a], 128]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 128]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 120]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 120]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+17] += m[17] * mu\n\t" - "ldr x10, [%[a], 136]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 136]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 128]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 128]\n\t" + "ldp x8, x9, [%[a], 136]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+18] += m[18] * mu\n\t" - "ldr x11, [%[a], 144]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 144]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 136]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 136]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+19] += m[19] * mu\n\t" - "ldr x10, [%[a], 152]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 152]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 144]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 144]\n\t" + "ldp x8, x9, [%[a], 152]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 160]\n\t" + "ldp x26, x27, [%[m], 176]\n\t" "# a[i+20] += m[20] * mu\n\t" - "ldr x11, [%[a], 160]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 160]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 152]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 152]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+21] += m[21] * mu\n\t" - "ldr x10, [%[a], 168]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 168]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 160]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 160]\n\t" + "ldp x8, x9, [%[a], 168]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+22] += m[22] * mu\n\t" - "ldr x11, [%[a], 176]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 176]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 168]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 168]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+23] += m[23] * mu\n\t" - "ldr x10, [%[a], 184]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 184]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 176]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 176]\n\t" + "ldp x8, x9, [%[a], 184]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 192]\n\t" + "ldp x26, x27, [%[m], 208]\n\t" "# a[i+24] += m[24] * mu\n\t" - "ldr x11, [%[a], 192]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 192]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 184]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 184]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+25] += m[25] * mu\n\t" - "ldr x10, [%[a], 200]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 200]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 192]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 192]\n\t" + "ldp x8, x9, [%[a], 200]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+26] += m[26] * mu\n\t" - "ldr x11, [%[a], 208]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 208]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 200]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 200]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+27] += m[27] * mu\n\t" - "ldr x10, [%[a], 216]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 216]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 208]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 208]\n\t" + "ldp x8, x9, [%[a], 216]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 224]\n\t" + "ldp x26, x27, [%[m], 240]\n\t" "# a[i+28] += m[28] * mu\n\t" - "ldr x11, [%[a], 224]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 224]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 216]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 216]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+29] += m[29] * mu\n\t" - "ldr x10, [%[a], 232]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 232]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 224]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 224]\n\t" + "ldp x8, x9, [%[a], 232]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+30] += m[30] * mu\n\t" - "ldr x11, [%[a], 240]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 240]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 232]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 232]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+31] += m[31] * mu\n\t" - "ldr x10, [%[a], 248]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 248]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 240]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 240]\n\t" + "ldp x8, x9, [%[a], 248]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 256]\n\t" + "ldp x26, x27, [%[m], 272]\n\t" "# a[i+32] += m[32] * mu\n\t" - "ldr x11, [%[a], 256]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 256]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 248]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 248]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+33] += m[33] * mu\n\t" - "ldr x10, [%[a], 264]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 264]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 256]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 256]\n\t" + "ldp x8, x9, [%[a], 264]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+34] += m[34] * mu\n\t" - "ldr x11, [%[a], 272]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 272]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 264]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 264]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+35] += m[35] * mu\n\t" - "ldr x10, [%[a], 280]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 280]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 272]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 272]\n\t" + "ldp x8, x9, [%[a], 280]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 288]\n\t" + "ldp x26, x27, [%[m], 304]\n\t" "# a[i+36] += m[36] * mu\n\t" - "ldr x11, [%[a], 288]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 288]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 280]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 280]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+37] += m[37] * mu\n\t" - "ldr x10, [%[a], 296]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 296]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 288]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 288]\n\t" + "ldp x8, x9, [%[a], 296]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+38] += m[38] * mu\n\t" - "ldr x11, [%[a], 304]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 304]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 296]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 296]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+39] += m[39] * mu\n\t" - "ldr x10, [%[a], 312]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 312]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 304]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 304]\n\t" + "ldp x8, x9, [%[a], 312]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 320]\n\t" + "ldp x26, x27, [%[m], 336]\n\t" "# a[i+40] += m[40] * mu\n\t" - "ldr x11, [%[a], 320]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 320]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 312]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 312]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+41] += m[41] * mu\n\t" - "ldr x10, [%[a], 328]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 328]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 320]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 320]\n\t" + "ldp x8, x9, [%[a], 328]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+42] += m[42] * mu\n\t" - "ldr x11, [%[a], 336]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 336]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 328]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 328]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+43] += m[43] * mu\n\t" - "ldr x10, [%[a], 344]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 344]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 336]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 336]\n\t" + "ldp x8, x9, [%[a], 344]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 352]\n\t" + "ldp x26, x27, [%[m], 368]\n\t" "# a[i+44] += m[44] * mu\n\t" - "ldr x11, [%[a], 352]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 352]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 344]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 344]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+45] += m[45] * mu\n\t" - "ldr x10, [%[a], 360]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 360]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 352]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 352]\n\t" + "ldp x8, x9, [%[a], 360]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+46] += m[46] * mu\n\t" - "ldr x11, [%[a], 368]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 368]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 360]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 360]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+47] += m[47] * mu\n\t" - "ldr x10, [%[a], 376]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 376]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x8, x8, x3\n\t" - "str x11, [%[a], 368]\n\t" - "cset x3, cs\n\t" - "adds x10, x10, x6\n\t" - "ldr x11, [%[a], 384]\n\t" - "str x10, [%[a], 376]\n\t" - "adcs x11, x11, x8\n\t" - "str x11, [%[a], 384]\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 368]\n\t" + "umulh x7, x27, x10\n\t" + "ldp x8, x9, [%[a], 376]\n\t" + "adds x5, x5, x6\n\t" + "adcs x7, x7, x3\n\t" + "cset x3, cs\n\t" + "adds x8, x8, x5\n\t" + "str x8, [%[a], 376]\n\t" + "adcs x9, x9, x7\n\t" + "str x9, [%[a], 384]\n\t" "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" - "bne 1b\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" + "b.ne 1b\n\t" "# Create mask\n\t" "neg x3, x3\n\t" - "mov x9, %[a]\n\t" + "mov %[mp], %[a]\n\t" "sub %[a], %[a], 384\n\t" "# Subtract masked modulus\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" - "and x14, x14, x3\n\t" - "ldp x11, x10, [x9, 16]\n\t" - "and x15, x15, x3\n\t" - "subs x12, x12, x14\n\t" - "and x16, x16, x3\n\t" - "sbcs x13, x13, x15\n\t" - "and x17, x17, x3\n\t" - "sbcs x11, x11, x16\n\t" - "stp x12, x13, [%[a], 0]\n\t" - "sbcs x10, x10, x17\n\t" - "stp x11, x10, [%[a], 16]\n\t" - "ldp x12, x13, [x9, 32]\n\t" - "and x19, x19, x3\n\t" - "ldp x11, x10, [x9, 48]\n\t" - "and x20, x20, x3\n\t" - "sbcs x12, x12, x19\n\t" - "and x21, x21, x3\n\t" - "sbcs x13, x13, x20\n\t" - "and x22, x22, x3\n\t" - "sbcs x11, x11, x21\n\t" - "stp x12, x13, [%[a], 32]\n\t" - "sbcs x10, x10, x22\n\t" - "stp x11, x10, [%[a], 48]\n\t" - "ldp x12, x13, [x9, 64]\n\t" - "and x23, x23, x3\n\t" - "ldp x11, x10, [x9, 80]\n\t" - "and x24, x24, x3\n\t" - "sbcs x12, x12, x23\n\t" - "and x25, x25, x3\n\t" - "sbcs x13, x13, x24\n\t" - "and x26, x26, x3\n\t" - "sbcs x11, x11, x25\n\t" - "stp x12, x13, [%[a], 64]\n\t" - "sbcs x10, x10, x26\n\t" - "stp x11, x10, [%[a], 80]\n\t" - "ldp x7, x8, [%[m], 112]\n\t" - "ldp x12, x13, [x9, 96]\n\t" - "and x27, x27, x3\n\t" - "ldp x11, x10, [x9, 112]\n\t" - "and x28, x28, x3\n\t" - "sbcs x12, x12, x27\n\t" - "and x7, x7, x3\n\t" - "sbcs x13, x13, x28\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 96]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 112]\n\t" - "ldp x5, x6, [%[m], 128]\n\t" - "ldp x7, x8, [%[m], 144]\n\t" - "ldp x12, x13, [x9, 128]\n\t" + "ldp x4, x5, [%[m], 0]\n\t" + "ldp x6, x7, [%[m], 16]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 144]\n\t" + "subs x11, x11, x4\n\t" "and x6, x6, x3\n\t" "sbcs x12, x12, x5\n\t" "and x7, x7, x3\n\t" "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 128]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 144]\n\t" - "ldp x5, x6, [%[m], 160]\n\t" - "ldp x7, x8, [%[m], 176]\n\t" - "ldp x12, x13, [x9, 160]\n\t" + "stp x11, x12, [%[a], 0]\n\t" + "sbcs x14, x14, x7\n\t" + "stp x13, x14, [%[a], 16]\n\t" + "ldp x4, x5, [%[m], 32]\n\t" + "ldp x6, x7, [%[m], 48]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 176]\n\t" + "sbcs x15, x15, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x16, x16, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 160]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 176]\n\t" - "ldp x5, x6, [%[m], 192]\n\t" - "ldp x7, x8, [%[m], 208]\n\t" - "ldp x12, x13, [x9, 192]\n\t" + "sbcs x17, x17, x6\n\t" + "stp x15, x16, [%[a], 32]\n\t" + "sbcs x19, x19, x7\n\t" + "stp x17, x19, [%[a], 48]\n\t" + "ldp x4, x5, [%[m], 64]\n\t" + "ldp x6, x7, [%[m], 80]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 208]\n\t" + "sbcs x20, x20, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x21, x21, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 192]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 208]\n\t" - "ldp x5, x6, [%[m], 224]\n\t" - "ldp x7, x8, [%[m], 240]\n\t" - "ldp x12, x13, [x9, 224]\n\t" + "sbcs x22, x22, x6\n\t" + "stp x20, x21, [%[a], 64]\n\t" + "sbcs x23, x23, x7\n\t" + "stp x22, x23, [%[a], 80]\n\t" + "ldp x4, x5, [%[m], 96]\n\t" + "ldp x6, x7, [%[m], 112]\n\t" + "ldp x8, x9, [%[mp], 96]\n\t" + "ldp x10, x11, [%[mp], 112]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 240]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 96]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 224]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 240]\n\t" - "ldp x5, x6, [%[m], 256]\n\t" - "ldp x7, x8, [%[m], 272]\n\t" - "ldp x12, x13, [x9, 256]\n\t" + "stp x10, x11, [%[a], 112]\n\t" + "ldp x4, x5, [%[m], 128]\n\t" + "ldp x6, x7, [%[m], 144]\n\t" + "ldp x8, x9, [%[mp], 128]\n\t" + "ldp x10, x11, [%[mp], 144]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 272]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 128]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 256]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 272]\n\t" - "ldp x5, x6, [%[m], 288]\n\t" - "ldp x7, x8, [%[m], 304]\n\t" - "ldp x12, x13, [x9, 288]\n\t" + "stp x10, x11, [%[a], 144]\n\t" + "ldp x4, x5, [%[m], 160]\n\t" + "ldp x6, x7, [%[m], 176]\n\t" + "ldp x8, x9, [%[mp], 160]\n\t" + "ldp x10, x11, [%[mp], 176]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 304]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 160]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 288]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 304]\n\t" - "ldp x5, x6, [%[m], 320]\n\t" - "ldp x7, x8, [%[m], 336]\n\t" - "ldp x12, x13, [x9, 320]\n\t" + "stp x10, x11, [%[a], 176]\n\t" + "ldp x4, x5, [%[m], 192]\n\t" + "ldp x6, x7, [%[m], 208]\n\t" + "ldp x8, x9, [%[mp], 192]\n\t" + "ldp x10, x11, [%[mp], 208]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 336]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 192]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 320]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 336]\n\t" - "ldp x5, x6, [%[m], 352]\n\t" - "ldp x7, x8, [%[m], 368]\n\t" - "ldp x12, x13, [x9, 352]\n\t" + "stp x10, x11, [%[a], 208]\n\t" + "ldp x4, x5, [%[m], 224]\n\t" + "ldp x6, x7, [%[m], 240]\n\t" + "ldp x8, x9, [%[mp], 224]\n\t" + "ldp x10, x11, [%[mp], 240]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 368]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 224]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 352]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 368]\n\t" - : [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + "stp x10, x11, [%[a], 240]\n\t" + "ldp x4, x5, [%[m], 256]\n\t" + "ldp x6, x7, [%[m], 272]\n\t" + "ldp x8, x9, [%[mp], 256]\n\t" + "ldp x10, x11, [%[mp], 272]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 256]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 272]\n\t" + "ldp x4, x5, [%[m], 288]\n\t" + "ldp x6, x7, [%[m], 304]\n\t" + "ldp x8, x9, [%[mp], 288]\n\t" + "ldp x10, x11, [%[mp], 304]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 288]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 304]\n\t" + "ldp x4, x5, [%[m], 320]\n\t" + "ldp x6, x7, [%[m], 336]\n\t" + "ldp x8, x9, [%[mp], 320]\n\t" + "ldp x10, x11, [%[mp], 336]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 320]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 336]\n\t" + "ldp x4, x5, [%[m], 352]\n\t" + "ldp x6, x7, [%[m], 368]\n\t" + "ldp x8, x9, [%[mp], 352]\n\t" + "ldp x10, x11, [%[mp], 368]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 352]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 368]\n\t" + : [a] "+r" (a), [mp] "+r" (mp) + : [m] "r" (m) + : "memory", "x3", "x4", "x5", "x6", "x7", "x10", "x8", "x9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } @@ -11882,7 +14079,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_48(r, a, b); @@ -11896,7 +14093,7 @@ static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_48(r, a); @@ -12081,38 +14278,38 @@ static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. */ -static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) +static sp_digit div_3072_word_48_cond(sp_digit d1, sp_digit d0, sp_digit div) { - sp_digit r; - __asm__ __volatile__ ( - "lsr x5, %[div], 32\n\t" - "add x5, x5, 1\n\t" + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" "lsl x6, x3, 32\n\t" "mul x4, %[div], x6\n\t" "umulh x3, %[div], x6\n\t" "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "udiv x3, %[d1], x5\n\t" - "lsl x3, x3, 32\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "umulh x3, %[div], x3\n\t" - "subs %[d0], %[d0], x4\n\t" - "sbc %[d1], %[d1], x3\n\t" + "cmp %[d1], x5\n\t" + "b.lt 1f\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" + "1:\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" "udiv x3, x3, x5\n\t" "add x6, x6, x3\n\t" @@ -12121,23 +14318,25 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" - "udiv x3, x3, x5\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "sub %[d0], %[d0], x4\n\t" + "cmp x3, x5\n\t" + "b.lt 2f\n\t" + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" + "2:\n\t" "udiv x3, %[d0], %[div]\n\t" - "add %[r], x6, x3\n\t" + "add %[d1], x6, x3\n\t" - : [r] "=r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "x3", "x4", "x5", "x6" + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); - return r; + return d1; } /* Divide d in a and put remainder into r (m*d + r = a) @@ -12160,9 +14359,13 @@ static WC_INLINE int sp_3072_div_48_cond(const sp_digit* a, const sp_digit* d, s div = d[47]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); - for (i=47; i>=0; i--) { - sp_digit hi = t1[48 + i] - (t1[48 + i] == div); - r1 = div_3072_word_48(hi, t1[48 + i - 1], div); + for (i = 47; i >= 0; i--) { + if (t1[48 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_3072_word_48_cond(t1[48 + i], t1[48 + i - 1], div); + } sp_3072_mul_d_48(t2, d, r1); t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2); @@ -12415,6 +14618,67 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_dig #endif /* WOLFSSL_SP_SMALL */ } +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + */ +static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) +{ + __asm__ __volatile__ ( + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" + + "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" + "lsl x6, x3, 32\n\t" + "mul x4, %[div], x6\n\t" + "umulh x3, %[div], x6\n\t" + "subs %[d0], %[d0], x4\n\t" + "sbc %[d1], %[d1], x3\n\t" + + "cmp %[d1], x5\n\t" + "cset x9, ge\n\t" + "csetm x10, ge\n\t" + "lsl x9, x9, #32\n\t" + "and x7, x7, x10\n\t" + "and x8, x8, x10\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" + + "extr x3, %[d1], %[d0], 32\n\t" + + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "umulh x3, %[div], x3\n\t" + "subs %[d0], %[d0], x4\n\t" + "sbc %[d1], %[d1], x3\n\t" + + "extr x3, %[d1], %[d0], 32\n\t" + + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" + + "udiv x3, %[d0], %[div]\n\t" + "add %[d1], x6, x3\n\t" + + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + ); + + return d1; +} + /* AND m into each word of a and store in r. * * r A single precision integer. @@ -12456,371 +14720,331 @@ static sp_int64 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b) { #ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "mov x5, 376\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "mov x10, #48\n\t" + "add %[a], %[a], #368\n\t" + "add %[b], %[b], #368\n\t" "1:\n\t" - "ldr x6, [%[a], x5]\n\t" - "ldr x7, [%[b], x5]\n\t" - "and x6, x6, x4\n\t" - "and x7, x7, x4\n\t" - "subs x6, x6, x7\n\t" - "csel x2, x3, x2, hi\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "subs x5, x5, #8\n\t" - "b.cs 1b\n\t" - "eor %[a], x2, x4\n\t" - : [a] "+r" (a) - : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + "ldp x6, x7, [%[a]], -16\n\t" + "ldp x8, x9, [%[b]], -16\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x10, x10, #2\n\t" + "b.ne 1b\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #else __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "ldp x5, x6, [%[a], 368]\n\t" - "ldp x7, x8, [%[b], 368]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "ldp x6, x7, [%[a], 368]\n\t" + "ldp x8, x9, [%[b], 368]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 352]\n\t" - "ldp x7, x8, [%[b], 352]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 352]\n\t" + "ldp x8, x9, [%[b], 352]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 336]\n\t" - "ldp x7, x8, [%[b], 336]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 336]\n\t" + "ldp x8, x9, [%[b], 336]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 320]\n\t" - "ldp x7, x8, [%[b], 320]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 320]\n\t" + "ldp x8, x9, [%[b], 320]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 304]\n\t" - "ldp x7, x8, [%[b], 304]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 304]\n\t" + "ldp x8, x9, [%[b], 304]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 288]\n\t" - "ldp x7, x8, [%[b], 288]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 288]\n\t" + "ldp x8, x9, [%[b], 288]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 272]\n\t" - "ldp x7, x8, [%[b], 272]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 272]\n\t" + "ldp x8, x9, [%[b], 272]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 256]\n\t" - "ldp x7, x8, [%[b], 256]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 256]\n\t" + "ldp x8, x9, [%[b], 256]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 240]\n\t" - "ldp x7, x8, [%[b], 240]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 240]\n\t" + "ldp x8, x9, [%[b], 240]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 224]\n\t" - "ldp x7, x8, [%[b], 224]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 224]\n\t" + "ldp x8, x9, [%[b], 224]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 208]\n\t" - "ldp x7, x8, [%[b], 208]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 208]\n\t" + "ldp x8, x9, [%[b], 208]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 192]\n\t" - "ldp x7, x8, [%[b], 192]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 192]\n\t" + "ldp x8, x9, [%[b], 192]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 176]\n\t" - "ldp x7, x8, [%[b], 176]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 176]\n\t" + "ldp x8, x9, [%[b], 176]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 160]\n\t" - "ldp x7, x8, [%[b], 160]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 160]\n\t" + "ldp x8, x9, [%[b], 160]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 144]\n\t" - "ldp x7, x8, [%[b], 144]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 144]\n\t" + "ldp x8, x9, [%[b], 144]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 128]\n\t" - "ldp x7, x8, [%[b], 128]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 128]\n\t" + "ldp x8, x9, [%[b], 128]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "ldp x7, x8, [%[b], 112]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "ldp x8, x9, [%[b], 112]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 96]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "ldp x7, x8, [%[b], 80]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "ldp x8, x9, [%[b], 80]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 64]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "ldp x7, x8, [%[b], 48]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "ldp x8, x9, [%[b], 48]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 32]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "ldp x7, x8, [%[b], 16]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 16]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 0]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "eor %[a], x2, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" : [a] "+r" (a) : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); #endif @@ -12847,7 +15071,7 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig div = d[47]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); - for (i=47; i>=0; i--) { + for (i = 47; i >= 0; i--) { sp_digit hi = t1[48 + i] - (t1[48 + i] == div); r1 = div_3072_word_48(hi, t1[48 + i - 1], div); @@ -13233,9 +15457,9 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 48; r = a + 48 * 2; m = r + 48 * 2; - ah = a + 48; sp_3072_from_bin(ah, 48, in, inLen); #if DIGIT_BIT >= 64 @@ -13253,7 +15477,38 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_3072_from_mp(m, 48, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_3072_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 48); + err = sp_3072_mod_48_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_3072_mont_sqr_48(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_3072_mont_mul_48(r, r, ah, m, mp); + + for (i = 47; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_3072_sub_in_place_48(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_3072_sqr_48(r, ah); err = sp_3072_mod_48_cond(r, r, m); @@ -13281,7 +15536,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 48); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_3072_mont_sqr_48(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_3072_mont_mul_48(r, r, a, m, mp); @@ -13316,6 +15571,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, } #ifndef WOLFSSL_RSA_PUBLIC_ONLY +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -13327,7 +15583,6 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, static sp_digit sp_3072_cond_add_24(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m) { -#ifdef WOLFSSL_SP_SMALL sp_digit c = 0; __asm__ __volatile__ ( @@ -13345,106 +15600,12 @@ static sp_digit sp_3072_cond_add_24(sp_digit* r, const sp_digit* a, const sp_dig "b.lt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + : "memory", "x4", "x5", "x8", "x9", "x10", "x11", "x12" ); return c; -#else - __asm__ __volatile__ ( - - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "adds x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldp x5, x7, [%[b], 64]\n\t" - "ldp x11, x12, [%[b], 80]\n\t" - "ldp x4, x6, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 80]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 64]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 80]\n\t" - "ldp x5, x7, [%[b], 96]\n\t" - "ldp x11, x12, [%[b], 112]\n\t" - "ldp x4, x6, [%[a], 96]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 112]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 96]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 112]\n\t" - "ldp x5, x7, [%[b], 128]\n\t" - "ldp x11, x12, [%[b], 144]\n\t" - "ldp x4, x6, [%[a], 128]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 144]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 128]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 144]\n\t" - "ldp x5, x7, [%[b], 160]\n\t" - "ldp x11, x12, [%[b], 176]\n\t" - "ldp x4, x6, [%[a], 160]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 176]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 160]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 176]\n\t" - "cset %[r], cs\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* WOLFSSL_SP_SMALL */ /* RSA private key operation. * @@ -14313,41 +16474,74 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod, */ static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n) { - int i; - int j; - byte* d; + sp_int64 nl = n; + sp_int64 size8 = size * 8; - for (i = n - 1,j = 0; i >= 7; i -= 8) { - r[j] = ((sp_digit)a[i - 0] << 0) | - ((sp_digit)a[i - 1] << 8) | - ((sp_digit)a[i - 2] << 16) | - ((sp_digit)a[i - 3] << 24) | - ((sp_digit)a[i - 4] << 32) | - ((sp_digit)a[i - 5] << 40) | - ((sp_digit)a[i - 6] << 48) | - ((sp_digit)a[i - 7] << 56); - j++; - } - - if (i >= 0) { - r[j] = 0; - - d = (byte*)r; - switch (i) { - case 6: d[n - 1 - 6] = a[6]; //fallthrough - case 5: d[n - 1 - 5] = a[5]; //fallthrough - case 4: d[n - 1 - 4] = a[4]; //fallthrough - case 3: d[n - 1 - 3] = a[3]; //fallthrough - case 2: d[n - 1 - 2] = a[2]; //fallthrough - case 1: d[n - 1 - 1] = a[1]; //fallthrough - case 0: d[n - 1 - 0] = a[0]; //fallthrough - } - j++; - } - - for (; j < size; j++) { - r[j] = 0; - } + __asm__ __volatile__ ( + "add x4, %[a], %[n]\n\t" + "mov x5, %[r]\n\t" + "sub x4, x4, 8\n\t" + "subs x6, %[n], 8\n\t" + "mov x7, xzr\n\t" + "blt 2f\n\t" + /* Put in mulitples of 8 bytes. */ + "1:\n\t" + "ldr x8, [x4], -8\n\t" + "subs x6, x6, 8\n\t" + "rev x8, x8\n\t" + "str x8, [x5], 8\n\t" + "add x7, x7, 8\n\t" + "b.ge 1b\n\t" + "2:\n\t" + "cmp x6, -7\n\t" + "b.lt 20f\n\t" + /* Put in less than 8 bytes. */ + "str xzr, [x5]\n\t" + "add x7, x7, 8\n\t" + "add x4, x4, 7\n\t" + "b.eq 17f\n\t" + "cmp x6, -5\n\t" + "b.lt 16f\n\t" + "b.eq 15f\n\t" + "cmp x6, -3\n\t" + "b.lt 14f\n\t" + "b.eq 13f\n\t" + "cmp x6, -2\n\t" + "b.eq 12f\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "12:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "13:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "14:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "15:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "16:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "17:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "20:\n\t" + "add x5, %[r], x7\n\t" + "subs x7, %[size], x7\n\t" + "b.eq 30f\n\t" + /* Zero out remaining words. */ + "21:\n\t" + "subs x7, x7, 8\n\t" + "str xzr, [x5], 8\n\t" + "b.gt 21b\n\t" + "30:\n\t" + : + : [r] "r" (r), [size] "r" (size8), [a] "r" (a), [n] "r" (nl) + : "memory", "x4", "x5", "x6", "x7", "x8" + ); } /* Convert an mp_int to an array of sp_digit. @@ -14446,15 +16640,15 @@ static void sp_4096_to_bin_64(sp_digit* r, byte* a) int i; int j = 0; - for (i = 63; i >= 0; i--) { - a[j++] = r[i] >> 56; - a[j++] = r[i] >> 48; - a[j++] = r[i] >> 40; - a[j++] = r[i] >> 32; - a[j++] = r[i] >> 24; - a[j++] = r[i] >> 16; - a[j++] = r[i] >> 8; - a[j++] = r[i] >> 0; + for (i = 63; i >= 0; i--, j += 8) { + __asm__ __volatile__ ( + "ldr x4, [%[r]]\n\t" + "rev x4, x4\n\t" + "str x4, [%[a]]\n\t" + : + : [r] "r" (r + i), [a] "r" (a + j) + : "memory", "x4" + ); } } @@ -14473,103 +16667,84 @@ static void sp_4096_to_bin_64(sp_digit* r, byte* a) #define sp_4096_norm_64(a) #ifndef WOLFSSL_SP_SMALL -/* Add b to a into r. (r = a + b) +/* Add digit to a into r. (r = a + b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -static sp_digit sp_4096_add_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_4096_add_word_32(sp_digit* r, const sp_digit* a, + sp_digit b) { __asm__ __volatile__ ( "ldp x3, x4, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "adds x3, x3, x7\n\t" "ldp x5, x6, [%[a], 16]\n\t" - "adcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 16]\n\t" - "adcs x5, x5, x9\n\t" + "adds x3, x3, %[b]\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, x10\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 16]\n\t" "ldp x3, x4, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "adcs x3, x3, x7\n\t" "ldp x5, x6, [%[a], 48]\n\t" - "adcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 48]\n\t" - "adcs x5, x5, x9\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, x10\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 48]\n\t" "ldp x3, x4, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "adcs x3, x3, x7\n\t" "ldp x5, x6, [%[a], 80]\n\t" - "adcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 80]\n\t" - "adcs x5, x5, x9\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 64]\n\t" - "adcs x6, x6, x10\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 80]\n\t" "ldp x3, x4, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "adcs x3, x3, x7\n\t" "ldp x5, x6, [%[a], 112]\n\t" - "adcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 112]\n\t" - "adcs x5, x5, x9\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 96]\n\t" - "adcs x6, x6, x10\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 112]\n\t" "ldp x3, x4, [%[a], 128]\n\t" - "ldp x7, x8, [%[b], 128]\n\t" - "adcs x3, x3, x7\n\t" "ldp x5, x6, [%[a], 144]\n\t" - "adcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 144]\n\t" - "adcs x5, x5, x9\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 128]\n\t" - "adcs x6, x6, x10\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 144]\n\t" "ldp x3, x4, [%[a], 160]\n\t" - "ldp x7, x8, [%[b], 160]\n\t" - "adcs x3, x3, x7\n\t" "ldp x5, x6, [%[a], 176]\n\t" - "adcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 176]\n\t" - "adcs x5, x5, x9\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 160]\n\t" - "adcs x6, x6, x10\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 176]\n\t" "ldp x3, x4, [%[a], 192]\n\t" - "ldp x7, x8, [%[b], 192]\n\t" - "adcs x3, x3, x7\n\t" "ldp x5, x6, [%[a], 208]\n\t" - "adcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 208]\n\t" - "adcs x5, x5, x9\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 192]\n\t" - "adcs x6, x6, x10\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 208]\n\t" "ldp x3, x4, [%[a], 224]\n\t" - "ldp x7, x8, [%[b], 224]\n\t" - "adcs x3, x3, x7\n\t" "ldp x5, x6, [%[a], 240]\n\t" - "adcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 240]\n\t" - "adcs x5, x5, x9\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 224]\n\t" - "adcs x6, x6, x10\n\t" + "adcs x6, x6, xzr\n\t" "stp x5, x6, [%[r], 240]\n\t" - "cset %[r], cs\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6" ); - - return (sp_digit)r; } /* Sub b from a into a. (a -= b) @@ -14928,85 +17103,141 @@ static sp_digit sp_4096_add_64(sp_digit* r, const sp_digit* a, return (sp_digit)r; } -/* Add digit to a into r. (r = a + b) +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. */ -static void sp_4096_add_zero_32(sp_digit* r, const sp_digit* a, - const sp_digit d) +static sp_digit sp_4096_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) { __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "adds x3, x3, %[d]\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 48]\n\t" - "ldp x3, x4, [%[a], 64]\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 64]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 80]\n\t" - "ldp x3, x4, [%[a], 96]\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 96]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 112]\n\t" - "ldp x3, x4, [%[a], 128]\n\t" - "ldp x5, x6, [%[a], 144]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 128]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 144]\n\t" - "ldp x3, x4, [%[a], 160]\n\t" - "ldp x5, x6, [%[a], 176]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 160]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 176]\n\t" - "ldp x3, x4, [%[a], 192]\n\t" - "ldp x5, x6, [%[a], 208]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 192]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 208]\n\t" - "ldp x3, x4, [%[a], 224]\n\t" - "ldp x5, x6, [%[a], 240]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 224]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 240]\n\t" - : - : [r] "r" (r), [a] "r" (a), [d] "r" (d) - : "memory", "x3", "x4", "x5", "x6" + + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x10, x11, [%[b], 48]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 32]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 48]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "ldp x10, x11, [%[b], 80]\n\t" + "ldp x4, x5, [%[a], 64]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 64]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 80]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "ldp x10, x11, [%[b], 112]\n\t" + "ldp x4, x5, [%[a], 96]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 96]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 112]\n\t" + "ldp x8, x9, [%[b], 128]\n\t" + "ldp x10, x11, [%[b], 144]\n\t" + "ldp x4, x5, [%[a], 128]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 144]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 128]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 144]\n\t" + "ldp x8, x9, [%[b], 160]\n\t" + "ldp x10, x11, [%[b], 176]\n\t" + "ldp x4, x5, [%[a], 160]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 176]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 160]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 176]\n\t" + "ldp x8, x9, [%[b], 192]\n\t" + "ldp x10, x11, [%[b], 208]\n\t" + "ldp x4, x5, [%[a], 192]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 208]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 192]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 208]\n\t" + "ldp x8, x9, [%[b], 224]\n\t" + "ldp x10, x11, [%[b], 240]\n\t" + "ldp x4, x5, [%[a], 224]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 240]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 224]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 240]\n\t" + "cset %[r], cs\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" ); + + return (sp_digit)r; } +#endif /* !WOLFSSL_SP_SMALL */ /* Multiply a and b into r. (r = a * b) * @@ -15021,149 +17252,28 @@ SP_NOINLINE static void sp_4096_mul_64(sp_digit* r, const sp_digit* a, sp_digit z1[64]; sp_digit a1[32]; sp_digit b1[32]; - sp_digit z2[64]; - sp_digit u, ca, cb; + sp_digit* z2 = r + 64; + sp_digit u; + sp_digit ca; + sp_digit cb; ca = sp_2048_add_32(a1, a, &a[32]); cb = sp_2048_add_32(b1, b, &b[32]); u = ca & cb; - sp_2048_mul_32(z1, a1, b1); + sp_2048_mul_32(z2, &a[32], &b[32]); sp_2048_mul_32(z0, a, b); - sp_2048_mask_32(r + 64, a1, 0 - cb); - sp_2048_mask_32(b1, b1, 0 - ca); - u += sp_2048_add_32(r + 64, r + 64, b1); - u += sp_4096_sub_in_place_64(z1, z2); + sp_2048_mul_32(z1, a1, b1); + u += sp_4096_sub_in_place_64(z1, z0); + u += sp_4096_sub_in_place_64(z1, z2); + u += sp_4096_cond_add_32(z1 + 32, z1 + 32, a1, 0 - cb); + u += sp_4096_cond_add_32(z1 + 32, z1 + 32, b1, 0 - ca); + u += sp_4096_add_64(r + 32, r + 32, z1); - u += sp_4096_add_32(r + 64, r + 64, z2); - sp_4096_add_zero_32(r + 96, z2 + 32, u); + (void)sp_4096_add_word_32(r + 96, r + 96, u); } -#ifdef WOLFSSL_SP_SMALL -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -static sp_digit sp_2048_dbl_32(sp_digit* r, const sp_digit* a) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "add x11, %[a], 256\n\t" - "\n1:\n\t" - "adds %[c], %[c], #-1\n\t" - "ldp x3, x4, [%[a]], #16\n\t" - "ldp x5, x6, [%[a]], #16\n\t" - "adcs x3, x3, x3\n\t" - "adcs x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r]], #16\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" - "cmp %[a], x11\n\t" - "b.ne 1b\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x3", "x4", "x5", "x6", "x11" - ); - - return c; -} - -#else -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -static sp_digit sp_2048_dbl_32(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "adds x3, x3, x3\n\t" - "ldr x5, [%[a], 16]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 24]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 48]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 56]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 48]\n\t" - "ldp x3, x4, [%[a], 64]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 80]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 88]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 64]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 80]\n\t" - "ldp x3, x4, [%[a], 96]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 112]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 120]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 96]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 112]\n\t" - "ldp x3, x4, [%[a], 128]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 144]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 152]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 128]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 144]\n\t" - "ldp x3, x4, [%[a], 160]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 176]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 184]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 160]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 176]\n\t" - "ldp x3, x4, [%[a], 192]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 208]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 216]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 192]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 208]\n\t" - "ldp x3, x4, [%[a], 224]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 240]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 248]\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r], 224]\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r], 240]\n\t" - "cset %[r], cs\n\t" - : [r] "+r" (r) - : [a] "r" (a) - : "memory", "x3", "x4", "x5", "x6" - ); - - return (sp_digit)r; -} - -#endif /* WOLFSSL_SP_SMALL */ /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -15172,22 +17282,31 @@ static sp_digit sp_2048_dbl_32(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_4096_sqr_64(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[64]; + sp_digit* z2 = r + 64; sp_digit z1[64]; - sp_digit a1[32]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 32; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 32); + + mask = sp_2048_sub_32(a1, a, &a[32]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_32(a1, p1, p2); - u = sp_2048_add_32(a1, a, &a[32]); - sp_2048_sqr_32(z1, a1); sp_2048_sqr_32(z2, &a[32]); sp_2048_sqr_32(z0, a); - sp_2048_mask_32(r + 64, a1, 0 - u); - u += sp_2048_dbl_32(r + 64, r + 64); - u += sp_4096_sub_in_place_64(z1, z2); - u += sp_4096_sub_in_place_64(z1, z0); - u += sp_4096_add_64(r + 32, r + 32, z1); - u += sp_4096_add_32(r + 64, r + 64, z2); - sp_4096_add_zero_32(r + 96, z2 + 32, u); + sp_2048_sqr_32(z1, a1); + + u = 0; + u -= sp_4096_sub_in_place_64(z1, z2); + u -= sp_4096_sub_in_place_64(z1, z0); + u += sp_4096_sub_in_place_64(r + 32, z1); + sp_4096_add_word_32(r + 96, r + 96, u); } #endif /* !WOLFSSL_SP_SMALL */ @@ -15277,10 +17396,10 @@ static void sp_4096_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b) sp_digit tmp[128]; __asm__ __volatile__ ( - "mov x5, 0\n\t" - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" "\n1:\n\t" "subs x3, x5, 504\n\t" "csel x3, xzr, x3, cc\n\t" @@ -15326,10 +17445,10 @@ static void sp_4096_sqr_64(sp_digit* r, const sp_digit* a) sp_digit tmp[128]; __asm__ __volatile__ ( - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" - "mov x5, 0\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" + "mov x5, xzr\n\t" "\n1:\n\t" "subs x3, x5, 504\n\t" "csel x3, xzr, x3, cc\n\t" @@ -15418,9 +17537,9 @@ static void sp_4096_mul_d_64(sp_digit* r, const sp_digit* a, "ldr x8, [%[a]]\n\t" "mul x5, %[b], x8\n\t" "umulh x3, %[b], x8\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "str x5, [%[r]]\n\t" - "mov x5, 0\n\t" + "mov x5, xzr\n\t" "mov x9, #8\n\t" "1:\n\t" "ldr x8, [%[a], x9]\n\t" @@ -15444,547 +17563,547 @@ static void sp_4096_mul_d_64(sp_digit* r, const sp_digit* a, #else __asm__ __volatile__ ( "# A[0] * B\n\t" - "ldp x8, x9, [%[a]]\n\t" - "mul x3, %[b], x8\n\t" - "umulh x4, %[b], x8\n\t" - "mov x5, 0\n\t" + "ldp x9, x10, [%[a]]\n\t" + "mul x3, %[b], x9\n\t" + "umulh x4, %[b], x9\n\t" + "mov x5, xzr\n\t" "# A[1] * B\n\t" "str x3, [%[r]]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adds x4, x4, x6\n\t" "# A[2] * B\n\t" - "ldp x8, x9, [%[a], 16]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" "str x4, [%[r], 8]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[3] * B\n\t" "str x5, [%[r], 16]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[4] * B\n\t" - "ldp x8, x9, [%[a], 32]\n\t" + "ldp x9, x10, [%[a], 32]\n\t" "str x3, [%[r], 24]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[5] * B\n\t" "str x4, [%[r], 32]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[6] * B\n\t" - "ldp x8, x9, [%[a], 48]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" "str x5, [%[r], 40]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[7] * B\n\t" "str x3, [%[r], 48]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[8] * B\n\t" - "ldp x8, x9, [%[a], 64]\n\t" + "ldp x9, x10, [%[a], 64]\n\t" "str x4, [%[r], 56]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[9] * B\n\t" "str x5, [%[r], 64]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[10] * B\n\t" - "ldp x8, x9, [%[a], 80]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" "str x3, [%[r], 72]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[11] * B\n\t" "str x4, [%[r], 80]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[12] * B\n\t" - "ldp x8, x9, [%[a], 96]\n\t" + "ldp x9, x10, [%[a], 96]\n\t" "str x5, [%[r], 88]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[13] * B\n\t" "str x3, [%[r], 96]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[14] * B\n\t" - "ldp x8, x9, [%[a], 112]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" "str x4, [%[r], 104]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[15] * B\n\t" "str x5, [%[r], 112]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[16] * B\n\t" - "ldp x8, x9, [%[a], 128]\n\t" + "ldp x9, x10, [%[a], 128]\n\t" "str x3, [%[r], 120]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[17] * B\n\t" "str x4, [%[r], 128]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[18] * B\n\t" - "ldp x8, x9, [%[a], 144]\n\t" + "ldp x9, x10, [%[a], 144]\n\t" "str x5, [%[r], 136]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[19] * B\n\t" "str x3, [%[r], 144]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[20] * B\n\t" - "ldp x8, x9, [%[a], 160]\n\t" + "ldp x9, x10, [%[a], 160]\n\t" "str x4, [%[r], 152]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[21] * B\n\t" "str x5, [%[r], 160]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[22] * B\n\t" - "ldp x8, x9, [%[a], 176]\n\t" + "ldp x9, x10, [%[a], 176]\n\t" "str x3, [%[r], 168]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[23] * B\n\t" "str x4, [%[r], 176]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[24] * B\n\t" - "ldp x8, x9, [%[a], 192]\n\t" + "ldp x9, x10, [%[a], 192]\n\t" "str x5, [%[r], 184]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[25] * B\n\t" "str x3, [%[r], 192]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[26] * B\n\t" - "ldp x8, x9, [%[a], 208]\n\t" + "ldp x9, x10, [%[a], 208]\n\t" "str x4, [%[r], 200]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[27] * B\n\t" "str x5, [%[r], 208]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[28] * B\n\t" - "ldp x8, x9, [%[a], 224]\n\t" + "ldp x9, x10, [%[a], 224]\n\t" "str x3, [%[r], 216]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[29] * B\n\t" "str x4, [%[r], 224]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[30] * B\n\t" - "ldp x8, x9, [%[a], 240]\n\t" + "ldp x9, x10, [%[a], 240]\n\t" "str x5, [%[r], 232]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[31] * B\n\t" "str x3, [%[r], 240]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[32] * B\n\t" - "ldp x8, x9, [%[a], 256]\n\t" + "ldp x9, x10, [%[a], 256]\n\t" "str x4, [%[r], 248]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[33] * B\n\t" "str x5, [%[r], 256]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[34] * B\n\t" - "ldp x8, x9, [%[a], 272]\n\t" + "ldp x9, x10, [%[a], 272]\n\t" "str x3, [%[r], 264]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[35] * B\n\t" "str x4, [%[r], 272]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[36] * B\n\t" - "ldp x8, x9, [%[a], 288]\n\t" + "ldp x9, x10, [%[a], 288]\n\t" "str x5, [%[r], 280]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[37] * B\n\t" "str x3, [%[r], 288]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[38] * B\n\t" - "ldp x8, x9, [%[a], 304]\n\t" + "ldp x9, x10, [%[a], 304]\n\t" "str x4, [%[r], 296]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[39] * B\n\t" "str x5, [%[r], 304]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[40] * B\n\t" - "ldp x8, x9, [%[a], 320]\n\t" + "ldp x9, x10, [%[a], 320]\n\t" "str x3, [%[r], 312]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[41] * B\n\t" "str x4, [%[r], 320]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[42] * B\n\t" - "ldp x8, x9, [%[a], 336]\n\t" + "ldp x9, x10, [%[a], 336]\n\t" "str x5, [%[r], 328]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[43] * B\n\t" "str x3, [%[r], 336]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[44] * B\n\t" - "ldp x8, x9, [%[a], 352]\n\t" + "ldp x9, x10, [%[a], 352]\n\t" "str x4, [%[r], 344]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[45] * B\n\t" "str x5, [%[r], 352]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[46] * B\n\t" - "ldp x8, x9, [%[a], 368]\n\t" + "ldp x9, x10, [%[a], 368]\n\t" "str x3, [%[r], 360]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[47] * B\n\t" "str x4, [%[r], 368]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[48] * B\n\t" - "ldp x8, x9, [%[a], 384]\n\t" + "ldp x9, x10, [%[a], 384]\n\t" "str x5, [%[r], 376]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[49] * B\n\t" "str x3, [%[r], 384]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[50] * B\n\t" - "ldp x8, x9, [%[a], 400]\n\t" + "ldp x9, x10, [%[a], 400]\n\t" "str x4, [%[r], 392]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[51] * B\n\t" "str x5, [%[r], 400]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[52] * B\n\t" - "ldp x8, x9, [%[a], 416]\n\t" + "ldp x9, x10, [%[a], 416]\n\t" "str x3, [%[r], 408]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[53] * B\n\t" "str x4, [%[r], 416]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[54] * B\n\t" - "ldp x8, x9, [%[a], 432]\n\t" + "ldp x9, x10, [%[a], 432]\n\t" "str x5, [%[r], 424]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[55] * B\n\t" "str x3, [%[r], 432]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[56] * B\n\t" - "ldp x8, x9, [%[a], 448]\n\t" + "ldp x9, x10, [%[a], 448]\n\t" "str x4, [%[r], 440]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[57] * B\n\t" "str x5, [%[r], 448]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[58] * B\n\t" - "ldp x8, x9, [%[a], 464]\n\t" + "ldp x9, x10, [%[a], 464]\n\t" "str x3, [%[r], 456]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[59] * B\n\t" "str x4, [%[r], 464]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[60] * B\n\t" - "ldp x8, x9, [%[a], 480]\n\t" + "ldp x9, x10, [%[a], 480]\n\t" "str x5, [%[r], 472]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[61] * B\n\t" "str x3, [%[r], 480]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[62] * B\n\t" - "ldp x8, x9, [%[a], 496]\n\t" + "ldp x9, x10, [%[a], 496]\n\t" "str x4, [%[r], 488]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[63] * B\n\t" "str x5, [%[r], 496]\n\t" - "mul x6, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "adc x4, x4, x7\n\t" "stp x3, x4, [%[r], 504]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #endif } @@ -16014,874 +18133,808 @@ static void sp_4096_mont_norm_64(sp_digit* r, const sp_digit* m) SP_NOINLINE static void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) { - __asm__ __volatile__ ( - "ldp x14, x15, [%[m], 0]\n\t" - "ldp x16, x17, [%[m], 16]\n\t" - "ldp x19, x20, [%[m], 32]\n\t" - "ldp x21, x22, [%[m], 48]\n\t" - "ldp x23, x24, [%[m], 64]\n\t" - "ldp x25, x26, [%[m], 80]\n\t" - "ldp x27, x28, [%[m], 96]\n\t" + "ldp x11, x12, [%[a], 0]\n\t" + "ldp x13, x14, [%[a], 16]\n\t" + "ldp x15, x16, [%[a], 32]\n\t" + "ldp x17, x19, [%[a], 48]\n\t" + "ldp x20, x21, [%[a], 64]\n\t" + "ldp x22, x23, [%[a], 80]\n\t" + "# No carry yet\n\t" "mov x3, xzr\n\t" - "# i = 64\n\t" + "# i = 0..63\n\t" "mov x4, 64\n\t" - "ldp x12, x13, [%[a], 0]\n\t" "\n1:\n\t" "# mu = a[i] * mp\n\t" - "mul x9, %[mp], x12\n\t" + "mul x10, %[mp], x11\n\t" + "ldp x24, x25, [%[m], 0]\n\t" + "ldp x26, x27, [%[m], 16]\n\t" "# a[i+0] += m[0] * mu\n\t" - "mul x7, x14, x9\n\t" - "umulh x8, x14, x9\n\t" - "adds x12, x12, x7\n\t" + "mul x5, x24, x10\n\t" + "umulh x6, x24, x10\n\t" "# a[i+1] += m[1] * mu\n\t" - "mul x7, x15, x9\n\t" - "adc x6, x8, xzr\n\t" - "umulh x8, x15, x9\n\t" - "adds x12, x13, x7\n\t" + "adds x11, x11, x5\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x11, x12, x5\n\t" "# a[i+2] += m[2] * mu\n\t" - "ldr x13, [%[a], 16]\n\t" - "adc x5, x8, xzr\n\t" - "mul x7, x16, x9\n\t" - "adds x12, x12, x6\n\t" - "umulh x8, x16, x9\n\t" - "adc x5, x5, xzr\n\t" - "adds x13, x13, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x11, x11, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x12, x13, x5\n\t" "# a[i+3] += m[3] * mu\n\t" - "ldr x10, [%[a], 24]\n\t" - "adc x6, x8, xzr\n\t" - "mul x7, x17, x9\n\t" - "adds x13, x13, x5\n\t" - "umulh x8, x17, x9\n\t" "adc x6, x6, xzr\n\t" - "adds x10, x10, x7\n\t" + "adds x12, x12, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x13, x14, x5\n\t" + "ldp x24, x25, [%[m], 32]\n\t" + "ldp x26, x27, [%[m], 48]\n\t" "# a[i+4] += m[4] * mu\n\t" - "ldr x11, [%[a], 32]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x19, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "str x10, [%[a], 24]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x13, x13, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x24, x10\n\t" + "adds x14, x15, x5\n\t" "# a[i+5] += m[5] * mu\n\t" - "ldr x10, [%[a], 40]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x20, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "str x11, [%[a], 32]\n\t" - "adds x10, x10, x7\n\t" + "adds x14, x14, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x15, x16, x5\n\t" "# a[i+6] += m[6] * mu\n\t" - "ldr x11, [%[a], 48]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x21, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "str x10, [%[a], 40]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x15, x15, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x16, x17, x5\n\t" "# a[i+7] += m[7] * mu\n\t" - "ldr x10, [%[a], 56]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x22, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "str x11, [%[a], 48]\n\t" - "adds x10, x10, x7\n\t" + "adds x16, x16, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x17, x19, x5\n\t" + "ldp x24, x25, [%[m], 64]\n\t" + "ldp x26, x27, [%[m], 80]\n\t" "# a[i+8] += m[8] * mu\n\t" - "ldr x11, [%[a], 64]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x23, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x23, x9\n\t" - "str x10, [%[a], 56]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x17, x17, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x24, x10\n\t" + "adds x19, x20, x5\n\t" "# a[i+9] += m[9] * mu\n\t" - "ldr x10, [%[a], 72]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x24, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x24, x9\n\t" - "str x11, [%[a], 64]\n\t" - "adds x10, x10, x7\n\t" + "adds x19, x19, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x25, x10\n\t" + "adds x20, x21, x5\n\t" "# a[i+10] += m[10] * mu\n\t" - "ldr x11, [%[a], 80]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x25, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x25, x9\n\t" - "str x10, [%[a], 72]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x20, x20, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "umulh x6, x26, x10\n\t" + "adds x21, x22, x5\n\t" "# a[i+11] += m[11] * mu\n\t" - "ldr x10, [%[a], 88]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x26, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x26, x9\n\t" - "str x11, [%[a], 80]\n\t" - "adds x10, x10, x7\n\t" + "adds x21, x21, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "umulh x7, x27, x10\n\t" + "adds x22, x23, x5\n\t" + "ldp x24, x25, [%[m], 96]\n\t" + "ldp x26, x27, [%[m], 112]\n\t" "# a[i+12] += m[12] * mu\n\t" - "ldr x11, [%[a], 96]\n\t" - "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x27, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x27, x9\n\t" - "str x10, [%[a], 88]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x22, x22, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "ldr x23, [%[a], 96]\n\t" + "umulh x6, x24, x10\n\t" + "adds x23, x23, x5\n\t" "# a[i+13] += m[13] * mu\n\t" - "ldr x10, [%[a], 104]\n\t" - "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x28, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x28, x9\n\t" - "str x11, [%[a], 96]\n\t" - "adds x10, x10, x7\n\t" + "adds x23, x23, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "ldp x8, x9, [%[a], 104]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+14] += m[14] * mu\n\t" - "ldr x11, [%[a], 112]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 112]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 104]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 104]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+15] += m[15] * mu\n\t" - "ldr x10, [%[a], 120]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 120]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 112]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 112]\n\t" + "ldp x8, x9, [%[a], 120]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 128]\n\t" + "ldp x26, x27, [%[m], 144]\n\t" "# a[i+16] += m[16] * mu\n\t" - "ldr x11, [%[a], 128]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 128]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 120]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 120]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+17] += m[17] * mu\n\t" - "ldr x10, [%[a], 136]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 136]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 128]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 128]\n\t" + "ldp x8, x9, [%[a], 136]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+18] += m[18] * mu\n\t" - "ldr x11, [%[a], 144]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 144]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 136]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 136]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+19] += m[19] * mu\n\t" - "ldr x10, [%[a], 152]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 152]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 144]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 144]\n\t" + "ldp x8, x9, [%[a], 152]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 160]\n\t" + "ldp x26, x27, [%[m], 176]\n\t" "# a[i+20] += m[20] * mu\n\t" - "ldr x11, [%[a], 160]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 160]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 152]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 152]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+21] += m[21] * mu\n\t" - "ldr x10, [%[a], 168]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 168]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 160]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 160]\n\t" + "ldp x8, x9, [%[a], 168]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+22] += m[22] * mu\n\t" - "ldr x11, [%[a], 176]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 176]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 168]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 168]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+23] += m[23] * mu\n\t" - "ldr x10, [%[a], 184]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 184]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 176]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 176]\n\t" + "ldp x8, x9, [%[a], 184]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 192]\n\t" + "ldp x26, x27, [%[m], 208]\n\t" "# a[i+24] += m[24] * mu\n\t" - "ldr x11, [%[a], 192]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 192]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 184]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 184]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+25] += m[25] * mu\n\t" - "ldr x10, [%[a], 200]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 200]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 192]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 192]\n\t" + "ldp x8, x9, [%[a], 200]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+26] += m[26] * mu\n\t" - "ldr x11, [%[a], 208]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 208]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 200]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 200]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+27] += m[27] * mu\n\t" - "ldr x10, [%[a], 216]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 216]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 208]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 208]\n\t" + "ldp x8, x9, [%[a], 216]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 224]\n\t" + "ldp x26, x27, [%[m], 240]\n\t" "# a[i+28] += m[28] * mu\n\t" - "ldr x11, [%[a], 224]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 224]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 216]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 216]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+29] += m[29] * mu\n\t" - "ldr x10, [%[a], 232]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 232]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 224]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 224]\n\t" + "ldp x8, x9, [%[a], 232]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+30] += m[30] * mu\n\t" - "ldr x11, [%[a], 240]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 240]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 232]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 232]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+31] += m[31] * mu\n\t" - "ldr x10, [%[a], 248]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 248]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 240]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 240]\n\t" + "ldp x8, x9, [%[a], 248]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 256]\n\t" + "ldp x26, x27, [%[m], 272]\n\t" "# a[i+32] += m[32] * mu\n\t" - "ldr x11, [%[a], 256]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 256]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 248]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 248]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+33] += m[33] * mu\n\t" - "ldr x10, [%[a], 264]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 264]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 256]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 256]\n\t" + "ldp x8, x9, [%[a], 264]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+34] += m[34] * mu\n\t" - "ldr x11, [%[a], 272]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 272]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 264]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 264]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+35] += m[35] * mu\n\t" - "ldr x10, [%[a], 280]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 280]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 272]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 272]\n\t" + "ldp x8, x9, [%[a], 280]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 288]\n\t" + "ldp x26, x27, [%[m], 304]\n\t" "# a[i+36] += m[36] * mu\n\t" - "ldr x11, [%[a], 288]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 288]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 280]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 280]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+37] += m[37] * mu\n\t" - "ldr x10, [%[a], 296]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 296]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 288]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 288]\n\t" + "ldp x8, x9, [%[a], 296]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+38] += m[38] * mu\n\t" - "ldr x11, [%[a], 304]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 304]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 296]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 296]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+39] += m[39] * mu\n\t" - "ldr x10, [%[a], 312]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 312]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 304]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 304]\n\t" + "ldp x8, x9, [%[a], 312]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 320]\n\t" + "ldp x26, x27, [%[m], 336]\n\t" "# a[i+40] += m[40] * mu\n\t" - "ldr x11, [%[a], 320]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 320]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 312]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 312]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+41] += m[41] * mu\n\t" - "ldr x10, [%[a], 328]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 328]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 320]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 320]\n\t" + "ldp x8, x9, [%[a], 328]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+42] += m[42] * mu\n\t" - "ldr x11, [%[a], 336]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 336]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 328]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 328]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+43] += m[43] * mu\n\t" - "ldr x10, [%[a], 344]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 344]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 336]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 336]\n\t" + "ldp x8, x9, [%[a], 344]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 352]\n\t" + "ldp x26, x27, [%[m], 368]\n\t" "# a[i+44] += m[44] * mu\n\t" - "ldr x11, [%[a], 352]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 352]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 344]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 344]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+45] += m[45] * mu\n\t" - "ldr x10, [%[a], 360]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 360]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 352]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 352]\n\t" + "ldp x8, x9, [%[a], 360]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+46] += m[46] * mu\n\t" - "ldr x11, [%[a], 368]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 368]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 360]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 360]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+47] += m[47] * mu\n\t" - "ldr x10, [%[a], 376]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 376]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 368]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 368]\n\t" + "ldp x8, x9, [%[a], 376]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 384]\n\t" + "ldp x26, x27, [%[m], 400]\n\t" "# a[i+48] += m[48] * mu\n\t" - "ldr x11, [%[a], 384]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 384]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 376]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 376]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+49] += m[49] * mu\n\t" - "ldr x10, [%[a], 392]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 392]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 384]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 384]\n\t" + "ldp x8, x9, [%[a], 392]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+50] += m[50] * mu\n\t" - "ldr x11, [%[a], 400]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 400]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 392]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 392]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+51] += m[51] * mu\n\t" - "ldr x10, [%[a], 408]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 408]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 400]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 400]\n\t" + "ldp x8, x9, [%[a], 408]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 416]\n\t" + "ldp x26, x27, [%[m], 432]\n\t" "# a[i+52] += m[52] * mu\n\t" - "ldr x11, [%[a], 416]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 416]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 408]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 408]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+53] += m[53] * mu\n\t" - "ldr x10, [%[a], 424]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 424]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 416]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 416]\n\t" + "ldp x8, x9, [%[a], 424]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+54] += m[54] * mu\n\t" - "ldr x11, [%[a], 432]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 432]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 424]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 424]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+55] += m[55] * mu\n\t" - "ldr x10, [%[a], 440]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 440]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 432]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 432]\n\t" + "ldp x8, x9, [%[a], 440]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 448]\n\t" + "ldp x26, x27, [%[m], 464]\n\t" "# a[i+56] += m[56] * mu\n\t" - "ldr x11, [%[a], 448]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 448]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 440]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 440]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+57] += m[57] * mu\n\t" - "ldr x10, [%[a], 456]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 456]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 448]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 448]\n\t" + "ldp x8, x9, [%[a], 456]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+58] += m[58] * mu\n\t" - "ldr x11, [%[a], 464]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 464]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 456]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 456]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+59] += m[59] * mu\n\t" - "ldr x10, [%[a], 472]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 472]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 464]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 464]\n\t" + "ldp x8, x9, [%[a], 472]\n\t" + "umulh x7, x27, x10\n\t" + "adds x8, x8, x5\n\t" + "ldp x24, x25, [%[m], 480]\n\t" + "ldp x26, x27, [%[m], 496]\n\t" "# a[i+60] += m[60] * mu\n\t" - "ldr x11, [%[a], 480]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 480]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 472]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x24, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 472]\n\t" + "umulh x6, x24, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+61] += m[61] * mu\n\t" - "ldr x10, [%[a], 488]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 488]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x11, [%[a], 480]\n\t" - "adds x10, x10, x7\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x25, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 480]\n\t" + "ldp x8, x9, [%[a], 488]\n\t" + "umulh x7, x25, x10\n\t" + "adds x8, x8, x5\n\t" "# a[i+62] += m[62] * mu\n\t" - "ldr x11, [%[a], 496]\n\t" - "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 496]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" - "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 488]\n\t" - "adds x11, x11, x7\n\t" + "adc x7, x7, xzr\n\t" + "adds x8, x8, x6\n\t" + "mul x5, x26, x10\n\t" + "adc x7, x7, xzr\n\t" + "str x8, [%[a], 488]\n\t" + "umulh x6, x26, x10\n\t" + "adds x9, x9, x5\n\t" "# a[i+63] += m[63] * mu\n\t" - "ldr x10, [%[a], 504]\n\t" - "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 504]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" - "adds x6, x6, x7\n\t" - "adcs x8, x8, x3\n\t" - "str x11, [%[a], 496]\n\t" - "cset x3, cs\n\t" - "adds x10, x10, x6\n\t" - "ldr x11, [%[a], 512]\n\t" - "str x10, [%[a], 504]\n\t" - "adcs x11, x11, x8\n\t" - "str x11, [%[a], 512]\n\t" + "adds x9, x9, x7\n\t" + "mul x5, x27, x10\n\t" + "adc x6, x6, xzr\n\t" + "str x9, [%[a], 496]\n\t" + "umulh x7, x27, x10\n\t" + "ldp x8, x9, [%[a], 504]\n\t" + "adds x5, x5, x6\n\t" + "adcs x7, x7, x3\n\t" + "cset x3, cs\n\t" + "adds x8, x8, x5\n\t" + "str x8, [%[a], 504]\n\t" + "adcs x9, x9, x7\n\t" + "str x9, [%[a], 512]\n\t" "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" - "bne 1b\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" + "b.ne 1b\n\t" "# Create mask\n\t" "neg x3, x3\n\t" - "mov x9, %[a]\n\t" + "mov %[mp], %[a]\n\t" "sub %[a], %[a], 512\n\t" "# Subtract masked modulus\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" - "and x14, x14, x3\n\t" - "ldp x11, x10, [x9, 16]\n\t" - "and x15, x15, x3\n\t" - "subs x12, x12, x14\n\t" - "and x16, x16, x3\n\t" - "sbcs x13, x13, x15\n\t" - "and x17, x17, x3\n\t" - "sbcs x11, x11, x16\n\t" - "stp x12, x13, [%[a], 0]\n\t" - "sbcs x10, x10, x17\n\t" - "stp x11, x10, [%[a], 16]\n\t" - "ldp x12, x13, [x9, 32]\n\t" - "and x19, x19, x3\n\t" - "ldp x11, x10, [x9, 48]\n\t" - "and x20, x20, x3\n\t" - "sbcs x12, x12, x19\n\t" - "and x21, x21, x3\n\t" - "sbcs x13, x13, x20\n\t" - "and x22, x22, x3\n\t" - "sbcs x11, x11, x21\n\t" - "stp x12, x13, [%[a], 32]\n\t" - "sbcs x10, x10, x22\n\t" - "stp x11, x10, [%[a], 48]\n\t" - "ldp x12, x13, [x9, 64]\n\t" - "and x23, x23, x3\n\t" - "ldp x11, x10, [x9, 80]\n\t" - "and x24, x24, x3\n\t" - "sbcs x12, x12, x23\n\t" - "and x25, x25, x3\n\t" - "sbcs x13, x13, x24\n\t" - "and x26, x26, x3\n\t" - "sbcs x11, x11, x25\n\t" - "stp x12, x13, [%[a], 64]\n\t" - "sbcs x10, x10, x26\n\t" - "stp x11, x10, [%[a], 80]\n\t" - "ldp x7, x8, [%[m], 112]\n\t" - "ldp x12, x13, [x9, 96]\n\t" - "and x27, x27, x3\n\t" - "ldp x11, x10, [x9, 112]\n\t" - "and x28, x28, x3\n\t" - "sbcs x12, x12, x27\n\t" - "and x7, x7, x3\n\t" - "sbcs x13, x13, x28\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 96]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 112]\n\t" - "ldp x5, x6, [%[m], 128]\n\t" - "ldp x7, x8, [%[m], 144]\n\t" - "ldp x12, x13, [x9, 128]\n\t" + "ldp x4, x5, [%[m], 0]\n\t" + "ldp x6, x7, [%[m], 16]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 144]\n\t" + "subs x11, x11, x4\n\t" "and x6, x6, x3\n\t" "sbcs x12, x12, x5\n\t" "and x7, x7, x3\n\t" "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 128]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 144]\n\t" - "ldp x5, x6, [%[m], 160]\n\t" - "ldp x7, x8, [%[m], 176]\n\t" - "ldp x12, x13, [x9, 160]\n\t" + "stp x11, x12, [%[a], 0]\n\t" + "sbcs x14, x14, x7\n\t" + "stp x13, x14, [%[a], 16]\n\t" + "ldp x4, x5, [%[m], 32]\n\t" + "ldp x6, x7, [%[m], 48]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 176]\n\t" + "sbcs x15, x15, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x16, x16, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 160]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 176]\n\t" - "ldp x5, x6, [%[m], 192]\n\t" - "ldp x7, x8, [%[m], 208]\n\t" - "ldp x12, x13, [x9, 192]\n\t" + "sbcs x17, x17, x6\n\t" + "stp x15, x16, [%[a], 32]\n\t" + "sbcs x19, x19, x7\n\t" + "stp x17, x19, [%[a], 48]\n\t" + "ldp x4, x5, [%[m], 64]\n\t" + "ldp x6, x7, [%[m], 80]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 208]\n\t" + "sbcs x20, x20, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x21, x21, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 192]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 208]\n\t" - "ldp x5, x6, [%[m], 224]\n\t" - "ldp x7, x8, [%[m], 240]\n\t" - "ldp x12, x13, [x9, 224]\n\t" + "sbcs x22, x22, x6\n\t" + "stp x20, x21, [%[a], 64]\n\t" + "sbcs x23, x23, x7\n\t" + "stp x22, x23, [%[a], 80]\n\t" + "ldp x4, x5, [%[m], 96]\n\t" + "ldp x6, x7, [%[m], 112]\n\t" + "ldp x8, x9, [%[mp], 96]\n\t" + "ldp x10, x11, [%[mp], 112]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 240]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 96]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 224]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 240]\n\t" - "ldp x5, x6, [%[m], 256]\n\t" - "ldp x7, x8, [%[m], 272]\n\t" - "ldp x12, x13, [x9, 256]\n\t" + "stp x10, x11, [%[a], 112]\n\t" + "ldp x4, x5, [%[m], 128]\n\t" + "ldp x6, x7, [%[m], 144]\n\t" + "ldp x8, x9, [%[mp], 128]\n\t" + "ldp x10, x11, [%[mp], 144]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 272]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 128]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 256]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 272]\n\t" - "ldp x5, x6, [%[m], 288]\n\t" - "ldp x7, x8, [%[m], 304]\n\t" - "ldp x12, x13, [x9, 288]\n\t" + "stp x10, x11, [%[a], 144]\n\t" + "ldp x4, x5, [%[m], 160]\n\t" + "ldp x6, x7, [%[m], 176]\n\t" + "ldp x8, x9, [%[mp], 160]\n\t" + "ldp x10, x11, [%[mp], 176]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 304]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 160]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 288]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 304]\n\t" - "ldp x5, x6, [%[m], 320]\n\t" - "ldp x7, x8, [%[m], 336]\n\t" - "ldp x12, x13, [x9, 320]\n\t" + "stp x10, x11, [%[a], 176]\n\t" + "ldp x4, x5, [%[m], 192]\n\t" + "ldp x6, x7, [%[m], 208]\n\t" + "ldp x8, x9, [%[mp], 192]\n\t" + "ldp x10, x11, [%[mp], 208]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 336]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 192]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 320]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 336]\n\t" - "ldp x5, x6, [%[m], 352]\n\t" - "ldp x7, x8, [%[m], 368]\n\t" - "ldp x12, x13, [x9, 352]\n\t" + "stp x10, x11, [%[a], 208]\n\t" + "ldp x4, x5, [%[m], 224]\n\t" + "ldp x6, x7, [%[m], 240]\n\t" + "ldp x8, x9, [%[mp], 224]\n\t" + "ldp x10, x11, [%[mp], 240]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 368]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 224]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 352]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 368]\n\t" - "ldp x5, x6, [%[m], 384]\n\t" - "ldp x7, x8, [%[m], 400]\n\t" - "ldp x12, x13, [x9, 384]\n\t" + "stp x10, x11, [%[a], 240]\n\t" + "ldp x4, x5, [%[m], 256]\n\t" + "ldp x6, x7, [%[m], 272]\n\t" + "ldp x8, x9, [%[mp], 256]\n\t" + "ldp x10, x11, [%[mp], 272]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 400]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 256]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 384]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 400]\n\t" - "ldp x5, x6, [%[m], 416]\n\t" - "ldp x7, x8, [%[m], 432]\n\t" - "ldp x12, x13, [x9, 416]\n\t" + "stp x10, x11, [%[a], 272]\n\t" + "ldp x4, x5, [%[m], 288]\n\t" + "ldp x6, x7, [%[m], 304]\n\t" + "ldp x8, x9, [%[mp], 288]\n\t" + "ldp x10, x11, [%[mp], 304]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 432]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 288]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 416]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 432]\n\t" - "ldp x5, x6, [%[m], 448]\n\t" - "ldp x7, x8, [%[m], 464]\n\t" - "ldp x12, x13, [x9, 448]\n\t" + "stp x10, x11, [%[a], 304]\n\t" + "ldp x4, x5, [%[m], 320]\n\t" + "ldp x6, x7, [%[m], 336]\n\t" + "ldp x8, x9, [%[mp], 320]\n\t" + "ldp x10, x11, [%[mp], 336]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 464]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 320]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 448]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 464]\n\t" - "ldp x5, x6, [%[m], 480]\n\t" - "ldp x7, x8, [%[m], 496]\n\t" - "ldp x12, x13, [x9, 480]\n\t" + "stp x10, x11, [%[a], 336]\n\t" + "ldp x4, x5, [%[m], 352]\n\t" + "ldp x6, x7, [%[m], 368]\n\t" + "ldp x8, x9, [%[mp], 352]\n\t" + "ldp x10, x11, [%[mp], 368]\n\t" + "and x4, x4, x3\n\t" "and x5, x5, x3\n\t" - "ldp x11, x10, [x9, 496]\n\t" + "sbcs x8, x8, x4\n\t" "and x6, x6, x3\n\t" - "sbcs x12, x12, x5\n\t" + "sbcs x9, x9, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x6\n\t" - "and x8, x8, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 352]\n\t" "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 480]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 496]\n\t" - : [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + "stp x10, x11, [%[a], 368]\n\t" + "ldp x4, x5, [%[m], 384]\n\t" + "ldp x6, x7, [%[m], 400]\n\t" + "ldp x8, x9, [%[mp], 384]\n\t" + "ldp x10, x11, [%[mp], 400]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 384]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 400]\n\t" + "ldp x4, x5, [%[m], 416]\n\t" + "ldp x6, x7, [%[m], 432]\n\t" + "ldp x8, x9, [%[mp], 416]\n\t" + "ldp x10, x11, [%[mp], 432]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 416]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 432]\n\t" + "ldp x4, x5, [%[m], 448]\n\t" + "ldp x6, x7, [%[m], 464]\n\t" + "ldp x8, x9, [%[mp], 448]\n\t" + "ldp x10, x11, [%[mp], 464]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 448]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 464]\n\t" + "ldp x4, x5, [%[m], 480]\n\t" + "ldp x6, x7, [%[m], 496]\n\t" + "ldp x8, x9, [%[mp], 480]\n\t" + "ldp x10, x11, [%[mp], 496]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x8, x8, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x9, x9, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x10, x10, x6\n\t" + "stp x8, x9, [%[a], 480]\n\t" + "sbcs x11, x11, x7\n\t" + "stp x10, x11, [%[a], 496]\n\t" + : [a] "+r" (a), [mp] "+r" (mp) + : [m] "r" (m) + : "memory", "x3", "x4", "x5", "x6", "x7", "x10", "x8", "x9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ); } @@ -16895,7 +18948,7 @@ SP_NOINLINE static void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_64(r, a, b); @@ -16909,7 +18962,7 @@ static void sp_4096_mont_mul_64(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_64(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_64(r, a); @@ -17134,38 +19187,38 @@ static sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. */ -static sp_digit div_4096_word_64(sp_digit d1, sp_digit d0, sp_digit div) +static sp_digit div_4096_word_64_cond(sp_digit d1, sp_digit d0, sp_digit div) { - sp_digit r; - __asm__ __volatile__ ( - "lsr x5, %[div], 32\n\t" - "add x5, x5, 1\n\t" + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" "lsl x6, x3, 32\n\t" "mul x4, %[div], x6\n\t" "umulh x3, %[div], x6\n\t" "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "udiv x3, %[d1], x5\n\t" - "lsl x3, x3, 32\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "umulh x3, %[div], x3\n\t" - "subs %[d0], %[d0], x4\n\t" - "sbc %[d1], %[d1], x3\n\t" + "cmp %[d1], x5\n\t" + "b.lt 1f\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" + "1:\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" "udiv x3, x3, x5\n\t" "add x6, x6, x3\n\t" @@ -17174,23 +19227,25 @@ static sp_digit div_4096_word_64(sp_digit d1, sp_digit d0, sp_digit div) "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" - "udiv x3, x3, x5\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "sub %[d0], %[d0], x4\n\t" + "cmp x3, x5\n\t" + "b.lt 2f\n\t" + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" + "2:\n\t" "udiv x3, %[d0], %[div]\n\t" - "add %[r], x6, x3\n\t" + "add %[d1], x6, x3\n\t" - : [r] "=r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "x3", "x4", "x5", "x6" + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); - return r; + return d1; } /* Divide d in a and put remainder into r (m*d + r = a) @@ -17213,9 +19268,13 @@ static WC_INLINE int sp_4096_div_64_cond(const sp_digit* a, const sp_digit* d, s div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_4096_word_64(hi, t1[64 + i - 1], div); + for (i = 63; i >= 0; i--) { + if (t1[64 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_4096_word_64_cond(t1[64 + i], t1[64 + i - 1], div); + } sp_4096_mul_d_64(t2, d, r1); t1[64 + i] += sp_4096_sub_in_place_64(&t1[i], t2); @@ -17524,6 +19583,67 @@ static sp_digit sp_4096_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_dig #endif /* WOLFSSL_SP_SMALL */ } +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + */ +static sp_digit div_4096_word_64(sp_digit d1, sp_digit d0, sp_digit div) +{ + __asm__ __volatile__ ( + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" + + "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" + "lsl x6, x3, 32\n\t" + "mul x4, %[div], x6\n\t" + "umulh x3, %[div], x6\n\t" + "subs %[d0], %[d0], x4\n\t" + "sbc %[d1], %[d1], x3\n\t" + + "cmp %[d1], x5\n\t" + "cset x9, ge\n\t" + "csetm x10, ge\n\t" + "lsl x9, x9, #32\n\t" + "and x7, x7, x10\n\t" + "and x8, x8, x10\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" + + "extr x3, %[d1], %[d0], 32\n\t" + + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "umulh x3, %[div], x3\n\t" + "subs %[d0], %[d0], x4\n\t" + "sbc %[d1], %[d1], x3\n\t" + + "extr x3, %[d1], %[d0], 32\n\t" + + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" + + "udiv x3, %[d0], %[div]\n\t" + "add %[d1], x6, x3\n\t" + + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + ); + + return d1; +} + /* AND m into each word of a and store in r. * * r A single precision integer. @@ -17565,483 +19685,427 @@ static sp_int64 sp_4096_cmp_64(const sp_digit* a, const sp_digit* b) { #ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "mov x5, 504\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "mov x10, #64\n\t" + "add %[a], %[a], #496\n\t" + "add %[b], %[b], #496\n\t" "1:\n\t" - "ldr x6, [%[a], x5]\n\t" - "ldr x7, [%[b], x5]\n\t" - "and x6, x6, x4\n\t" - "and x7, x7, x4\n\t" - "subs x6, x6, x7\n\t" - "csel x2, x3, x2, hi\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "subs x5, x5, #8\n\t" - "b.cs 1b\n\t" - "eor %[a], x2, x4\n\t" - : [a] "+r" (a) - : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + "ldp x6, x7, [%[a]], -16\n\t" + "ldp x8, x9, [%[b]], -16\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x10, x10, #2\n\t" + "b.ne 1b\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #else __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "ldp x5, x6, [%[a], 496]\n\t" - "ldp x7, x8, [%[b], 496]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "ldp x6, x7, [%[a], 496]\n\t" + "ldp x8, x9, [%[b], 496]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 480]\n\t" - "ldp x7, x8, [%[b], 480]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 480]\n\t" + "ldp x8, x9, [%[b], 480]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 464]\n\t" - "ldp x7, x8, [%[b], 464]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 464]\n\t" + "ldp x8, x9, [%[b], 464]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 448]\n\t" - "ldp x7, x8, [%[b], 448]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 448]\n\t" + "ldp x8, x9, [%[b], 448]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 432]\n\t" - "ldp x7, x8, [%[b], 432]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 432]\n\t" + "ldp x8, x9, [%[b], 432]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 416]\n\t" - "ldp x7, x8, [%[b], 416]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 416]\n\t" + "ldp x8, x9, [%[b], 416]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 400]\n\t" - "ldp x7, x8, [%[b], 400]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 400]\n\t" + "ldp x8, x9, [%[b], 400]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 384]\n\t" - "ldp x7, x8, [%[b], 384]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 384]\n\t" + "ldp x8, x9, [%[b], 384]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 368]\n\t" - "ldp x7, x8, [%[b], 368]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 368]\n\t" + "ldp x8, x9, [%[b], 368]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 352]\n\t" - "ldp x7, x8, [%[b], 352]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 352]\n\t" + "ldp x8, x9, [%[b], 352]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 336]\n\t" - "ldp x7, x8, [%[b], 336]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 336]\n\t" + "ldp x8, x9, [%[b], 336]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 320]\n\t" - "ldp x7, x8, [%[b], 320]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 320]\n\t" + "ldp x8, x9, [%[b], 320]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 304]\n\t" - "ldp x7, x8, [%[b], 304]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 304]\n\t" + "ldp x8, x9, [%[b], 304]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 288]\n\t" - "ldp x7, x8, [%[b], 288]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 288]\n\t" + "ldp x8, x9, [%[b], 288]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 272]\n\t" - "ldp x7, x8, [%[b], 272]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 272]\n\t" + "ldp x8, x9, [%[b], 272]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 256]\n\t" - "ldp x7, x8, [%[b], 256]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 256]\n\t" + "ldp x8, x9, [%[b], 256]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 240]\n\t" - "ldp x7, x8, [%[b], 240]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 240]\n\t" + "ldp x8, x9, [%[b], 240]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 224]\n\t" - "ldp x7, x8, [%[b], 224]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 224]\n\t" + "ldp x8, x9, [%[b], 224]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 208]\n\t" - "ldp x7, x8, [%[b], 208]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 208]\n\t" + "ldp x8, x9, [%[b], 208]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 192]\n\t" - "ldp x7, x8, [%[b], 192]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 192]\n\t" + "ldp x8, x9, [%[b], 192]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 176]\n\t" - "ldp x7, x8, [%[b], 176]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 176]\n\t" + "ldp x8, x9, [%[b], 176]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 160]\n\t" - "ldp x7, x8, [%[b], 160]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 160]\n\t" + "ldp x8, x9, [%[b], 160]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 144]\n\t" - "ldp x7, x8, [%[b], 144]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 144]\n\t" + "ldp x8, x9, [%[b], 144]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 128]\n\t" - "ldp x7, x8, [%[b], 128]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 128]\n\t" + "ldp x8, x9, [%[b], 128]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "ldp x7, x8, [%[b], 112]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "ldp x8, x9, [%[b], 112]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 96]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "ldp x7, x8, [%[b], 80]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "ldp x8, x9, [%[b], 80]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 64]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "ldp x7, x8, [%[b], 48]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "ldp x8, x9, [%[b], 48]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 32]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "ldp x7, x8, [%[b], 16]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 16]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 0]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "eor %[a], x2, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" : [a] "+r" (a) : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); #endif @@ -18068,7 +20132,7 @@ static WC_INLINE int sp_4096_div_64(const sp_digit* a, const sp_digit* d, sp_dig div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { + for (i = 63; i >= 0; i--) { sp_digit hi = t1[64 + i] - (t1[64 + i] == div); r1 = div_4096_word_64(hi, t1[64 + i - 1], div); @@ -18454,9 +20518,9 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 64; r = a + 64 * 2; m = r + 64 * 2; - ah = a + 64; sp_4096_from_bin(ah, 64, in, inLen); #if DIGIT_BIT >= 64 @@ -18474,7 +20538,38 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_4096_from_mp(m, 64, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_4096_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 64); + err = sp_4096_mod_64_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_4096_mont_sqr_64(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_4096_mont_mul_64(r, r, ah, m, mp); + + for (i = 63; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_4096_sub_in_place_64(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_4096_sqr_64(r, ah); err = sp_4096_mod_64_cond(r, r, m); @@ -18502,7 +20597,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 64); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_4096_mont_sqr_64(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_4096_mont_mul_64(r, r, a, m, mp); @@ -18537,6 +20632,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, } #ifndef WOLFSSL_RSA_PUBLIC_ONLY +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -18548,7 +20644,6 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, static sp_digit sp_4096_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m) { -#ifdef WOLFSSL_SP_SMALL sp_digit c = 0; __asm__ __volatile__ ( @@ -18566,134 +20661,12 @@ static sp_digit sp_4096_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig "b.lt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + : "memory", "x4", "x5", "x8", "x9", "x10", "x11", "x12" ); return c; -#else - __asm__ __volatile__ ( - - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "adds x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldp x5, x7, [%[b], 64]\n\t" - "ldp x11, x12, [%[b], 80]\n\t" - "ldp x4, x6, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 80]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 64]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 80]\n\t" - "ldp x5, x7, [%[b], 96]\n\t" - "ldp x11, x12, [%[b], 112]\n\t" - "ldp x4, x6, [%[a], 96]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 112]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 96]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 112]\n\t" - "ldp x5, x7, [%[b], 128]\n\t" - "ldp x11, x12, [%[b], 144]\n\t" - "ldp x4, x6, [%[a], 128]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 144]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 128]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 144]\n\t" - "ldp x5, x7, [%[b], 160]\n\t" - "ldp x11, x12, [%[b], 176]\n\t" - "ldp x4, x6, [%[a], 160]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 176]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 160]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 176]\n\t" - "ldp x5, x7, [%[b], 192]\n\t" - "ldp x11, x12, [%[b], 208]\n\t" - "ldp x4, x6, [%[a], 192]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 208]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 192]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 208]\n\t" - "ldp x5, x7, [%[b], 224]\n\t" - "ldp x11, x12, [%[b], 240]\n\t" - "ldp x4, x6, [%[a], 224]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 240]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 224]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 240]\n\t" - "cset %[r], cs\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* WOLFSSL_SP_SMALL */ /* RSA private key operation. * @@ -19692,10 +21665,10 @@ static void sp_256_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b) sp_digit tmp[8]; __asm__ __volatile__ ( - "mov x5, 0\n\t" - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" "\n1:\n\t" "subs x3, x5, 24\n\t" "csel x3, xzr, x3, cc\n\t" @@ -20398,12 +22371,10 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const "# - a[0] << 32 << 192\n\t" "# + (a[0] * 2) << 192\n\t" "# a[0]-a[2] << 32\n\t" - "lsl x10, x10, 32\n\t" + "extr x10, x10, x9, 32\n\t" "add x7, x11, x8\n\t" - "eor x10, x10, x9, lsr #32\n\t" - "lsl x9, x9, 32\n\t" + "extr x9, x9, x8, 32\n\t" "add x7, x7, x8\n\t" - "eor x9, x9, x8, lsr #32\n\t" "# + a[0]-a[2] << 32 << 64\n\t" "# - a[0] << 32 << 192\n\t" "adds x5, x5, x8, lsl #32\n\t" @@ -20425,16 +22396,13 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const "adcs x13, x13, x6\n\t" "lsr x16, x7, 32\n\t" "adcs x14, x14, x7\n\t" - "lsl x7, x7, 32\n\t" + "extr x7, x7, x6, 32\n\t" "adcs x15, x15, xzr\n\t" - "eor x7, x7, x6, lsr #32\n\t" + "extr x6, x6, x5, 32\n\t" "adc x8, x8, xzr\n\t" - "lsl x6, x6, 32\n\t" - "eor x6, x6, x5, lsr #32\n\t" "adds x11, x11, x6\n\t" - "lsl x5, x5, 32\n\t" + "extr x5, x5, x4, 32\n\t" "adcs x12, x12, x7\n\t" - "eor x5, x5, x4, lsr #32\n\t" "adcs x13, x13, x16\n\t" "lsl x4, x4, 32\n\t" "adcs x14, x14, xzr\n\t" @@ -20550,12 +22518,10 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const "# - a[0] << 32 << 192\n\t" "# + (a[0] * 2) << 192\n\t" "# a[0]-a[2] << 32\n\t" - "lsl x10, x10, 32\n\t" + "extr x10, x10, x9, 32\n\t" "add x6, x11, x8\n\t" - "eor x10, x10, x9, lsr #32\n\t" - "lsl x9, x9, 32\n\t" + "extr x9, x9, x8, 32\n\t" "add x6, x6, x8\n\t" - "eor x9, x9, x8, lsr #32\n\t" "# + a[0]-a[2] << 32 << 64\n\t" "# - a[0] << 32 << 192\n\t" "adds x4, x4, x8, lsl #32\n\t" @@ -20577,16 +22543,13 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const "adcs x13, x13, x5\n\t" "lsr x7, x6, 32\n\t" "adcs x14, x14, x6\n\t" - "lsl x6, x6, 32\n\t" + "extr x6, x6, x5, 32\n\t" "adcs x15, x15, xzr\n\t" - "eor x6, x6, x5, lsr #32\n\t" + "extr x5, x5, x4, 32\n\t" "adc x8, x8, xzr\n\t" - "lsl x5, x5, 32\n\t" - "eor x5, x5, x4, lsr #32\n\t" "adds x11, x11, x5\n\t" - "lsl x4, x4, 32\n\t" + "extr x4, x4, x3, 32\n\t" "adcs x12, x12, x6\n\t" - "eor x4, x4, x3, lsr #32\n\t" "adcs x13, x13, x7\n\t" "lsl x3, x3, 32\n\t" "adcs x14, x14, xzr\n\t" @@ -20728,63 +22691,67 @@ static sp_int64 sp_256_cmp_4(const sp_digit* a, const sp_digit* b) { #ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "mov x5, 24\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "mov x10, #4\n\t" + "add %[a], %[a], #16\n\t" + "add %[b], %[b], #16\n\t" "1:\n\t" - "ldr x6, [%[a], x5]\n\t" - "ldr x7, [%[b], x5]\n\t" - "and x6, x6, x4\n\t" - "and x7, x7, x4\n\t" - "subs x6, x6, x7\n\t" - "csel x2, x3, x2, hi\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "subs x5, x5, #8\n\t" - "b.cs 1b\n\t" - "eor %[a], x2, x4\n\t" - : [a] "+r" (a) - : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12" + "ldp x6, x7, [%[a]], -16\n\t" + "ldp x8, x9, [%[b]], -16\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x10, x10, #2\n\t" + "b.ne 1b\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #else __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "ldp x5, x6, [%[a], 0]\n\t" - "ldp x7, x8, [%[a], 16]\n\t" - "ldp x9, x10, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "and x8, x8, x4\n\t" - "and x12, x12, x4\n\t" - "subs x8, x8, x12\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x7, x7, x4\n\t" - "and x11, x11, x4\n\t" - "subs x7, x7, x11\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x6, x6, x4\n\t" - "and x10, x10, x4\n\t" - "subs x6, x6, x10\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x9, x9, x4\n\t" - "subs x5, x5, x9\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "eor %[a], x2, x4\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 16]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 0]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" : [a] "+r" (a) : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12" + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); #endif @@ -20833,8 +22800,6 @@ static sp_digit sp_256_cond_sub_4(sp_digit* r, const sp_digit* a, const sp_digit return (sp_digit)r; } -#define sp_256_mont_reduce_order_4 sp_256_mont_reduce_4 - /* Reduce the number back to 256 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -20843,6 +22808,93 @@ static sp_digit sp_256_cond_sub_4(sp_digit* r, const sp_digit* a, const sp_digit */ SP_NOINLINE static void sp_256_mont_reduce_4(sp_digit* a, const sp_digit* m, sp_digit mp) +{ + __asm__ __volatile__ ( + "ldp x10, x11, [%[a], 0]\n\t" + "ldp x12, x13, [%[a], 16]\n\t" + "ldp x14, x15, [%[a], 32]\n\t" + "ldp x16, x17, [%[a], 48]\n\t" + "mov x3, x10\n\t" + "# Start Reduction\n\t" + "mov x4, x11\n\t" + "mov x5, x12\n\t" + "# mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192\n\t" + "# - a[0] << 32 << 192\n\t" + "# + (a[0] * 2) << 192\n\t" + "# a[0]-a[2] << 32\n\t" + "extr x12, x12, x11, 32\n\t" + "add x6, x13, x10\n\t" + "extr x11, x11, x10, 32\n\t" + "add x6, x6, x10\n\t" + "# + a[0]-a[2] << 32 << 64\n\t" + "# - a[0] << 32 << 192\n\t" + "adds x4, x4, x10, lsl #32\n\t" + "sub x6, x6, x10, lsl #32\n\t" + "adcs x5, x5, x11\n\t" + "adc x6, x6, x12\n\t" + "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t" + "# a += mu << 256\n\t" + "adds x14, x14, x3\n\t" + "adcs x15, x15, x4\n\t" + "adcs x16, x16, x5\n\t" + "adcs x17, x17, x6\n\t" + "cset x10, cs\n\t" + "# a += mu << 192\n\t" + "# mu <<= 32\n\t" + "# a += (mu << 32) << 64\n\t" + "adds x13, x13, x3\n\t" + "adcs x14, x14, x4\n\t" + "adcs x15, x15, x5\n\t" + "lsr x7, x6, 32\n\t" + "adcs x16, x16, x6\n\t" + "extr x6, x6, x5, 32\n\t" + "adcs x17, x17, xzr\n\t" + "extr x5, x5, x4, 32\n\t" + "adc x10, x10, xzr\n\t" + "adds x13, x13, x5\n\t" + "extr x4, x4, x3, 32\n\t" + "adcs x14, x14, x6\n\t" + "adcs x15, x15, x7\n\t" + "lsl x3, x3, 32\n\t" + "adcs x16, x16, xzr\n\t" + "adcs x17, x17, xzr\n\t" + "adc x10, x10, xzr\n\t" + "# a -= (mu << 32) << 192\n\t" + "subs x13, x13, x3\n\t" + "sbcs x14, x14, x4\n\t" + "sbcs x15, x15, x5\n\t" + "sub x10, xzr, x10\n\t" + "sbcs x16, x16, x6\n\t" + "sub x10, x10, #1\n\t" + "sbcs x17, x17, x7\n\t" + "mov x9, 0xffffffff00000001\n\t" + "adc x10, x10, xzr\n\t" + "# mask m and sub from result if overflow\n\t" + "# m[0] = -1 & mask = mask\n\t" + "subs x14, x14, x10\n\t" + "# m[1] = 0xffffffff & mask = mask >> 32 as mask is all 1s or 0s\n\t" + "lsr x8, x10, 32\n\t" + "sbcs x15, x15, x8\n\t" + "and x9, x9, x10\n\t" + "# m[2] = 0 & mask = 0\n\t" + "sbcs x16, x16, xzr\n\t" + "stp x14, x15, [%[a], 0]\n\t" + "# m[3] = 0xffffffff00000001 & mask\n\t" + "sbc x17, x17, x9\n\t" + "stp x16, x17, [%[a], 16]\n\t" + : + : [a] "r" (a), [m] "r" (m), [mp] "r" (mp) + : "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + ); +} +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +SP_NOINLINE static void sp_256_mont_reduce_order_4(sp_digit* a, const sp_digit* m, + sp_digit mp) { __asm__ __volatile__ ( "ldp x9, x10, [%[a], 0]\n\t" @@ -21200,18 +23252,14 @@ static void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m) "adds x3, x3, x10\n\t" "and x8, x10, 0xffffffff00000001\n\t" "adcs x4, x4, x7\n\t" - "lsr x3, x3, 1\n\t" "adcs x5, x5, xzr\n\t" - "lsr x7, x4, 1\n\t" + "extr x3, x4, x3, 1\n\t" "adcs x6, x6, x8\n\t" - "lsr x8, x5, 1\n\t" + "extr x4, x5, x4, 1\n\t" "cset x9, cs\n\t" - "lsr x10, x6, 1\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "orr x4, x7, x5, lsl 63\n\t" - "orr x5, x8, x6, lsl 63\n\t" + "extr x5, x6, x5, 1\n\t" + "extr x6, x9, x6, 1\n\t" "stp x3, x4, [%[r], 0]\n\t" - "orr x6, x10, x9, lsl 63\n\t" "stp x5, x6, [%[r], 16]\n\t" : : [r] "r" (r), [a] "r" (a), [m] "r" (m) @@ -37370,41 +39418,74 @@ static void sp_256_add_one_4(sp_digit* a) */ static void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n) { - int i; - int j; - byte* d; + sp_int64 nl = n; + sp_int64 size8 = size * 8; - for (i = n - 1,j = 0; i >= 7; i -= 8) { - r[j] = ((sp_digit)a[i - 0] << 0) | - ((sp_digit)a[i - 1] << 8) | - ((sp_digit)a[i - 2] << 16) | - ((sp_digit)a[i - 3] << 24) | - ((sp_digit)a[i - 4] << 32) | - ((sp_digit)a[i - 5] << 40) | - ((sp_digit)a[i - 6] << 48) | - ((sp_digit)a[i - 7] << 56); - j++; - } - - if (i >= 0) { - r[j] = 0; - - d = (byte*)r; - switch (i) { - case 6: d[n - 1 - 6] = a[6]; //fallthrough - case 5: d[n - 1 - 5] = a[5]; //fallthrough - case 4: d[n - 1 - 4] = a[4]; //fallthrough - case 3: d[n - 1 - 3] = a[3]; //fallthrough - case 2: d[n - 1 - 2] = a[2]; //fallthrough - case 1: d[n - 1 - 1] = a[1]; //fallthrough - case 0: d[n - 1 - 0] = a[0]; //fallthrough - } - j++; - } - - for (; j < size; j++) { - r[j] = 0; - } + __asm__ __volatile__ ( + "add x4, %[a], %[n]\n\t" + "mov x5, %[r]\n\t" + "sub x4, x4, 8\n\t" + "subs x6, %[n], 8\n\t" + "mov x7, xzr\n\t" + "blt 2f\n\t" + /* Put in mulitples of 8 bytes. */ + "1:\n\t" + "ldr x8, [x4], -8\n\t" + "subs x6, x6, 8\n\t" + "rev x8, x8\n\t" + "str x8, [x5], 8\n\t" + "add x7, x7, 8\n\t" + "b.ge 1b\n\t" + "2:\n\t" + "cmp x6, -7\n\t" + "b.lt 20f\n\t" + /* Put in less than 8 bytes. */ + "str xzr, [x5]\n\t" + "add x7, x7, 8\n\t" + "add x4, x4, 7\n\t" + "b.eq 17f\n\t" + "cmp x6, -5\n\t" + "b.lt 16f\n\t" + "b.eq 15f\n\t" + "cmp x6, -3\n\t" + "b.lt 14f\n\t" + "b.eq 13f\n\t" + "cmp x6, -2\n\t" + "b.eq 12f\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "12:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "13:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "14:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "15:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "16:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "17:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "20:\n\t" + "add x5, %[r], x7\n\t" + "subs x7, %[size], x7\n\t" + "b.eq 30f\n\t" + /* Zero out remaining words. */ + "21:\n\t" + "subs x7, x7, 8\n\t" + "str xzr, [x5], 8\n\t" + "b.gt 21b\n\t" + "30:\n\t" + : + : [r] "r" (r), [size] "r" (size8), [a] "r" (a), [n] "r" (nl) + : "memory", "x4", "x5", "x6", "x7", "x8" + ); } /* Generates a scalar that is in the range 1..order-1. @@ -37533,15 +39614,15 @@ static void sp_256_to_bin_4(sp_digit* r, byte* a) int i; int j = 0; - for (i = 3; i >= 0; i--) { - a[j++] = r[i] >> 56; - a[j++] = r[i] >> 48; - a[j++] = r[i] >> 40; - a[j++] = r[i] >> 32; - a[j++] = r[i] >> 24; - a[j++] = r[i] >> 16; - a[j++] = r[i] >> 8; - a[j++] = r[i] >> 0; + for (i = 3; i >= 0; i--, j += 8) { + __asm__ __volatile__ ( + "ldr x4, [%[r]]\n\t" + "rev x4, x4\n\t" + "str x4, [%[a]]\n\t" + : + : [r] "r" (r + i), [a] "r" (a + j) + : "memory", "x4" + ); } } @@ -37680,38 +39761,41 @@ static void sp_256_mul_d_4(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. */ static sp_digit div_256_word_4(sp_digit d1, sp_digit d0, sp_digit div) { - sp_digit r; - __asm__ __volatile__ ( - "lsr x5, %[div], 32\n\t" - "add x5, x5, 1\n\t" + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" "lsl x6, x3, 32\n\t" "mul x4, %[div], x6\n\t" "umulh x3, %[div], x6\n\t" "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "udiv x3, %[d1], x5\n\t" - "lsl x3, x3, 32\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "umulh x3, %[div], x3\n\t" - "subs %[d0], %[d0], x4\n\t" - "sbc %[d1], %[d1], x3\n\t" + "cmp %[d1], x5\n\t" + "cset x9, ge\n\t" + "csetm x10, ge\n\t" + "lsl x9, x9, #32\n\t" + "and x7, x7, x10\n\t" + "and x8, x8, x10\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" "udiv x3, x3, x5\n\t" "add x6, x6, x3\n\t" @@ -37720,23 +39804,22 @@ static sp_digit div_256_word_4(sp_digit d1, sp_digit d0, sp_digit div) "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" - "udiv x3, x3, x5\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "sub %[d0], %[d0], x4\n\t" + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" "udiv x3, %[d0], %[div]\n\t" - "add %[r], x6, x3\n\t" + "add %[d1], x6, x3\n\t" - : [r] "=r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "x3", "x4", "x5", "x6" + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); - return r; + return d1; } /* AND m into each word of a and store in r. @@ -37781,7 +39864,7 @@ static WC_INLINE int sp_256_div_4(const sp_digit* a, const sp_digit* d, sp_digit div = d[3]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 4); - for (i=3; i>=0; i--) { + for (i = 3; i >= 0; i--) { sp_digit hi = t1[4 + i] - (t1[4 + i] == div); r1 = div_256_word_4(hi, t1[4 + i - 1], div); @@ -38454,12 +40537,9 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a, "tst x7, 1\n\t" "b.ne 90f\n\t" "\n1:\n\t" - "lsr x7, x7, 1\n\t" - "lsr x26, x8, 1\n\t" - "lsr x27, x9, 1\n\t" - "orr x7, x7, x8, lsl 63\n\t" - "orr x8, x26, x9, lsl 63\n\t" - "orr x9, x27, x10, lsl 63\n\t" + "extr x7, x8, x7, 1\n\t" + "extr x8, x9, x8, 1\n\t" + "extr x9, x10, x9, 1\n\t" "lsr x10, x10, 1\n\t" "sub x24, x24, 1\n\t" "ands x25, x15, 1\n\t" @@ -38470,14 +40550,10 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a, "adcs x19, x19, %[m]\n\t" "cset x25, cs\n\t" "\n2:\n\t" - "lsr x15, x15, 1\n\t" - "lsr x26, x16, 1\n\t" - "lsr x27, x17, 1\n\t" - "lsr x28, x19, 1\n\t" - "orr x15, x15, x16, lsl 63\n\t" - "orr x16, x26, x17, lsl 63\n\t" - "orr x17, x27, x19, lsl 63\n\t" - "orr x19, x28, x25, lsl 63\n\t" + "extr x15, x16, x15, 1\n\t" + "extr x16, x17, x16, 1\n\t" + "extr x17, x19, x17, 1\n\t" + "extr x19, x25, x19, 1\n\t" "tst x7, 1\n\t" "b.eq 1b\n\t" "\n90:\n\t" @@ -38540,12 +40616,9 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a, "sub x23, x25, x23\n\t" "\n43:\n\t" "\n50:\n\t" - "lsr x3, x3, 1\n\t" - "lsr x26, x4, 1\n\t" - "lsr x27, x5, 1\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "orr x4, x26, x5, lsl 63\n\t" - "orr x5, x27, x6, lsl 63\n\t" + "extr x3, x4, x3, 1\n\t" + "extr x4, x5, x4, 1\n\t" + "extr x5, x6, x5, 1\n\t" "lsr x6, x6, 1\n\t" "sub x23, x23, 1\n\t" "ands x25, x11, 1\n\t" @@ -38556,14 +40629,10 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a, "adcs x14, x14, %[m]\n\t" "cset x25, cs\n\t" "\n51:\n\t" - "lsr x11, x11, 1\n\t" - "lsr x26, x12, 1\n\t" - "lsr x27, x13, 1\n\t" - "lsr x28, x14, 1\n\t" - "orr x11, x11, x12, lsl 63\n\t" - "orr x12, x26, x13, lsl 63\n\t" - "orr x13, x27, x14, lsl 63\n\t" - "orr x14, x28, x25, lsl 63\n\t" + "extr x11, x12, x11, 1\n\t" + "extr x12, x13, x12, 1\n\t" + "extr x13, x14, x13, 1\n\t" + "extr x14, x25, x14, 1\n\t" "tst x3, 1\n\t" "b.eq 50b\n\t" "b 90b\n\t" @@ -38608,12 +40677,9 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a, "sub x24, x25, x24\n\t" "\n73:\n\t" "\n80:\n\t" - "lsr x7, x7, 1\n\t" - "lsr x26, x8, 1\n\t" - "lsr x27, x9, 1\n\t" - "orr x7, x7, x8, lsl 63\n\t" - "orr x8, x26, x9, lsl 63\n\t" - "orr x9, x27, x10, lsl 63\n\t" + "extr x7, x8, x7, 1\n\t" + "extr x8, x9, x8, 1\n\t" + "extr x9, x10, x9, 1\n\t" "lsr x10, x10, 1\n\t" "sub x24, x24, 1\n\t" "ands x25, x15, 1\n\t" @@ -38624,14 +40690,10 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a, "adcs x19, x19, %[m]\n\t" "cset x25, cs\n\t" "\n81:\n\t" - "lsr x15, x15, 1\n\t" - "lsr x26, x16, 1\n\t" - "lsr x27, x17, 1\n\t" - "lsr x28, x19, 1\n\t" - "orr x15, x15, x16, lsl 63\n\t" - "orr x16, x26, x17, lsl 63\n\t" - "orr x17, x27, x19, lsl 63\n\t" - "orr x19, x28, x25, lsl 63\n\t" + "extr x15, x16, x15, 1\n\t" + "extr x16, x17, x16, 1\n\t" + "extr x17, x19, x17, 1\n\t" + "extr x19, x25, x19, 1\n\t" "tst x7, 1\n\t" "b.eq 80b\n\t" "b 90b\n\t" @@ -38649,7 +40711,7 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a, "\n102:\n\t" : [m] "+r" (m) : [r] "r" (r), [a] "r" (a) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25" ); return MP_OKAY; @@ -39663,10 +41725,10 @@ static void sp_384_mul_6(sp_digit* r, const sp_digit* a, const sp_digit* b) sp_digit tmp[12]; __asm__ __volatile__ ( - "mov x5, 0\n\t" - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" "\n1:\n\t" "subs x3, x5, 40\n\t" "csel x3, xzr, x3, cc\n\t" @@ -39958,10 +42020,10 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a) sp_digit tmp[12]; __asm__ __volatile__ ( - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" - "mov x5, 0\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" + "mov x5, xzr\n\t" "\n1:\n\t" "subs x3, x5, 40\n\t" "csel x3, xzr, x3, cc\n\t" @@ -40599,16 +42661,14 @@ SP_NOINLINE static void sp_384_mont_reduce_6(sp_digit* a, const sp_digit* m, "mov x6, xzr\n\t" "# a[0-7] += m[0-5] * mu[0..1] = m[0-5] * (a[0..1] * mp)\n\t" "ldp x13, x14, [%[a], #48]\n\t" - "lsl x2, x8, 32\n\t" - "lsl x1, x7, 32\n\t" - "orr x2, x2, x7, lsr 32\n\t" + "extr x2, x8, x7, 32\n\t" + "extr x1, x7, xzr, 32\n\t" "adds x1, x1, x7\n\t" "adc x2, x2, x8\n\t" "add x2, x2, x7\n\t" - "lsl x3, x1, 32\n\t" - "lsl x4, x2, 32\n\t" - "orr x4, x4, x1, lsr 32\n\t" - "lsr x5, x2, 32\n\t" + "extr x5, xzr, x2, 32\n\t" + "extr x4, x2, x1, 32\n\t" + "extr x3, x1, xzr, 32\n\t" "adds x7, x7, x3\n\t" "adcs x8, x8, x4\n\t" "adcs x9, x9, x5\n\t" @@ -40631,16 +42691,14 @@ SP_NOINLINE static void sp_384_mont_reduce_6(sp_digit* a, const sp_digit* m, "sbc x6, x6, xzr\n\t" "# a[2-9] += m[0-5] * mu[0..1] = m[0-5] * (a[2..3] * mp)\n\t" "ldp x7, x8, [%[a], #64]\n\t" - "lsl x2, x10, 32\n\t" - "lsl x1, x9, 32\n\t" - "orr x2, x2, x9, lsr 32\n\t" + "extr x2, x10, x9, 32\n\t" + "extr x1, x9, xzr, 32\n\t" "adds x1, x1, x9\n\t" "adc x2, x2, x10\n\t" "add x2, x2, x9\n\t" - "lsl x3, x1, 32\n\t" - "lsl x4, x2, 32\n\t" - "orr x4, x4, x1, lsr 32\n\t" - "lsr x5, x2, 32\n\t" + "extr x5, xzr, x2, 32\n\t" + "extr x4, x2, x1, 32\n\t" + "extr x3, x1, xzr, 32\n\t" "adds x7, x7, x6\n\t" "adcs x8, x8, xzr\n\t" "adc x6, xzr, xzr\n\t" @@ -40666,16 +42724,14 @@ SP_NOINLINE static void sp_384_mont_reduce_6(sp_digit* a, const sp_digit* m, "sbc x6, x6, xzr\n\t" "# a[4-11] += m[0-5] * mu[0..1] = m[0-5] * (a[4..5] * mp)\n\t" "ldp x9, x10, [%[a], #80]\n\t" - "lsl x2, x12, 32\n\t" - "lsl x1, x11, 32\n\t" - "orr x2, x2, x11, lsr 32\n\t" + "extr x2, x12, x11, 32\n\t" + "extr x1, x11, xzr, 32\n\t" "adds x1, x1, x11\n\t" "adc x2, x2, x12\n\t" "add x2, x2, x11\n\t" - "lsl x3, x1, 32\n\t" - "lsl x4, x2, 32\n\t" - "orr x4, x4, x1, lsr 32\n\t" - "lsr x5, x2, 32\n\t" + "extr x5, xzr, x2, 32\n\t" + "extr x4, x2, x1, 32\n\t" + "extr x3, x1, xzr, 32\n\t" "adds x9, x9, x6\n\t" "adcs x10, x10, xzr\n\t" "adc x6, xzr, xzr\n\t" @@ -40729,99 +42785,92 @@ SP_NOINLINE static void sp_384_mont_reduce_6(sp_digit* a, const sp_digit* m, SP_NOINLINE static void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit* m, sp_digit mp) { - __asm__ __volatile__ ( - "ldp x14, x15, [%[m], 0]\n\t" - "ldp x16, x17, [%[m], 16]\n\t" - "ldp x19, x20, [%[m], 32]\n\t" - "mov x3, xzr\n\t" - "# i = 6\n\t" - "mov x4, 6\n\t" "ldp x12, x13, [%[a], 0]\n\t" + "ldp x14, x15, [%[a], 16]\n\t" + "ldp x16, x17, [%[a], 32]\n\t" + "mov x3, xzr\n\t" + "# i = 0..5\n\t" + "mov x4, 6\n\t" "\n1:\n\t" "# mu = a[i] * mp\n\t" "mul x9, %[mp], x12\n\t" "# a[i+0] += m[0] * mu\n\t" - "mul x7, x14, x9\n\t" - "umulh x8, x14, x9\n\t" + "ldp x10, x11, [%[m], 0]\n\t" + "mul x7, x10, x9\n\t" + "umulh x8, x10, x9\n\t" "adds x12, x12, x7\n\t" "# a[i+1] += m[1] * mu\n\t" - "mul x7, x15, x9\n\t" "adc x6, x8, xzr\n\t" - "umulh x8, x15, x9\n\t" + "mul x7, x11, x9\n\t" + "umulh x8, x11, x9\n\t" "adds x12, x13, x7\n\t" "# a[i+2] += m[2] * mu\n\t" - "ldr x13, [%[a], 16]\n\t" + "ldp x11, x10, [%[m], 16]\n\t" "adc x5, x8, xzr\n\t" - "mul x7, x16, x9\n\t" "adds x12, x12, x6\n\t" - "umulh x8, x16, x9\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "adds x13, x13, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x13, x14, x7\n\t" "# a[i+3] += m[3] * mu\n\t" - "ldr x10, [%[a], 24]\n\t" "adc x6, x8, xzr\n\t" - "mul x7, x17, x9\n\t" "adds x13, x13, x5\n\t" - "umulh x8, x17, x9\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x14, x15, x7\n\t" "# a[i+4] += m[4] * mu\n\t" - "ldr x11, [%[a], 32]\n\t" + "ldp x11, x10, [%[m], 32]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x19, x9\n\t" + "adds x14, x14, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "str x10, [%[a], 24]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x15, x16, x7\n\t" "# a[i+5] += m[5] * mu\n\t" - "ldr x10, [%[a], 40]\n\t" + "ldr x10, [%[m], 40]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x20, x9\n\t" + "adds x15, x15, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" + "umulh x8, x10, x9\n\t" "adds x6, x6, x7\n\t" "adcs x8, x8, x3\n\t" - "str x11, [%[a], 32]\n\t" - "cset x3, cs\n\t" - "adds x10, x10, x6\n\t" - "ldr x11, [%[a], 48]\n\t" - "str x10, [%[a], 40]\n\t" - "adcs x11, x11, x8\n\t" - "str x11, [%[a], 48]\n\t" + "cset x3, cs\n\t" + "adds x16, x17, x6\n\t" + "ldr x17, [%[a], 48]\n\t" + "adcs x17, x17, x8\n\t" "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" "bne 1b\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" "# Create mask\n\t" "neg x3, x3\n\t" "mov x9, %[a]\n\t" "sub %[a], %[a], 48\n\t" "# Subtract masked modulus\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" - "and x14, x14, x3\n\t" - "ldp x11, x10, [x9, 16]\n\t" - "and x15, x15, x3\n\t" - "subs x12, x12, x14\n\t" - "and x16, x16, x3\n\t" - "sbcs x13, x13, x15\n\t" - "and x17, x17, x3\n\t" - "sbcs x11, x11, x16\n\t" + "ldp x4, x5, [%[m], 0]\n\t" + "ldp x6, x7, [%[m], 16]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "subs x12, x12, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x13, x13, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x14, x14, x6\n\t" "stp x12, x13, [%[a], 0]\n\t" - "sbcs x10, x10, x17\n\t" - "stp x11, x10, [%[a], 16]\n\t" - "ldp x12, x13, [x9, 32]\n\t" - "and x19, x19, x3\n\t" - "and x20, x20, x3\n\t" - "sbcs x12, x12, x19\n\t" - "sbcs x13, x13, x20\n\t" - "stp x12, x13, [%[a], 32]\n\t" - : [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20" + "sbcs x15, x15, x7\n\t" + "stp x14, x15, [%[a], 16]\n\t" + "ldp x4, x5, [%[m], 32]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x16, x16, x4\n\t" + "sbcs x17, x17, x5\n\t" + "stp x16, x17, [%[a], 32]\n\t" + : [a] "+r" (a), [mp] "+r" (mp) + : [m] "r" (m) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" ); } @@ -40835,7 +42884,7 @@ SP_NOINLINE static void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit* * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_mul_6(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_mul_6(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_384_mul_6(r, a, b); @@ -40849,7 +42898,7 @@ static void sp_384_mont_mul_6(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_sqr_6(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_sqr_6(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_384_sqr_6(r, a); @@ -40979,77 +43028,79 @@ static sp_int64 sp_384_cmp_6(const sp_digit* a, const sp_digit* b) { #ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "mov x5, 40\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "mov x10, #6\n\t" + "add %[a], %[a], #32\n\t" + "add %[b], %[b], #32\n\t" "1:\n\t" - "ldr x6, [%[a], x5]\n\t" - "ldr x7, [%[b], x5]\n\t" - "and x6, x6, x4\n\t" - "and x7, x7, x4\n\t" - "subs x6, x6, x7\n\t" - "csel x2, x3, x2, hi\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "subs x5, x5, #8\n\t" - "b.cs 1b\n\t" - "eor %[a], x2, x4\n\t" - : [a] "+r" (a) - : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16" + "ldp x6, x7, [%[a]], -16\n\t" + "ldp x8, x9, [%[b]], -16\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x10, x10, #2\n\t" + "b.ne 1b\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #else __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "ldp x5, x6, [%[a], 0]\n\t" - "ldp x7, x8, [%[a], 16]\n\t" - "ldp x9, x10, [%[a], 32]\n\t" - "ldp x11, x12, [%[b], 0]\n\t" - "ldp x13, x14, [%[b], 16]\n\t" - "ldp x15, x16, [%[b], 32]\n\t" - "and x10, x10, x4\n\t" - "and x16, x16, x4\n\t" - "subs x10, x10, x16\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x9, x9, x4\n\t" - "and x15, x15, x4\n\t" - "subs x9, x9, x15\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x8, x8, x4\n\t" - "and x14, x14, x4\n\t" - "subs x8, x8, x14\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x7, x7, x4\n\t" - "and x13, x13, x4\n\t" - "subs x7, x7, x13\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x6, x6, x4\n\t" - "and x12, x12, x4\n\t" - "subs x6, x6, x12\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x11, x11, x4\n\t" - "subs x5, x5, x11\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "eor %[a], x2, x4\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "ldp x6, x7, [%[a], 32]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 16]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 0]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" : [a] "+r" (a) : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16" + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); #endif @@ -41194,6 +43245,7 @@ static void sp_384_mont_tpl_6(sp_digit* r, const sp_digit* a, const sp_digit* m) sp_384_cond_sub_6(r, r, m, 0 - o); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -41205,7 +43257,6 @@ static void sp_384_mont_tpl_6(sp_digit* r, const sp_digit* a, const sp_digit* m) static sp_digit sp_384_cond_add_6(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m) { -#ifdef WOLFSSL_SP_SMALL sp_digit c = 0; __asm__ __volatile__ ( @@ -41223,43 +43274,57 @@ static sp_digit sp_384_cond_add_6(sp_digit* r, const sp_digit* a, const sp_digit "b.lt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + : "memory", "x4", "x5", "x8", "x9", "x10", "x11", "x12" ); return c; -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_384_cond_add_6(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ __asm__ __volatile__ ( - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "adds x4, x4, x5\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" - "adcs x6, x6, x7\n\t" - "stp x4, x6, [%[r], 32]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "adcs x5, x5, x9\n\t" + "stp x4, x5, [%[r], 32]\n\t" "cset %[r], cs\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" ); return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * @@ -41283,23 +43348,18 @@ static void sp_384_rshift1_6(sp_digit* r, const sp_digit* a) "ldp x2, x3, [%[a]]\n\t" "ldp x4, x5, [%[a], 16]\n\t" "ldp x6, x7, [%[a], 32]\n\t" - "lsr x11, x6, 1\n\t" - "lsr x10, x5, 1\n\t" - "lsr x9, x4, 1\n\t" - "lsr x8, x3, 1\n\t" - "lsr x2, x2, 1\n\t" - "orr x2, x2, x3, lsl 63\n\t" - "orr x3, x8, x4, lsl 63\n\t" - "orr x4, x9, x5, lsl 63\n\t" - "orr x5, x10, x6, lsl 63\n\t" - "orr x6, x11, x7, lsl 63\n\t" - "lsr x7, x7, 1\n\t" + "extr x2, x3, x2, #1\n\t" + "extr x3, x4, x3, #1\n\t" + "extr x4, x5, x4, #1\n\t" + "extr x5, x6, x5, #1\n\t" + "extr x6, x7, x6, #1\n\t" + "lsr x7, x7, #1\n\t" "stp x2, x3, [%[r]]\n\t" "stp x4, x5, [%[r], 16]\n\t" "stp x6, x7, [%[r], 32]\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7" ); } @@ -63233,41 +65293,74 @@ static void sp_384_add_one_6(sp_digit* a) */ static void sp_384_from_bin(sp_digit* r, int size, const byte* a, int n) { - int i; - int j; - byte* d; + sp_int64 nl = n; + sp_int64 size8 = size * 8; - for (i = n - 1,j = 0; i >= 7; i -= 8) { - r[j] = ((sp_digit)a[i - 0] << 0) | - ((sp_digit)a[i - 1] << 8) | - ((sp_digit)a[i - 2] << 16) | - ((sp_digit)a[i - 3] << 24) | - ((sp_digit)a[i - 4] << 32) | - ((sp_digit)a[i - 5] << 40) | - ((sp_digit)a[i - 6] << 48) | - ((sp_digit)a[i - 7] << 56); - j++; - } - - if (i >= 0) { - r[j] = 0; - - d = (byte*)r; - switch (i) { - case 6: d[n - 1 - 6] = a[6]; //fallthrough - case 5: d[n - 1 - 5] = a[5]; //fallthrough - case 4: d[n - 1 - 4] = a[4]; //fallthrough - case 3: d[n - 1 - 3] = a[3]; //fallthrough - case 2: d[n - 1 - 2] = a[2]; //fallthrough - case 1: d[n - 1 - 1] = a[1]; //fallthrough - case 0: d[n - 1 - 0] = a[0]; //fallthrough - } - j++; - } - - for (; j < size; j++) { - r[j] = 0; - } + __asm__ __volatile__ ( + "add x4, %[a], %[n]\n\t" + "mov x5, %[r]\n\t" + "sub x4, x4, 8\n\t" + "subs x6, %[n], 8\n\t" + "mov x7, xzr\n\t" + "blt 2f\n\t" + /* Put in mulitples of 8 bytes. */ + "1:\n\t" + "ldr x8, [x4], -8\n\t" + "subs x6, x6, 8\n\t" + "rev x8, x8\n\t" + "str x8, [x5], 8\n\t" + "add x7, x7, 8\n\t" + "b.ge 1b\n\t" + "2:\n\t" + "cmp x6, -7\n\t" + "b.lt 20f\n\t" + /* Put in less than 8 bytes. */ + "str xzr, [x5]\n\t" + "add x7, x7, 8\n\t" + "add x4, x4, 7\n\t" + "b.eq 17f\n\t" + "cmp x6, -5\n\t" + "b.lt 16f\n\t" + "b.eq 15f\n\t" + "cmp x6, -3\n\t" + "b.lt 14f\n\t" + "b.eq 13f\n\t" + "cmp x6, -2\n\t" + "b.eq 12f\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "12:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "13:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "14:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "15:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "16:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "17:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "20:\n\t" + "add x5, %[r], x7\n\t" + "subs x7, %[size], x7\n\t" + "b.eq 30f\n\t" + /* Zero out remaining words. */ + "21:\n\t" + "subs x7, x7, 8\n\t" + "str xzr, [x5], 8\n\t" + "b.gt 21b\n\t" + "30:\n\t" + : + : [r] "r" (r), [size] "r" (size8), [a] "r" (a), [n] "r" (nl) + : "memory", "x4", "x5", "x6", "x7", "x8" + ); } /* Generates a scalar that is in the range 1..order-1. @@ -63396,15 +65489,15 @@ static void sp_384_to_bin_6(sp_digit* r, byte* a) int i; int j = 0; - for (i = 5; i >= 0; i--) { - a[j++] = r[i] >> 56; - a[j++] = r[i] >> 48; - a[j++] = r[i] >> 40; - a[j++] = r[i] >> 32; - a[j++] = r[i] >> 24; - a[j++] = r[i] >> 16; - a[j++] = r[i] >> 8; - a[j++] = r[i] >> 0; + for (i = 5; i >= 0; i--, j += 8) { + __asm__ __volatile__ ( + "ldr x4, [%[r]]\n\t" + "rev x4, x4\n\t" + "str x4, [%[a]]\n\t" + : + : [r] "r" (r + i), [a] "r" (a + j) + : "memory", "x4" + ); } } @@ -63527,9 +65620,9 @@ static void sp_384_mul_d_6(sp_digit* r, const sp_digit* a, "ldr x8, [%[a]]\n\t" "mul x5, %[b], x8\n\t" "umulh x3, %[b], x8\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "str x5, [%[r]]\n\t" - "mov x5, 0\n\t" + "mov x5, xzr\n\t" "mov x9, #8\n\t" "1:\n\t" "ldr x8, [%[a], x9]\n\t" @@ -63553,90 +65646,94 @@ static void sp_384_mul_d_6(sp_digit* r, const sp_digit* a, #else __asm__ __volatile__ ( "# A[0] * B\n\t" - "ldp x8, x9, [%[a]]\n\t" - "mul x3, %[b], x8\n\t" - "umulh x4, %[b], x8\n\t" - "mov x5, 0\n\t" + "ldp x9, x10, [%[a]]\n\t" + "mul x3, %[b], x9\n\t" + "umulh x4, %[b], x9\n\t" + "mov x5, xzr\n\t" "# A[1] * B\n\t" "str x3, [%[r]]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adds x4, x4, x6\n\t" "# A[2] * B\n\t" - "ldp x8, x9, [%[a], 16]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" "str x4, [%[r], 8]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[3] * B\n\t" "str x5, [%[r], 16]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[4] * B\n\t" - "ldp x8, x9, [%[a], 32]\n\t" + "ldp x9, x10, [%[a], 32]\n\t" "str x3, [%[r], 24]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[5] * B\n\t" "str x4, [%[r], 32]\n\t" - "mul x6, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "adc x3, x3, x7\n\t" - "stp x5, x3, [%[r], 40]\n\t" + "str x5, [%[r], 40]\n\t" + "str x3, [%[r], 48]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. */ static sp_digit div_384_word_6(sp_digit d1, sp_digit d0, sp_digit div) { - sp_digit r; - __asm__ __volatile__ ( - "lsr x5, %[div], 32\n\t" - "add x5, x5, 1\n\t" + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" "lsl x6, x3, 32\n\t" "mul x4, %[div], x6\n\t" "umulh x3, %[div], x6\n\t" "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "udiv x3, %[d1], x5\n\t" - "lsl x3, x3, 32\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "umulh x3, %[div], x3\n\t" - "subs %[d0], %[d0], x4\n\t" - "sbc %[d1], %[d1], x3\n\t" + "cmp %[d1], x5\n\t" + "cset x9, ge\n\t" + "csetm x10, ge\n\t" + "lsl x9, x9, #32\n\t" + "and x7, x7, x10\n\t" + "and x8, x8, x10\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" "udiv x3, x3, x5\n\t" "add x6, x6, x3\n\t" @@ -63645,23 +65742,22 @@ static sp_digit div_384_word_6(sp_digit d1, sp_digit d0, sp_digit div) "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" - "udiv x3, x3, x5\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "sub %[d0], %[d0], x4\n\t" + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" "udiv x3, %[d0], %[div]\n\t" - "add %[r], x6, x3\n\t" + "add %[d1], x6, x3\n\t" - : [r] "=r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "x3", "x4", "x5", "x6" + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); - return r; + return d1; } /* AND m into each word of a and store in r. @@ -63708,7 +65804,7 @@ static WC_INLINE int sp_384_div_6(const sp_digit* a, const sp_digit* d, sp_digit div = d[5]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 6); - for (i=5; i>=0; i--) { + for (i = 5; i >= 0; i--) { sp_digit hi = t1[6 + i] - (t1[6 + i] == div); r1 = div_384_word_6(hi, t1[6 + i - 1], div); @@ -64278,46 +66374,40 @@ static void sp_384_div2_mod_6(sp_digit* r, const sp_digit* a, const sp_digit* m) { __asm__ __volatile__ ( - "ldr x3, [%[a], 0]\n\t" - "ldr x4, [%[a], 8]\n\t" - "ldr x5, [%[a], 16]\n\t" - "ldr x6, [%[a], 24]\n\t" - "ldr x7, [%[a], 32]\n\t" - "ldr x8, [%[a], 40]\n\t" - "ldr x9, [%[m], 0]\n\t" - "ldr x10, [%[m], 8]\n\t" - "ldr x11, [%[m], 16]\n\t" - "ldr x12, [%[m], 24]\n\t" - "ldr x13, [%[m], 32]\n\t" - "ldr x14, [%[m], 40]\n\t" - "ands x15, x3, 1\n\t" - "b.eq 1f\n\t" - "adds x3, x3, x9\n\t" - "adcs x4, x4, x10\n\t" - "adcs x5, x5, x11\n\t" - "adcs x6, x6, x12\n\t" - "adcs x7, x7, x13\n\t" - "adcs x8, x8, x14\n\t" - "cset x15, cs\n\t" + "ldr x3, [%[a], 0]\n\t" + "ldr x4, [%[a], 8]\n\t" + "ldr x5, [%[a], 16]\n\t" + "ldr x6, [%[a], 24]\n\t" + "ldr x7, [%[a], 32]\n\t" + "ldr x8, [%[a], 40]\n\t" + "ldr x9, [%[m], 0]\n\t" + "ldr x10, [%[m], 8]\n\t" + "ldr x11, [%[m], 16]\n\t" + "ldr x12, [%[m], 24]\n\t" + "ldr x13, [%[m], 32]\n\t" + "ldr x14, [%[m], 40]\n\t" + "ands x15, x3, 1\n\t" + "b.eq 1f\n\t" + "adds x3, x3, x9\n\t" + "adcs x4, x4, x10\n\t" + "adcs x5, x5, x11\n\t" + "adcs x6, x6, x12\n\t" + "adcs x7, x7, x13\n\t" + "adcs x8, x8, x14\n\t" + "cset x15, cs\n\t" "\n1:\n\t" - "lsr x3, x3, 1\n\t" - "lsr x10, x4, 1\n\t" - "lsr x11, x5, 1\n\t" - "lsr x12, x6, 1\n\t" - "lsr x13, x7, 1\n\t" - "lsr x14, x8, 1\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "orr x4, x10, x5, lsl 63\n\t" - "orr x5, x11, x6, lsl 63\n\t" - "orr x6, x12, x7, lsl 63\n\t" - "orr x7, x13, x8, lsl 63\n\t" - "orr x8, x14, x15, lsl 63\n\t" - "str x3, [%[r], 0]\n\t" - "str x4, [%[r], 8]\n\t" - "str x5, [%[r], 16]\n\t" - "str x6, [%[r], 24]\n\t" - "str x7, [%[r], 32]\n\t" - "str x8, [%[r], 40]\n\t" + "extr x3, x4, x3, 1\n\t" + "extr x4, x5, x4, 1\n\t" + "extr x5, x6, x5, 1\n\t" + "extr x6, x7, x6, 1\n\t" + "extr x7, x8, x7, 1\n\t" + "extr x8, x15, x8, 1\n\t" + "str x3, [%[r], 0]\n\t" + "str x4, [%[r], 8]\n\t" + "str x5, [%[r], 16]\n\t" + "str x6, [%[r], 24]\n\t" + "str x7, [%[r], 32]\n\t" + "str x8, [%[r], 40]\n\t" : : [r] "r" (r), [a] "r" (a), [m] "r" (m) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15" @@ -64337,7 +66427,7 @@ static int sp_384_num_bits_64_6(sp_digit n) : "x1" ); - return r + 1; + return (int)(r + 1); } static int sp_384_num_bits_6(const sp_digit* a) @@ -65487,10 +67577,10 @@ static void sp_521_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b) sp_digit tmp[18]; __asm__ __volatile__ ( - "mov x5, 0\n\t" - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" "\n1:\n\t" "subs x3, x5, 64\n\t" "csel x3, xzr, x3, cc\n\t" @@ -66062,10 +68152,10 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) sp_digit tmp[18]; __asm__ __volatile__ ( - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" - "mov x5, 0\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" + "mov x5, xzr\n\t" "\n1:\n\t" "subs x3, x5, 64\n\t" "csel x3, xzr, x3, cc\n\t" @@ -66135,7 +68225,7 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) "mul x2, x10, x10\n\t" "umulh x3, x10, x10\n\t" "str x2, [%[r]]\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "# A[0] * A[1]\n\t" "mul x8, x10, x11\n\t" "umulh x9, x10, x11\n\t" @@ -66209,8 +68299,8 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) "adcs x4, x4, x9\n\t" "umulh x6, x10, x15\n\t" "adc x2, x2, xzr\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" "# A[1] * A[4]\n\t" "mul x8, x11, x14\n\t" "umulh x9, x11, x14\n\t" @@ -66233,8 +68323,8 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) "# A[0] * A[6]\n\t" "mul x5, x10, x16\n\t" "umulh x6, x10, x16\n\t" - "mov x4, 0\n\t" - "mov x7, 0\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" "# A[1] * A[5]\n\t" "mul x8, x11, x15\n\t" "umulh x9, x11, x15\n\t" @@ -66263,8 +68353,8 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) "# A[0] * A[7]\n\t" "mul x5, x10, x17\n\t" "umulh x6, x10, x17\n\t" - "mov x2, 0\n\t" - "mov x7, 0\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" "# A[1] * A[6]\n\t" "mul x8, x11, x16\n\t" "umulh x9, x11, x16\n\t" @@ -66293,8 +68383,8 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) "# A[0] * A[8]\n\t" "mul x5, x10, x19\n\t" "umulh x6, x10, x19\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" "# A[1] * A[7]\n\t" "mul x8, x11, x17\n\t" "umulh x9, x11, x17\n\t" @@ -66329,8 +68419,8 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) "# A[1] * A[8]\n\t" "mul x5, x11, x19\n\t" "umulh x6, x11, x19\n\t" - "mov x4, 0\n\t" - "mov x7, 0\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" "# A[2] * A[7]\n\t" "mul x8, x12, x17\n\t" "umulh x9, x12, x17\n\t" @@ -66359,8 +68449,8 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) "# A[2] * A[8]\n\t" "mul x5, x12, x19\n\t" "umulh x6, x12, x19\n\t" - "mov x2, 0\n\t" - "mov x7, 0\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" "# A[3] * A[7]\n\t" "mul x8, x13, x17\n\t" "umulh x9, x13, x17\n\t" @@ -66389,8 +68479,8 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) "# A[3] * A[8]\n\t" "mul x5, x13, x19\n\t" "umulh x6, x13, x19\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" "# A[4] * A[7]\n\t" "mul x8, x14, x17\n\t" "umulh x9, x14, x17\n\t" @@ -66488,7 +68578,7 @@ static void sp_521_sqr_9(sp_digit* r, const sp_digit* a) "stp x3, x4, [%[r], 128]\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "x2", "x3", "x4", "x8", "x9", "x10", "x5", "x6", "x7", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19" ); } @@ -66855,6 +68945,7 @@ static void sp_521_lshift_18(sp_digit* r, const sp_digit* a, byte n) static void sp_521_rshift_9(sp_digit* r, const sp_digit* a, byte n) { + sp_uint64 nl = n; __asm__ __volatile__ ( "mov x6, 64\n\t" "sub x6, x6, %[n]\n\t" @@ -66900,7 +68991,7 @@ static void sp_521_rshift_9(sp_digit* r, const sp_digit* a, byte n) "orr x3, x3, x5\n\t" "stp x3, x4, [%[r], 56]\n\t" : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : [r] "r" (r), [a] "r" (a), [n] "r" (nl) : "memory", "x2", "x3", "x4", "x5", "x6" ); } @@ -67083,9 +69174,9 @@ static void sp_521_mul_d_9(sp_digit* r, const sp_digit* a, "ldr x8, [%[a]]\n\t" "mul x5, %[b], x8\n\t" "umulh x3, %[b], x8\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "str x5, [%[r]]\n\t" - "mov x5, 0\n\t" + "mov x5, xzr\n\t" "mov x9, #8\n\t" "1:\n\t" "ldr x8, [%[a], x9]\n\t" @@ -67109,116 +69200,120 @@ static void sp_521_mul_d_9(sp_digit* r, const sp_digit* a, #else __asm__ __volatile__ ( "# A[0] * B\n\t" - "ldp x8, x9, [%[a]]\n\t" - "mul x3, %[b], x8\n\t" - "umulh x4, %[b], x8\n\t" - "mov x5, 0\n\t" + "ldp x9, x10, [%[a]]\n\t" + "mul x3, %[b], x9\n\t" + "umulh x4, %[b], x9\n\t" + "mov x5, xzr\n\t" "# A[1] * B\n\t" "str x3, [%[r]]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adds x4, x4, x6\n\t" "# A[2] * B\n\t" - "ldp x8, x9, [%[a], 16]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" "str x4, [%[r], 8]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[3] * B\n\t" "str x5, [%[r], 16]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[4] * B\n\t" - "ldp x8, x9, [%[a], 32]\n\t" + "ldp x9, x10, [%[a], 32]\n\t" "str x3, [%[r], 24]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[5] * B\n\t" "str x4, [%[r], 32]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[6] * B\n\t" - "ldr x8, [%[a], 48]\n\t" + "ldr x9, [%[a], 48]\n\t" "str x5, [%[r], 40]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[7] * B\n\t" - "ldp x8, x9, [%[a], 56]\n\t" + "ldp x9, x10, [%[a], 56]\n\t" "str x3, [%[r], 48]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[8] * B\n\t" "str x4, [%[r], 56]\n\t" - "mul x6, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "adc x3, x3, x7\n\t" - "stp x5, x3, [%[r], 64]\n\t" + "str x5, [%[r], 64]\n\t" + "str x3, [%[r], 72]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. */ static sp_digit div_521_word_9(sp_digit d1, sp_digit d0, sp_digit div) { - sp_digit r; - __asm__ __volatile__ ( - "lsr x5, %[div], 32\n\t" - "add x5, x5, 1\n\t" + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" "lsl x6, x3, 32\n\t" "mul x4, %[div], x6\n\t" "umulh x3, %[div], x6\n\t" "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "udiv x3, %[d1], x5\n\t" - "lsl x3, x3, 32\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "umulh x3, %[div], x3\n\t" - "subs %[d0], %[d0], x4\n\t" - "sbc %[d1], %[d1], x3\n\t" + "cmp %[d1], x5\n\t" + "cset x9, ge\n\t" + "csetm x10, ge\n\t" + "lsl x9, x9, #32\n\t" + "and x7, x7, x10\n\t" + "and x8, x8, x10\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" "udiv x3, x3, x5\n\t" "add x6, x6, x3\n\t" @@ -67227,23 +69322,22 @@ static sp_digit div_521_word_9(sp_digit d1, sp_digit d0, sp_digit div) "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" - "udiv x3, x3, x5\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "sub %[d0], %[d0], x4\n\t" + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" "udiv x3, %[d0], %[div]\n\t" - "add %[r], x6, x3\n\t" + "add %[d1], x6, x3\n\t" - : [r] "=r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "x3", "x4", "x5", "x6" + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); - return r; + return d1; } /* AND m into each word of a and store in r. @@ -67284,99 +69378,105 @@ static sp_int64 sp_521_cmp_9(const sp_digit* a, const sp_digit* b) { #ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "mov x5, 64\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "mov x10, #8\n\t" + "add %[a], %[a], #56\n\t" + "add %[b], %[b], #56\n\t" "1:\n\t" - "ldr x6, [%[a], x5]\n\t" - "ldr x7, [%[b], x5]\n\t" - "and x6, x6, x4\n\t" - "and x7, x7, x4\n\t" - "subs x6, x6, x7\n\t" - "csel x2, x3, x2, hi\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "subs x5, x5, #8\n\t" - "b.cs 1b\n\t" - "eor %[a], x2, x4\n\t" - : [a] "+r" (a) - : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22" + "ldp x6, x7, [%[a]], -16\n\t" + "ldp x8, x9, [%[b]], -16\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x10, x10, #2\n\t" + "b.ne 1b\n\t" + "ldr x6, [%[a], 8]\n\t" + "ldr x8, [%[b], 8]\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #else __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "ldp x5, x6, [%[a], 0]\n\t" - "ldp x7, x8, [%[a], 16]\n\t" - "ldp x9, x10, [%[a], 32]\n\t" - "ldp x11, x12, [%[a], 48]\n\t" - "ldr x13, [%[a], 64]\n\t" - "ldp x14, x15, [%[b], 0]\n\t" - "ldp x16, x17, [%[b], 16]\n\t" - "ldp x18, x19, [%[b], 32]\n\t" - "ldp x20, x21, [%[b], 48]\n\t" - "ldr x22, [%[b], 64]\n\t" - "and x13, x13, x4\n\t" - "and x22, x22, x4\n\t" - "subs x13, x13, x22\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x12, x12, x4\n\t" - "and x21, x21, x4\n\t" - "subs x12, x12, x21\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x11, x11, x4\n\t" - "and x20, x20, x4\n\t" - "subs x11, x11, x20\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x10, x10, x4\n\t" - "and x19, x19, x4\n\t" - "subs x10, x10, x19\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x9, x9, x4\n\t" - "and x18, x18, x4\n\t" - "subs x9, x9, x18\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x8, x8, x4\n\t" - "and x17, x17, x4\n\t" - "subs x8, x8, x17\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x7, x7, x4\n\t" - "and x16, x16, x4\n\t" - "subs x7, x7, x16\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x6, x6, x4\n\t" - "and x15, x15, x4\n\t" - "subs x6, x6, x15\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x14, x14, x4\n\t" - "subs x5, x5, x14\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "eor %[a], x2, x4\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "ldp x6, x7, [%[a], 56]\n\t" + "ldp x8, x9, [%[b], 56]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 40]\n\t" + "ldp x8, x9, [%[b], 40]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 24]\n\t" + "ldp x8, x9, [%[b], 24]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 8]\n\t" + "ldp x8, x9, [%[b], 8]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldr x6, [%[a]]\n\t" + "ldr x8, [%[b]]\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" : [a] "+r" (a) : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22" + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); #endif @@ -67412,7 +69512,7 @@ static WC_INLINE int sp_521_div_9(const sp_digit* a, const sp_digit* d, sp_digit sp_521_lshift_9(sd, d, 55); sp_521_lshift_18(t1, t1, 55); - for (i=8; i>=0; i--) { + for (i = 8; i >= 0; i--) { sp_digit hi = t1[9 + i] - (t1[9 + i] == div); r1 = div_521_word_9(hi, t1[9 + i - 1], div); @@ -67668,35 +69768,35 @@ static void sp_521_cond_copy_9(sp_digit* r, const sp_digit* a, sp_digit m) "ldp x12, x13, [%[a], 0]\n\t" "ldp x14, x15, [%[a], 16]\n\t" "ldp x16, x17, [%[a], 32]\n\t" - "ldp x18, x19, [%[a], 48]\n\t" - "ldr x20, [%[a], 64]\n\t" + "ldp x19, x20, [%[a], 48]\n\t" + "ldr x21, [%[a], 64]\n\t" "eor x12, x12, x3\n\t" "eor x13, x13, x4\n\t" "eor x14, x14, x5\n\t" "eor x15, x15, x6\n\t" "eor x16, x16, x7\n\t" "eor x17, x17, x8\n\t" - "eor x18, x18, x9\n\t" - "eor x19, x19, x10\n\t" - "eor x20, x20, x11\n\t" + "eor x19, x19, x9\n\t" + "eor x20, x20, x10\n\t" + "eor x21, x21, x11\n\t" "and x12, x12, %[m]\n\t" "and x13, x13, %[m]\n\t" "and x14, x14, %[m]\n\t" "and x15, x15, %[m]\n\t" "and x16, x16, %[m]\n\t" "and x17, x17, %[m]\n\t" - "and x18, x18, %[m]\n\t" "and x19, x19, %[m]\n\t" "and x20, x20, %[m]\n\t" + "and x21, x21, %[m]\n\t" "eor x3, x3, x12\n\t" "eor x4, x4, x13\n\t" "eor x5, x5, x14\n\t" "eor x6, x6, x15\n\t" "eor x7, x7, x16\n\t" "eor x8, x8, x17\n\t" - "eor x9, x9, x18\n\t" - "eor x10, x10, x19\n\t" - "eor x11, x11, x20\n\t" + "eor x9, x9, x19\n\t" + "eor x10, x10, x20\n\t" + "eor x11, x11, x21\n\t" "stp x3, x4, [%[r], 0]\n\t" "stp x5, x6, [%[r], 16]\n\t" "stp x7, x8, [%[r], 32]\n\t" @@ -67704,7 +69804,7 @@ static void sp_521_cond_copy_9(sp_digit* r, const sp_digit* a, sp_digit m) "str x11, [%[r], 64]\n\t" : : [r] "r" (r), [a] "r" (a), [m] "r" (m) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21" ); } @@ -68318,7 +70418,7 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const "mul x2, x10, x10\n\t" "umulh x3, x10, x10\n\t" "str x2, [x29]\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "# A[0] * A[1]\n\t" "mul x8, x10, x11\n\t" "umulh x9, x10, x11\n\t" @@ -68392,8 +70492,8 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const "adcs x4, x4, x9\n\t" "umulh x6, x10, x15\n\t" "adc x2, x2, xzr\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" "# A[1] * A[4]\n\t" "mul x8, x11, x14\n\t" "umulh x9, x11, x14\n\t" @@ -68416,8 +70516,8 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const "# A[0] * A[6]\n\t" "mul x5, x10, x16\n\t" "umulh x6, x10, x16\n\t" - "mov x4, 0\n\t" - "mov x7, 0\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" "# A[1] * A[5]\n\t" "mul x8, x11, x15\n\t" "umulh x9, x11, x15\n\t" @@ -68446,8 +70546,8 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const "# A[0] * A[7]\n\t" "mul x5, x10, x17\n\t" "umulh x6, x10, x17\n\t" - "mov x2, 0\n\t" - "mov x7, 0\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" "# A[1] * A[6]\n\t" "mul x8, x11, x16\n\t" "umulh x9, x11, x16\n\t" @@ -68476,8 +70576,8 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const "# A[0] * A[8]\n\t" "mul x5, x10, x19\n\t" "umulh x6, x10, x19\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" "# A[1] * A[7]\n\t" "mul x8, x11, x17\n\t" "umulh x9, x11, x17\n\t" @@ -68512,8 +70612,8 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const "# A[1] * A[8]\n\t" "mul x5, x11, x19\n\t" "umulh x6, x11, x19\n\t" - "mov x4, 0\n\t" - "mov x7, 0\n\t" + "mov x4, xzr\n\t" + "mov x7, xzr\n\t" "# A[2] * A[7]\n\t" "mul x8, x12, x17\n\t" "umulh x9, x12, x17\n\t" @@ -68542,8 +70642,8 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const "# A[2] * A[8]\n\t" "mul x5, x12, x19\n\t" "umulh x6, x12, x19\n\t" - "mov x2, 0\n\t" - "mov x7, 0\n\t" + "mov x2, xzr\n\t" + "mov x7, xzr\n\t" "# A[3] * A[7]\n\t" "mul x8, x13, x17\n\t" "umulh x9, x13, x17\n\t" @@ -68572,8 +70672,8 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const "# A[3] * A[8]\n\t" "mul x5, x13, x19\n\t" "umulh x6, x13, x19\n\t" - "mov x3, 0\n\t" - "mov x7, 0\n\t" + "mov x3, xzr\n\t" + "mov x7, xzr\n\t" "# A[4] * A[7]\n\t" "mul x8, x14, x17\n\t" "umulh x9, x14, x17\n\t" @@ -68724,7 +70824,7 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const "ldp x29, x30, [sp], #0xa0\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "x2", "x3", "x4", "x8", "x9", "x10", "x5", "x6", "x7", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20" ); } @@ -68854,178 +70954,138 @@ static void sp_521_mont_inv_9(sp_digit* r, const sp_digit* a, sp_digit* td) SP_NOINLINE static void sp_521_mont_reduce_9(sp_digit* a, const sp_digit* m, sp_digit mp) { - __asm__ __volatile__ ( - "ldp x14, x15, [%[m], 0]\n\t" - "ldp x16, x17, [%[m], 16]\n\t" - "ldp x19, x20, [%[m], 32]\n\t" - "ldp x21, x22, [%[m], 48]\n\t" - "ldr x23, [%[m], 64]\n\t" + "ldp x13, x14, [%[a], 0]\n\t" + "ldp x15, x16, [%[a], 16]\n\t" + "ldp x17, x19, [%[a], 32]\n\t" + "ldp x20, x21, [%[a], 48]\n\t" + "ldr x22, [%[a], 64]\n\t" "mov x3, xzr\n\t" - "# i = 9\n\t" + "# i = 0..8\n\t" "mov x4, 9\n\t" - "ldp x12, x13, [%[a], 0]\n\t" "\n1:\n\t" "# mu = a[i] * mp\n\t" - "mul x9, %[mp], x12\n\t" - "cmp x4, 1\n\t" + "mul x9, %[mp], x13\n\t" + "cmp x4, #1\n\t" "bne L_521_mont_reduce_9_nomask\n\t" - "and x9, x9, 0x1ff\n\t" + "and x9, x9, #0x1ff\n\t" "L_521_mont_reduce_9_nomask:\n\t" "# a[i+0] += m[0] * mu\n\t" - "mul x7, x14, x9\n\t" - "umulh x8, x14, x9\n\t" - "adds x12, x12, x7\n\t" - "# a[i+1] += m[1] * mu\n\t" - "mul x7, x15, x9\n\t" - "adc x6, x8, xzr\n\t" - "umulh x8, x15, x9\n\t" - "str x12, [%[a], 0]\n\t" + "ldp x10, x11, [%[m], 0]\n\t" + "mul x7, x10, x9\n\t" + "umulh x8, x10, x9\n\t" "adds x12, x13, x7\n\t" + "# a[i+1] += m[1] * mu\n\t" + "adc x6, x8, xzr\n\t" + "mul x7, x11, x9\n\t" + "umulh x8, x11, x9\n\t" + "adds x13, x14, x7\n\t" "# a[i+2] += m[2] * mu\n\t" - "ldr x13, [%[a], 16]\n\t" + "ldp x11, x10, [%[m], 16]\n\t" "adc x5, x8, xzr\n\t" - "mul x7, x16, x9\n\t" - "adds x12, x12, x6\n\t" - "umulh x8, x16, x9\n\t" + "adds x13, x13, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "adds x13, x13, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x14, x15, x7\n\t" "# a[i+3] += m[3] * mu\n\t" - "ldr x10, [%[a], 24]\n\t" "adc x6, x8, xzr\n\t" - "mul x7, x17, x9\n\t" - "adds x13, x13, x5\n\t" - "umulh x8, x17, x9\n\t" + "adds x14, x14, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x15, x16, x7\n\t" "# a[i+4] += m[4] * mu\n\t" - "ldr x11, [%[a], 32]\n\t" + "ldp x11, x10, [%[m], 32]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x19, x9\n\t" + "adds x15, x15, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "str x10, [%[a], 24]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x16, x17, x7\n\t" "# a[i+5] += m[5] * mu\n\t" - "ldr x10, [%[a], 40]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x20, x9\n\t" + "adds x16, x16, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "str x11, [%[a], 32]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x17, x19, x7\n\t" "# a[i+6] += m[6] * mu\n\t" - "ldr x11, [%[a], 48]\n\t" + "ldp x11, x10, [%[m], 48]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x21, x9\n\t" + "adds x17, x17, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "str x10, [%[a], 40]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x19, x20, x7\n\t" "# a[i+7] += m[7] * mu\n\t" - "ldr x10, [%[a], 56]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x22, x9\n\t" + "adds x19, x19, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "str x11, [%[a], 48]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x20, x21, x7\n\t" "# a[i+8] += m[8] * mu\n\t" - "ldr x11, [%[a], 64]\n\t" + "ldr x11, [%[m], 64]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x23, x9\n\t" + "adds x20, x20, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x23, x9\n\t" + "umulh x8, x11, x9\n\t" "adds x5, x5, x7\n\t" "adcs x8, x8, x3\n\t" - "str x10, [%[a], 56]\n\t" - "cset x3, cs\n\t" - "adds x11, x11, x5\n\t" - "ldr x10, [%[a], 72]\n\t" - "str x11, [%[a], 64]\n\t" - "adcs x10, x10, x8\n\t" - "str x10, [%[a], 72]\n\t" + "cset x3, cs\n\t" + "adds x21, x22, x5\n\t" + "ldr x22, [%[a], 72]\n\t" + "adcs x22, x22, x8\n\t" "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" "bne 1b\n\t" - "stp x12, x13, [%[a], 0]\n\t" - "sub x9, %[a], 8\n\t" + "extr x12, x13, x12, 9\n\t" + "extr x13, x14, x13, 9\n\t" + "extr x14, x15, x14, 9\n\t" + "extr x15, x16, x15, 9\n\t" + "extr x16, x17, x16, 9\n\t" + "extr x17, x19, x17, 9\n\t" + "extr x19, x20, x19, 9\n\t" + "extr x20, x21, x20, 9\n\t" + "lsr x21, x21, 9\n\t" + "lsr x3, x21, 9\n\t" "sub %[a], %[a], 72\n\t" - "ldr x4, [x9]\n\t" - "ldr x5, [x9, 8]\n\t" - "lsr x4, x4, 9\n\t" - "orr x4, x4, x5, lsl 55\n\t" - "lsr x5, x5, 9\n\t" - "ldr x6, [x9, 16]\n\t" - "str x4, [%[a], 0]\n\t" - "orr x5, x5, x6, lsl 55\n\t" - "lsr x6, x6, 9\n\t" - "ldr x4, [x9, 24]\n\t" - "str x5, [%[a], 8]\n\t" - "orr x6, x6, x4, lsl 55\n\t" - "lsr x4, x4, 9\n\t" - "ldr x5, [x9, 32]\n\t" - "str x6, [%[a], 16]\n\t" - "orr x4, x4, x5, lsl 55\n\t" - "lsr x5, x5, 9\n\t" - "ldr x6, [x9, 40]\n\t" - "str x4, [%[a], 24]\n\t" - "orr x5, x5, x6, lsl 55\n\t" - "lsr x6, x6, 9\n\t" - "ldr x4, [x9, 48]\n\t" - "str x5, [%[a], 32]\n\t" - "orr x6, x6, x4, lsl 55\n\t" - "lsr x4, x4, 9\n\t" - "ldr x5, [x9, 56]\n\t" - "str x6, [%[a], 40]\n\t" - "orr x4, x4, x5, lsl 55\n\t" - "lsr x5, x5, 9\n\t" - "ldr x6, [x9, 64]\n\t" - "str x4, [%[a], 48]\n\t" - "orr x5, x5, x6, lsl 55\n\t" - "lsr x6, x6, 9\n\t" - "str x5, [%[a], 56]\n\t" - "str x6, [%[a], 64]\n\t" - "lsr x3, x6, 9\n\t" "neg x3, x3\n\t" "# Subtract masked modulus\n\t" - "ldp x12, x13, [%[a], 0]\n\t" - "and x14, x14, x3\n\t" - "ldp x10, x11, [%[a], 16]\n\t" - "and x15, x15, x3\n\t" - "subs x12, x12, x14\n\t" - "and x16, x16, x3\n\t" - "sbcs x13, x13, x15\n\t" - "and x17, x17, x3\n\t" - "sbcs x10, x10, x16\n\t" + "ldp x4, x5, [%[m], 0]\n\t" + "ldp x6, x7, [%[m], 16]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "subs x12, x12, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x13, x13, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x14, x14, x6\n\t" "stp x12, x13, [%[a], 0]\n\t" - "sbcs x11, x11, x17\n\t" - "stp x10, x11, [%[a], 16]\n\t" - "ldp x12, x13, [%[a], 32]\n\t" - "and x19, x19, x3\n\t" - "ldp x10, x11, [%[a], 48]\n\t" - "and x20, x20, x3\n\t" - "sbcs x12, x12, x19\n\t" - "and x21, x21, x3\n\t" - "sbcs x13, x13, x20\n\t" - "and x22, x22, x3\n\t" - "sbcs x10, x10, x21\n\t" - "stp x12, x13, [%[a], 32]\n\t" - "sbcs x11, x11, x22\n\t" - "stp x10, x11, [%[a], 48]\n\t" - "ldr x12, [%[a], 64]\n\t" - "and x23, x23, x3\n\t" - "sbcs x12, x12, x23\n\t" - "str x12, [%[a], 64]\n\t" - : [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23" + "sbcs x15, x15, x7\n\t" + "stp x14, x15, [%[a], 16]\n\t" + "ldp x4, x5, [%[m], 32]\n\t" + "ldp x6, x7, [%[m], 48]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x16, x16, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x17, x17, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x19, x19, x6\n\t" + "stp x16, x17, [%[a], 32]\n\t" + "sbcs x20, x20, x7\n\t" + "stp x19, x20, [%[a], 48]\n\t" + "ldr x4, [%[m], 64]\n\t" + "and x4, x4, x3\n\t" + "sbcs x21, x21, x4\n\t" + "str x21, [%[a], 64]\n\t" + : [a] "+r" (a), [mp] "+r" (mp) + : [m] "r" (m) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" ); } @@ -69277,6 +71337,7 @@ static void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, ); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -69288,7 +71349,6 @@ static void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, static sp_digit sp_521_cond_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m) { -#ifdef WOLFSSL_SP_SMALL sp_digit c = 0; __asm__ __volatile__ ( @@ -69306,91 +71366,98 @@ static sp_digit sp_521_cond_add_9(sp_digit* r, const sp_digit* a, const sp_digit "b.lt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + : "memory", "x4", "x5", "x8", "x9", "x10", "x11", "x12" ); return c; -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_521_cond_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ __asm__ __volatile__ ( - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "adds x4, x4, x5\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x10, x11, [%[b], 48]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldr x5, [%[b], 64]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 32]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 48]\n\t" + "ldr x8, [%[b], 64]\n\t" "ldr x4, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "adcs x4, x4, x5\n\t" + "and x8, x8, %[m]\n\t" + "adcs x4, x4, x8\n\t" "str x4, [%[r], 64]\n\t" "cset %[r], cs\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" ); return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ static void sp_521_rshift1_9(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( "ldp x2, x3, [%[a]]\n\t" - "lsr x2, x2, 1\n\t" - "orr x2, x2, x3, lsl 63\n\t" - "lsr x3, x3, 1\n\t" + "ldr x3, [%[a], 8]\n\t" + "extr x2, x3, x2, #1\n\t" "ldr x4, [%[a], 16]\n\t" "str x2, [%[r], 0]\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "lsr x4, x4, 1\n\t" + "extr x3, x4, x3, #1\n\t" "ldr x2, [%[a], 24]\n\t" "str x3, [%[r], 8]\n\t" - "orr x4, x4, x2, lsl 63\n\t" - "lsr x2, x2, 1\n\t" + "extr x4, x2, x4, #1\n\t" "ldr x3, [%[a], 32]\n\t" "str x4, [%[r], 16]\n\t" - "orr x2, x2, x3, lsl 63\n\t" - "lsr x3, x3, 1\n\t" + "extr x2, x3, x2, #1\n\t" "ldr x4, [%[a], 40]\n\t" "str x2, [%[r], 24]\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "lsr x4, x4, 1\n\t" + "extr x3, x4, x3, #1\n\t" "ldr x2, [%[a], 48]\n\t" "str x3, [%[r], 32]\n\t" - "orr x4, x4, x2, lsl 63\n\t" - "lsr x2, x2, 1\n\t" + "extr x4, x2, x4, #1\n\t" "ldr x3, [%[a], 56]\n\t" "str x4, [%[r], 40]\n\t" - "orr x2, x2, x3, lsl 63\n\t" - "lsr x3, x3, 1\n\t" + "extr x2, x3, x2, #1\n\t" "ldr x4, [%[a], 64]\n\t" "str x2, [%[r], 48]\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "lsr x4, x4, 1\n\t" + "extr x3, x4, x3, #1\n\t" + "lsr x4, x4, #1\n\t" "stp x3, x4, [%[r], 56]\n\t" : : [r] "r" (r), [a] "r" (a) @@ -108088,41 +110155,74 @@ static void sp_521_add_one_9(sp_digit* a) */ static void sp_521_from_bin(sp_digit* r, int size, const byte* a, int n) { - int i; - int j; - byte* d; + sp_int64 nl = n; + sp_int64 size8 = size * 8; - for (i = n - 1,j = 0; i >= 7; i -= 8) { - r[j] = ((sp_digit)a[i - 0] << 0) | - ((sp_digit)a[i - 1] << 8) | - ((sp_digit)a[i - 2] << 16) | - ((sp_digit)a[i - 3] << 24) | - ((sp_digit)a[i - 4] << 32) | - ((sp_digit)a[i - 5] << 40) | - ((sp_digit)a[i - 6] << 48) | - ((sp_digit)a[i - 7] << 56); - j++; - } - - if (i >= 0) { - r[j] = 0; - - d = (byte*)r; - switch (i) { - case 6: d[n - 1 - 6] = a[6]; //fallthrough - case 5: d[n - 1 - 5] = a[5]; //fallthrough - case 4: d[n - 1 - 4] = a[4]; //fallthrough - case 3: d[n - 1 - 3] = a[3]; //fallthrough - case 2: d[n - 1 - 2] = a[2]; //fallthrough - case 1: d[n - 1 - 1] = a[1]; //fallthrough - case 0: d[n - 1 - 0] = a[0]; //fallthrough - } - j++; - } - - for (; j < size; j++) { - r[j] = 0; - } + __asm__ __volatile__ ( + "add x4, %[a], %[n]\n\t" + "mov x5, %[r]\n\t" + "sub x4, x4, 8\n\t" + "subs x6, %[n], 8\n\t" + "mov x7, xzr\n\t" + "blt 2f\n\t" + /* Put in mulitples of 8 bytes. */ + "1:\n\t" + "ldr x8, [x4], -8\n\t" + "subs x6, x6, 8\n\t" + "rev x8, x8\n\t" + "str x8, [x5], 8\n\t" + "add x7, x7, 8\n\t" + "b.ge 1b\n\t" + "2:\n\t" + "cmp x6, -7\n\t" + "b.lt 20f\n\t" + /* Put in less than 8 bytes. */ + "str xzr, [x5]\n\t" + "add x7, x7, 8\n\t" + "add x4, x4, 7\n\t" + "b.eq 17f\n\t" + "cmp x6, -5\n\t" + "b.lt 16f\n\t" + "b.eq 15f\n\t" + "cmp x6, -3\n\t" + "b.lt 14f\n\t" + "b.eq 13f\n\t" + "cmp x6, -2\n\t" + "b.eq 12f\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "12:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "13:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "14:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "15:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "16:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "17:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "20:\n\t" + "add x5, %[r], x7\n\t" + "subs x7, %[size], x7\n\t" + "b.eq 30f\n\t" + /* Zero out remaining words. */ + "21:\n\t" + "subs x7, x7, 8\n\t" + "str xzr, [x5], 8\n\t" + "b.gt 21b\n\t" + "30:\n\t" + : + : [r] "r" (r), [size] "r" (size8), [a] "r" (a), [n] "r" (nl) + : "memory", "x4", "x5", "x6", "x7", "x8" + ); } /* Generates a scalar that is in the range 1..order-1. @@ -108254,15 +110354,15 @@ static void sp_521_to_bin_9(sp_digit* r, byte* a) a[j++] = r[8] >> 8; a[j++] = r[8] >> 0; - for (i = 7; i >= 0; i--) { - a[j++] = r[i] >> 56; - a[j++] = r[i] >> 48; - a[j++] = r[i] >> 40; - a[j++] = r[i] >> 32; - a[j++] = r[i] >> 24; - a[j++] = r[i] >> 16; - a[j++] = r[i] >> 8; - a[j++] = r[i] >> 0; + for (i = 7; i >= 0; i--, j += 8) { + __asm__ __volatile__ ( + "ldr x4, [%[r]]\n\t" + "rev x4, x4\n\t" + "str x4, [%[a]]\n\t" + : + : [r] "r" (r + i), [a] "r" (a + j) + : "memory", "x4" + ); } } @@ -108894,67 +110994,58 @@ static void sp_521_div2_mod_9(sp_digit* r, const sp_digit* a, const sp_digit* m) { __asm__ __volatile__ ( - "ldr x3, [%[a], 0]\n\t" - "ldr x4, [%[a], 8]\n\t" - "ldr x5, [%[a], 16]\n\t" - "ldr x6, [%[a], 24]\n\t" - "ldr x7, [%[a], 32]\n\t" - "ldr x8, [%[a], 40]\n\t" - "ldr x9, [%[a], 48]\n\t" - "ldr x10, [%[a], 56]\n\t" - "ldr x11, [%[a], 64]\n\t" - "ldr x12, [%[m], 0]\n\t" - "ldr x13, [%[m], 8]\n\t" - "ldr x14, [%[m], 16]\n\t" - "ldr x15, [%[m], 24]\n\t" - "ldr x16, [%[m], 32]\n\t" - "ldr x17, [%[m], 40]\n\t" - "ldr x18, [%[m], 48]\n\t" - "ldr x19, [%[m], 56]\n\t" - "ldr x20, [%[m], 64]\n\t" - "ands x21, x3, 1\n\t" - "b.eq 1f\n\t" - "adds x3, x3, x12\n\t" - "adcs x4, x4, x13\n\t" - "adcs x5, x5, x14\n\t" - "adcs x6, x6, x15\n\t" - "adcs x7, x7, x16\n\t" - "adcs x8, x8, x17\n\t" - "adcs x9, x9, x18\n\t" - "adcs x10, x10, x19\n\t" - "adcs x11, x11, x20\n\t" - "cset x21, cs\n\t" + "ldr x3, [%[a], 0]\n\t" + "ldr x4, [%[a], 8]\n\t" + "ldr x5, [%[a], 16]\n\t" + "ldr x6, [%[a], 24]\n\t" + "ldr x7, [%[a], 32]\n\t" + "ldr x8, [%[a], 40]\n\t" + "ldr x9, [%[a], 48]\n\t" + "ldr x10, [%[a], 56]\n\t" + "ldr x11, [%[a], 64]\n\t" + "ldr x12, [%[m], 0]\n\t" + "ldr x13, [%[m], 8]\n\t" + "ldr x14, [%[m], 16]\n\t" + "ldr x15, [%[m], 24]\n\t" + "ldr x16, [%[m], 32]\n\t" + "ldr x17, [%[m], 40]\n\t" + "ldr x19, [%[m], 48]\n\t" + "ldr x20, [%[m], 56]\n\t" + "ldr x21, [%[m], 64]\n\t" + "ands x22, x3, 1\n\t" + "b.eq 1f\n\t" + "adds x3, x3, x12\n\t" + "adcs x4, x4, x13\n\t" + "adcs x5, x5, x14\n\t" + "adcs x6, x6, x15\n\t" + "adcs x7, x7, x16\n\t" + "adcs x8, x8, x17\n\t" + "adcs x9, x9, x19\n\t" + "adcs x10, x10, x20\n\t" + "adcs x11, x11, x21\n\t" + "cset x22, cs\n\t" "\n1:\n\t" - "lsr x3, x3, 1\n\t" - "lsr x13, x4, 1\n\t" - "lsr x14, x5, 1\n\t" - "lsr x15, x6, 1\n\t" - "lsr x16, x7, 1\n\t" - "lsr x17, x8, 1\n\t" - "lsr x18, x9, 1\n\t" - "lsr x19, x10, 1\n\t" - "lsr x20, x11, 1\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "orr x4, x13, x5, lsl 63\n\t" - "orr x5, x14, x6, lsl 63\n\t" - "orr x6, x15, x7, lsl 63\n\t" - "orr x7, x16, x8, lsl 63\n\t" - "orr x8, x17, x9, lsl 63\n\t" - "orr x9, x18, x10, lsl 63\n\t" - "orr x10, x19, x11, lsl 63\n\t" - "orr x11, x20, x21, lsl 63\n\t" - "str x3, [%[r], 0]\n\t" - "str x4, [%[r], 8]\n\t" - "str x5, [%[r], 16]\n\t" - "str x6, [%[r], 24]\n\t" - "str x7, [%[r], 32]\n\t" - "str x8, [%[r], 40]\n\t" - "str x9, [%[r], 48]\n\t" - "str x10, [%[r], 56]\n\t" - "str x11, [%[r], 64]\n\t" + "extr x3, x4, x3, 1\n\t" + "extr x4, x5, x4, 1\n\t" + "extr x5, x6, x5, 1\n\t" + "extr x6, x7, x6, 1\n\t" + "extr x7, x8, x7, 1\n\t" + "extr x8, x9, x8, 1\n\t" + "extr x9, x10, x9, 1\n\t" + "extr x10, x11, x10, 1\n\t" + "extr x11, x22, x11, 1\n\t" + "str x3, [%[r], 0]\n\t" + "str x4, [%[r], 8]\n\t" + "str x5, [%[r], 16]\n\t" + "str x6, [%[r], 24]\n\t" + "str x7, [%[r], 32]\n\t" + "str x8, [%[r], 40]\n\t" + "str x9, [%[r], 48]\n\t" + "str x10, [%[r], 56]\n\t" + "str x11, [%[r], 64]\n\t" : : [r] "r" (r), [a] "r" (a), [m] "r" (m) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" ); } @@ -108971,7 +111062,7 @@ static int sp_521_num_bits_64_9(sp_digit n) : "x1" ); - return r + 1; + return (int)(r + 1); } static int sp_521_num_bits_9(const sp_digit* a) @@ -110693,6 +112784,38 @@ static sp_digit sp_1024_add_8(sp_digit* r, const sp_digit* a, return (sp_digit)r; } +/* Add digit to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_1024_add_word_8(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "ldp x3, x4, [%[a], 0]\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "adds x3, x3, %[b]\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 0]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 16]\n\t" + "ldp x3, x4, [%[a], 32]\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "stp x3, x4, [%[r], 32]\n\t" + "adcs x6, x6, xzr\n\t" + "stp x5, x6, [%[r], 48]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6" + ); +} + /* Sub b from a into a. (a -= b) * * a A single precision integer and result. @@ -110809,63 +112932,57 @@ static sp_digit sp_1024_add_16(sp_digit* r, const sp_digit* a, return (sp_digit)r; } -/* AND m into each word of a and store in r. +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. */ -static void sp_1024_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<8; i++) { - r[i] = a[i] & m; - } -#else - r[0] = a[0] & m; - r[1] = a[1] & m; - r[2] = a[2] & m; - r[3] = a[3] & m; - r[4] = a[4] & m; - r[5] = a[5] & m; - r[6] = a[6] & m; - r[7] = a[7] & m; -#endif -} - -/* Add digit to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static void sp_1024_add_zero_8(sp_digit* r, const sp_digit* a, - const sp_digit d) +static sp_digit sp_1024_cond_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) { __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "adds x3, x3, %[d]\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "adcs x3, x3, xzr\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, xzr\n\t" - "stp x5, x6, [%[r], 48]\n\t" - : - : [r] "r" (r), [a] "r" (a), [d] "r" (d) - : "memory", "x3", "x4", "x5", "x6" + + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x10, x11, [%[b], 48]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" + "and x11, x11, %[m]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 32]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 48]\n\t" + "cset %[r], cs\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" ); + + return (sp_digit)r; } +#endif /* !WOLFSSL_SP_SMALL */ /* Multiply a and b into r. (r = a * b) * @@ -110880,95 +112997,67 @@ SP_NOINLINE static void sp_1024_mul_16(sp_digit* r, const sp_digit* a, sp_digit z1[16]; sp_digit a1[8]; sp_digit b1[8]; - sp_digit z2[16]; - sp_digit u, ca, cb; + sp_digit* z2 = r + 16; + sp_digit u; + sp_digit ca; + sp_digit cb; ca = sp_1024_add_8(a1, a, &a[8]); cb = sp_1024_add_8(b1, b, &b[8]); u = ca & cb; - sp_1024_mul_8(z1, a1, b1); + sp_1024_mul_8(z2, &a[8], &b[8]); sp_1024_mul_8(z0, a, b); - sp_1024_mask_8(r + 16, a1, 0 - cb); - sp_1024_mask_8(b1, b1, 0 - ca); - u += sp_1024_add_8(r + 16, r + 16, b1); - u += sp_1024_sub_in_place_16(z1, z2); + sp_1024_mul_8(z1, a1, b1); + u += sp_1024_sub_in_place_16(z1, z0); + u += sp_1024_sub_in_place_16(z1, z2); + u += sp_1024_cond_add_8(z1 + 8, z1 + 8, a1, 0 - cb); + u += sp_1024_cond_add_8(z1 + 8, z1 + 8, b1, 0 - ca); + u += sp_1024_add_16(r + 8, r + 8, z1); - u += sp_1024_add_8(r + 16, r + 16, z2); - sp_1024_add_zero_8(r + 24, z2 + 8, u); + (void)sp_1024_add_word_8(r + 24, r + 24, u); } -#ifdef WOLFSSL_SP_SMALL -/* Double a into r. (r = a + a) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. + * b A single precision integer. */ -static sp_digit sp_1024_dbl_8(sp_digit* r, const sp_digit* a) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "add x11, %[a], 64\n\t" - "\n1:\n\t" - "adds %[c], %[c], #-1\n\t" - "ldp x3, x4, [%[a]], #16\n\t" - "ldp x5, x6, [%[a]], #16\n\t" - "adcs x3, x3, x3\n\t" - "adcs x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "stp x3, x4, [%[r]], #16\n\t" - "adcs x6, x6, x6\n\t" - "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" - "cmp %[a], x11\n\t" - "b.ne 1b\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x3", "x4", "x5", "x6", "x11" - ); - - return c; -} - -#else -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -static sp_digit sp_1024_dbl_8(sp_digit* r, const sp_digit* a) +static sp_digit sp_1024_sub_8(sp_digit* r, const sp_digit* a, + const sp_digit* b) { __asm__ __volatile__ ( "ldp x3, x4, [%[a], 0]\n\t" - "adds x3, x3, x3\n\t" - "ldr x5, [%[a], 16]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 24]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 0]\n\t" + "subs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 16]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 0]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 16]\n\t" "ldp x3, x4, [%[a], 32]\n\t" - "adcs x3, x3, x3\n\t" - "ldr x5, [%[a], 48]\n\t" - "adcs x4, x4, x4\n\t" - "ldr x6, [%[a], 56]\n\t" - "adcs x5, x5, x5\n\t" + "ldp x7, x8, [%[b], 32]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 48]\n\t" + "sbcs x5, x5, x9\n\t" "stp x3, x4, [%[r], 32]\n\t" - "adcs x6, x6, x6\n\t" + "sbcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 48]\n\t" - "cset %[r], cs\n\t" + "csetm %[r], cc\n\t" : [r] "+r" (r) - : [a] "r" (a) - : "memory", "x3", "x4", "x5", "x6" + : [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); return (sp_digit)r; } -#endif /* WOLFSSL_SP_SMALL */ /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -110977,22 +113066,31 @@ static sp_digit sp_1024_dbl_8(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[16]; + sp_digit* z2 = r + 16; sp_digit z1[16]; - sp_digit a1[8]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 8; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 8); + + mask = sp_1024_sub_8(a1, a, &a[8]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_1024_sub_8(a1, p1, p2); - u = sp_1024_add_8(a1, a, &a[8]); - sp_1024_sqr_8(z1, a1); sp_1024_sqr_8(z2, &a[8]); sp_1024_sqr_8(z0, a); - sp_1024_mask_8(r + 16, a1, 0 - u); - u += sp_1024_dbl_8(r + 16, r + 16); - u += sp_1024_sub_in_place_16(z1, z2); - u += sp_1024_sub_in_place_16(z1, z0); - u += sp_1024_add_16(r + 8, r + 8, z1); - u += sp_1024_add_8(r + 16, r + 16, z2); - sp_1024_add_zero_8(r + 24, z2 + 8, u); + sp_1024_sqr_8(z1, a1); + + u = 0; + u -= sp_1024_sub_in_place_16(z1, z2); + u -= sp_1024_sub_in_place_16(z1, z0); + u += sp_1024_sub_in_place_16(r + 8, z1); + sp_1024_add_word_8(r + 24, r + 24, u); } #else @@ -111007,10 +113105,10 @@ static void sp_1024_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b) sp_digit tmp[32]; __asm__ __volatile__ ( - "mov x5, 0\n\t" - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" "\n1:\n\t" "subs x3, x5, 120\n\t" "csel x3, xzr, x3, cc\n\t" @@ -111056,10 +113154,10 @@ static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a) sp_digit tmp[32]; __asm__ __volatile__ ( - "mov x6, 0\n\t" - "mov x7, 0\n\t" - "mov x8, 0\n\t" - "mov x5, 0\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, xzr\n\t" + "mov x5, xzr\n\t" "\n1:\n\t" "subs x3, x5, 120\n\t" "csel x3, xzr, x3, cc\n\t" @@ -111379,9 +113477,9 @@ static void sp_1024_mul_d_16(sp_digit* r, const sp_digit* a, "ldr x8, [%[a]]\n\t" "mul x5, %[b], x8\n\t" "umulh x3, %[b], x8\n\t" - "mov x4, 0\n\t" + "mov x4, xzr\n\t" "str x5, [%[r]]\n\t" - "mov x5, 0\n\t" + "mov x5, xzr\n\t" "mov x9, #8\n\t" "1:\n\t" "ldr x8, [%[a], x9]\n\t" @@ -111405,175 +113503,178 @@ static void sp_1024_mul_d_16(sp_digit* r, const sp_digit* a, #else __asm__ __volatile__ ( "# A[0] * B\n\t" - "ldp x8, x9, [%[a]]\n\t" - "mul x3, %[b], x8\n\t" - "umulh x4, %[b], x8\n\t" - "mov x5, 0\n\t" + "ldp x9, x10, [%[a]]\n\t" + "mul x3, %[b], x9\n\t" + "umulh x4, %[b], x9\n\t" + "mov x5, xzr\n\t" "# A[1] * B\n\t" "str x3, [%[r]]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adds x4, x4, x6\n\t" "# A[2] * B\n\t" - "ldp x8, x9, [%[a], 16]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" "str x4, [%[r], 8]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[3] * B\n\t" "str x5, [%[r], 16]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[4] * B\n\t" - "ldp x8, x9, [%[a], 32]\n\t" + "ldp x9, x10, [%[a], 32]\n\t" "str x3, [%[r], 24]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[5] * B\n\t" "str x4, [%[r], 32]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[6] * B\n\t" - "ldp x8, x9, [%[a], 48]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" "str x5, [%[r], 40]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[7] * B\n\t" "str x3, [%[r], 48]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[8] * B\n\t" - "ldp x8, x9, [%[a], 64]\n\t" + "ldp x9, x10, [%[a], 64]\n\t" "str x4, [%[r], 56]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[9] * B\n\t" "str x5, [%[r], 64]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[10] * B\n\t" - "ldp x8, x9, [%[a], 80]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" "str x3, [%[r], 72]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[11] * B\n\t" "str x4, [%[r], 80]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[12] * B\n\t" - "ldp x8, x9, [%[a], 96]\n\t" + "ldp x9, x10, [%[a], 96]\n\t" "str x5, [%[r], 88]\n\t" - "mov x5, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x5, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "# A[13] * B\n\t" "str x3, [%[r], 96]\n\t" - "mov x3, 0\n\t" - "mul x6, %[b], x9\n\t" "adcs x4, x4, x7\n\t" - "umulh x7, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" + "mov x3, xzr\n\t" + "umulh x7, %[b], x10\n\t" "adc x5, xzr, xzr\n\t" "adds x4, x4, x6\n\t" "# A[14] * B\n\t" - "ldp x8, x9, [%[a], 112]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" "str x4, [%[r], 104]\n\t" - "mov x4, 0\n\t" - "mul x6, %[b], x8\n\t" "adcs x5, x5, x7\n\t" - "umulh x7, %[b], x8\n\t" + "mul x6, %[b], x9\n\t" + "mov x4, xzr\n\t" + "umulh x7, %[b], x9\n\t" "adc x3, xzr, xzr\n\t" "adds x5, x5, x6\n\t" "# A[15] * B\n\t" "str x5, [%[r], 112]\n\t" - "mul x6, %[b], x9\n\t" + "mul x6, %[b], x10\n\t" "adcs x3, x3, x7\n\t" - "umulh x7, %[b], x9\n\t" + "umulh x7, %[b], x10\n\t" "adc x4, xzr, xzr\n\t" "adds x3, x3, x6\n\t" "adc x4, x4, x7\n\t" "stp x3, x4, [%[r], 120]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #endif } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * Assumes divisor has higest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. */ static sp_digit div_1024_word_16(sp_digit d1, sp_digit d0, sp_digit div) { - sp_digit r; - __asm__ __volatile__ ( - "lsr x5, %[div], 32\n\t" - "add x5, x5, 1\n\t" + "lsr x8, %[div], 32\n\t" + "add x5, x8, 1\n\t" "udiv x3, %[d1], x5\n\t" + "lsl x7, %[div], 32\n\t" + "movz x9, #1, lsl 32\n\t" "lsl x6, x3, 32\n\t" "mul x4, %[div], x6\n\t" "umulh x3, %[div], x6\n\t" "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "udiv x3, %[d1], x5\n\t" - "lsl x3, x3, 32\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "umulh x3, %[div], x3\n\t" - "subs %[d0], %[d0], x4\n\t" - "sbc %[d1], %[d1], x3\n\t" + "cmp %[d1], x5\n\t" + "cset x9, ge\n\t" + "csetm x10, ge\n\t" + "lsl x9, x9, #32\n\t" + "and x7, x7, x10\n\t" + "and x8, x8, x10\n\t" + "subs %[d0], %[d0], x7\n\t" + "add x6, x6, x9\n\t" + "sbc %[d1], %[d1], x8\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" "udiv x3, x3, x5\n\t" "add x6, x6, x3\n\t" @@ -111582,23 +113683,22 @@ static sp_digit div_1024_word_16(sp_digit d1, sp_digit d0, sp_digit div) "subs %[d0], %[d0], x4\n\t" "sbc %[d1], %[d1], x3\n\t" - "lsr x3, %[d0], 32\n\t" - "orr x3, x3, %[d1], lsl 32\n\t" + "extr x3, %[d1], %[d0], 32\n\t" - "udiv x3, x3, x5\n\t" - "add x6, x6, x3\n\t" - "mul x4, %[div], x3\n\t" - "sub %[d0], %[d0], x4\n\t" + "udiv x3, x3, x5\n\t" + "add x6, x6, x3\n\t" + "mul x4, %[div], x3\n\t" + "sub %[d0], %[d0], x4\n\t" "udiv x3, %[d0], %[div]\n\t" - "add %[r], x6, x3\n\t" + "add %[d1], x6, x3\n\t" - : [r] "=r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "x3", "x4", "x5", "x6" + : [d1] "+r" (d1), [d0] "+r" (d0) + : [div] "r" (div) + : "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); - return r; + return d1; } /* AND m into each word of a and store in r. @@ -111642,147 +113742,139 @@ static sp_int64 sp_1024_cmp_16(const sp_digit* a, const sp_digit* b) { #ifdef WOLFSSL_SP_SMALL __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "mov x5, 120\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "mov x10, #16\n\t" + "add %[a], %[a], #112\n\t" + "add %[b], %[b], #112\n\t" "1:\n\t" - "ldr x6, [%[a], x5]\n\t" - "ldr x7, [%[b], x5]\n\t" - "and x6, x6, x4\n\t" - "and x7, x7, x4\n\t" - "subs x6, x6, x7\n\t" - "csel x2, x3, x2, hi\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "subs x5, x5, #8\n\t" - "b.cs 1b\n\t" - "eor %[a], x2, x4\n\t" - : [a] "+r" (a) - : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + "ldp x6, x7, [%[a]], -16\n\t" + "ldp x8, x9, [%[b]], -16\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x6, x6, x8\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "subs x10, x10, #2\n\t" + "b.ne 1b\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); #else __asm__ __volatile__ ( - "mov x2, -1\n\t" - "mov x3, 1\n\t" - "mov x4, -1\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "ldp x7, x8, [%[b], 112]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "mov x3, #0\n\t" + "mov x2, #-1\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "ldp x8, x9, [%[b], 112]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 96]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "ldp x7, x8, [%[b], 80]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "ldp x8, x9, [%[b], 80]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 64]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "ldp x7, x8, [%[b], 48]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "ldp x8, x9, [%[b], 48]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 32]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "ldp x7, x8, [%[b], 16]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 16]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "ldp x5, x6, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "and x6, x6, x4\n\t" - "and x8, x8, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "ldp x6, x7, [%[a], 0]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "subs x7, x7, x9\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" "subs x6, x6, x8\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "and x5, x5, x4\n\t" - "and x7, x7, x4\n\t" - "subs x5, x5, x7\n\t" - "csel x2, x4, x2, lo\n\t" - "csel x4, x4, xzr, eq\n\t" - "csel x2, x3, x2, hi\n\t" - "eor %[a], x2, x4\n\t" + "csel x4, x2, xzr, lo\n\t" + "csetm x5, eq\n\t" + "orr x3, x3, x4\n\t" + "and x2, x2, x5\n\t" + "cmp x2, #0\n\t" + "cset %[a], eq\n\t" + "orr %[a], %[a], x3\n\t" : [a] "+r" (a) : [b] "r" (b) - : "x2", "x3", "x4", "x5", "x6", "x7", "x8" + : "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" ); #endif @@ -111809,7 +113901,7 @@ static WC_INLINE int sp_1024_div_16(const sp_digit* a, const sp_digit* d, sp_dig div = d[15]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 16); - for (i=15; i>=0; i--) { + for (i = 15; i >= 0; i--) { sp_digit hi = t1[16 + i] - (t1[16 + i] == div); r1 = div_1024_word_16(hi, t1[16 + i - 1], div); @@ -112204,230 +114296,205 @@ static void sp_1024_cond_copy_16(sp_digit* r, const sp_digit* a, sp_digit m) SP_NOINLINE static void sp_1024_mont_reduce_16(sp_digit* a, const sp_digit* m, sp_digit mp) { - __asm__ __volatile__ ( - "ldp x14, x15, [%[m], 0]\n\t" - "ldp x16, x17, [%[m], 16]\n\t" - "ldp x19, x20, [%[m], 32]\n\t" - "ldp x21, x22, [%[m], 48]\n\t" - "ldp x23, x24, [%[m], 64]\n\t" - "ldp x25, x26, [%[m], 80]\n\t" - "ldp x27, x28, [%[m], 96]\n\t" - "mov x3, xzr\n\t" - "# i = 16\n\t" - "mov x4, 16\n\t" "ldp x12, x13, [%[a], 0]\n\t" + "ldp x14, x15, [%[a], 16]\n\t" + "ldp x16, x17, [%[a], 32]\n\t" + "ldp x19, x20, [%[a], 48]\n\t" + "ldp x21, x22, [%[a], 64]\n\t" + "ldp x23, x24, [%[a], 80]\n\t" + "ldp x25, x26, [%[a], 96]\n\t" + "ldp x27, x28, [%[a], 112]\n\t" + "mov x3, xzr\n\t" + "# i = 0..15\n\t" + "mov x4, 16\n\t" "\n1:\n\t" "# mu = a[i] * mp\n\t" "mul x9, %[mp], x12\n\t" "# a[i+0] += m[0] * mu\n\t" - "mul x7, x14, x9\n\t" - "umulh x8, x14, x9\n\t" + "ldp x10, x11, [%[m], 0]\n\t" + "mul x7, x10, x9\n\t" + "umulh x8, x10, x9\n\t" "adds x12, x12, x7\n\t" "# a[i+1] += m[1] * mu\n\t" - "mul x7, x15, x9\n\t" "adc x6, x8, xzr\n\t" - "umulh x8, x15, x9\n\t" + "mul x7, x11, x9\n\t" + "umulh x8, x11, x9\n\t" "adds x12, x13, x7\n\t" "# a[i+2] += m[2] * mu\n\t" - "ldr x13, [%[a], 16]\n\t" + "ldp x11, x10, [%[m], 16]\n\t" "adc x5, x8, xzr\n\t" - "mul x7, x16, x9\n\t" "adds x12, x12, x6\n\t" - "umulh x8, x16, x9\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "adds x13, x13, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x13, x14, x7\n\t" "# a[i+3] += m[3] * mu\n\t" - "ldr x10, [%[a], 24]\n\t" "adc x6, x8, xzr\n\t" - "mul x7, x17, x9\n\t" "adds x13, x13, x5\n\t" - "umulh x8, x17, x9\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x14, x15, x7\n\t" "# a[i+4] += m[4] * mu\n\t" - "ldr x11, [%[a], 32]\n\t" + "ldp x11, x10, [%[m], 32]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x19, x9\n\t" + "adds x14, x14, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x19, x9\n\t" - "str x10, [%[a], 24]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x15, x16, x7\n\t" "# a[i+5] += m[5] * mu\n\t" - "ldr x10, [%[a], 40]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x20, x9\n\t" + "adds x15, x15, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x20, x9\n\t" - "str x11, [%[a], 32]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x16, x17, x7\n\t" "# a[i+6] += m[6] * mu\n\t" - "ldr x11, [%[a], 48]\n\t" + "ldp x11, x10, [%[m], 48]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x21, x9\n\t" + "adds x16, x16, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x21, x9\n\t" - "str x10, [%[a], 40]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x17, x19, x7\n\t" "# a[i+7] += m[7] * mu\n\t" - "ldr x10, [%[a], 56]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x22, x9\n\t" + "adds x17, x17, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x22, x9\n\t" - "str x11, [%[a], 48]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x19, x20, x7\n\t" "# a[i+8] += m[8] * mu\n\t" - "ldr x11, [%[a], 64]\n\t" + "ldp x11, x10, [%[m], 64]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x23, x9\n\t" + "adds x19, x19, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x23, x9\n\t" - "str x10, [%[a], 56]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x20, x21, x7\n\t" "# a[i+9] += m[9] * mu\n\t" - "ldr x10, [%[a], 72]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x24, x9\n\t" + "adds x20, x20, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x24, x9\n\t" - "str x11, [%[a], 64]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x21, x22, x7\n\t" "# a[i+10] += m[10] * mu\n\t" - "ldr x11, [%[a], 80]\n\t" + "ldp x11, x10, [%[m], 80]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x25, x9\n\t" + "adds x21, x21, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x25, x9\n\t" - "str x10, [%[a], 72]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x22, x23, x7\n\t" "# a[i+11] += m[11] * mu\n\t" - "ldr x10, [%[a], 88]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x26, x9\n\t" + "adds x22, x22, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x26, x9\n\t" - "str x11, [%[a], 80]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x23, x24, x7\n\t" "# a[i+12] += m[12] * mu\n\t" - "ldr x11, [%[a], 96]\n\t" + "ldp x11, x10, [%[m], 96]\n\t" "adc x5, x8, xzr\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x27, x9\n\t" + "adds x23, x23, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x27, x9\n\t" - "str x10, [%[a], 88]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x24, x25, x7\n\t" "# a[i+13] += m[13] * mu\n\t" - "ldr x10, [%[a], 104]\n\t" "adc x6, x8, xzr\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x28, x9\n\t" + "adds x24, x24, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x28, x9\n\t" - "str x11, [%[a], 96]\n\t" - "adds x10, x10, x7\n\t" + "umulh x8, x10, x9\n\t" + "adds x25, x26, x7\n\t" "# a[i+14] += m[14] * mu\n\t" - "ldr x11, [%[a], 112]\n\t" + "ldp x11, x10, [%[m], 112]\n\t" "adc x5, x8, xzr\n\t" - "ldr x8, [%[m], 112]\n\t" - "adds x10, x10, x6\n\t" - "mul x7, x8, x9\n\t" + "adds x25, x25, x6\n\t" + "mul x7, x11, x9\n\t" "adc x5, x5, xzr\n\t" - "umulh x8, x8, x9\n\t" - "str x10, [%[a], 104]\n\t" - "adds x11, x11, x7\n\t" + "umulh x8, x11, x9\n\t" + "adds x26, x27, x7\n\t" "# a[i+15] += m[15] * mu\n\t" - "ldr x10, [%[a], 120]\n\t" + "ldr x10, [%[m], 120]\n\t" "adc x6, x8, xzr\n\t" - "ldr x8, [%[m], 120]\n\t" - "adds x11, x11, x5\n\t" - "mul x7, x8, x9\n\t" + "adds x26, x26, x5\n\t" + "mul x7, x10, x9\n\t" "adc x6, x6, xzr\n\t" - "umulh x8, x8, x9\n\t" + "umulh x8, x10, x9\n\t" "adds x6, x6, x7\n\t" "adcs x8, x8, x3\n\t" - "str x11, [%[a], 112]\n\t" - "cset x3, cs\n\t" - "adds x10, x10, x6\n\t" - "ldr x11, [%[a], 128]\n\t" - "str x10, [%[a], 120]\n\t" - "adcs x11, x11, x8\n\t" - "str x11, [%[a], 128]\n\t" + "cset x3, cs\n\t" + "adds x27, x28, x6\n\t" + "ldr x28, [%[a], 128]\n\t" + "adcs x28, x28, x8\n\t" "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" "bne 1b\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" "# Create mask\n\t" - "ldr x9, [%[m], 120]\n\t" - "subs x11, x9, x11\n\t" + "subs x11, x10, x28\n\t" "neg x3, x3\n\t" "sbc x11, x11, x11\n\t" "orr x3, x3, x11\n\t" "mov x9, %[a]\n\t" "sub %[a], %[a], 128\n\t" "# Subtract masked modulus\n\t" - "# x12 and x13 hold a[0] and a[1]\n\t" - "and x14, x14, x3\n\t" - "ldp x11, x10, [x9, 16]\n\t" - "and x15, x15, x3\n\t" - "subs x12, x12, x14\n\t" - "and x16, x16, x3\n\t" - "sbcs x13, x13, x15\n\t" - "and x17, x17, x3\n\t" - "sbcs x11, x11, x16\n\t" - "stp x12, x13, [%[a], 0]\n\t" - "sbcs x10, x10, x17\n\t" - "stp x11, x10, [%[a], 16]\n\t" - "ldp x12, x13, [x9, 32]\n\t" - "and x19, x19, x3\n\t" - "ldp x11, x10, [x9, 48]\n\t" - "and x20, x20, x3\n\t" - "sbcs x12, x12, x19\n\t" - "and x21, x21, x3\n\t" - "sbcs x13, x13, x20\n\t" - "and x22, x22, x3\n\t" - "sbcs x11, x11, x21\n\t" - "stp x12, x13, [%[a], 32]\n\t" - "sbcs x10, x10, x22\n\t" - "stp x11, x10, [%[a], 48]\n\t" - "ldp x12, x13, [x9, 64]\n\t" - "and x23, x23, x3\n\t" - "ldp x11, x10, [x9, 80]\n\t" - "and x24, x24, x3\n\t" - "sbcs x12, x12, x23\n\t" - "and x25, x25, x3\n\t" - "sbcs x13, x13, x24\n\t" - "and x26, x26, x3\n\t" - "sbcs x11, x11, x25\n\t" - "stp x12, x13, [%[a], 64]\n\t" - "sbcs x10, x10, x26\n\t" - "stp x11, x10, [%[a], 80]\n\t" - "ldp x7, x8, [%[m], 112]\n\t" - "ldp x12, x13, [x9, 96]\n\t" - "and x27, x27, x3\n\t" - "ldp x11, x10, [x9, 112]\n\t" - "and x28, x28, x3\n\t" - "sbcs x12, x12, x27\n\t" + "ldp x4, x5, [%[m], 0]\n\t" + "ldp x6, x7, [%[m], 16]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "subs x12, x12, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x13, x13, x5\n\t" "and x7, x7, x3\n\t" - "sbcs x13, x13, x28\n\t" - "and x8, x8, x3\n\t" - "sbcs x11, x11, x7\n\t" - "stp x12, x13, [%[a], 96]\n\t" - "sbcs x10, x10, x8\n\t" - "stp x11, x10, [%[a], 112]\n\t" - : [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + "sbcs x14, x14, x6\n\t" + "stp x12, x13, [%[a], 0]\n\t" + "sbcs x15, x15, x7\n\t" + "stp x14, x15, [%[a], 16]\n\t" + "ldp x4, x5, [%[m], 32]\n\t" + "ldp x6, x7, [%[m], 48]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x16, x16, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x17, x17, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x19, x19, x6\n\t" + "stp x16, x17, [%[a], 32]\n\t" + "sbcs x20, x20, x7\n\t" + "stp x19, x20, [%[a], 48]\n\t" + "ldp x4, x5, [%[m], 64]\n\t" + "ldp x6, x7, [%[m], 80]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x21, x21, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x22, x22, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x23, x23, x6\n\t" + "stp x21, x22, [%[a], 64]\n\t" + "sbcs x24, x24, x7\n\t" + "stp x23, x24, [%[a], 80]\n\t" + "ldp x4, x5, [%[m], 96]\n\t" + "ldp x6, x7, [%[m], 112]\n\t" + "and x4, x4, x3\n\t" + "and x5, x5, x3\n\t" + "sbcs x25, x25, x4\n\t" + "and x6, x6, x3\n\t" + "sbcs x26, x26, x5\n\t" + "and x7, x7, x3\n\t" + "sbcs x27, x27, x6\n\t" + "stp x25, x26, [%[a], 96]\n\t" + "sbcs x28, x28, x7\n\t" + "stp x27, x28, [%[a], 112]\n\t" + : [a] "+r" (a), [mp] "+r" (mp) + : [m] "r" (m) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } @@ -112441,7 +114508,7 @@ SP_NOINLINE static void sp_1024_mont_reduce_16(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_mul_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_1024_mul_16(r, a, b); @@ -112455,7 +114522,7 @@ static void sp_1024_mont_mul_16(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_sqr_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_sqr_16(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_1024_sqr_16(r, a); @@ -113022,6 +115089,7 @@ static void sp_1024_mont_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* ); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -113033,7 +115101,6 @@ static void sp_1024_mont_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* static sp_digit sp_1024_cond_add_16(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m) { -#ifdef WOLFSSL_SP_SMALL sp_digit c = 0; __asm__ __volatile__ ( @@ -113051,142 +115118,142 @@ static sp_digit sp_1024_cond_add_16(sp_digit* r, const sp_digit* a, const sp_dig "b.lt 1b\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + : "memory", "x4", "x5", "x8", "x9", "x10", "x11", "x12" ); return c; -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_1024_cond_add_16(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ __asm__ __volatile__ ( - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "adds x4, x4, x5\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "and x9, x9, %[m]\n\t" + "adds x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "ldp x8, x9, [%[b], 32]\n\t" + "ldp x10, x11, [%[b], 48]\n\t" + "ldp x4, x5, [%[a], 32]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 48]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldp x5, x7, [%[b], 64]\n\t" - "ldp x11, x12, [%[b], 80]\n\t" - "ldp x4, x6, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 80]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 32]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 48]\n\t" + "ldp x8, x9, [%[b], 64]\n\t" + "ldp x10, x11, [%[b], 80]\n\t" + "ldp x4, x5, [%[a], 64]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 80]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 64]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 80]\n\t" - "ldp x5, x7, [%[b], 96]\n\t" - "ldp x11, x12, [%[b], 112]\n\t" - "ldp x4, x6, [%[a], 96]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 112]\n\t" - "and x7, x7, %[m]\n\t" - "adcs x4, x4, x5\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 64]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 80]\n\t" + "ldp x8, x9, [%[b], 96]\n\t" + "ldp x10, x11, [%[b], 112]\n\t" + "ldp x4, x5, [%[a], 96]\n\t" + "and x8, x8, %[m]\n\t" + "ldp x6, x7, [%[a], 112]\n\t" + "and x9, x9, %[m]\n\t" + "adcs x4, x4, x8\n\t" + "and x10, x10, %[m]\n\t" + "adcs x5, x5, x9\n\t" "and x11, x11, %[m]\n\t" - "adcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "adcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 96]\n\t" - "adcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 112]\n\t" + "adcs x6, x6, x10\n\t" + "stp x4, x5, [%[r], 96]\n\t" + "adcs x7, x7, x11\n\t" + "stp x6, x7, [%[r], 112]\n\t" "cset %[r], cs\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" ); return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ static void sp_1024_rshift1_16(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( "ldp x2, x3, [%[a]]\n\t" - "lsr x2, x2, 1\n\t" - "orr x2, x2, x3, lsl 63\n\t" - "lsr x3, x3, 1\n\t" + "ldr x3, [%[a], 8]\n\t" + "extr x2, x3, x2, #1\n\t" "ldr x4, [%[a], 16]\n\t" "str x2, [%[r], 0]\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "lsr x4, x4, 1\n\t" + "extr x3, x4, x3, #1\n\t" "ldr x2, [%[a], 24]\n\t" "str x3, [%[r], 8]\n\t" - "orr x4, x4, x2, lsl 63\n\t" - "lsr x2, x2, 1\n\t" + "extr x4, x2, x4, #1\n\t" "ldr x3, [%[a], 32]\n\t" "str x4, [%[r], 16]\n\t" - "orr x2, x2, x3, lsl 63\n\t" - "lsr x3, x3, 1\n\t" + "extr x2, x3, x2, #1\n\t" "ldr x4, [%[a], 40]\n\t" "str x2, [%[r], 24]\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "lsr x4, x4, 1\n\t" + "extr x3, x4, x3, #1\n\t" "ldr x2, [%[a], 48]\n\t" "str x3, [%[r], 32]\n\t" - "orr x4, x4, x2, lsl 63\n\t" - "lsr x2, x2, 1\n\t" + "extr x4, x2, x4, #1\n\t" "ldr x3, [%[a], 56]\n\t" "str x4, [%[r], 40]\n\t" - "orr x2, x2, x3, lsl 63\n\t" - "lsr x3, x3, 1\n\t" + "extr x2, x3, x2, #1\n\t" "ldr x4, [%[a], 64]\n\t" "str x2, [%[r], 48]\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "lsr x4, x4, 1\n\t" + "extr x3, x4, x3, #1\n\t" "ldr x2, [%[a], 72]\n\t" "str x3, [%[r], 56]\n\t" - "orr x4, x4, x2, lsl 63\n\t" - "lsr x2, x2, 1\n\t" + "extr x4, x2, x4, #1\n\t" "ldr x3, [%[a], 80]\n\t" "str x4, [%[r], 64]\n\t" - "orr x2, x2, x3, lsl 63\n\t" - "lsr x3, x3, 1\n\t" + "extr x2, x3, x2, #1\n\t" "ldr x4, [%[a], 88]\n\t" "str x2, [%[r], 72]\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "lsr x4, x4, 1\n\t" + "extr x3, x4, x3, #1\n\t" "ldr x2, [%[a], 96]\n\t" "str x3, [%[r], 80]\n\t" - "orr x4, x4, x2, lsl 63\n\t" - "lsr x2, x2, 1\n\t" + "extr x4, x2, x4, #1\n\t" "ldr x3, [%[a], 104]\n\t" "str x4, [%[r], 88]\n\t" - "orr x2, x2, x3, lsl 63\n\t" - "lsr x3, x3, 1\n\t" + "extr x2, x3, x2, #1\n\t" "ldr x4, [%[a], 112]\n\t" "str x2, [%[r], 96]\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "lsr x4, x4, 1\n\t" + "extr x3, x4, x3, #1\n\t" "ldr x2, [%[a], 120]\n\t" "str x3, [%[r], 104]\n\t" - "orr x4, x4, x2, lsl 63\n\t" - "lsr x2, x2, 1\n\t" + "extr x4, x2, x4, #1\n\t" + "lsr x2, x2, #1\n\t" "stp x4, x2, [%[r], 112]\n\t" : : [r] "r" (r), [a] "r" (a) @@ -121609,41 +123676,74 @@ static int sp_1024_iszero_16(const sp_digit* a) */ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n) { - int i; - int j; - byte* d; + sp_int64 nl = n; + sp_int64 size8 = size * 8; - for (i = n - 1,j = 0; i >= 7; i -= 8) { - r[j] = ((sp_digit)a[i - 0] << 0) | - ((sp_digit)a[i - 1] << 8) | - ((sp_digit)a[i - 2] << 16) | - ((sp_digit)a[i - 3] << 24) | - ((sp_digit)a[i - 4] << 32) | - ((sp_digit)a[i - 5] << 40) | - ((sp_digit)a[i - 6] << 48) | - ((sp_digit)a[i - 7] << 56); - j++; - } - - if (i >= 0) { - r[j] = 0; - - d = (byte*)r; - switch (i) { - case 6: d[n - 1 - 6] = a[6]; //fallthrough - case 5: d[n - 1 - 5] = a[5]; //fallthrough - case 4: d[n - 1 - 4] = a[4]; //fallthrough - case 3: d[n - 1 - 3] = a[3]; //fallthrough - case 2: d[n - 1 - 2] = a[2]; //fallthrough - case 1: d[n - 1 - 1] = a[1]; //fallthrough - case 0: d[n - 1 - 0] = a[0]; //fallthrough - } - j++; - } - - for (; j < size; j++) { - r[j] = 0; - } + __asm__ __volatile__ ( + "add x4, %[a], %[n]\n\t" + "mov x5, %[r]\n\t" + "sub x4, x4, 8\n\t" + "subs x6, %[n], 8\n\t" + "mov x7, xzr\n\t" + "blt 2f\n\t" + /* Put in mulitples of 8 bytes. */ + "1:\n\t" + "ldr x8, [x4], -8\n\t" + "subs x6, x6, 8\n\t" + "rev x8, x8\n\t" + "str x8, [x5], 8\n\t" + "add x7, x7, 8\n\t" + "b.ge 1b\n\t" + "2:\n\t" + "cmp x6, -7\n\t" + "b.lt 20f\n\t" + /* Put in less than 8 bytes. */ + "str xzr, [x5]\n\t" + "add x7, x7, 8\n\t" + "add x4, x4, 7\n\t" + "b.eq 17f\n\t" + "cmp x6, -5\n\t" + "b.lt 16f\n\t" + "b.eq 15f\n\t" + "cmp x6, -3\n\t" + "b.lt 14f\n\t" + "b.eq 13f\n\t" + "cmp x6, -2\n\t" + "b.eq 12f\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "12:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "13:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "14:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "15:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "16:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "17:\n\t" + "ldrb w8, [x4], -1\n\t" + "strb w8, [x5], 1\n\t" + "20:\n\t" + "add x5, %[r], x7\n\t" + "subs x7, %[size], x7\n\t" + "b.eq 30f\n\t" + /* Zero out remaining words. */ + "21:\n\t" + "subs x7, x7, 8\n\t" + "str xzr, [x5], 8\n\t" + "b.gt 21b\n\t" + "30:\n\t" + : + : [r] "r" (r), [size] "r" (size8), [a] "r" (a), [n] "r" (nl) + : "memory", "x4", "x5", "x6", "x7", "x8" + ); } /* Check that the x and y oridinates are a valid point on the curve. diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c index b482379a7..cfa7dd2a8 100644 --- a/wolfcrypt/src/sp_armthumb.c +++ b/wolfcrypt/src/sp_armthumb.c @@ -48,19 +48,19 @@ #include #ifdef WOLFSSL_SP_ARM_THUMB_ASM -#define SP_PRINT_NUM(var, name, total, words, bits) \ - do { \ - int ii; \ - fprintf(stderr, name "=0x"); \ - for (ii = words - 1; ii >= 0; ii--) \ - fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ - fprintf(stderr, "\n"); \ +#define SP_PRINT_NUM(var, name, total, words, bits) \ + do { \ + int ii; \ + fprintf(stderr, name "=0x"); \ + for (ii = ((bits + 31) / 32) - 1; ii >= 0; ii--) \ + fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ + fprintf(stderr, "\n"); \ } while (0) -#define SP_PRINT_VAL(var, name) \ +#define SP_PRINT_VAL(var, name) \ fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var) -#define SP_PRINT_INT(var, name) \ +#define SP_PRINT_INT(var, name) \ fprintf(stderr, name "=%d\n", var) #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) @@ -9422,6 +9422,2906 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, } #endif /* !WOLFSSL_SP_LARGE_CODE */ +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_add_word_8(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "movs r5, #0\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, %[b]\n\t" +#else + "add r3, r3, %[b]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)r; +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_in_place_16(sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "subs r2, r2, r4\n\t" +#else + "sub r2, r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs %[a], %[a], %[a]\n\t" +#elif defined(__clang__) + "sbcs %[a], %[a]\n\t" +#else + "sbc %[a], %[a]\n\t" +#endif + : [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r2", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)a; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_2048_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<8; i++) { + r[i] = a[i] & m; + } +#else + r[0] = a[0] & m; + r[1] = a[1] & m; + r[2] = a[2] & m; + r[3] = a[3] & m; + r[4] = a[4] & m; + r[5] = a[5] & m; + r[6] = a[6] & m; + r[7] = a[7] & m; +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[16]; + sp_digit a1[8]; + sp_digit b1[8]; + sp_digit* z2 = r + 16; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_2048_add_8(a1, a, &a[8]); + cb = sp_2048_add_8(b1, b, &b[8]); + u = ca & cb; + + sp_2048_mul_8(z2, &a[8], &b[8]); + sp_2048_mul_8(z0, a, b); + sp_2048_mul_8(z1, a1, b1); + + u += sp_2048_sub_in_place_16(z1, z0); + u += sp_2048_sub_in_place_16(z1, z2); + sp_2048_mask_8(a1, a1, 0 - cb); + u += sp_2048_add_8(z1 + 8, z1 + 8, a1); + sp_2048_mask_8(b1, b1, 0 - ca); + u += sp_2048_add_8(z1 + 8, z1 + 8, b1); + + u += sp_2048_add_16(r + 8, r + 8, z1); + (void)sp_2048_add_word_8(r + 24, r + 24, u); +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_add_word_16(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "movs r5, #0\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, %[b]\n\t" +#else + "add r3, r3, %[b]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)r; +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "subs r2, r2, r4\n\t" +#else + "sub r2, r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs %[a], %[a], %[a]\n\t" +#elif defined(__clang__) + "sbcs %[a], %[a]\n\t" +#else + "sbc %[a], %[a]\n\t" +#endif + : [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r2", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)a; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<16; i++) { + r[i] = a[i] & m; + } +#else + int i; + + for (i = 0; i < 16; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[32]; + sp_digit a1[16]; + sp_digit b1[16]; + sp_digit* z2 = r + 32; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_2048_add_16(a1, a, &a[16]); + cb = sp_2048_add_16(b1, b, &b[16]); + u = ca & cb; + + sp_2048_mul_16(z2, &a[16], &b[16]); + sp_2048_mul_16(z0, a, b); + sp_2048_mul_16(z1, a1, b1); + + u += sp_2048_sub_in_place_32(z1, z0); + u += sp_2048_sub_in_place_32(z1, z2); + sp_2048_mask_16(a1, a1, 0 - cb); + u += sp_2048_add_16(z1 + 16, z1 + 16, a1); + sp_2048_mask_16(b1, b1, 0 - ca); + u += sp_2048_add_16(z1 + 16, z1 + 16, b1); + + u += sp_2048_add_32(r + 16, r + 16, z1); + (void)sp_2048_add_word_16(r + 48, r + 48, u); +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_add_word_32(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "movs r5, #0\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, %[b]\n\t" +#else + "add r3, r3, %[b]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)r; +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "subs r2, r2, r4\n\t" +#else + "sub r2, r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs %[a], %[a], %[a]\n\t" +#elif defined(__clang__) + "sbcs %[a], %[a]\n\t" +#else + "sbc %[a], %[a]\n\t" +#endif + : [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r2", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)a; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<32; i++) { + r[i] = a[i] & m; + } +#else + int i; + + for (i = 0; i < 32; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[64]; + sp_digit a1[32]; + sp_digit b1[32]; + sp_digit* z2 = r + 64; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_2048_add_32(a1, a, &a[32]); + cb = sp_2048_add_32(b1, b, &b[32]); + u = ca & cb; + + sp_2048_mul_32(z2, &a[32], &b[32]); + sp_2048_mul_32(z0, a, b); + sp_2048_mul_32(z1, a1, b1); + + u += sp_2048_sub_in_place_64(z1, z0); + u += sp_2048_sub_in_place_64(z1, z2); + sp_2048_mask_32(a1, a1, 0 - cb); + u += sp_2048_add_32(z1 + 32, z1 + 32, a1); + sp_2048_mask_32(b1, b1, 0 - ca); + u += sp_2048_add_32(z1 + 32, z1 + 32, b1); + + u += sp_2048_add_64(r + 32, r + 32, z1); + (void)sp_2048_add_word_32(r + 96, r + 96, u); +} + #ifndef WOLFSSL_SP_LARGE_CODE /* Square a and put result in r. (r = a * a) * @@ -16362,89 +19262,88 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) } #endif /* !WOLFSSL_SP_LARGE_CODE */ -/* Add b to a into r. (r = a + b) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_2048_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b) { __asm__ __volatile__ ( "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, r5\n\t" + "subs r3, r3, r5\n\t" #else - "add r3, r3, r5\n\t" + "sub r3, r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" #ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" + "sbcs %[r], %[r], %[r]\n\t" #elif defined(__clang__) - "adcs %[r], %[r]\n\t" + "sbcs %[r], %[r]\n\t" #else - "adc %[r], %[r]\n\t" + "sbc %[r], %[r]\n\t" #endif : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) : @@ -16453,559 +19352,6 @@ SP_NOINLINE static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a, return (uint32_t)(size_t)r; } -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_add_to_word_8(sp_digit* r, sp_digit a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "movs r5, #0\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, %[a]\n\t" -#else - "add r3, r3, %[a]\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - -/* Sub b from a into a. (a -= b) - * - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_sub_in_place_16(sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "subs r2, r2, r4\n\t" -#else - "sub r2, r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" -#ifdef WOLFSSL_KEIL - "sbcs %[a], %[a], %[a]\n\t" -#elif defined(__clang__) - "sbcs %[a], %[a]\n\t" -#else - "sbc %[a], %[a]\n\t" -#endif - : [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)a; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, r5\n\t" -#else - "add r3, r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r3", "r4", "r5", "r6" - ); - return (uint32_t)(size_t)r; -} - -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_2048_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<8; i++) { - r[i] = a[i] & m; - } -#else - r[0] = a[0] & m; - r[1] = a[1] & m; - r[2] = a[2] & m; - r[3] = a[3] & m; - r[4] = a[4] & m; - r[5] = a[5] & m; - r[6] = a[6] & m; - r[7] = a[7] & m; -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit b1[8]; - sp_digit z2[16]; - sp_digit u; - sp_digit ca; - sp_digit cb; - - ca = sp_2048_add_8(a1, a, &a[8]); - cb = sp_2048_add_8(b1, b, &b[8]); - u = ca & cb; - sp_2048_mul_8(z1, a1, b1); - sp_2048_mul_8(z2, &a[8], &b[8]); - sp_2048_mul_8(z0, a, b); - sp_2048_mask_8(r + 16, a1, 0 - cb); - sp_2048_mask_8(b1, b1, 0 - ca); - u += sp_2048_add_8(r + 16, r + 16, b1); - u += sp_2048_sub_in_place_16(z1, z2); - u += sp_2048_sub_in_place_16(z1, z0); - u += sp_2048_add_16(r + 8, r + 8, z1); - u += sp_2048_add_8(r + 16, r + 16, z2); - (void)sp_2048_add_to_word_8(r + 24, u, z2 + 8); -} - -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_dbl_8(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r2, r2, r2\n\t" -#else - "add r2, r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -17014,772 +19360,183 @@ SP_NOINLINE static sp_digit sp_2048_dbl_8(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[16]; + sp_digit* z2 = r + 16; sp_digit z1[16]; - sp_digit a1[8]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 8; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 8); + + mask = sp_2048_sub_8(a1, a, &a[8]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_8(a1, p1, p2); - u = sp_2048_add_8(a1, a, &a[8]); - sp_2048_sqr_8(z1, a1); sp_2048_sqr_8(z2, &a[8]); sp_2048_sqr_8(z0, a); - sp_2048_mask_8(r + 16, a1, 0 - u); - u += sp_2048_dbl_8(r + 16, r + 16); - u += sp_2048_sub_in_place_16(z1, z2); - u += sp_2048_sub_in_place_16(z1, z0); - u += sp_2048_add_16(r + 8, r + 8, z1); - u += sp_2048_add_8(r + 16, r + 16, z2); - (void)sp_2048_add_to_word_8(r + 24, u, z2 + 8); + sp_2048_sqr_8(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_16(z1, z2); + u -= sp_2048_sub_in_place_16(z1, z0); + u += sp_2048_sub_in_place_16(r + 8, z1); + sp_2048_add_word_8(r + 24, r + 24, u); } -/* Add b to a into r. (r = a + b) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_add_to_word_16(sp_digit* r, sp_digit a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "movs r5, #0\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, %[a]\n\t" -#else - "add r3, r3, %[a]\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - -/* Sub b from a into a. (a -= b) - * - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "subs r2, r2, r4\n\t" -#else - "sub r2, r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" -#ifdef WOLFSSL_KEIL - "sbcs %[a], %[a], %[a]\n\t" -#elif defined(__clang__) - "sbcs %[a], %[a]\n\t" -#else - "sbc %[a], %[a]\n\t" -#endif - : [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)a; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_2048_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* b) { __asm__ __volatile__ ( "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, r5\n\t" + "subs r3, r3, r5\n\t" #else - "add r3, r3, r5\n\t" + "sub r3, r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs %[r], %[r], %[r]\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs %[r], %[r]\n\t" #else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" + "sbc %[r], %[r]\n\t" #endif : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) : @@ -17788,211 +19545,6 @@ SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, return (uint32_t)(size_t)r; } -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<16; i++) { - r[i] = a[i] & m; - } -#else - int i; - - for (i = 0; i < 16; i += 8) { - r[i+0] = a[i+0] & m; - r[i+1] = a[i+1] & m; - r[i+2] = a[i+2] & m; - r[i+3] = a[i+3] & m; - r[i+4] = a[i+4] & m; - r[i+5] = a[i+5] & m; - r[i+6] = a[i+6] & m; - r[i+7] = a[i+7] & m; - } -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[32]; - sp_digit a1[16]; - sp_digit b1[16]; - sp_digit z2[32]; - sp_digit u; - sp_digit ca; - sp_digit cb; - - ca = sp_2048_add_16(a1, a, &a[16]); - cb = sp_2048_add_16(b1, b, &b[16]); - u = ca & cb; - sp_2048_mul_16(z1, a1, b1); - sp_2048_mul_16(z2, &a[16], &b[16]); - sp_2048_mul_16(z0, a, b); - sp_2048_mask_16(r + 32, a1, 0 - cb); - sp_2048_mask_16(b1, b1, 0 - ca); - u += sp_2048_add_16(r + 32, r + 32, b1); - u += sp_2048_sub_in_place_32(z1, z2); - u += sp_2048_sub_in_place_32(z1, z0); - u += sp_2048_add_32(r + 16, r + 16, z1); - u += sp_2048_add_16(r + 32, r + 32, z2); - (void)sp_2048_add_to_word_16(r + 48, u, z2 + 16); -} - -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_dbl_16(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r2, r2, r2\n\t" -#else - "add r2, r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -18001,1460 +19553,319 @@ SP_NOINLINE static sp_digit sp_2048_dbl_16(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit z1[32]; - sp_digit a1[16]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 16; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 16); + + mask = sp_2048_sub_16(a1, a, &a[16]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_16(a1, p1, p2); - u = sp_2048_add_16(a1, a, &a[16]); - sp_2048_sqr_16(z1, a1); sp_2048_sqr_16(z2, &a[16]); sp_2048_sqr_16(z0, a); - sp_2048_mask_16(r + 32, a1, 0 - u); - u += sp_2048_dbl_16(r + 32, r + 32); - u += sp_2048_sub_in_place_32(z1, z2); - u += sp_2048_sub_in_place_32(z1, z0); - u += sp_2048_add_32(r + 16, r + 16, z1); - u += sp_2048_add_16(r + 32, r + 32, z2); - (void)sp_2048_add_to_word_16(r + 48, u, z2 + 16); + sp_2048_sqr_16(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_32(z1, z2); + u -= sp_2048_sub_in_place_32(z1, z0); + u += sp_2048_sub_in_place_32(r + 16, z1); + sp_2048_add_word_16(r + 48, r + 48, u); } -/* Add b to a into r. (r = a + b) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_add_to_word_32(sp_digit* r, sp_digit a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "movs r5, #0\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, %[a]\n\t" -#else - "add r3, r3, %[a]\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - -/* Sub b from a into a. (a -= b) - * - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "subs r2, r2, r4\n\t" -#else - "sub r2, r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" -#ifdef WOLFSSL_KEIL - "sbcs %[a], %[a], %[a]\n\t" -#elif defined(__clang__) - "sbcs %[a], %[a]\n\t" -#else - "sbc %[a], %[a]\n\t" -#endif - : [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)a; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b) { __asm__ __volatile__ ( "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, r5\n\t" + "subs r3, r3, r5\n\t" #else - "add r3, r3, r5\n\t" + "sub r3, r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs %[r], %[r], %[r]\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs %[r], %[r]\n\t" #else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" + "sbc %[r], %[r]\n\t" #endif : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) : @@ -19463,331 +19874,6 @@ SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, return (uint32_t)(size_t)r; } -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<32; i++) { - r[i] = a[i] & m; - } -#else - int i; - - for (i = 0; i < 32; i += 8) { - r[i+0] = a[i+0] & m; - r[i+1] = a[i+1] & m; - r[i+2] = a[i+2] & m; - r[i+3] = a[i+3] & m; - r[i+4] = a[i+4] & m; - r[i+5] = a[i+5] & m; - r[i+6] = a[i+6] & m; - r[i+7] = a[i+7] & m; - } -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[64]; - sp_digit a1[32]; - sp_digit b1[32]; - sp_digit z2[64]; - sp_digit u; - sp_digit ca; - sp_digit cb; - - ca = sp_2048_add_32(a1, a, &a[32]); - cb = sp_2048_add_32(b1, b, &b[32]); - u = ca & cb; - sp_2048_mul_32(z1, a1, b1); - sp_2048_mul_32(z2, &a[32], &b[32]); - sp_2048_mul_32(z0, a, b); - sp_2048_mask_32(r + 64, a1, 0 - cb); - sp_2048_mask_32(b1, b1, 0 - ca); - u += sp_2048_add_32(r + 64, r + 64, b1); - u += sp_2048_sub_in_place_64(z1, z2); - u += sp_2048_sub_in_place_64(z1, z0); - u += sp_2048_add_64(r + 32, r + 32, z1); - u += sp_2048_add_32(r + 64, r + 64, z2); - (void)sp_2048_add_to_word_32(r + 96, u, z2 + 32); -} - -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_dbl_32(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r2, r2, r2\n\t" -#else - "add r2, r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -19796,22 +19882,31 @@ SP_NOINLINE static sp_digit sp_2048_dbl_32(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[64]; + sp_digit* z2 = r + 64; sp_digit z1[64]; - sp_digit a1[32]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 32; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 32); + + mask = sp_2048_sub_32(a1, a, &a[32]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_32(a1, p1, p2); - u = sp_2048_add_32(a1, a, &a[32]); - sp_2048_sqr_32(z1, a1); sp_2048_sqr_32(z2, &a[32]); sp_2048_sqr_32(z0, a); - sp_2048_mask_32(r + 64, a1, 0 - u); - u += sp_2048_dbl_32(r + 64, r + 64); - u += sp_2048_sub_in_place_64(z1, z2); - u += sp_2048_sub_in_place_64(z1, z0); - u += sp_2048_add_64(r + 32, r + 32, z1); - u += sp_2048_add_32(r + 64, r + 64, z2); - (void)sp_2048_add_to_word_32(r + 96, u, z2 + 32); + sp_2048_sqr_32(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_64(z1, z2); + u -= sp_2048_sub_in_place_64(z1, z0); + u += sp_2048_sub_in_place_64(r + 32, z1); + sp_2048_add_word_32(r + 96, r + 96, u); } #endif /* !WOLFSSL_SP_SMALL */ @@ -22890,7 +22985,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_32(r, a, b); @@ -22904,7 +22999,7 @@ static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_32(r, a); @@ -23097,11 +23192,11 @@ SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -23726,7 +23821,7 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig div = d[31]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i=31; i>=0; i--) { + for (i = 31; i >= 0; i--) { sp_digit hi = t1[32 + i] - (t1[32 + i] == div); r1 = div_2048_word_32(hi, t1[32 + i - 1], div); @@ -25801,7 +25896,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_64(r, a, b); @@ -25815,7 +25910,7 @@ static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_64(r, a); @@ -26464,11 +26559,11 @@ SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -26984,9 +27079,13 @@ static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, s div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_2048_word_64(hi, t1[64 + i - 1], div); + for (i = 63; i >= 0; i--) { + if (t1[64 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_2048_word_64(t1[64 + i], t1[64 + i - 1], div); + } sp_2048_mul_d_64(t2, d, r1); t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2); @@ -27184,7 +27283,7 @@ static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_dig div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { + for (i = 63; i >= 0; i--) { sp_digit hi = t1[64 + i] - (t1[64 + i] == div); r1 = div_2048_word_64(hi, t1[64 + i - 1], div); @@ -27570,9 +27669,9 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 64; r = a + 64 * 2; m = r + 64 * 2; - ah = a + 64; sp_2048_from_bin(ah, 64, in, inLen); #if DIGIT_BIT >= 32 @@ -27590,7 +27689,38 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_2048_from_mp(m, 64, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_2048_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 64); + err = sp_2048_mod_64_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_2048_mont_sqr_64(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_2048_mont_mul_64(r, r, ah, m, mp); + + for (i = 63; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_2048_sub_in_place_64(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_2048_sqr_64(r, ah); err = sp_2048_mod_64_cond(r, r, m); @@ -27618,7 +27748,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 64); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_2048_mont_sqr_64(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_2048_mont_mul_64(r, r, a, m, mp); @@ -50454,6 +50584,4148 @@ SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, } #endif /* !WOLFSSL_SP_LARGE_CODE */ +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_add_word_12(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "movs r5, #0\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, %[b]\n\t" +#else + "add r3, r3, %[b]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)r; +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "subs r2, r2, r4\n\t" +#else + "sub r2, r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs %[a], %[a], %[a]\n\t" +#elif defined(__clang__) + "sbcs %[a], %[a]\n\t" +#else + "sbc %[a], %[a]\n\t" +#endif + : [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r2", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)a; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<12; i++) { + r[i] = a[i] & m; + } +#else + r[0] = a[0] & m; + r[1] = a[1] & m; + r[2] = a[2] & m; + r[3] = a[3] & m; + r[4] = a[4] & m; + r[5] = a[5] & m; + r[6] = a[6] & m; + r[7] = a[7] & m; + r[8] = a[8] & m; + r[9] = a[9] & m; + r[10] = a[10] & m; + r[11] = a[11] & m; +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[24]; + sp_digit a1[12]; + sp_digit b1[12]; + sp_digit* z2 = r + 24; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_3072_add_12(a1, a, &a[12]); + cb = sp_3072_add_12(b1, b, &b[12]); + u = ca & cb; + + sp_3072_mul_12(z2, &a[12], &b[12]); + sp_3072_mul_12(z0, a, b); + sp_3072_mul_12(z1, a1, b1); + + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_sub_in_place_24(z1, z2); + sp_3072_mask_12(a1, a1, 0 - cb); + u += sp_3072_add_12(z1 + 12, z1 + 12, a1); + sp_3072_mask_12(b1, b1, 0 - ca); + u += sp_3072_add_12(z1 + 12, z1 + 12, b1); + + u += sp_3072_add_24(r + 12, r + 12, z1); + (void)sp_3072_add_word_12(r + 36, r + 36, u); +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_add_word_24(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "movs r5, #0\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, %[b]\n\t" +#else + "add r3, r3, %[b]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)r; +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "subs r2, r2, r4\n\t" +#else + "sub r2, r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs %[a], %[a], %[a]\n\t" +#elif defined(__clang__) + "sbcs %[a], %[a]\n\t" +#else + "sbc %[a], %[a]\n\t" +#endif + : [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r2", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)a; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<24; i++) { + r[i] = a[i] & m; + } +#else + int i; + + for (i = 0; i < 24; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[48]; + sp_digit a1[24]; + sp_digit b1[24]; + sp_digit* z2 = r + 48; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_3072_add_24(a1, a, &a[24]); + cb = sp_3072_add_24(b1, b, &b[24]); + u = ca & cb; + + sp_3072_mul_24(z2, &a[24], &b[24]); + sp_3072_mul_24(z0, a, b); + sp_3072_mul_24(z1, a1, b1); + + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_sub_in_place_48(z1, z2); + sp_3072_mask_24(a1, a1, 0 - cb); + u += sp_3072_add_24(z1 + 24, z1 + 24, a1); + sp_3072_mask_24(b1, b1, 0 - ca); + u += sp_3072_add_24(z1 + 24, z1 + 24, b1); + + u += sp_3072_add_48(r + 24, r + 24, z1); + (void)sp_3072_add_word_24(r + 72, r + 72, u); +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_add_word_48(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "movs r5, #0\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, %[b]\n\t" +#else + "add r3, r3, %[b]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)r; +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "subs r2, r2, r4\n\t" +#else + "sub r2, r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" + "ldm %[b]!, {r4, r5}\n\t" + "ldr r2, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r2, r2, r4\n\t" +#elif defined(__clang__) + "sbcs r2, r4\n\t" +#else + "sbc r2, r4\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif + "stm %[a]!, {r2, r3}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs %[a], %[a], %[a]\n\t" +#elif defined(__clang__) + "sbcs %[a], %[a]\n\t" +#else + "sbc %[a], %[a]\n\t" +#endif + : [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r2", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)a; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r6\n\t" +#elif defined(__clang__) + "adcs r4, r6\n\t" +#else + "adc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<48; i++) { + r[i] = a[i] & m; + } +#else + int i; + + for (i = 0; i < 48; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[96]; + sp_digit a1[48]; + sp_digit b1[48]; + sp_digit* z2 = r + 96; + sp_digit u; + sp_digit ca; + sp_digit cb; + + ca = sp_3072_add_48(a1, a, &a[48]); + cb = sp_3072_add_48(b1, b, &b[48]); + u = ca & cb; + + sp_3072_mul_48(z2, &a[48], &b[48]); + sp_3072_mul_48(z0, a, b); + sp_3072_mul_48(z1, a1, b1); + + u += sp_3072_sub_in_place_96(z1, z0); + u += sp_3072_sub_in_place_96(z1, z2); + sp_3072_mask_48(a1, a1, 0 - cb); + u += sp_3072_add_48(z1 + 48, z1 + 48, a1); + sp_3072_mask_48(b1, b1, 0 - ca); + u += sp_3072_add_48(z1 + 48, z1 + 48, b1); + + u += sp_3072_add_96(r + 48, r + 48, z1); + (void)sp_3072_add_word_48(r + 144, r + 144, u); +} + #ifndef WOLFSSL_SP_LARGE_CODE /* Square a and put result in r. (r = a * a) * @@ -65650,123 +69922,122 @@ SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) } #endif /* !WOLFSSL_SP_LARGE_CODE */ -/* Add b to a into r. (r = a + b) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_3072_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b) { __asm__ __volatile__ ( "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, r5\n\t" + "subs r3, r3, r5\n\t" #else - "add r3, r3, r5\n\t" + "sub r3, r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" #ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" + "sbcs %[r], %[r], %[r]\n\t" #elif defined(__clang__) - "adcs %[r], %[r]\n\t" + "sbcs %[r], %[r]\n\t" #else - "adc %[r], %[r]\n\t" + "sbc %[r], %[r]\n\t" #endif : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) : @@ -65775,765 +70046,6 @@ SP_NOINLINE static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, return (uint32_t)(size_t)r; } -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_add_to_word_12(sp_digit* r, sp_digit a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "movs r5, #0\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, %[a]\n\t" -#else - "add r3, r3, %[a]\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - -/* Sub b from a into a. (a -= b) - * - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "subs r2, r2, r4\n\t" -#else - "sub r2, r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" -#ifdef WOLFSSL_KEIL - "sbcs %[a], %[a], %[a]\n\t" -#elif defined(__clang__) - "sbcs %[a], %[a]\n\t" -#else - "sbc %[a], %[a]\n\t" -#endif - : [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)a; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, r5\n\t" -#else - "add r3, r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r3", "r4", "r5", "r6" - ); - return (uint32_t)(size_t)r; -} - -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<12; i++) { - r[i] = a[i] & m; - } -#else - r[0] = a[0] & m; - r[1] = a[1] & m; - r[2] = a[2] & m; - r[3] = a[3] & m; - r[4] = a[4] & m; - r[5] = a[5] & m; - r[6] = a[6] & m; - r[7] = a[7] & m; - r[8] = a[8] & m; - r[9] = a[9] & m; - r[10] = a[10] & m; - r[11] = a[11] & m; -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[24]; - sp_digit a1[12]; - sp_digit b1[12]; - sp_digit z2[24]; - sp_digit u; - sp_digit ca; - sp_digit cb; - - ca = sp_3072_add_12(a1, a, &a[12]); - cb = sp_3072_add_12(b1, b, &b[12]); - u = ca & cb; - sp_3072_mul_12(z1, a1, b1); - sp_3072_mul_12(z2, &a[12], &b[12]); - sp_3072_mul_12(z0, a, b); - sp_3072_mask_12(r + 24, a1, 0 - cb); - sp_3072_mask_12(b1, b1, 0 - ca); - u += sp_3072_add_12(r + 24, r + 24, b1); - u += sp_3072_sub_in_place_24(z1, z2); - u += sp_3072_sub_in_place_24(z1, z0); - u += sp_3072_add_24(r + 12, r + 12, z1); - u += sp_3072_add_12(r + 24, r + 24, z2); - (void)sp_3072_add_to_word_12(r + 36, u, z2 + 12); -} - -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_dbl_12(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r2, r2, r2\n\t" -#else - "add r2, r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -66542,1116 +70054,251 @@ SP_NOINLINE static sp_digit sp_3072_dbl_12(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[24]; + sp_digit* z2 = r + 24; sp_digit z1[24]; - sp_digit a1[12]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 12; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 12); + + mask = sp_3072_sub_12(a1, a, &a[12]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_12(a1, p1, p2); - u = sp_3072_add_12(a1, a, &a[12]); - sp_3072_sqr_12(z1, a1); sp_3072_sqr_12(z2, &a[12]); sp_3072_sqr_12(z0, a); - sp_3072_mask_12(r + 24, a1, 0 - u); - u += sp_3072_dbl_12(r + 24, r + 24); - u += sp_3072_sub_in_place_24(z1, z2); - u += sp_3072_sub_in_place_24(z1, z0); - u += sp_3072_add_24(r + 12, r + 12, z1); - u += sp_3072_add_12(r + 24, r + 24, z2); - (void)sp_3072_add_to_word_12(r + 36, u, z2 + 12); + sp_3072_sqr_12(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_24(z1, z2); + u -= sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_sub_in_place_24(r + 12, z1); + sp_3072_add_word_12(r + 36, r + 36, u); } -/* Add b to a into r. (r = a + b) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_to_word_24(sp_digit* r, sp_digit a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "movs r5, #0\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, %[a]\n\t" -#else - "add r3, r3, %[a]\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - -/* Sub b from a into a. (a -= b) - * - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "subs r2, r2, r4\n\t" -#else - "sub r2, r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" -#ifdef WOLFSSL_KEIL - "sbcs %[a], %[a], %[a]\n\t" -#elif defined(__clang__) - "sbcs %[a], %[a]\n\t" -#else - "sbc %[a], %[a]\n\t" -#endif - : [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)a; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_3072_sub_24(sp_digit* r, const sp_digit* a, const sp_digit* b) { __asm__ __volatile__ ( "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, r5\n\t" + "subs r3, r3, r5\n\t" #else - "add r3, r3, r5\n\t" + "sub r3, r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs %[r], %[r], %[r]\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs %[r], %[r]\n\t" #else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" + "sbc %[r], %[r]\n\t" #endif : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) : @@ -67660,271 +70307,6 @@ SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, return (uint32_t)(size_t)r; } -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<24; i++) { - r[i] = a[i] & m; - } -#else - int i; - - for (i = 0; i < 24; i += 8) { - r[i+0] = a[i+0] & m; - r[i+1] = a[i+1] & m; - r[i+2] = a[i+2] & m; - r[i+3] = a[i+3] & m; - r[i+4] = a[i+4] & m; - r[i+5] = a[i+5] & m; - r[i+6] = a[i+6] & m; - r[i+7] = a[i+7] & m; - } -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[48]; - sp_digit a1[24]; - sp_digit b1[24]; - sp_digit z2[48]; - sp_digit u; - sp_digit ca; - sp_digit cb; - - ca = sp_3072_add_24(a1, a, &a[24]); - cb = sp_3072_add_24(b1, b, &b[24]); - u = ca & cb; - sp_3072_mul_24(z1, a1, b1); - sp_3072_mul_24(z2, &a[24], &b[24]); - sp_3072_mul_24(z0, a, b); - sp_3072_mask_24(r + 48, a1, 0 - cb); - sp_3072_mask_24(b1, b1, 0 - ca); - u += sp_3072_add_24(r + 48, r + 48, b1); - u += sp_3072_sub_in_place_48(z1, z2); - u += sp_3072_sub_in_place_48(z1, z0); - u += sp_3072_add_48(r + 24, r + 24, z1); - u += sp_3072_add_24(r + 48, r + 48, z2); - (void)sp_3072_add_to_word_24(r + 72, u, z2 + 24); -} - -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_dbl_24(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r2, r2, r2\n\t" -#else - "add r2, r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -67933,2148 +70315,455 @@ SP_NOINLINE static sp_digit sp_3072_dbl_24(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[48]; + sp_digit* z2 = r + 48; sp_digit z1[48]; - sp_digit a1[24]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 24; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 24); + + mask = sp_3072_sub_24(a1, a, &a[24]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_24(a1, p1, p2); - u = sp_3072_add_24(a1, a, &a[24]); - sp_3072_sqr_24(z1, a1); sp_3072_sqr_24(z2, &a[24]); sp_3072_sqr_24(z0, a); - sp_3072_mask_24(r + 48, a1, 0 - u); - u += sp_3072_dbl_24(r + 48, r + 48); - u += sp_3072_sub_in_place_48(z1, z2); - u += sp_3072_sub_in_place_48(z1, z0); - u += sp_3072_add_48(r + 24, r + 24, z1); - u += sp_3072_add_24(r + 48, r + 48, z2); - (void)sp_3072_add_to_word_24(r + 72, u, z2 + 24); + sp_3072_sqr_24(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_48(z1, z2); + u -= sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_sub_in_place_48(r + 24, z1); + sp_3072_add_word_24(r + 72, r + 72, u); } -/* Add b to a into r. (r = a + b) +/* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_to_word_48(sp_digit* r, sp_digit a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "movs r5, #0\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, %[a]\n\t" -#else - "add r3, r3, %[a]\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r5\n\t" -#elif defined(__clang__) - "adcs r4, r5\n\t" -#else - "adc r4, r5\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - -/* Sub b from a into a. (a -= b) - * - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "subs r2, r2, r4\n\t" -#else - "sub r2, r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" - "ldm %[b]!, {r4, r5}\n\t" - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#ifdef WOLFSSL_KEIL - "sbcs r2, r2, r4\n\t" -#elif defined(__clang__) - "sbcs r2, r4\n\t" -#else - "sbc r2, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "sbcs r3, r3, r5\n\t" -#elif defined(__clang__) - "sbcs r3, r5\n\t" -#else - "sbc r3, r5\n\t" -#endif - "stm %[a]!, {r2, r3}\n\t" -#ifdef WOLFSSL_KEIL - "sbcs %[a], %[a], %[a]\n\t" -#elif defined(__clang__) - "sbcs %[a], %[a]\n\t" -#else - "sbc %[a], %[a]\n\t" -#endif - : [a] "+l" (a), [b] "+l" (b) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)a; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b) { __asm__ __volatile__ ( "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, r5\n\t" + "subs r3, r3, r5\n\t" #else - "add r3, r3, r5\n\t" + "sub r3, r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" "ldm %[b]!, {r5, r6}\n\t" "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs r3, r3, r5\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs r3, r5\n\t" #else - "adc r3, r5\n\t" + "sbc r3, r5\n\t" #endif #ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" + "sbcs r4, r4, r6\n\t" #elif defined(__clang__) - "adcs r4, r6\n\t" + "sbcs r4, r6\n\t" #else - "adc r4, r6\n\t" + "sbc r4, r6\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" + "sbcs %[r], %[r], %[r]\n\t" #elif defined(__clang__) - "adcs r3, r5\n\t" + "sbcs %[r], %[r]\n\t" #else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "ldm %[a]!, {r3, r4}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r5\n\t" -#elif defined(__clang__) - "adcs r3, r5\n\t" -#else - "adc r3, r5\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r6\n\t" -#elif defined(__clang__) - "adcs r4, r6\n\t" -#else - "adc r4, r6\n\t" -#endif - "stm %[r]!, {r3, r4}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" + "sbc %[r], %[r]\n\t" #endif : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) : @@ -70083,451 +70772,6 @@ SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, return (uint32_t)(size_t)r; } -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<48; i++) { - r[i] = a[i] & m; - } -#else - int i; - - for (i = 0; i < 48; i += 8) { - r[i+0] = a[i+0] & m; - r[i+1] = a[i+1] & m; - r[i+2] = a[i+2] & m; - r[i+3] = a[i+3] & m; - r[i+4] = a[i+4] & m; - r[i+5] = a[i+5] & m; - r[i+6] = a[i+6] & m; - r[i+7] = a[i+7] & m; - } -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[96]; - sp_digit a1[48]; - sp_digit b1[48]; - sp_digit z2[96]; - sp_digit u; - sp_digit ca; - sp_digit cb; - - ca = sp_3072_add_48(a1, a, &a[48]); - cb = sp_3072_add_48(b1, b, &b[48]); - u = ca & cb; - sp_3072_mul_48(z1, a1, b1); - sp_3072_mul_48(z2, &a[48], &b[48]); - sp_3072_mul_48(z0, a, b); - sp_3072_mask_48(r + 96, a1, 0 - cb); - sp_3072_mask_48(b1, b1, 0 - ca); - u += sp_3072_add_48(r + 96, r + 96, b1); - u += sp_3072_sub_in_place_96(z1, z2); - u += sp_3072_sub_in_place_96(z1, z0); - u += sp_3072_add_96(r + 48, r + 48, z1); - u += sp_3072_add_48(r + 96, r + 96, z2); - (void)sp_3072_add_to_word_48(r + 144, u, z2 + 48); -} - -/* Double a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_dbl_48(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r2, r2, r2\n\t" -#else - "add r2, r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "ldm %[a]!, {r2, r3, r4, r5}\n\t" -#ifdef WOLFSSL_KEIL - "adcs r2, r2, r2\n\t" -#elif defined(__clang__) - "adcs r2, r2\n\t" -#else - "adc r2, r2\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r3, r3, r3\n\t" -#elif defined(__clang__) - "adcs r3, r3\n\t" -#else - "adc r3, r3\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r4, r4, r4\n\t" -#elif defined(__clang__) - "adcs r4, r4\n\t" -#else - "adc r4, r4\n\t" -#endif -#ifdef WOLFSSL_KEIL - "adcs r5, r5, r5\n\t" -#elif defined(__clang__) - "adcs r5, r5\n\t" -#else - "adc r5, r5\n\t" -#endif - "stm %[r]!, {r2, r3, r4, r5}\n\t" - "movs %[r], #0\n\t" -#ifdef WOLFSSL_KEIL - "adcs %[r], %[r], %[r]\n\t" -#elif defined(__clang__) - "adcs %[r], %[r]\n\t" -#else - "adc %[r], %[r]\n\t" -#endif - : [r] "+l" (r), [a] "+l" (a) - : - : "memory", "r2", "r3", "r4", "r5" - ); - return (uint32_t)(size_t)r; -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -70536,22 +70780,31 @@ SP_NOINLINE static sp_digit sp_3072_dbl_48(sp_digit* r, const sp_digit* a) SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[96]; + sp_digit* z2 = r + 96; sp_digit z1[96]; - sp_digit a1[48]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 48; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 48); + + mask = sp_3072_sub_48(a1, a, &a[48]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_48(a1, p1, p2); - u = sp_3072_add_48(a1, a, &a[48]); - sp_3072_sqr_48(z1, a1); sp_3072_sqr_48(z2, &a[48]); sp_3072_sqr_48(z0, a); - sp_3072_mask_48(r + 96, a1, 0 - u); - u += sp_3072_dbl_48(r + 96, r + 96); - u += sp_3072_sub_in_place_96(z1, z2); - u += sp_3072_sub_in_place_96(z1, z0); - u += sp_3072_add_96(r + 48, r + 48, z1); - u += sp_3072_add_48(r + 96, r + 96, z2); - (void)sp_3072_add_to_word_48(r + 144, u, z2 + 48); + sp_3072_sqr_48(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_96(z1, z2); + u -= sp_3072_sub_in_place_96(z1, z0); + u += sp_3072_sub_in_place_96(r + 48, z1); + sp_3072_add_word_48(r + 144, r + 144, u); } #endif /* !WOLFSSL_SP_SMALL */ @@ -73942,7 +74195,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_48(r, a, b); @@ -73956,7 +74209,7 @@ static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_48(r, a); @@ -74149,11 +74402,11 @@ SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -74778,7 +75031,7 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig div = d[47]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); - for (i=47; i>=0; i--) { + for (i = 47; i >= 0; i--) { sp_digit hi = t1[48 + i] - (t1[48 + i] == div); r1 = div_3072_word_48(hi, t1[48 + i - 1], div); @@ -77407,7 +77660,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_96(r, a, b); @@ -77421,7 +77674,7 @@ static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_96(r, a); @@ -78342,11 +78595,11 @@ SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -78862,9 +79115,13 @@ static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, s div = d[95]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); - for (i=95; i>=0; i--) { - sp_digit hi = t1[96 + i] - (t1[96 + i] == div); - r1 = div_3072_word_96(hi, t1[96 + i - 1], div); + for (i = 95; i >= 0; i--) { + if (t1[96 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_3072_word_96(t1[96 + i], t1[96 + i - 1], div); + } sp_3072_mul_d_96(t2, d, r1); t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2); @@ -79067,7 +79324,7 @@ static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_dig div = d[95]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); - for (i=95; i>=0; i--) { + for (i = 95; i >= 0; i--) { sp_digit hi = t1[96 + i] - (t1[96 + i] == div); r1 = div_3072_word_96(hi, t1[96 + i - 1], div); @@ -79453,9 +79710,9 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 96; r = a + 96 * 2; m = r + 96 * 2; - ah = a + 96; sp_3072_from_bin(ah, 96, in, inLen); #if DIGIT_BIT >= 32 @@ -79473,7 +79730,38 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_3072_from_mp(m, 96, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_3072_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 96); + err = sp_3072_mod_96_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_3072_mont_sqr_96(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_3072_mont_mul_96(r, r, ah, m, mp); + + for (i = 95; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_3072_sub_in_place_96(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_3072_sqr_96(r, ah); err = sp_3072_mod_96_cond(r, r, m); @@ -79501,7 +79789,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 96); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_3072_mont_sqr_96(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_3072_mont_mul_96(r, r, a, m, mp); @@ -82715,16 +83003,16 @@ static void sp_4096_to_bin_128(sp_digit* r, byte* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, - const sp_digit* b) +SP_NOINLINE static sp_digit sp_4096_add_word_64(sp_digit* r, const sp_digit* a, + sp_digit b) { __asm__ __volatile__ ( "movs r5, #0\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #if defined(__clang__) || defined(WOLFSSL_KEIL) - "adds r3, r3, %[a]\n\t" + "adds r3, r3, %[b]\n\t" #else - "add r3, r3, %[a]\n\t" + "add r3, r3, %[b]\n\t" #endif #ifdef WOLFSSL_KEIL "adcs r4, r4, r5\n\t" @@ -82734,7 +83022,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82750,7 +83038,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82766,7 +83054,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82782,7 +83070,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82798,7 +83086,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82814,7 +83102,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82830,7 +83118,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82846,7 +83134,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82862,7 +83150,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82878,7 +83166,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82894,7 +83182,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82910,7 +83198,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82926,7 +83214,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82942,7 +83230,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82958,7 +83246,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82974,7 +83262,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -82990,7 +83278,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83006,7 +83294,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83022,7 +83310,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83038,7 +83326,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83054,7 +83342,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83070,7 +83358,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83086,7 +83374,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83102,7 +83390,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83118,7 +83406,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83134,7 +83422,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83150,7 +83438,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83166,7 +83454,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83182,7 +83470,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83198,7 +83486,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -83214,7 +83502,7 @@ SP_NOINLINE static sp_digit sp_4096_add_to_word_64(sp_digit* r, sp_digit a, "adc r4, r5\n\t" #endif "stm %[r]!, {r3, r4}\n\t" - "ldm %[b]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" #ifdef WOLFSSL_KEIL "adcs r3, r3, r5\n\t" #elif defined(__clang__) @@ -85542,7 +85830,7 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, sp_digit z1[128]; sp_digit a1[64]; sp_digit b1[64]; - sp_digit z2[128]; + sp_digit* z2 = r + 128; sp_digit u; sp_digit ca; sp_digit cb; @@ -85550,17 +85838,20 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, ca = sp_2048_add_64(a1, a, &a[64]); cb = sp_2048_add_64(b1, b, &b[64]); u = ca & cb; - sp_2048_mul_64(z1, a1, b1); + sp_2048_mul_64(z2, &a[64], &b[64]); sp_2048_mul_64(z0, a, b); - sp_2048_mask_64(r + 128, a1, 0 - cb); - sp_2048_mask_64(b1, b1, 0 - ca); - u += sp_2048_add_64(r + 128, r + 128, b1); - u += sp_4096_sub_in_place_128(z1, z2); + sp_2048_mul_64(z1, a1, b1); + u += sp_4096_sub_in_place_128(z1, z0); + u += sp_4096_sub_in_place_128(z1, z2); + sp_2048_mask_64(a1, a1, 0 - cb); + u += sp_2048_add_64(z1 + 64, z1 + 64, a1); + sp_2048_mask_64(b1, b1, 0 - ca); + u += sp_2048_add_64(z1 + 64, z1 + 64, b1); + u += sp_4096_add_128(r + 64, r + 64, z1); - u += sp_4096_add_64(r + 128, r + 128, z2); - (void)sp_4096_add_to_word_64(r + 192, u, z2 + 64); + (void)sp_4096_add_word_64(r + 192, r + 192, u); } /* Square a and put result in r. (r = a * a) @@ -85571,22 +85862,31 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[128]; + sp_digit* z2 = r + 128; sp_digit z1[128]; - sp_digit a1[64]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 64; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 64); + + mask = sp_2048_sub_64(a1, a, &a[64]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_64(a1, p1, p2); - u = sp_2048_add_64(a1, a, &a[64]); - sp_2048_sqr_64(z1, a1); sp_2048_sqr_64(z2, &a[64]); sp_2048_sqr_64(z0, a); - sp_2048_mask_64(r + 128, a1, 0 - u); - u += sp_2048_dbl_64(r + 128, r + 128); - u += sp_4096_sub_in_place_128(z1, z2); - u += sp_4096_sub_in_place_128(z1, z0); - u += sp_4096_add_128(r + 64, r + 64, z1); - u += sp_4096_add_64(r + 128, r + 128, z2); - (void)sp_4096_add_to_word_64(r + 192, u, z2 + 64); + sp_2048_sqr_64(z1, a1); + + u = 0; + u -= sp_4096_sub_in_place_128(z1, z2); + u -= sp_4096_sub_in_place_128(z1, z0); + u += sp_4096_sub_in_place_128(r + 64, z1); + sp_4096_add_word_64(r + 192, r + 192, u); } #endif /* !WOLFSSL_SP_SMALL */ @@ -89501,7 +89801,7 @@ SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_128(r, a, b); @@ -89515,7 +89815,7 @@ static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_128(r, a); @@ -90708,11 +91008,11 @@ SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -91228,9 +91528,13 @@ static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, div = d[127]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); - for (i=127; i>=0; i--) { - sp_digit hi = t1[128 + i] - (t1[128 + i] == div); - r1 = div_4096_word_128(hi, t1[128 + i - 1], div); + for (i = 127; i >= 0; i--) { + if (t1[128 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_4096_word_128(t1[128 + i], t1[128 + i - 1], div); + } sp_4096_mul_d_128(t2, d, r1); t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2); @@ -91434,7 +91738,7 @@ static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_di div = d[127]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); - for (i=127; i>=0; i--) { + for (i = 127; i >= 0; i--) { sp_digit hi = t1[128 + i] - (t1[128 + i] == div); r1 = div_4096_word_128(hi, t1[128 + i - 1], div); @@ -91820,9 +92124,9 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 128; r = a + 128 * 2; m = r + 128 * 2; - ah = a + 128; sp_4096_from_bin(ah, 128, in, inLen); #if DIGIT_BIT >= 32 @@ -91840,7 +92144,38 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_4096_from_mp(m, 128, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_4096_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 128); + err = sp_4096_mod_128_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_4096_mont_sqr_128(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_4096_mont_mul_128(r, r, ah, m, mp); + + for (i = 127; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_4096_sub_in_place_128(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_4096_sqr_128(r, ah); err = sp_4096_mod_128_cond(r, r, m); @@ -91868,7 +92203,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 128); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_4096_mont_sqr_128(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_4096_mont_mul_128(r, r, a, m, mp); @@ -98051,7 +98386,7 @@ SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_256_mul_8(r, a, b); @@ -98065,7 +98400,7 @@ static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_256_sqr_8(r, a); @@ -103623,11 +103958,11 @@ SP_NOINLINE static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -104169,7 +104504,7 @@ static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit div = d[7]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 8); - for (i=7; i>=0; i--) { + for (i = 7; i >= 0; i--) { sp_digit hi = t1[8 + i] - (t1[8 + i] == div); r1 = div_256_word_8(hi, t1[8 + i - 1], div); @@ -109178,7 +109513,7 @@ SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_mul_12(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_384_mul_12(r, a, b); @@ -109192,7 +109527,7 @@ static void sp_384_mont_mul_12(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_sqr_12(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_sqr_12(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_384_sqr_12(r, a); @@ -114127,11 +114462,11 @@ SP_NOINLINE static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -114677,7 +115012,7 @@ static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digi div = d[11]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 12); - for (i=11; i>=0; i--) { + for (i = 11; i >= 0; i--) { sp_digit hi = t1[12 + i] - (t1[12 + i] == div); r1 = div_384_word_12(hi, t1[12 + i - 1], div); @@ -121218,7 +121553,7 @@ SP_NOINLINE static void sp_521_mont_reduce_order_17(sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_mul_17(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_mul_17(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_521_mul_17(r, a, b); @@ -121232,7 +121567,7 @@ static void sp_521_mont_mul_17(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_sqr_17(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_sqr_17(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_521_sqr_17(r, a); @@ -130014,11 +130349,11 @@ SP_NOINLINE static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -130574,7 +130909,7 @@ static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d, sp_digi sp_521_lshift_17(sd, d, 23); sp_521_lshift_34(t1, t1, 23); - for (i=16; i>=0; i--) { + for (i = 16; i >= 0; i--) { sp_digit hi = t1[17 + i] - (t1[17 + i] == div); r1 = div_521_word_17(hi, t1[17 + i - 1], div); @@ -197571,6 +197906,158 @@ SP_NOINLINE static sp_digit sp_1024_add_16(sp_digit* r, const sp_digit* a, return (uint32_t)(size_t)r; } +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_1024_add_word_16(sp_digit* r, const sp_digit* a, + sp_digit b) +{ + __asm__ __volatile__ ( + "movs r5, #0\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, %[b]\n\t" +#else + "add r3, r3, %[b]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "adcs r3, r3, r5\n\t" +#elif defined(__clang__) + "adcs r3, r5\n\t" +#else + "adc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r4, r4, r5\n\t" +#elif defined(__clang__) + "adcs r4, r5\n\t" +#else + "adc r4, r5\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "movs %[r], #0\n\t" +#ifdef WOLFSSL_KEIL + "adcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "adcs %[r], %[r]\n\t" +#else + "adc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)r; +} + /* Sub b from a into a. (a -= b) * * a A single precision integer. @@ -198218,7 +198705,7 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, sp_digit z1[32]; sp_digit a1[16]; sp_digit b1[16]; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit u; sp_digit ca; sp_digit cb; @@ -198226,17 +198713,178 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, ca = sp_1024_add_16(a1, a, &a[16]); cb = sp_1024_add_16(b1, b, &b[16]); u = ca & cb; - sp_1024_mul_16(z1, a1, b1); + sp_1024_mul_16(z2, &a[16], &b[16]); sp_1024_mul_16(z0, a, b); - sp_1024_mask_16(r + 32, a1, 0 - cb); - sp_1024_mask_16(b1, b1, 0 - ca); - u += sp_1024_add_16(r + 32, r + 32, b1); - u += sp_1024_sub_in_place_32(z1, z2); + sp_1024_mul_16(z1, a1, b1); + u += sp_1024_sub_in_place_32(z1, z0); + u += sp_1024_sub_in_place_32(z1, z2); + sp_1024_mask_16(a1, a1, 0 - cb); + u += sp_1024_add_16(z1 + 16, z1 + 16, a1); + sp_1024_mask_16(b1, b1, 0 - ca); + u += sp_1024_add_16(z1 + 16, z1 + 16, b1); + u += sp_1024_add_32(r + 16, r + 16, z1); - u += sp_1024_add_16(r + 32, r + 32, z2); - (void)sp_1024_add_to_word_16(r + 48, u, z2 + 16); + (void)sp_1024_add_word_16(r + 48, r + 48, u); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_1024_sub_16(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "subs r3, r3, r5\n\t" +#else + "sub r3, r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r4, r4, r6\n\t" +#elif defined(__clang__) + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r4, r4, r6\n\t" +#elif defined(__clang__) + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r4, r4, r6\n\t" +#elif defined(__clang__) + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r4, r4, r6\n\t" +#elif defined(__clang__) + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r4, r4, r6\n\t" +#elif defined(__clang__) + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r4, r4, r6\n\t" +#elif defined(__clang__) + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r4, r4, r6\n\t" +#elif defined(__clang__) + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" + "ldm %[b]!, {r5, r6}\n\t" + "ldm %[a]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs r3, r3, r5\n\t" +#elif defined(__clang__) + "sbcs r3, r5\n\t" +#else + "sbc r3, r5\n\t" +#endif +#ifdef WOLFSSL_KEIL + "sbcs r4, r4, r6\n\t" +#elif defined(__clang__) + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif + "stm %[r]!, {r3, r4}\n\t" +#ifdef WOLFSSL_KEIL + "sbcs %[r], %[r], %[r]\n\t" +#elif defined(__clang__) + "sbcs %[r], %[r]\n\t" +#else + "sbc %[r], %[r]\n\t" +#endif + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; } /* Square a and put result in r. (r = a * a) @@ -198247,22 +198895,31 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit z1[32]; - sp_digit a1[16]; + sp_digit* a1 = z1; + sp_digit* zero = z1 + 16; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 16); + + mask = sp_1024_sub_16(a1, a, &a[16]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_1024_sub_16(a1, p1, p2); - u = sp_1024_add_16(a1, a, &a[16]); - sp_1024_sqr_16(z1, a1); sp_1024_sqr_16(z2, &a[16]); sp_1024_sqr_16(z0, a); - sp_1024_mask_16(r + 32, a1, 0 - u); - u += sp_1024_dbl_16(r + 32, r + 32); - u += sp_1024_sub_in_place_32(z1, z2); - u += sp_1024_sub_in_place_32(z1, z0); - u += sp_1024_add_32(r + 16, r + 16, z1); - u += sp_1024_add_16(r + 32, r + 32, z2); - (void)sp_1024_add_to_word_16(r + 48, u, z2 + 16); + sp_1024_sqr_16(z1, a1); + + u = 0; + u -= sp_1024_sub_in_place_32(z1, z2); + u -= sp_1024_sub_in_place_32(z1, z0); + u += sp_1024_sub_in_place_32(r + 16, z1); + sp_1024_add_word_16(r + 48, r + 48, u); } #else @@ -199426,11 +200083,11 @@ SP_NOINLINE static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -200085,7 +200742,7 @@ static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_dig div = d[31]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i=31; i>=0; i--) { + for (i = 31; i >= 0; i--) { sp_digit hi = t1[32 + i] - (t1[32 + i] == div); r1 = div_1024_word_32(hi, t1[32 + i - 1], div); @@ -201478,7 +202135,7 @@ SP_NOINLINE static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_1024_mul_32(r, a, b); @@ -201492,7 +202149,7 @@ static void sp_1024_mont_mul_32(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_sqr_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_sqr_32(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_1024_sqr_32(r, a); diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index 0f4ed9320..736424df2 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -499,118 +499,6 @@ SP_NOINLINE static void sp_2048_mul_12(sp_digit* r, const sp_digit* a, r[22] = t22 & 0x1fffffff; } -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_12(sp_digit* r, const sp_digit* a) -{ - sp_uint64 t0 = ((sp_uint64)a[ 0]) * a[ 0]; - sp_uint64 t1 = (((sp_uint64)a[ 0]) * a[ 1]) * 2; - sp_uint64 t2 = (((sp_uint64)a[ 0]) * a[ 2]) * 2 - + ((sp_uint64)a[ 1]) * a[ 1]; - sp_uint64 t3 = (((sp_uint64)a[ 0]) * a[ 3] - + ((sp_uint64)a[ 1]) * a[ 2]) * 2; - sp_uint64 t4 = (((sp_uint64)a[ 0]) * a[ 4] - + ((sp_uint64)a[ 1]) * a[ 3]) * 2 - + ((sp_uint64)a[ 2]) * a[ 2]; - sp_uint64 t5 = (((sp_uint64)a[ 0]) * a[ 5] - + ((sp_uint64)a[ 1]) * a[ 4] - + ((sp_uint64)a[ 2]) * a[ 3]) * 2; - sp_uint64 t6 = (((sp_uint64)a[ 0]) * a[ 6] - + ((sp_uint64)a[ 1]) * a[ 5] - + ((sp_uint64)a[ 2]) * a[ 4]) * 2 - + ((sp_uint64)a[ 3]) * a[ 3]; - sp_uint64 t7 = (((sp_uint64)a[ 0]) * a[ 7] - + ((sp_uint64)a[ 1]) * a[ 6] - + ((sp_uint64)a[ 2]) * a[ 5] - + ((sp_uint64)a[ 3]) * a[ 4]) * 2; - sp_uint64 t8 = (((sp_uint64)a[ 0]) * a[ 8] - + ((sp_uint64)a[ 1]) * a[ 7] - + ((sp_uint64)a[ 2]) * a[ 6] - + ((sp_uint64)a[ 3]) * a[ 5]) * 2 - + ((sp_uint64)a[ 4]) * a[ 4]; - sp_uint64 t9 = (((sp_uint64)a[ 0]) * a[ 9] - + ((sp_uint64)a[ 1]) * a[ 8] - + ((sp_uint64)a[ 2]) * a[ 7] - + ((sp_uint64)a[ 3]) * a[ 6] - + ((sp_uint64)a[ 4]) * a[ 5]) * 2; - sp_uint64 t10 = (((sp_uint64)a[ 0]) * a[10] - + ((sp_uint64)a[ 1]) * a[ 9] - + ((sp_uint64)a[ 2]) * a[ 8] - + ((sp_uint64)a[ 3]) * a[ 7] - + ((sp_uint64)a[ 4]) * a[ 6]) * 2 - + ((sp_uint64)a[ 5]) * a[ 5]; - sp_uint64 t11 = (((sp_uint64)a[ 0]) * a[11] - + ((sp_uint64)a[ 1]) * a[10] - + ((sp_uint64)a[ 2]) * a[ 9] - + ((sp_uint64)a[ 3]) * a[ 8] - + ((sp_uint64)a[ 4]) * a[ 7] - + ((sp_uint64)a[ 5]) * a[ 6]) * 2; - sp_uint64 t12 = (((sp_uint64)a[ 1]) * a[11] - + ((sp_uint64)a[ 2]) * a[10] - + ((sp_uint64)a[ 3]) * a[ 9] - + ((sp_uint64)a[ 4]) * a[ 8] - + ((sp_uint64)a[ 5]) * a[ 7]) * 2 - + ((sp_uint64)a[ 6]) * a[ 6]; - sp_uint64 t13 = (((sp_uint64)a[ 2]) * a[11] - + ((sp_uint64)a[ 3]) * a[10] - + ((sp_uint64)a[ 4]) * a[ 9] - + ((sp_uint64)a[ 5]) * a[ 8] - + ((sp_uint64)a[ 6]) * a[ 7]) * 2; - sp_uint64 t14 = (((sp_uint64)a[ 3]) * a[11] - + ((sp_uint64)a[ 4]) * a[10] - + ((sp_uint64)a[ 5]) * a[ 9] - + ((sp_uint64)a[ 6]) * a[ 8]) * 2 - + ((sp_uint64)a[ 7]) * a[ 7]; - sp_uint64 t15 = (((sp_uint64)a[ 4]) * a[11] - + ((sp_uint64)a[ 5]) * a[10] - + ((sp_uint64)a[ 6]) * a[ 9] - + ((sp_uint64)a[ 7]) * a[ 8]) * 2; - sp_uint64 t16 = (((sp_uint64)a[ 5]) * a[11] - + ((sp_uint64)a[ 6]) * a[10] - + ((sp_uint64)a[ 7]) * a[ 9]) * 2 - + ((sp_uint64)a[ 8]) * a[ 8]; - sp_uint64 t17 = (((sp_uint64)a[ 6]) * a[11] - + ((sp_uint64)a[ 7]) * a[10] - + ((sp_uint64)a[ 8]) * a[ 9]) * 2; - sp_uint64 t18 = (((sp_uint64)a[ 7]) * a[11] - + ((sp_uint64)a[ 8]) * a[10]) * 2 - + ((sp_uint64)a[ 9]) * a[ 9]; - sp_uint64 t19 = (((sp_uint64)a[ 8]) * a[11] - + ((sp_uint64)a[ 9]) * a[10]) * 2; - sp_uint64 t20 = (((sp_uint64)a[ 9]) * a[11]) * 2 - + ((sp_uint64)a[10]) * a[10]; - sp_uint64 t21 = (((sp_uint64)a[10]) * a[11]) * 2; - sp_uint64 t22 = ((sp_uint64)a[11]) * a[11]; - - t1 += t0 >> 29; r[ 0] = t0 & 0x1fffffff; - t2 += t1 >> 29; r[ 1] = t1 & 0x1fffffff; - t3 += t2 >> 29; r[ 2] = t2 & 0x1fffffff; - t4 += t3 >> 29; r[ 3] = t3 & 0x1fffffff; - t5 += t4 >> 29; r[ 4] = t4 & 0x1fffffff; - t6 += t5 >> 29; r[ 5] = t5 & 0x1fffffff; - t7 += t6 >> 29; r[ 6] = t6 & 0x1fffffff; - t8 += t7 >> 29; r[ 7] = t7 & 0x1fffffff; - t9 += t8 >> 29; r[ 8] = t8 & 0x1fffffff; - t10 += t9 >> 29; r[ 9] = t9 & 0x1fffffff; - t11 += t10 >> 29; r[10] = t10 & 0x1fffffff; - t12 += t11 >> 29; r[11] = t11 & 0x1fffffff; - t13 += t12 >> 29; r[12] = t12 & 0x1fffffff; - t14 += t13 >> 29; r[13] = t13 & 0x1fffffff; - t15 += t14 >> 29; r[14] = t14 & 0x1fffffff; - t16 += t15 >> 29; r[15] = t15 & 0x1fffffff; - t17 += t16 >> 29; r[16] = t16 & 0x1fffffff; - t18 += t17 >> 29; r[17] = t17 & 0x1fffffff; - t19 += t18 >> 29; r[18] = t18 & 0x1fffffff; - t20 += t19 >> 29; r[19] = t19 & 0x1fffffff; - t21 += t20 >> 29; r[20] = t20 & 0x1fffffff; - t22 += t21 >> 29; r[21] = t21 & 0x1fffffff; - r[23] = (sp_digit)(t22 >> 29); - r[22] = t22 & 0x1fffffff; -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -807,55 +695,6 @@ SP_NOINLINE static void sp_2048_mul_36(sp_digit* r, const sp_digit* a, sp_2048_norm_72(r); } -/* Square a into r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a) -{ - sp_digit p0[24]; - sp_digit p1[24]; - sp_digit p2[24]; - sp_digit p3[24]; - sp_digit p4[24]; - sp_digit p5[24]; - sp_digit t0[24]; - sp_digit t1[24]; - sp_digit t2[24]; - sp_digit a0[12]; - sp_digit a1[12]; - sp_digit a2[12]; - (void)sp_2048_add_12(a0, a, &a[12]); - sp_2048_norm_12(a0); - (void)sp_2048_add_12(a1, &a[12], &a[24]); - sp_2048_norm_12(a1); - (void)sp_2048_add_12(a2, a0, &a[24]); - sp_2048_norm_12(a2); - sp_2048_sqr_12(p0, a); - sp_2048_sqr_12(p2, &a[12]); - sp_2048_sqr_12(p4, &a[24]); - sp_2048_sqr_12(p1, a0); - sp_2048_sqr_12(p3, a1); - sp_2048_sqr_12(p5, a2); - XMEMSET(r, 0, sizeof(*r)*2U*36U); - (void)sp_2048_sub_24(t0, p3, p2); - (void)sp_2048_sub_24(t1, p1, p2); - (void)sp_2048_sub_24(t2, p5, t0); - (void)sp_2048_sub_24(t2, t2, t1); - sp_2048_norm_24(t2); - (void)sp_2048_sub_24(t0, t0, p4); - sp_2048_norm_24(t0); - (void)sp_2048_sub_24(t1, t1, p0); - sp_2048_norm_24(t1); - (void)sp_2048_add_24(r, r, p0); - (void)sp_2048_add_24(&r[12], &r[12], t1); - (void)sp_2048_add_24(&r[24], &r[24], t2); - (void)sp_2048_add_24(&r[36], &r[36], t0); - (void)sp_2048_add_24(&r[48], &r[48], p4); - sp_2048_norm_72(r); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -996,6 +835,167 @@ SP_NOINLINE static void sp_2048_mul_72(sp_digit* r, const sp_digit* a, sp_2048_norm_144(r); } +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_2048_sqr_12(sp_digit* r, const sp_digit* a) +{ + sp_uint64 t0 = ((sp_uint64)a[ 0]) * a[ 0]; + sp_uint64 t1 = (((sp_uint64)a[ 0]) * a[ 1]) * 2; + sp_uint64 t2 = (((sp_uint64)a[ 0]) * a[ 2]) * 2 + + ((sp_uint64)a[ 1]) * a[ 1]; + sp_uint64 t3 = (((sp_uint64)a[ 0]) * a[ 3] + + ((sp_uint64)a[ 1]) * a[ 2]) * 2; + sp_uint64 t4 = (((sp_uint64)a[ 0]) * a[ 4] + + ((sp_uint64)a[ 1]) * a[ 3]) * 2 + + ((sp_uint64)a[ 2]) * a[ 2]; + sp_uint64 t5 = (((sp_uint64)a[ 0]) * a[ 5] + + ((sp_uint64)a[ 1]) * a[ 4] + + ((sp_uint64)a[ 2]) * a[ 3]) * 2; + sp_uint64 t6 = (((sp_uint64)a[ 0]) * a[ 6] + + ((sp_uint64)a[ 1]) * a[ 5] + + ((sp_uint64)a[ 2]) * a[ 4]) * 2 + + ((sp_uint64)a[ 3]) * a[ 3]; + sp_uint64 t7 = (((sp_uint64)a[ 0]) * a[ 7] + + ((sp_uint64)a[ 1]) * a[ 6] + + ((sp_uint64)a[ 2]) * a[ 5] + + ((sp_uint64)a[ 3]) * a[ 4]) * 2; + sp_uint64 t8 = (((sp_uint64)a[ 0]) * a[ 8] + + ((sp_uint64)a[ 1]) * a[ 7] + + ((sp_uint64)a[ 2]) * a[ 6] + + ((sp_uint64)a[ 3]) * a[ 5]) * 2 + + ((sp_uint64)a[ 4]) * a[ 4]; + sp_uint64 t9 = (((sp_uint64)a[ 0]) * a[ 9] + + ((sp_uint64)a[ 1]) * a[ 8] + + ((sp_uint64)a[ 2]) * a[ 7] + + ((sp_uint64)a[ 3]) * a[ 6] + + ((sp_uint64)a[ 4]) * a[ 5]) * 2; + sp_uint64 t10 = (((sp_uint64)a[ 0]) * a[10] + + ((sp_uint64)a[ 1]) * a[ 9] + + ((sp_uint64)a[ 2]) * a[ 8] + + ((sp_uint64)a[ 3]) * a[ 7] + + ((sp_uint64)a[ 4]) * a[ 6]) * 2 + + ((sp_uint64)a[ 5]) * a[ 5]; + sp_uint64 t11 = (((sp_uint64)a[ 0]) * a[11] + + ((sp_uint64)a[ 1]) * a[10] + + ((sp_uint64)a[ 2]) * a[ 9] + + ((sp_uint64)a[ 3]) * a[ 8] + + ((sp_uint64)a[ 4]) * a[ 7] + + ((sp_uint64)a[ 5]) * a[ 6]) * 2; + sp_uint64 t12 = (((sp_uint64)a[ 1]) * a[11] + + ((sp_uint64)a[ 2]) * a[10] + + ((sp_uint64)a[ 3]) * a[ 9] + + ((sp_uint64)a[ 4]) * a[ 8] + + ((sp_uint64)a[ 5]) * a[ 7]) * 2 + + ((sp_uint64)a[ 6]) * a[ 6]; + sp_uint64 t13 = (((sp_uint64)a[ 2]) * a[11] + + ((sp_uint64)a[ 3]) * a[10] + + ((sp_uint64)a[ 4]) * a[ 9] + + ((sp_uint64)a[ 5]) * a[ 8] + + ((sp_uint64)a[ 6]) * a[ 7]) * 2; + sp_uint64 t14 = (((sp_uint64)a[ 3]) * a[11] + + ((sp_uint64)a[ 4]) * a[10] + + ((sp_uint64)a[ 5]) * a[ 9] + + ((sp_uint64)a[ 6]) * a[ 8]) * 2 + + ((sp_uint64)a[ 7]) * a[ 7]; + sp_uint64 t15 = (((sp_uint64)a[ 4]) * a[11] + + ((sp_uint64)a[ 5]) * a[10] + + ((sp_uint64)a[ 6]) * a[ 9] + + ((sp_uint64)a[ 7]) * a[ 8]) * 2; + sp_uint64 t16 = (((sp_uint64)a[ 5]) * a[11] + + ((sp_uint64)a[ 6]) * a[10] + + ((sp_uint64)a[ 7]) * a[ 9]) * 2 + + ((sp_uint64)a[ 8]) * a[ 8]; + sp_uint64 t17 = (((sp_uint64)a[ 6]) * a[11] + + ((sp_uint64)a[ 7]) * a[10] + + ((sp_uint64)a[ 8]) * a[ 9]) * 2; + sp_uint64 t18 = (((sp_uint64)a[ 7]) * a[11] + + ((sp_uint64)a[ 8]) * a[10]) * 2 + + ((sp_uint64)a[ 9]) * a[ 9]; + sp_uint64 t19 = (((sp_uint64)a[ 8]) * a[11] + + ((sp_uint64)a[ 9]) * a[10]) * 2; + sp_uint64 t20 = (((sp_uint64)a[ 9]) * a[11]) * 2 + + ((sp_uint64)a[10]) * a[10]; + sp_uint64 t21 = (((sp_uint64)a[10]) * a[11]) * 2; + sp_uint64 t22 = ((sp_uint64)a[11]) * a[11]; + + t1 += t0 >> 29; r[ 0] = t0 & 0x1fffffff; + t2 += t1 >> 29; r[ 1] = t1 & 0x1fffffff; + t3 += t2 >> 29; r[ 2] = t2 & 0x1fffffff; + t4 += t3 >> 29; r[ 3] = t3 & 0x1fffffff; + t5 += t4 >> 29; r[ 4] = t4 & 0x1fffffff; + t6 += t5 >> 29; r[ 5] = t5 & 0x1fffffff; + t7 += t6 >> 29; r[ 6] = t6 & 0x1fffffff; + t8 += t7 >> 29; r[ 7] = t7 & 0x1fffffff; + t9 += t8 >> 29; r[ 8] = t8 & 0x1fffffff; + t10 += t9 >> 29; r[ 9] = t9 & 0x1fffffff; + t11 += t10 >> 29; r[10] = t10 & 0x1fffffff; + t12 += t11 >> 29; r[11] = t11 & 0x1fffffff; + t13 += t12 >> 29; r[12] = t12 & 0x1fffffff; + t14 += t13 >> 29; r[13] = t13 & 0x1fffffff; + t15 += t14 >> 29; r[14] = t14 & 0x1fffffff; + t16 += t15 >> 29; r[15] = t15 & 0x1fffffff; + t17 += t16 >> 29; r[16] = t16 & 0x1fffffff; + t18 += t17 >> 29; r[17] = t17 & 0x1fffffff; + t19 += t18 >> 29; r[18] = t18 & 0x1fffffff; + t20 += t19 >> 29; r[19] = t19 & 0x1fffffff; + t21 += t20 >> 29; r[20] = t20 & 0x1fffffff; + t22 += t21 >> 29; r[21] = t21 & 0x1fffffff; + r[23] = (sp_digit)(t22 >> 29); + r[22] = t22 & 0x1fffffff; +} + +/* Square a into r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a) +{ + sp_digit p0[24]; + sp_digit p1[24]; + sp_digit p2[24]; + sp_digit p3[24]; + sp_digit p4[24]; + sp_digit p5[24]; + sp_digit t0[24]; + sp_digit t1[24]; + sp_digit t2[24]; + sp_digit a0[12]; + sp_digit a1[12]; + sp_digit a2[12]; + (void)sp_2048_add_12(a0, a, &a[12]); + sp_2048_norm_12(a0); + (void)sp_2048_add_12(a1, &a[12], &a[24]); + sp_2048_norm_12(a1); + (void)sp_2048_add_12(a2, a0, &a[24]); + sp_2048_norm_12(a2); + sp_2048_sqr_12(p0, a); + sp_2048_sqr_12(p2, &a[12]); + sp_2048_sqr_12(p4, &a[24]); + sp_2048_sqr_12(p1, a0); + sp_2048_sqr_12(p3, a1); + sp_2048_sqr_12(p5, a2); + XMEMSET(r, 0, sizeof(*r)*2U*36U); + (void)sp_2048_sub_24(t0, p3, p2); + (void)sp_2048_sub_24(t1, p1, p2); + (void)sp_2048_sub_24(t2, p5, t0); + (void)sp_2048_sub_24(t2, t2, t1); + sp_2048_norm_24(t2); + (void)sp_2048_sub_24(t0, t0, p4); + sp_2048_norm_24(t0); + (void)sp_2048_sub_24(t1, t1, p0); + sp_2048_norm_24(t1); + (void)sp_2048_add_24(r, r, p0); + (void)sp_2048_add_24(&r[12], &r[12], t1); + (void)sp_2048_add_24(&r[24], &r[24], t2); + (void)sp_2048_add_24(&r[36], &r[36], t0); + (void)sp_2048_add_24(&r[48], &r[48], p4); + sp_2048_norm_72(r); +} + /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -1750,7 +1750,7 @@ static void sp_2048_mont_reduce_36(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_36(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_36(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_36(r, a, b); @@ -1764,7 +1764,7 @@ static void sp_2048_mont_mul_36(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_36(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_36(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_36(r, a); @@ -1824,6 +1824,7 @@ SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a, #endif /* WOLFSSL_SP_SMALL */ } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -1835,13 +1836,26 @@ SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a, static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; for (i = 0; i < 36; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ int i; for (i = 0; i < 32; i += 8) { @@ -1858,8 +1872,8 @@ static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a, r[33] = a[33] + (b[33] & m); r[34] = a[34] + (b[34] & m); r[35] = a[35] + (b[35] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_2048_rshift_36(sp_digit* r, const sp_digit* a, byte n) @@ -2831,7 +2845,7 @@ static void sp_2048_mont_reduce_72(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_72(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_72(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_72(r, a, b); @@ -2845,7 +2859,7 @@ static void sp_2048_mont_mul_72(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_72(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_72(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_72(r, a); @@ -2938,6 +2952,7 @@ SP_NOINLINE static void sp_2048_mul_d_144(sp_digit* r, const sp_digit* a, #endif /* WOLFSSL_SP_SMALL */ } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -2949,16 +2964,29 @@ SP_NOINLINE static void sp_2048_mul_d_144(sp_digit* r, const sp_digit* a, static void sp_2048_cond_add_72(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; - for (i = 0; i < 71; i++) { + for (i = 0; i < 72; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_2048_cond_add_72(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ int i; - for (i = 0; i < 64; i += 8) { + for (i = 0; i < 72; i += 8) { r[i + 0] = a[i + 0] + (b[i + 0] & m); r[i + 1] = a[i + 1] + (b[i + 1] & m); r[i + 2] = a[i + 2] + (b[i + 2] & m); @@ -2968,15 +2996,8 @@ static void sp_2048_cond_add_72(sp_digit* r, const sp_digit* a, r[i + 6] = a[i + 6] + (b[i + 6] & m); r[i + 7] = a[i + 7] + (b[i + 7] & m); } - r[64] = a[64] + (b[64] & m); - r[65] = a[65] + (b[65] & m); - r[66] = a[66] + (b[66] & m); - r[67] = a[67] + (b[67] & m); - r[68] = a[68] + (b[68] & m); - r[69] = a[69] + (b[69] & m); - r[70] = a[70] + (b[70] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_2048_rshift_72(sp_digit* r, const sp_digit* a, byte n) @@ -5532,7 +5553,7 @@ SP_NOINLINE static void sp_3072_mul_53(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_53(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_53(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_53(r, a, b); @@ -5610,7 +5631,7 @@ SP_NOINLINE static void sp_3072_sqr_53(sp_digit* r, const sp_digit* a) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_53(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_53(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_53(r, a); @@ -5638,6 +5659,7 @@ SP_NOINLINE static void sp_3072_mul_d_53(sp_digit* r, const sp_digit* a, r[53] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -5655,6 +5677,7 @@ static void sp_3072_cond_add_53(sp_digit* r, const sp_digit* a, r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -6499,7 +6522,7 @@ static void sp_3072_mont_reduce_106(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_106(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_106(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_106(r, a, b); @@ -6513,7 +6536,7 @@ static void sp_3072_mont_mul_106(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_106(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_106(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_106(r, a); @@ -6541,6 +6564,7 @@ SP_NOINLINE static void sp_3072_mul_d_212(sp_digit* r, const sp_digit* a, r[212] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -6554,10 +6578,11 @@ static void sp_3072_cond_add_106(sp_digit* r, const sp_digit* a, { int i; - for (i = 0; i < 53; i++) { + for (i = 0; i < 106; i++) { r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -8867,6 +8892,341 @@ SP_NOINLINE static void sp_3072_mul_14(sp_digit* r, const sp_digit* a, r[26] = t26 & 0xfffffff; } +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static int sp_3072_add_14(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + r[ 0] = a[ 0] + b[ 0]; + r[ 1] = a[ 1] + b[ 1]; + r[ 2] = a[ 2] + b[ 2]; + r[ 3] = a[ 3] + b[ 3]; + r[ 4] = a[ 4] + b[ 4]; + r[ 5] = a[ 5] + b[ 5]; + r[ 6] = a[ 6] + b[ 6]; + r[ 7] = a[ 7] + b[ 7]; + r[ 8] = a[ 8] + b[ 8]; + r[ 9] = a[ 9] + b[ 9]; + r[10] = a[10] + b[10]; + r[11] = a[11] + b[11]; + r[12] = a[12] + b[12]; + r[13] = a[13] + b[13]; + + return 0; +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static int sp_3072_add_28(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + int i; + + for (i = 0; i < 24; i += 8) { + r[i + 0] = a[i + 0] + b[i + 0]; + r[i + 1] = a[i + 1] + b[i + 1]; + r[i + 2] = a[i + 2] + b[i + 2]; + r[i + 3] = a[i + 3] + b[i + 3]; + r[i + 4] = a[i + 4] + b[i + 4]; + r[i + 5] = a[i + 5] + b[i + 5]; + r[i + 6] = a[i + 6] + b[i + 6]; + r[i + 7] = a[i + 7] + b[i + 7]; + } + r[24] = a[24] + b[24]; + r[25] = a[25] + b[25]; + r[26] = a[26] + b[26]; + r[27] = a[27] + b[27]; + + return 0; +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static int sp_3072_sub_28(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + int i; + + for (i = 0; i < 24; i += 8) { + r[i + 0] = a[i + 0] - b[i + 0]; + r[i + 1] = a[i + 1] - b[i + 1]; + r[i + 2] = a[i + 2] - b[i + 2]; + r[i + 3] = a[i + 3] - b[i + 3]; + r[i + 4] = a[i + 4] - b[i + 4]; + r[i + 5] = a[i + 5] - b[i + 5]; + r[i + 6] = a[i + 6] - b[i + 6]; + r[i + 7] = a[i + 7] - b[i + 7]; + } + r[24] = a[24] - b[24]; + r[25] = a[25] - b[25]; + r[26] = a[26] - b[26]; + r[27] = a[27] - b[27]; + + return 0; +} + +/* Normalize the values in each word to 28 bits. + * + * a Array of sp_digit to normalize. + */ +static void sp_3072_norm_14(sp_digit* a) +{ + a[1] += a[0] >> 28; a[0] &= 0xfffffff; + a[2] += a[1] >> 28; a[1] &= 0xfffffff; + a[3] += a[2] >> 28; a[2] &= 0xfffffff; + a[4] += a[3] >> 28; a[3] &= 0xfffffff; + a[5] += a[4] >> 28; a[4] &= 0xfffffff; + a[6] += a[5] >> 28; a[5] &= 0xfffffff; + a[7] += a[6] >> 28; a[6] &= 0xfffffff; + a[8] += a[7] >> 28; a[7] &= 0xfffffff; + a[9] += a[8] >> 28; a[8] &= 0xfffffff; + a[10] += a[9] >> 28; a[9] &= 0xfffffff; + a[11] += a[10] >> 28; a[10] &= 0xfffffff; + a[12] += a[11] >> 28; a[11] &= 0xfffffff; + a[13] += a[12] >> 28; a[12] &= 0xfffffff; +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_28(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[28]; + sp_digit* a1 = z1; + sp_digit b1[14]; + sp_digit* z2 = r + 28; + (void)sp_3072_add_14(a1, a, &a[14]); + sp_3072_norm_14(a1); + (void)sp_3072_add_14(b1, b, &b[14]); + sp_3072_norm_14(b1); + sp_3072_mul_14(z2, &a[14], &b[14]); + sp_3072_mul_14(z0, a, b); + sp_3072_mul_14(z1, a1, b1); + (void)sp_3072_sub_28(z1, z1, z2); + (void)sp_3072_sub_28(z1, z1, z0); + (void)sp_3072_add_28(r + 14, r + 14, z1); + sp_3072_norm_56(r); +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static int sp_3072_add_56(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + int i; + + for (i = 0; i < 56; i += 8) { + r[i + 0] = a[i + 0] + b[i + 0]; + r[i + 1] = a[i + 1] + b[i + 1]; + r[i + 2] = a[i + 2] + b[i + 2]; + r[i + 3] = a[i + 3] + b[i + 3]; + r[i + 4] = a[i + 4] + b[i + 4]; + r[i + 5] = a[i + 5] + b[i + 5]; + r[i + 6] = a[i + 6] + b[i + 6]; + r[i + 7] = a[i + 7] + b[i + 7]; + } + + return 0; +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static int sp_3072_sub_56(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + int i; + + for (i = 0; i < 56; i += 8) { + r[i + 0] = a[i + 0] - b[i + 0]; + r[i + 1] = a[i + 1] - b[i + 1]; + r[i + 2] = a[i + 2] - b[i + 2]; + r[i + 3] = a[i + 3] - b[i + 3]; + r[i + 4] = a[i + 4] - b[i + 4]; + r[i + 5] = a[i + 5] - b[i + 5]; + r[i + 6] = a[i + 6] - b[i + 6]; + r[i + 7] = a[i + 7] - b[i + 7]; + } + + return 0; +} + +/* Normalize the values in each word to 28 bits. + * + * a Array of sp_digit to normalize. + */ +static void sp_3072_norm_28(sp_digit* a) +{ + int i; + for (i = 0; i < 24; i += 8) { + a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff; + a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff; + a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff; + a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff; + a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff; + a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff; + a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff; + a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff; + } + a[25] += a[24] >> 28; a[24] &= 0xfffffff; + a[26] += a[25] >> 28; a[25] &= 0xfffffff; + a[27] += a[26] >> 28; a[26] &= 0xfffffff; +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_56(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[56]; + sp_digit* a1 = z1; + sp_digit b1[28]; + sp_digit* z2 = r + 56; + (void)sp_3072_add_28(a1, a, &a[28]); + sp_3072_norm_28(a1); + (void)sp_3072_add_28(b1, b, &b[28]); + sp_3072_norm_28(b1); + sp_3072_mul_28(z2, &a[28], &b[28]); + sp_3072_mul_28(z0, a, b); + sp_3072_mul_28(z1, a1, b1); + (void)sp_3072_sub_56(z1, z1, z2); + (void)sp_3072_sub_56(z1, z1, z0); + (void)sp_3072_add_56(r + 28, r + 28, z1); + sp_3072_norm_112(r); +} + +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static int sp_3072_add_112(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + int i; + + for (i = 0; i < 112; i += 8) { + r[i + 0] = a[i + 0] + b[i + 0]; + r[i + 1] = a[i + 1] + b[i + 1]; + r[i + 2] = a[i + 2] + b[i + 2]; + r[i + 3] = a[i + 3] + b[i + 3]; + r[i + 4] = a[i + 4] + b[i + 4]; + r[i + 5] = a[i + 5] + b[i + 5]; + r[i + 6] = a[i + 6] + b[i + 6]; + r[i + 7] = a[i + 7] + b[i + 7]; + } + + return 0; +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static int sp_3072_sub_112(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + int i; + + for (i = 0; i < 112; i += 8) { + r[i + 0] = a[i + 0] - b[i + 0]; + r[i + 1] = a[i + 1] - b[i + 1]; + r[i + 2] = a[i + 2] - b[i + 2]; + r[i + 3] = a[i + 3] - b[i + 3]; + r[i + 4] = a[i + 4] - b[i + 4]; + r[i + 5] = a[i + 5] - b[i + 5]; + r[i + 6] = a[i + 6] - b[i + 6]; + r[i + 7] = a[i + 7] - b[i + 7]; + } + + return 0; +} + +/* Normalize the values in each word to 28 bits. + * + * a Array of sp_digit to normalize. + */ +static void sp_3072_norm_224(sp_digit* a) +{ + int i; + for (i = 0; i < 216; i += 8) { + a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff; + a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff; + a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff; + a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff; + a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff; + a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff; + a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff; + a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff; + } + a[217] += a[216] >> 28; a[216] &= 0xfffffff; + a[218] += a[217] >> 28; a[217] &= 0xfffffff; + a[219] += a[218] >> 28; a[218] &= 0xfffffff; + a[220] += a[219] >> 28; a[219] &= 0xfffffff; + a[221] += a[220] >> 28; a[220] &= 0xfffffff; + a[222] += a[221] >> 28; a[221] &= 0xfffffff; + a[223] += a[222] >> 28; a[222] &= 0xfffffff; +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_112(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[112]; + sp_digit* a1 = z1; + sp_digit b1[56]; + sp_digit* z2 = r + 112; + (void)sp_3072_add_56(a1, a, &a[56]); + sp_3072_norm_56(a1); + (void)sp_3072_add_56(b1, b, &b[56]); + sp_3072_norm_56(b1); + sp_3072_mul_56(z2, &a[56], &b[56]); + sp_3072_mul_56(z0, a, b); + sp_3072_mul_56(z1, a1, b1); + (void)sp_3072_sub_112(z1, z1, z2); + (void)sp_3072_sub_112(z1, z1, z0); + (void)sp_3072_add_112(r + 56, r + 56, z1); + sp_3072_norm_224(r); +} + /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -9010,139 +9370,6 @@ SP_NOINLINE static void sp_3072_sqr_14(sp_digit* r, const sp_digit* a) r[26] = t26 & 0xfffffff; } -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static int sp_3072_add_14(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - r[ 0] = a[ 0] + b[ 0]; - r[ 1] = a[ 1] + b[ 1]; - r[ 2] = a[ 2] + b[ 2]; - r[ 3] = a[ 3] + b[ 3]; - r[ 4] = a[ 4] + b[ 4]; - r[ 5] = a[ 5] + b[ 5]; - r[ 6] = a[ 6] + b[ 6]; - r[ 7] = a[ 7] + b[ 7]; - r[ 8] = a[ 8] + b[ 8]; - r[ 9] = a[ 9] + b[ 9]; - r[10] = a[10] + b[10]; - r[11] = a[11] + b[11]; - r[12] = a[12] + b[12]; - r[13] = a[13] + b[13]; - - return 0; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static int sp_3072_add_28(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - int i; - - for (i = 0; i < 24; i += 8) { - r[i + 0] = a[i + 0] + b[i + 0]; - r[i + 1] = a[i + 1] + b[i + 1]; - r[i + 2] = a[i + 2] + b[i + 2]; - r[i + 3] = a[i + 3] + b[i + 3]; - r[i + 4] = a[i + 4] + b[i + 4]; - r[i + 5] = a[i + 5] + b[i + 5]; - r[i + 6] = a[i + 6] + b[i + 6]; - r[i + 7] = a[i + 7] + b[i + 7]; - } - r[24] = a[24] + b[24]; - r[25] = a[25] + b[25]; - r[26] = a[26] + b[26]; - r[27] = a[27] + b[27]; - - return 0; -} - -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static int sp_3072_sub_28(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - int i; - - for (i = 0; i < 24; i += 8) { - r[i + 0] = a[i + 0] - b[i + 0]; - r[i + 1] = a[i + 1] - b[i + 1]; - r[i + 2] = a[i + 2] - b[i + 2]; - r[i + 3] = a[i + 3] - b[i + 3]; - r[i + 4] = a[i + 4] - b[i + 4]; - r[i + 5] = a[i + 5] - b[i + 5]; - r[i + 6] = a[i + 6] - b[i + 6]; - r[i + 7] = a[i + 7] - b[i + 7]; - } - r[24] = a[24] - b[24]; - r[25] = a[25] - b[25]; - r[26] = a[26] - b[26]; - r[27] = a[27] - b[27]; - - return 0; -} - -/* Normalize the values in each word to 28 bits. - * - * a Array of sp_digit to normalize. - */ -static void sp_3072_norm_14(sp_digit* a) -{ - a[1] += a[0] >> 28; a[0] &= 0xfffffff; - a[2] += a[1] >> 28; a[1] &= 0xfffffff; - a[3] += a[2] >> 28; a[2] &= 0xfffffff; - a[4] += a[3] >> 28; a[3] &= 0xfffffff; - a[5] += a[4] >> 28; a[4] &= 0xfffffff; - a[6] += a[5] >> 28; a[5] &= 0xfffffff; - a[7] += a[6] >> 28; a[6] &= 0xfffffff; - a[8] += a[7] >> 28; a[7] &= 0xfffffff; - a[9] += a[8] >> 28; a[8] &= 0xfffffff; - a[10] += a[9] >> 28; a[9] &= 0xfffffff; - a[11] += a[10] >> 28; a[10] &= 0xfffffff; - a[12] += a[11] >> 28; a[11] &= 0xfffffff; - a[13] += a[12] >> 28; a[12] &= 0xfffffff; -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_28(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[28]; - sp_digit* a1 = z1; - sp_digit b1[14]; - sp_digit* z2 = r + 28; - (void)sp_3072_add_14(a1, a, &a[14]); - sp_3072_norm_14(a1); - (void)sp_3072_add_14(b1, b, &b[14]); - sp_3072_norm_14(b1); - sp_3072_mul_14(z2, &a[14], &b[14]); - sp_3072_mul_14(z0, a, b); - sp_3072_mul_14(z1, a1, b1); - (void)sp_3072_sub_28(z1, z1, z2); - (void)sp_3072_sub_28(z1, z1, z0); - (void)sp_3072_add_28(r + 14, r + 14, z1); - sp_3072_norm_56(r); -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -9165,105 +9392,6 @@ SP_NOINLINE static void sp_3072_sqr_28(sp_digit* r, const sp_digit* a) sp_3072_norm_56(r); } -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static int sp_3072_add_56(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - int i; - - for (i = 0; i < 56; i += 8) { - r[i + 0] = a[i + 0] + b[i + 0]; - r[i + 1] = a[i + 1] + b[i + 1]; - r[i + 2] = a[i + 2] + b[i + 2]; - r[i + 3] = a[i + 3] + b[i + 3]; - r[i + 4] = a[i + 4] + b[i + 4]; - r[i + 5] = a[i + 5] + b[i + 5]; - r[i + 6] = a[i + 6] + b[i + 6]; - r[i + 7] = a[i + 7] + b[i + 7]; - } - - return 0; -} - -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static int sp_3072_sub_56(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - int i; - - for (i = 0; i < 56; i += 8) { - r[i + 0] = a[i + 0] - b[i + 0]; - r[i + 1] = a[i + 1] - b[i + 1]; - r[i + 2] = a[i + 2] - b[i + 2]; - r[i + 3] = a[i + 3] - b[i + 3]; - r[i + 4] = a[i + 4] - b[i + 4]; - r[i + 5] = a[i + 5] - b[i + 5]; - r[i + 6] = a[i + 6] - b[i + 6]; - r[i + 7] = a[i + 7] - b[i + 7]; - } - - return 0; -} - -/* Normalize the values in each word to 28 bits. - * - * a Array of sp_digit to normalize. - */ -static void sp_3072_norm_28(sp_digit* a) -{ - int i; - for (i = 0; i < 24; i += 8) { - a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff; - a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff; - a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff; - a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff; - a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff; - a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff; - a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff; - a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff; - } - a[25] += a[24] >> 28; a[24] &= 0xfffffff; - a[26] += a[25] >> 28; a[25] &= 0xfffffff; - a[27] += a[26] >> 28; a[26] &= 0xfffffff; -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_56(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[56]; - sp_digit* a1 = z1; - sp_digit b1[28]; - sp_digit* z2 = r + 56; - (void)sp_3072_add_28(a1, a, &a[28]); - sp_3072_norm_28(a1); - (void)sp_3072_add_28(b1, b, &b[28]); - sp_3072_norm_28(b1); - sp_3072_mul_28(z2, &a[28], &b[28]); - sp_3072_mul_28(z0, a, b); - sp_3072_mul_28(z1, a1, b1); - (void)sp_3072_sub_56(z1, z1, z2); - (void)sp_3072_sub_56(z1, z1, z0); - (void)sp_3072_add_56(r + 28, r + 28, z1); - sp_3072_norm_112(r); -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -9286,109 +9414,6 @@ SP_NOINLINE static void sp_3072_sqr_56(sp_digit* r, const sp_digit* a) sp_3072_norm_112(r); } -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static int sp_3072_add_112(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - int i; - - for (i = 0; i < 112; i += 8) { - r[i + 0] = a[i + 0] + b[i + 0]; - r[i + 1] = a[i + 1] + b[i + 1]; - r[i + 2] = a[i + 2] + b[i + 2]; - r[i + 3] = a[i + 3] + b[i + 3]; - r[i + 4] = a[i + 4] + b[i + 4]; - r[i + 5] = a[i + 5] + b[i + 5]; - r[i + 6] = a[i + 6] + b[i + 6]; - r[i + 7] = a[i + 7] + b[i + 7]; - } - - return 0; -} - -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static int sp_3072_sub_112(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - int i; - - for (i = 0; i < 112; i += 8) { - r[i + 0] = a[i + 0] - b[i + 0]; - r[i + 1] = a[i + 1] - b[i + 1]; - r[i + 2] = a[i + 2] - b[i + 2]; - r[i + 3] = a[i + 3] - b[i + 3]; - r[i + 4] = a[i + 4] - b[i + 4]; - r[i + 5] = a[i + 5] - b[i + 5]; - r[i + 6] = a[i + 6] - b[i + 6]; - r[i + 7] = a[i + 7] - b[i + 7]; - } - - return 0; -} - -/* Normalize the values in each word to 28 bits. - * - * a Array of sp_digit to normalize. - */ -static void sp_3072_norm_224(sp_digit* a) -{ - int i; - for (i = 0; i < 216; i += 8) { - a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff; - a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff; - a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff; - a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff; - a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff; - a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff; - a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff; - a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff; - } - a[217] += a[216] >> 28; a[216] &= 0xfffffff; - a[218] += a[217] >> 28; a[217] &= 0xfffffff; - a[219] += a[218] >> 28; a[218] &= 0xfffffff; - a[220] += a[219] >> 28; a[219] &= 0xfffffff; - a[221] += a[220] >> 28; a[220] &= 0xfffffff; - a[222] += a[221] >> 28; a[221] &= 0xfffffff; - a[223] += a[222] >> 28; a[222] &= 0xfffffff; -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_112(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[112]; - sp_digit* a1 = z1; - sp_digit b1[56]; - sp_digit* z2 = r + 112; - (void)sp_3072_add_56(a1, a, &a[56]); - sp_3072_norm_56(a1); - (void)sp_3072_add_56(b1, b, &b[56]); - sp_3072_norm_56(b1); - sp_3072_mul_56(z2, &a[56], &b[56]); - sp_3072_mul_56(z0, a, b); - sp_3072_mul_56(z1, a1, b1); - (void)sp_3072_sub_112(z1, z1, z2); - (void)sp_3072_sub_112(z1, z1, z0); - (void)sp_3072_add_112(r + 56, r + 56, z1); - sp_3072_norm_224(r); -} - /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -9717,7 +9742,7 @@ static void sp_3072_mont_reduce_56(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_56(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_56(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_56(r, a, b); @@ -9731,7 +9756,7 @@ static void sp_3072_mont_mul_56(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_56(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_56(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_56(r, a); @@ -9778,6 +9803,7 @@ SP_NOINLINE static void sp_3072_mul_d_56(sp_digit* r, const sp_digit* a, r[56] = (sp_digit)(t & 0xfffffff); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -9791,7 +9817,7 @@ static void sp_3072_cond_add_56(sp_digit* r, const sp_digit* a, { int i; - for (i = 0; i < 48; i += 8) { + for (i = 0; i < 56; i += 8) { r[i + 0] = a[i + 0] + (b[i + 0] & m); r[i + 1] = a[i + 1] + (b[i + 1] & m); r[i + 2] = a[i + 2] + (b[i + 2] & m); @@ -9801,14 +9827,8 @@ static void sp_3072_cond_add_56(sp_digit* r, const sp_digit* a, r[i + 6] = a[i + 6] + (b[i + 6] & m); r[i + 7] = a[i + 7] + (b[i + 7] & m); } - r[48] = a[48] + (b[48] & m); - r[49] = a[49] + (b[49] & m); - r[50] = a[50] + (b[50] & m); - r[51] = a[51] + (b[51] & m); - r[52] = a[52] + (b[52] & m); - r[53] = a[53] + (b[53] & m); - r[54] = a[54] + (b[54] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_3072_rshift_56(sp_digit* r, const sp_digit* a, byte n) @@ -10666,7 +10686,7 @@ static void sp_3072_mont_reduce_112(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_112(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_112(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_112(r, a, b); @@ -10680,7 +10700,7 @@ static void sp_3072_mont_mul_112(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_112(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_112(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_112(r, a); @@ -10727,6 +10747,7 @@ SP_NOINLINE static void sp_3072_mul_d_224(sp_digit* r, const sp_digit* a, r[224] = (sp_digit)(t & 0xfffffff); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -10740,7 +10761,7 @@ static void sp_3072_cond_add_112(sp_digit* r, const sp_digit* a, { int i; - for (i = 0; i < 104; i += 8) { + for (i = 0; i < 112; i += 8) { r[i + 0] = a[i + 0] + (b[i + 0] & m); r[i + 1] = a[i + 1] + (b[i + 1] & m); r[i + 2] = a[i + 2] + (b[i + 2] & m); @@ -10750,13 +10771,8 @@ static void sp_3072_cond_add_112(sp_digit* r, const sp_digit* a, r[i + 6] = a[i + 6] + (b[i + 6] & m); r[i + 7] = a[i + 7] + (b[i + 7] & m); } - r[104] = a[104] + (b[104] & m); - r[105] = a[105] + (b[105] & m); - r[106] = a[106] + (b[106] & m); - r[107] = a[107] + (b[107] & m); - r[108] = a[108] + (b[108] & m); - r[109] = a[109] + (b[109] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_3072_rshift_112(sp_digit* r, const sp_digit* a, byte n) @@ -13348,7 +13364,7 @@ SP_NOINLINE static void sp_4096_mul_71(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_71(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_71(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_71(r, a, b); @@ -13426,7 +13442,7 @@ SP_NOINLINE static void sp_4096_sqr_71(sp_digit* r, const sp_digit* a) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_71(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_71(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_71(r, a); @@ -13454,6 +13470,7 @@ SP_NOINLINE static void sp_4096_mul_d_71(sp_digit* r, const sp_digit* a, r[71] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -13471,6 +13488,7 @@ static void sp_4096_cond_add_71(sp_digit* r, const sp_digit* a, r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -14316,7 +14334,7 @@ static void sp_4096_mont_reduce_142(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_142(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_142(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_142(r, a, b); @@ -14330,7 +14348,7 @@ static void sp_4096_mont_mul_142(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_142(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_142(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_142(r, a); @@ -14358,6 +14376,7 @@ SP_NOINLINE static void sp_4096_mul_d_284(sp_digit* r, const sp_digit* a, r[284] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -14371,10 +14390,11 @@ static void sp_4096_cond_add_142(sp_digit* r, const sp_digit* a, { int i; - for (i = 0; i < 71; i++) { + for (i = 0; i < 142; i++) { r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -16406,79 +16426,6 @@ SP_NOINLINE static void sp_4096_mul_9(sp_digit* r, const sp_digit* a, r[16] = t16 & 0x3ffffff; } -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_4096_sqr_9(sp_digit* r, const sp_digit* a) -{ - sp_uint64 t0 = ((sp_uint64)a[ 0]) * a[ 0]; - sp_uint64 t1 = (((sp_uint64)a[ 0]) * a[ 1]) * 2; - sp_uint64 t2 = (((sp_uint64)a[ 0]) * a[ 2]) * 2 - + ((sp_uint64)a[ 1]) * a[ 1]; - sp_uint64 t3 = (((sp_uint64)a[ 0]) * a[ 3] - + ((sp_uint64)a[ 1]) * a[ 2]) * 2; - sp_uint64 t4 = (((sp_uint64)a[ 0]) * a[ 4] - + ((sp_uint64)a[ 1]) * a[ 3]) * 2 - + ((sp_uint64)a[ 2]) * a[ 2]; - sp_uint64 t5 = (((sp_uint64)a[ 0]) * a[ 5] - + ((sp_uint64)a[ 1]) * a[ 4] - + ((sp_uint64)a[ 2]) * a[ 3]) * 2; - sp_uint64 t6 = (((sp_uint64)a[ 0]) * a[ 6] - + ((sp_uint64)a[ 1]) * a[ 5] - + ((sp_uint64)a[ 2]) * a[ 4]) * 2 - + ((sp_uint64)a[ 3]) * a[ 3]; - sp_uint64 t7 = (((sp_uint64)a[ 0]) * a[ 7] - + ((sp_uint64)a[ 1]) * a[ 6] - + ((sp_uint64)a[ 2]) * a[ 5] - + ((sp_uint64)a[ 3]) * a[ 4]) * 2; - sp_uint64 t8 = (((sp_uint64)a[ 0]) * a[ 8] - + ((sp_uint64)a[ 1]) * a[ 7] - + ((sp_uint64)a[ 2]) * a[ 6] - + ((sp_uint64)a[ 3]) * a[ 5]) * 2 - + ((sp_uint64)a[ 4]) * a[ 4]; - sp_uint64 t9 = (((sp_uint64)a[ 1]) * a[ 8] - + ((sp_uint64)a[ 2]) * a[ 7] - + ((sp_uint64)a[ 3]) * a[ 6] - + ((sp_uint64)a[ 4]) * a[ 5]) * 2; - sp_uint64 t10 = (((sp_uint64)a[ 2]) * a[ 8] - + ((sp_uint64)a[ 3]) * a[ 7] - + ((sp_uint64)a[ 4]) * a[ 6]) * 2 - + ((sp_uint64)a[ 5]) * a[ 5]; - sp_uint64 t11 = (((sp_uint64)a[ 3]) * a[ 8] - + ((sp_uint64)a[ 4]) * a[ 7] - + ((sp_uint64)a[ 5]) * a[ 6]) * 2; - sp_uint64 t12 = (((sp_uint64)a[ 4]) * a[ 8] - + ((sp_uint64)a[ 5]) * a[ 7]) * 2 - + ((sp_uint64)a[ 6]) * a[ 6]; - sp_uint64 t13 = (((sp_uint64)a[ 5]) * a[ 8] - + ((sp_uint64)a[ 6]) * a[ 7]) * 2; - sp_uint64 t14 = (((sp_uint64)a[ 6]) * a[ 8]) * 2 - + ((sp_uint64)a[ 7]) * a[ 7]; - sp_uint64 t15 = (((sp_uint64)a[ 7]) * a[ 8]) * 2; - sp_uint64 t16 = ((sp_uint64)a[ 8]) * a[ 8]; - - t1 += t0 >> 26; r[ 0] = t0 & 0x3ffffff; - t2 += t1 >> 26; r[ 1] = t1 & 0x3ffffff; - t3 += t2 >> 26; r[ 2] = t2 & 0x3ffffff; - t4 += t3 >> 26; r[ 3] = t3 & 0x3ffffff; - t5 += t4 >> 26; r[ 4] = t4 & 0x3ffffff; - t6 += t5 >> 26; r[ 5] = t5 & 0x3ffffff; - t7 += t6 >> 26; r[ 6] = t6 & 0x3ffffff; - t8 += t7 >> 26; r[ 7] = t7 & 0x3ffffff; - t9 += t8 >> 26; r[ 8] = t8 & 0x3ffffff; - t10 += t9 >> 26; r[ 9] = t9 & 0x3ffffff; - t11 += t10 >> 26; r[10] = t10 & 0x3ffffff; - t12 += t11 >> 26; r[11] = t11 & 0x3ffffff; - t13 += t12 >> 26; r[12] = t12 & 0x3ffffff; - t14 += t13 >> 26; r[13] = t13 & 0x3ffffff; - t15 += t14 >> 26; r[14] = t14 & 0x3ffffff; - t16 += t15 >> 26; r[15] = t15 & 0x3ffffff; - r[17] = (sp_digit)(t16 >> 26); - r[16] = t16 & 0x3ffffff; -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -16675,55 +16622,6 @@ SP_NOINLINE static void sp_4096_mul_27(sp_digit* r, const sp_digit* a, sp_4096_norm_54(r); } -/* Square a into r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_4096_sqr_27(sp_digit* r, const sp_digit* a) -{ - sp_digit p0[18]; - sp_digit p1[18]; - sp_digit p2[18]; - sp_digit p3[18]; - sp_digit p4[18]; - sp_digit p5[18]; - sp_digit t0[18]; - sp_digit t1[18]; - sp_digit t2[18]; - sp_digit a0[9]; - sp_digit a1[9]; - sp_digit a2[9]; - (void)sp_4096_add_9(a0, a, &a[9]); - sp_4096_norm_9(a0); - (void)sp_4096_add_9(a1, &a[9], &a[18]); - sp_4096_norm_9(a1); - (void)sp_4096_add_9(a2, a0, &a[18]); - sp_4096_norm_9(a2); - sp_4096_sqr_9(p0, a); - sp_4096_sqr_9(p2, &a[9]); - sp_4096_sqr_9(p4, &a[18]); - sp_4096_sqr_9(p1, a0); - sp_4096_sqr_9(p3, a1); - sp_4096_sqr_9(p5, a2); - XMEMSET(r, 0, sizeof(*r)*2U*27U); - (void)sp_4096_sub_18(t0, p3, p2); - (void)sp_4096_sub_18(t1, p1, p2); - (void)sp_4096_sub_18(t2, p5, t0); - (void)sp_4096_sub_18(t2, t2, t1); - sp_4096_norm_18(t2); - (void)sp_4096_sub_18(t0, t0, p4); - sp_4096_norm_18(t0); - (void)sp_4096_sub_18(t1, t1, p0); - sp_4096_norm_18(t1); - (void)sp_4096_add_18(r, r, p0); - (void)sp_4096_add_18(&r[9], &r[9], t1); - (void)sp_4096_add_18(&r[18], &r[18], t2); - (void)sp_4096_add_18(&r[27], &r[27], t0); - (void)sp_4096_add_18(&r[36], &r[36], p4); - sp_4096_norm_54(r); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -16895,55 +16793,6 @@ SP_NOINLINE static void sp_4096_mul_81(sp_digit* r, const sp_digit* a, sp_4096_norm_162(r); } -/* Square a into r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_4096_sqr_81(sp_digit* r, const sp_digit* a) -{ - sp_digit p0[54]; - sp_digit p1[54]; - sp_digit p2[54]; - sp_digit p3[54]; - sp_digit p4[54]; - sp_digit p5[54]; - sp_digit t0[54]; - sp_digit t1[54]; - sp_digit t2[54]; - sp_digit a0[27]; - sp_digit a1[27]; - sp_digit a2[27]; - (void)sp_4096_add_27(a0, a, &a[27]); - sp_4096_norm_27(a0); - (void)sp_4096_add_27(a1, &a[27], &a[54]); - sp_4096_norm_27(a1); - (void)sp_4096_add_27(a2, a0, &a[54]); - sp_4096_norm_27(a2); - sp_4096_sqr_27(p0, a); - sp_4096_sqr_27(p2, &a[27]); - sp_4096_sqr_27(p4, &a[54]); - sp_4096_sqr_27(p1, a0); - sp_4096_sqr_27(p3, a1); - sp_4096_sqr_27(p5, a2); - XMEMSET(r, 0, sizeof(*r)*2U*81U); - (void)sp_4096_sub_54(t0, p3, p2); - (void)sp_4096_sub_54(t1, p1, p2); - (void)sp_4096_sub_54(t2, p5, t0); - (void)sp_4096_sub_54(t2, t2, t1); - sp_4096_norm_54(t2); - (void)sp_4096_sub_54(t0, t0, p4); - sp_4096_norm_54(t0); - (void)sp_4096_sub_54(t1, t1, p0); - sp_4096_norm_54(t1); - (void)sp_4096_add_54(r, r, p0); - (void)sp_4096_add_54(&r[27], &r[27], t1); - (void)sp_4096_add_54(&r[54], &r[54], t2); - (void)sp_4096_add_54(&r[81], &r[81], t0); - (void)sp_4096_add_54(&r[108], &r[108], p4); - sp_4096_norm_162(r); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -17073,6 +16922,177 @@ SP_NOINLINE static void sp_4096_mul_162(sp_digit* r, const sp_digit* a, sp_4096_norm_324(r); } +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_4096_sqr_9(sp_digit* r, const sp_digit* a) +{ + sp_uint64 t0 = ((sp_uint64)a[ 0]) * a[ 0]; + sp_uint64 t1 = (((sp_uint64)a[ 0]) * a[ 1]) * 2; + sp_uint64 t2 = (((sp_uint64)a[ 0]) * a[ 2]) * 2 + + ((sp_uint64)a[ 1]) * a[ 1]; + sp_uint64 t3 = (((sp_uint64)a[ 0]) * a[ 3] + + ((sp_uint64)a[ 1]) * a[ 2]) * 2; + sp_uint64 t4 = (((sp_uint64)a[ 0]) * a[ 4] + + ((sp_uint64)a[ 1]) * a[ 3]) * 2 + + ((sp_uint64)a[ 2]) * a[ 2]; + sp_uint64 t5 = (((sp_uint64)a[ 0]) * a[ 5] + + ((sp_uint64)a[ 1]) * a[ 4] + + ((sp_uint64)a[ 2]) * a[ 3]) * 2; + sp_uint64 t6 = (((sp_uint64)a[ 0]) * a[ 6] + + ((sp_uint64)a[ 1]) * a[ 5] + + ((sp_uint64)a[ 2]) * a[ 4]) * 2 + + ((sp_uint64)a[ 3]) * a[ 3]; + sp_uint64 t7 = (((sp_uint64)a[ 0]) * a[ 7] + + ((sp_uint64)a[ 1]) * a[ 6] + + ((sp_uint64)a[ 2]) * a[ 5] + + ((sp_uint64)a[ 3]) * a[ 4]) * 2; + sp_uint64 t8 = (((sp_uint64)a[ 0]) * a[ 8] + + ((sp_uint64)a[ 1]) * a[ 7] + + ((sp_uint64)a[ 2]) * a[ 6] + + ((sp_uint64)a[ 3]) * a[ 5]) * 2 + + ((sp_uint64)a[ 4]) * a[ 4]; + sp_uint64 t9 = (((sp_uint64)a[ 1]) * a[ 8] + + ((sp_uint64)a[ 2]) * a[ 7] + + ((sp_uint64)a[ 3]) * a[ 6] + + ((sp_uint64)a[ 4]) * a[ 5]) * 2; + sp_uint64 t10 = (((sp_uint64)a[ 2]) * a[ 8] + + ((sp_uint64)a[ 3]) * a[ 7] + + ((sp_uint64)a[ 4]) * a[ 6]) * 2 + + ((sp_uint64)a[ 5]) * a[ 5]; + sp_uint64 t11 = (((sp_uint64)a[ 3]) * a[ 8] + + ((sp_uint64)a[ 4]) * a[ 7] + + ((sp_uint64)a[ 5]) * a[ 6]) * 2; + sp_uint64 t12 = (((sp_uint64)a[ 4]) * a[ 8] + + ((sp_uint64)a[ 5]) * a[ 7]) * 2 + + ((sp_uint64)a[ 6]) * a[ 6]; + sp_uint64 t13 = (((sp_uint64)a[ 5]) * a[ 8] + + ((sp_uint64)a[ 6]) * a[ 7]) * 2; + sp_uint64 t14 = (((sp_uint64)a[ 6]) * a[ 8]) * 2 + + ((sp_uint64)a[ 7]) * a[ 7]; + sp_uint64 t15 = (((sp_uint64)a[ 7]) * a[ 8]) * 2; + sp_uint64 t16 = ((sp_uint64)a[ 8]) * a[ 8]; + + t1 += t0 >> 26; r[ 0] = t0 & 0x3ffffff; + t2 += t1 >> 26; r[ 1] = t1 & 0x3ffffff; + t3 += t2 >> 26; r[ 2] = t2 & 0x3ffffff; + t4 += t3 >> 26; r[ 3] = t3 & 0x3ffffff; + t5 += t4 >> 26; r[ 4] = t4 & 0x3ffffff; + t6 += t5 >> 26; r[ 5] = t5 & 0x3ffffff; + t7 += t6 >> 26; r[ 6] = t6 & 0x3ffffff; + t8 += t7 >> 26; r[ 7] = t7 & 0x3ffffff; + t9 += t8 >> 26; r[ 8] = t8 & 0x3ffffff; + t10 += t9 >> 26; r[ 9] = t9 & 0x3ffffff; + t11 += t10 >> 26; r[10] = t10 & 0x3ffffff; + t12 += t11 >> 26; r[11] = t11 & 0x3ffffff; + t13 += t12 >> 26; r[12] = t12 & 0x3ffffff; + t14 += t13 >> 26; r[13] = t13 & 0x3ffffff; + t15 += t14 >> 26; r[14] = t14 & 0x3ffffff; + t16 += t15 >> 26; r[15] = t15 & 0x3ffffff; + r[17] = (sp_digit)(t16 >> 26); + r[16] = t16 & 0x3ffffff; +} + +/* Square a into r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_4096_sqr_27(sp_digit* r, const sp_digit* a) +{ + sp_digit p0[18]; + sp_digit p1[18]; + sp_digit p2[18]; + sp_digit p3[18]; + sp_digit p4[18]; + sp_digit p5[18]; + sp_digit t0[18]; + sp_digit t1[18]; + sp_digit t2[18]; + sp_digit a0[9]; + sp_digit a1[9]; + sp_digit a2[9]; + (void)sp_4096_add_9(a0, a, &a[9]); + sp_4096_norm_9(a0); + (void)sp_4096_add_9(a1, &a[9], &a[18]); + sp_4096_norm_9(a1); + (void)sp_4096_add_9(a2, a0, &a[18]); + sp_4096_norm_9(a2); + sp_4096_sqr_9(p0, a); + sp_4096_sqr_9(p2, &a[9]); + sp_4096_sqr_9(p4, &a[18]); + sp_4096_sqr_9(p1, a0); + sp_4096_sqr_9(p3, a1); + sp_4096_sqr_9(p5, a2); + XMEMSET(r, 0, sizeof(*r)*2U*27U); + (void)sp_4096_sub_18(t0, p3, p2); + (void)sp_4096_sub_18(t1, p1, p2); + (void)sp_4096_sub_18(t2, p5, t0); + (void)sp_4096_sub_18(t2, t2, t1); + sp_4096_norm_18(t2); + (void)sp_4096_sub_18(t0, t0, p4); + sp_4096_norm_18(t0); + (void)sp_4096_sub_18(t1, t1, p0); + sp_4096_norm_18(t1); + (void)sp_4096_add_18(r, r, p0); + (void)sp_4096_add_18(&r[9], &r[9], t1); + (void)sp_4096_add_18(&r[18], &r[18], t2); + (void)sp_4096_add_18(&r[27], &r[27], t0); + (void)sp_4096_add_18(&r[36], &r[36], p4); + sp_4096_norm_54(r); +} + +/* Square a into r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_4096_sqr_81(sp_digit* r, const sp_digit* a) +{ + sp_digit p0[54]; + sp_digit p1[54]; + sp_digit p2[54]; + sp_digit p3[54]; + sp_digit p4[54]; + sp_digit p5[54]; + sp_digit t0[54]; + sp_digit t1[54]; + sp_digit t2[54]; + sp_digit a0[27]; + sp_digit a1[27]; + sp_digit a2[27]; + (void)sp_4096_add_27(a0, a, &a[27]); + sp_4096_norm_27(a0); + (void)sp_4096_add_27(a1, &a[27], &a[54]); + sp_4096_norm_27(a1); + (void)sp_4096_add_27(a2, a0, &a[54]); + sp_4096_norm_27(a2); + sp_4096_sqr_27(p0, a); + sp_4096_sqr_27(p2, &a[27]); + sp_4096_sqr_27(p4, &a[54]); + sp_4096_sqr_27(p1, a0); + sp_4096_sqr_27(p3, a1); + sp_4096_sqr_27(p5, a2); + XMEMSET(r, 0, sizeof(*r)*2U*81U); + (void)sp_4096_sub_54(t0, p3, p2); + (void)sp_4096_sub_54(t1, p1, p2); + (void)sp_4096_sub_54(t2, p5, t0); + (void)sp_4096_sub_54(t2, t2, t1); + sp_4096_norm_54(t2); + (void)sp_4096_sub_54(t0, t0, p4); + sp_4096_norm_54(t0); + (void)sp_4096_sub_54(t1, t1, p0); + sp_4096_norm_54(t1); + (void)sp_4096_add_54(r, r, p0); + (void)sp_4096_add_54(&r[27], &r[27], t1); + (void)sp_4096_add_54(&r[54], &r[54], t2); + (void)sp_4096_add_54(&r[81], &r[81], t0); + (void)sp_4096_add_54(&r[108], &r[108], p4); + sp_4096_norm_162(r); +} + /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -17416,7 +17436,7 @@ static void sp_4096_mont_reduce_81(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_81(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_81(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_81(r, a, b); @@ -17430,7 +17450,7 @@ static void sp_4096_mont_mul_81(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_81(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_81(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_81(r, a); @@ -17480,6 +17500,7 @@ SP_NOINLINE static void sp_4096_mul_d_81(sp_digit* r, const sp_digit* a, r[81] = (sp_digit)(t & 0x3ffffff); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -17493,7 +17514,7 @@ static void sp_4096_cond_add_81(sp_digit* r, const sp_digit* a, { int i; - for (i = 0; i < 72; i += 8) { + for (i = 0; i < 80; i += 8) { r[i + 0] = a[i + 0] + (b[i + 0] & m); r[i + 1] = a[i + 1] + (b[i + 1] & m); r[i + 2] = a[i + 2] + (b[i + 2] & m); @@ -17503,14 +17524,9 @@ static void sp_4096_cond_add_81(sp_digit* r, const sp_digit* a, r[i + 6] = a[i + 6] + (b[i + 6] & m); r[i + 7] = a[i + 7] + (b[i + 7] & m); } - r[72] = a[72] + (b[72] & m); - r[73] = a[73] + (b[73] & m); - r[74] = a[74] + (b[74] & m); - r[75] = a[75] + (b[75] & m); - r[76] = a[76] + (b[76] & m); - r[77] = a[77] + (b[77] & m); - r[78] = a[78] + (b[78] & m); + r[80] = a[80] + (b[80] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_4096_rshift_81(sp_digit* r, const sp_digit* a, byte n) @@ -18319,7 +18335,7 @@ static void sp_4096_mont_reduce_162(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_162(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_162(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_162(r, a, b); @@ -18333,7 +18349,7 @@ static void sp_4096_mont_mul_162(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_162(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_162(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_162(r, a); @@ -18380,6 +18396,7 @@ SP_NOINLINE static void sp_4096_mul_d_324(sp_digit* r, const sp_digit* a, r[324] = (sp_digit)(t & 0x3ffffff); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -18393,7 +18410,7 @@ static void sp_4096_cond_add_162(sp_digit* r, const sp_digit* a, { int i; - for (i = 0; i < 152; i += 8) { + for (i = 0; i < 160; i += 8) { r[i + 0] = a[i + 0] + (b[i + 0] & m); r[i + 1] = a[i + 1] + (b[i + 1] & m); r[i + 2] = a[i + 2] + (b[i + 2] & m); @@ -18403,13 +18420,10 @@ static void sp_4096_cond_add_162(sp_digit* r, const sp_digit* a, r[i + 6] = a[i + 6] + (b[i + 6] & m); r[i + 7] = a[i + 7] + (b[i + 7] & m); } - r[152] = a[152] + (b[152] & m); - r[153] = a[153] + (b[153] & m); - r[154] = a[154] + (b[154] & m); - r[155] = a[155] + (b[155] & m); - r[156] = a[156] + (b[156] & m); - r[157] = a[157] + (b[157] & m); + r[160] = a[160] + (b[160] & m); + r[161] = a[161] + (b[161] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_4096_rshift_162(sp_digit* r, const sp_digit* a, byte n) @@ -21270,7 +21284,7 @@ static void sp_256_mont_reduce_9(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_256_mont_mul_9(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_256_mont_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_256_mul_9(r, a, b); @@ -21284,7 +21298,7 @@ static void sp_256_mont_mul_9(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_256_mont_sqr_9(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_256_mont_sqr_9(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_256_sqr_9(r, a); @@ -21482,6 +21496,7 @@ static void sp_256_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m) sp_256_norm_9(r); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -21493,13 +21508,26 @@ static void sp_256_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m) static void sp_256_cond_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; for (i = 0; i < 9; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_256_cond_add_9(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ r[ 0] = a[ 0] + (b[ 0] & m); r[ 1] = a[ 1] + (b[ 1] & m); r[ 2] = a[ 2] + (b[ 2] & m); @@ -21509,8 +21537,8 @@ static void sp_256_cond_add_9(sp_digit* r, const sp_digit* a, r[ 6] = a[ 6] + (b[ 6] & m); r[ 7] = a[ 7] + (b[ 7] & m); r[ 8] = a[ 8] + (b[ 8] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * @@ -28489,7 +28517,7 @@ static void sp_384_mont_reduce_15(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_mul_15(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_mul_15(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_384_mul_15(r, a, b); @@ -28503,7 +28531,7 @@ static void sp_384_mont_mul_15(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_sqr_15(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_sqr_15(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_384_sqr_15(r, a); @@ -28717,6 +28745,7 @@ static void sp_384_mont_tpl_15(sp_digit* r, const sp_digit* a, const sp_digit* m sp_384_norm_15(r); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -28728,13 +28757,26 @@ static void sp_384_mont_tpl_15(sp_digit* r, const sp_digit* a, const sp_digit* m static void sp_384_cond_add_15(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; for (i = 0; i < 15; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_384_cond_add_15(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ r[ 0] = a[ 0] + (b[ 0] & m); r[ 1] = a[ 1] + (b[ 1] & m); r[ 2] = a[ 2] + (b[ 2] & m); @@ -28750,8 +28792,8 @@ static void sp_384_cond_add_15(sp_digit* r, const sp_digit* a, r[12] = a[12] + (b[12] & m); r[13] = a[13] + (b[13] & m); r[14] = a[14] + (b[14] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * @@ -35985,7 +36027,7 @@ static void sp_521_mont_reduce_order_21(sp_digit* a, const sp_digit* m, sp_digit * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_mul_21(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_mul_21(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_521_mul_21(r, a, b); @@ -35999,7 +36041,7 @@ static void sp_521_mont_mul_21(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_sqr_21(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_sqr_21(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_521_sqr_21(r, a); @@ -36210,6 +36252,7 @@ static void sp_521_mont_tpl_21(sp_digit* r, const sp_digit* a, const sp_digit* m sp_521_norm_21(r); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -36221,13 +36264,26 @@ static void sp_521_mont_tpl_21(sp_digit* r, const sp_digit* a, const sp_digit* m static void sp_521_cond_add_21(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; for (i = 0; i < 21; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_521_cond_add_21(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ int i; for (i = 0; i < 16; i += 8) { @@ -36245,8 +36301,8 @@ static void sp_521_cond_add_21(sp_digit* r, const sp_digit* a, r[18] = a[18] + (b[18] & m); r[19] = a[19] + (b[19] & m); r[20] = a[20] + (b[20] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * @@ -43830,6 +43886,7 @@ SP_NOINLINE static void sp_1024_mul_d_84(sp_digit* r, const sp_digit* a, #endif /* WOLFSSL_SP_SMALL */ } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -43841,13 +43898,26 @@ SP_NOINLINE static void sp_1024_mul_d_84(sp_digit* r, const sp_digit* a, static void sp_1024_cond_add_42(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; - for (i = 0; i < 41; i++) { + for (i = 0; i < 42; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_1024_cond_add_42(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ int i; for (i = 0; i < 40; i += 8) { @@ -43862,8 +43932,8 @@ static void sp_1024_cond_add_42(sp_digit* r, const sp_digit* a, } r[40] = a[40] + (b[40] & m); r[41] = a[41] + (b[41] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Sub b from a into r. (r = a - b) @@ -44618,7 +44688,7 @@ static void sp_1024_mont_reduce_42(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_mul_42(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_mul_42(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_1024_mul_42(r, a, b); @@ -44632,7 +44702,7 @@ static void sp_1024_mont_mul_42(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_sqr_42(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_sqr_42(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_1024_sqr_42(r, a); diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index dcf495917..4512640fe 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -667,7 +667,7 @@ SP_NOINLINE static void sp_2048_mul_17(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_17(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_17(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_17(r, a, b); @@ -745,7 +745,7 @@ SP_NOINLINE static void sp_2048_sqr_17(sp_digit* r, const sp_digit* a) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_17(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_17(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_17(r, a); @@ -773,6 +773,7 @@ SP_NOINLINE static void sp_2048_mul_d_17(sp_digit* r, const sp_digit* a, r[17] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -790,6 +791,7 @@ static void sp_2048_cond_add_17(sp_digit* r, const sp_digit* a, r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -1748,7 +1750,7 @@ static void sp_2048_mont_reduce_34(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_34(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_34(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_34(r, a, b); @@ -1762,7 +1764,7 @@ static void sp_2048_mont_mul_34(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_34(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_34(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_34(r, a); @@ -1790,6 +1792,7 @@ SP_NOINLINE static void sp_2048_mul_d_68(sp_digit* r, const sp_digit* a, r[68] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -1803,10 +1806,11 @@ static void sp_2048_cond_add_34(sp_digit* r, const sp_digit* a, { int i; - for (i = 0; i < 17; i++) { + for (i = 0; i < 34; i++) { r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -4061,79 +4065,6 @@ SP_NOINLINE static void sp_2048_mul_9(sp_digit* r, const sp_digit* a, r[16] = t16 & 0x1ffffffffffffffL; } -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a) -{ - sp_uint128 t0 = ((sp_uint128)a[ 0]) * a[ 0]; - sp_uint128 t1 = (((sp_uint128)a[ 0]) * a[ 1]) * 2; - sp_uint128 t2 = (((sp_uint128)a[ 0]) * a[ 2]) * 2 - + ((sp_uint128)a[ 1]) * a[ 1]; - sp_uint128 t3 = (((sp_uint128)a[ 0]) * a[ 3] - + ((sp_uint128)a[ 1]) * a[ 2]) * 2; - sp_uint128 t4 = (((sp_uint128)a[ 0]) * a[ 4] - + ((sp_uint128)a[ 1]) * a[ 3]) * 2 - + ((sp_uint128)a[ 2]) * a[ 2]; - sp_uint128 t5 = (((sp_uint128)a[ 0]) * a[ 5] - + ((sp_uint128)a[ 1]) * a[ 4] - + ((sp_uint128)a[ 2]) * a[ 3]) * 2; - sp_uint128 t6 = (((sp_uint128)a[ 0]) * a[ 6] - + ((sp_uint128)a[ 1]) * a[ 5] - + ((sp_uint128)a[ 2]) * a[ 4]) * 2 - + ((sp_uint128)a[ 3]) * a[ 3]; - sp_uint128 t7 = (((sp_uint128)a[ 0]) * a[ 7] - + ((sp_uint128)a[ 1]) * a[ 6] - + ((sp_uint128)a[ 2]) * a[ 5] - + ((sp_uint128)a[ 3]) * a[ 4]) * 2; - sp_uint128 t8 = (((sp_uint128)a[ 0]) * a[ 8] - + ((sp_uint128)a[ 1]) * a[ 7] - + ((sp_uint128)a[ 2]) * a[ 6] - + ((sp_uint128)a[ 3]) * a[ 5]) * 2 - + ((sp_uint128)a[ 4]) * a[ 4]; - sp_uint128 t9 = (((sp_uint128)a[ 1]) * a[ 8] - + ((sp_uint128)a[ 2]) * a[ 7] - + ((sp_uint128)a[ 3]) * a[ 6] - + ((sp_uint128)a[ 4]) * a[ 5]) * 2; - sp_uint128 t10 = (((sp_uint128)a[ 2]) * a[ 8] - + ((sp_uint128)a[ 3]) * a[ 7] - + ((sp_uint128)a[ 4]) * a[ 6]) * 2 - + ((sp_uint128)a[ 5]) * a[ 5]; - sp_uint128 t11 = (((sp_uint128)a[ 3]) * a[ 8] - + ((sp_uint128)a[ 4]) * a[ 7] - + ((sp_uint128)a[ 5]) * a[ 6]) * 2; - sp_uint128 t12 = (((sp_uint128)a[ 4]) * a[ 8] - + ((sp_uint128)a[ 5]) * a[ 7]) * 2 - + ((sp_uint128)a[ 6]) * a[ 6]; - sp_uint128 t13 = (((sp_uint128)a[ 5]) * a[ 8] - + ((sp_uint128)a[ 6]) * a[ 7]) * 2; - sp_uint128 t14 = (((sp_uint128)a[ 6]) * a[ 8]) * 2 - + ((sp_uint128)a[ 7]) * a[ 7]; - sp_uint128 t15 = (((sp_uint128)a[ 7]) * a[ 8]) * 2; - sp_uint128 t16 = ((sp_uint128)a[ 8]) * a[ 8]; - - t1 += t0 >> 57; r[ 0] = t0 & 0x1ffffffffffffffL; - t2 += t1 >> 57; r[ 1] = t1 & 0x1ffffffffffffffL; - t3 += t2 >> 57; r[ 2] = t2 & 0x1ffffffffffffffL; - t4 += t3 >> 57; r[ 3] = t3 & 0x1ffffffffffffffL; - t5 += t4 >> 57; r[ 4] = t4 & 0x1ffffffffffffffL; - t6 += t5 >> 57; r[ 5] = t5 & 0x1ffffffffffffffL; - t7 += t6 >> 57; r[ 6] = t6 & 0x1ffffffffffffffL; - t8 += t7 >> 57; r[ 7] = t7 & 0x1ffffffffffffffL; - t9 += t8 >> 57; r[ 8] = t8 & 0x1ffffffffffffffL; - t10 += t9 >> 57; r[ 9] = t9 & 0x1ffffffffffffffL; - t11 += t10 >> 57; r[10] = t10 & 0x1ffffffffffffffL; - t12 += t11 >> 57; r[11] = t11 & 0x1ffffffffffffffL; - t13 += t12 >> 57; r[12] = t12 & 0x1ffffffffffffffL; - t14 += t13 >> 57; r[13] = t13 & 0x1ffffffffffffffL; - t15 += t14 >> 57; r[14] = t14 & 0x1ffffffffffffffL; - t16 += t15 >> 57; r[15] = t15 & 0x1ffffffffffffffL; - r[17] = (sp_digit)(t16 >> 57); - r[16] = t16 & 0x1ffffffffffffffL; -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -4234,26 +4165,6 @@ SP_NOINLINE static void sp_2048_mul_18(sp_digit* r, const sp_digit* a, (void)sp_2048_add_18(r + 9, r + 9, z1); } -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z1[18]; - sp_digit* a1 = z1; - sp_digit* z2 = r + 18; - (void)sp_2048_add_9(a1, a, &a[9]); - sp_2048_sqr_9(z2, &a[9]); - sp_2048_sqr_9(z0, a); - sp_2048_sqr_9(z1, a1); - (void)sp_2048_sub_18(z1, z1, z2); - (void)sp_2048_sub_18(z1, z1, z0); - (void)sp_2048_add_18(r + 9, r + 9, z1); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -4336,6 +4247,99 @@ SP_NOINLINE static void sp_2048_mul_36(sp_digit* r, const sp_digit* a, (void)sp_2048_add_36(r + 18, r + 18, z1); } +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a) +{ + sp_uint128 t0 = ((sp_uint128)a[ 0]) * a[ 0]; + sp_uint128 t1 = (((sp_uint128)a[ 0]) * a[ 1]) * 2; + sp_uint128 t2 = (((sp_uint128)a[ 0]) * a[ 2]) * 2 + + ((sp_uint128)a[ 1]) * a[ 1]; + sp_uint128 t3 = (((sp_uint128)a[ 0]) * a[ 3] + + ((sp_uint128)a[ 1]) * a[ 2]) * 2; + sp_uint128 t4 = (((sp_uint128)a[ 0]) * a[ 4] + + ((sp_uint128)a[ 1]) * a[ 3]) * 2 + + ((sp_uint128)a[ 2]) * a[ 2]; + sp_uint128 t5 = (((sp_uint128)a[ 0]) * a[ 5] + + ((sp_uint128)a[ 1]) * a[ 4] + + ((sp_uint128)a[ 2]) * a[ 3]) * 2; + sp_uint128 t6 = (((sp_uint128)a[ 0]) * a[ 6] + + ((sp_uint128)a[ 1]) * a[ 5] + + ((sp_uint128)a[ 2]) * a[ 4]) * 2 + + ((sp_uint128)a[ 3]) * a[ 3]; + sp_uint128 t7 = (((sp_uint128)a[ 0]) * a[ 7] + + ((sp_uint128)a[ 1]) * a[ 6] + + ((sp_uint128)a[ 2]) * a[ 5] + + ((sp_uint128)a[ 3]) * a[ 4]) * 2; + sp_uint128 t8 = (((sp_uint128)a[ 0]) * a[ 8] + + ((sp_uint128)a[ 1]) * a[ 7] + + ((sp_uint128)a[ 2]) * a[ 6] + + ((sp_uint128)a[ 3]) * a[ 5]) * 2 + + ((sp_uint128)a[ 4]) * a[ 4]; + sp_uint128 t9 = (((sp_uint128)a[ 1]) * a[ 8] + + ((sp_uint128)a[ 2]) * a[ 7] + + ((sp_uint128)a[ 3]) * a[ 6] + + ((sp_uint128)a[ 4]) * a[ 5]) * 2; + sp_uint128 t10 = (((sp_uint128)a[ 2]) * a[ 8] + + ((sp_uint128)a[ 3]) * a[ 7] + + ((sp_uint128)a[ 4]) * a[ 6]) * 2 + + ((sp_uint128)a[ 5]) * a[ 5]; + sp_uint128 t11 = (((sp_uint128)a[ 3]) * a[ 8] + + ((sp_uint128)a[ 4]) * a[ 7] + + ((sp_uint128)a[ 5]) * a[ 6]) * 2; + sp_uint128 t12 = (((sp_uint128)a[ 4]) * a[ 8] + + ((sp_uint128)a[ 5]) * a[ 7]) * 2 + + ((sp_uint128)a[ 6]) * a[ 6]; + sp_uint128 t13 = (((sp_uint128)a[ 5]) * a[ 8] + + ((sp_uint128)a[ 6]) * a[ 7]) * 2; + sp_uint128 t14 = (((sp_uint128)a[ 6]) * a[ 8]) * 2 + + ((sp_uint128)a[ 7]) * a[ 7]; + sp_uint128 t15 = (((sp_uint128)a[ 7]) * a[ 8]) * 2; + sp_uint128 t16 = ((sp_uint128)a[ 8]) * a[ 8]; + + t1 += t0 >> 57; r[ 0] = t0 & 0x1ffffffffffffffL; + t2 += t1 >> 57; r[ 1] = t1 & 0x1ffffffffffffffL; + t3 += t2 >> 57; r[ 2] = t2 & 0x1ffffffffffffffL; + t4 += t3 >> 57; r[ 3] = t3 & 0x1ffffffffffffffL; + t5 += t4 >> 57; r[ 4] = t4 & 0x1ffffffffffffffL; + t6 += t5 >> 57; r[ 5] = t5 & 0x1ffffffffffffffL; + t7 += t6 >> 57; r[ 6] = t6 & 0x1ffffffffffffffL; + t8 += t7 >> 57; r[ 7] = t7 & 0x1ffffffffffffffL; + t9 += t8 >> 57; r[ 8] = t8 & 0x1ffffffffffffffL; + t10 += t9 >> 57; r[ 9] = t9 & 0x1ffffffffffffffL; + t11 += t10 >> 57; r[10] = t10 & 0x1ffffffffffffffL; + t12 += t11 >> 57; r[11] = t11 & 0x1ffffffffffffffL; + t13 += t12 >> 57; r[12] = t12 & 0x1ffffffffffffffL; + t14 += t13 >> 57; r[13] = t13 & 0x1ffffffffffffffL; + t15 += t14 >> 57; r[14] = t14 & 0x1ffffffffffffffL; + t16 += t15 >> 57; r[15] = t15 & 0x1ffffffffffffffL; + r[17] = (sp_digit)(t16 >> 57); + r[16] = t16 & 0x1ffffffffffffffL; +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit z1[18]; + sp_digit* a1 = z1; + sp_digit* z2 = r + 18; + (void)sp_2048_add_9(a1, a, &a[9]); + sp_2048_sqr_9(z2, &a[9]); + sp_2048_sqr_9(z0, a); + sp_2048_sqr_9(z1, a1); + (void)sp_2048_sub_18(z1, z1, z2); + (void)sp_2048_sub_18(z1, z1, z0); + (void)sp_2048_add_18(r + 9, r + 9, z1); +} + /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -4607,7 +4611,7 @@ static void sp_2048_mont_reduce_18(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_18(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_18(r, a, b); @@ -4621,7 +4625,7 @@ static void sp_2048_mont_mul_18(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_18(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_18(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_18(r, a); @@ -4674,6 +4678,7 @@ SP_NOINLINE static void sp_2048_mul_d_18(sp_digit* r, const sp_digit* a, r[18] = (sp_digit)(t & 0x1ffffffffffffffL); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -4700,6 +4705,7 @@ static void sp_2048_cond_add_18(sp_digit* r, const sp_digit* a, r[16] = a[16] + (b[16] & m); r[17] = a[17] + (b[17] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_2048_rshift_18(sp_digit* r, const sp_digit* a, byte n) @@ -5519,7 +5525,7 @@ static void sp_2048_mont_reduce_36(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_36(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_36(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_36(r, a, b); @@ -5533,7 +5539,7 @@ static void sp_2048_mont_mul_36(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_36(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_36(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_36(r, a); @@ -5580,6 +5586,7 @@ SP_NOINLINE static void sp_2048_mul_d_72(sp_digit* r, const sp_digit* a, r[72] = (sp_digit)(t & 0x1ffffffffffffffL); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -5608,6 +5615,7 @@ static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a, r[34] = a[34] + (b[34] & m); r[35] = a[35] + (b[35] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_2048_rshift_36(sp_digit* r, const sp_digit* a, byte n) @@ -7969,7 +7977,7 @@ SP_NOINLINE static void sp_3072_mul_26(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_26(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_26(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_26(r, a, b); @@ -8024,7 +8032,7 @@ SP_NOINLINE static void sp_3072_sqr_26(sp_digit* r, const sp_digit* a) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_26(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_26(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_26(r, a); @@ -8052,6 +8060,7 @@ SP_NOINLINE static void sp_3072_mul_d_26(sp_digit* r, const sp_digit* a, r[26] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -8069,6 +8078,7 @@ static void sp_3072_cond_add_26(sp_digit* r, const sp_digit* a, r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -8945,7 +8955,7 @@ static void sp_3072_mont_reduce_52(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_52(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_52(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_52(r, a, b); @@ -8959,7 +8969,7 @@ static void sp_3072_mont_mul_52(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_52(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_52(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_52(r, a); @@ -8987,6 +8997,7 @@ SP_NOINLINE static void sp_3072_mul_d_104(sp_digit* r, const sp_digit* a, r[104] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -9000,10 +9011,11 @@ static void sp_3072_cond_add_52(sp_digit* r, const sp_digit* a, { int i; - for (i = 0; i < 26; i++) { + for (i = 0; i < 52; i++) { r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -11173,79 +11185,6 @@ SP_NOINLINE static void sp_3072_mul_9(sp_digit* r, const sp_digit* a, r[16] = t16 & 0x1ffffffffffffffL; } -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_3072_sqr_9(sp_digit* r, const sp_digit* a) -{ - sp_uint128 t0 = ((sp_uint128)a[ 0]) * a[ 0]; - sp_uint128 t1 = (((sp_uint128)a[ 0]) * a[ 1]) * 2; - sp_uint128 t2 = (((sp_uint128)a[ 0]) * a[ 2]) * 2 - + ((sp_uint128)a[ 1]) * a[ 1]; - sp_uint128 t3 = (((sp_uint128)a[ 0]) * a[ 3] - + ((sp_uint128)a[ 1]) * a[ 2]) * 2; - sp_uint128 t4 = (((sp_uint128)a[ 0]) * a[ 4] - + ((sp_uint128)a[ 1]) * a[ 3]) * 2 - + ((sp_uint128)a[ 2]) * a[ 2]; - sp_uint128 t5 = (((sp_uint128)a[ 0]) * a[ 5] - + ((sp_uint128)a[ 1]) * a[ 4] - + ((sp_uint128)a[ 2]) * a[ 3]) * 2; - sp_uint128 t6 = (((sp_uint128)a[ 0]) * a[ 6] - + ((sp_uint128)a[ 1]) * a[ 5] - + ((sp_uint128)a[ 2]) * a[ 4]) * 2 - + ((sp_uint128)a[ 3]) * a[ 3]; - sp_uint128 t7 = (((sp_uint128)a[ 0]) * a[ 7] - + ((sp_uint128)a[ 1]) * a[ 6] - + ((sp_uint128)a[ 2]) * a[ 5] - + ((sp_uint128)a[ 3]) * a[ 4]) * 2; - sp_uint128 t8 = (((sp_uint128)a[ 0]) * a[ 8] - + ((sp_uint128)a[ 1]) * a[ 7] - + ((sp_uint128)a[ 2]) * a[ 6] - + ((sp_uint128)a[ 3]) * a[ 5]) * 2 - + ((sp_uint128)a[ 4]) * a[ 4]; - sp_uint128 t9 = (((sp_uint128)a[ 1]) * a[ 8] - + ((sp_uint128)a[ 2]) * a[ 7] - + ((sp_uint128)a[ 3]) * a[ 6] - + ((sp_uint128)a[ 4]) * a[ 5]) * 2; - sp_uint128 t10 = (((sp_uint128)a[ 2]) * a[ 8] - + ((sp_uint128)a[ 3]) * a[ 7] - + ((sp_uint128)a[ 4]) * a[ 6]) * 2 - + ((sp_uint128)a[ 5]) * a[ 5]; - sp_uint128 t11 = (((sp_uint128)a[ 3]) * a[ 8] - + ((sp_uint128)a[ 4]) * a[ 7] - + ((sp_uint128)a[ 5]) * a[ 6]) * 2; - sp_uint128 t12 = (((sp_uint128)a[ 4]) * a[ 8] - + ((sp_uint128)a[ 5]) * a[ 7]) * 2 - + ((sp_uint128)a[ 6]) * a[ 6]; - sp_uint128 t13 = (((sp_uint128)a[ 5]) * a[ 8] - + ((sp_uint128)a[ 6]) * a[ 7]) * 2; - sp_uint128 t14 = (((sp_uint128)a[ 6]) * a[ 8]) * 2 - + ((sp_uint128)a[ 7]) * a[ 7]; - sp_uint128 t15 = (((sp_uint128)a[ 7]) * a[ 8]) * 2; - sp_uint128 t16 = ((sp_uint128)a[ 8]) * a[ 8]; - - t1 += t0 >> 57; r[ 0] = t0 & 0x1ffffffffffffffL; - t2 += t1 >> 57; r[ 1] = t1 & 0x1ffffffffffffffL; - t3 += t2 >> 57; r[ 2] = t2 & 0x1ffffffffffffffL; - t4 += t3 >> 57; r[ 3] = t3 & 0x1ffffffffffffffL; - t5 += t4 >> 57; r[ 4] = t4 & 0x1ffffffffffffffL; - t6 += t5 >> 57; r[ 5] = t5 & 0x1ffffffffffffffL; - t7 += t6 >> 57; r[ 6] = t6 & 0x1ffffffffffffffL; - t8 += t7 >> 57; r[ 7] = t7 & 0x1ffffffffffffffL; - t9 += t8 >> 57; r[ 8] = t8 & 0x1ffffffffffffffL; - t10 += t9 >> 57; r[ 9] = t9 & 0x1ffffffffffffffL; - t11 += t10 >> 57; r[10] = t10 & 0x1ffffffffffffffL; - t12 += t11 >> 57; r[11] = t11 & 0x1ffffffffffffffL; - t13 += t12 >> 57; r[12] = t12 & 0x1ffffffffffffffL; - t14 += t13 >> 57; r[13] = t13 & 0x1ffffffffffffffL; - t15 += t14 >> 57; r[14] = t14 & 0x1ffffffffffffffL; - t16 += t15 >> 57; r[15] = t15 & 0x1ffffffffffffffL; - r[17] = (sp_digit)(t16 >> 57); - r[16] = t16 & 0x1ffffffffffffffL; -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -11372,48 +11311,6 @@ SP_NOINLINE static void sp_3072_mul_27(sp_digit* r, const sp_digit* a, (void)sp_3072_add_18(&r[36], &r[36], p4); } -/* Square a into r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a) -{ - sp_digit p0[18]; - sp_digit p1[18]; - sp_digit p2[18]; - sp_digit p3[18]; - sp_digit p4[18]; - sp_digit p5[18]; - sp_digit t0[18]; - sp_digit t1[18]; - sp_digit t2[18]; - sp_digit a0[9]; - sp_digit a1[9]; - sp_digit a2[9]; - (void)sp_3072_add_9(a0, a, &a[9]); - (void)sp_3072_add_9(a1, &a[9], &a[18]); - (void)sp_3072_add_9(a2, a0, &a[18]); - sp_3072_sqr_9(p0, a); - sp_3072_sqr_9(p2, &a[9]); - sp_3072_sqr_9(p4, &a[18]); - sp_3072_sqr_9(p1, a0); - sp_3072_sqr_9(p3, a1); - sp_3072_sqr_9(p5, a2); - XMEMSET(r, 0, sizeof(*r)*2U*27U); - (void)sp_3072_sub_18(t0, p3, p2); - (void)sp_3072_sub_18(t1, p1, p2); - (void)sp_3072_sub_18(t2, p5, t0); - (void)sp_3072_sub_18(t2, t2, t1); - (void)sp_3072_sub_18(t0, t0, p4); - (void)sp_3072_sub_18(t1, t1, p0); - (void)sp_3072_add_18(r, r, p0); - (void)sp_3072_add_18(&r[9], &r[9], t1); - (void)sp_3072_add_18(&r[18], &r[18], t2); - (void)sp_3072_add_18(&r[27], &r[27], t0); - (void)sp_3072_add_18(&r[36], &r[36], p4); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -11528,6 +11425,121 @@ SP_NOINLINE static void sp_3072_mul_54(sp_digit* r, const sp_digit* a, (void)sp_3072_add_54(r + 27, r + 27, z1); } +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_3072_sqr_9(sp_digit* r, const sp_digit* a) +{ + sp_uint128 t0 = ((sp_uint128)a[ 0]) * a[ 0]; + sp_uint128 t1 = (((sp_uint128)a[ 0]) * a[ 1]) * 2; + sp_uint128 t2 = (((sp_uint128)a[ 0]) * a[ 2]) * 2 + + ((sp_uint128)a[ 1]) * a[ 1]; + sp_uint128 t3 = (((sp_uint128)a[ 0]) * a[ 3] + + ((sp_uint128)a[ 1]) * a[ 2]) * 2; + sp_uint128 t4 = (((sp_uint128)a[ 0]) * a[ 4] + + ((sp_uint128)a[ 1]) * a[ 3]) * 2 + + ((sp_uint128)a[ 2]) * a[ 2]; + sp_uint128 t5 = (((sp_uint128)a[ 0]) * a[ 5] + + ((sp_uint128)a[ 1]) * a[ 4] + + ((sp_uint128)a[ 2]) * a[ 3]) * 2; + sp_uint128 t6 = (((sp_uint128)a[ 0]) * a[ 6] + + ((sp_uint128)a[ 1]) * a[ 5] + + ((sp_uint128)a[ 2]) * a[ 4]) * 2 + + ((sp_uint128)a[ 3]) * a[ 3]; + sp_uint128 t7 = (((sp_uint128)a[ 0]) * a[ 7] + + ((sp_uint128)a[ 1]) * a[ 6] + + ((sp_uint128)a[ 2]) * a[ 5] + + ((sp_uint128)a[ 3]) * a[ 4]) * 2; + sp_uint128 t8 = (((sp_uint128)a[ 0]) * a[ 8] + + ((sp_uint128)a[ 1]) * a[ 7] + + ((sp_uint128)a[ 2]) * a[ 6] + + ((sp_uint128)a[ 3]) * a[ 5]) * 2 + + ((sp_uint128)a[ 4]) * a[ 4]; + sp_uint128 t9 = (((sp_uint128)a[ 1]) * a[ 8] + + ((sp_uint128)a[ 2]) * a[ 7] + + ((sp_uint128)a[ 3]) * a[ 6] + + ((sp_uint128)a[ 4]) * a[ 5]) * 2; + sp_uint128 t10 = (((sp_uint128)a[ 2]) * a[ 8] + + ((sp_uint128)a[ 3]) * a[ 7] + + ((sp_uint128)a[ 4]) * a[ 6]) * 2 + + ((sp_uint128)a[ 5]) * a[ 5]; + sp_uint128 t11 = (((sp_uint128)a[ 3]) * a[ 8] + + ((sp_uint128)a[ 4]) * a[ 7] + + ((sp_uint128)a[ 5]) * a[ 6]) * 2; + sp_uint128 t12 = (((sp_uint128)a[ 4]) * a[ 8] + + ((sp_uint128)a[ 5]) * a[ 7]) * 2 + + ((sp_uint128)a[ 6]) * a[ 6]; + sp_uint128 t13 = (((sp_uint128)a[ 5]) * a[ 8] + + ((sp_uint128)a[ 6]) * a[ 7]) * 2; + sp_uint128 t14 = (((sp_uint128)a[ 6]) * a[ 8]) * 2 + + ((sp_uint128)a[ 7]) * a[ 7]; + sp_uint128 t15 = (((sp_uint128)a[ 7]) * a[ 8]) * 2; + sp_uint128 t16 = ((sp_uint128)a[ 8]) * a[ 8]; + + t1 += t0 >> 57; r[ 0] = t0 & 0x1ffffffffffffffL; + t2 += t1 >> 57; r[ 1] = t1 & 0x1ffffffffffffffL; + t3 += t2 >> 57; r[ 2] = t2 & 0x1ffffffffffffffL; + t4 += t3 >> 57; r[ 3] = t3 & 0x1ffffffffffffffL; + t5 += t4 >> 57; r[ 4] = t4 & 0x1ffffffffffffffL; + t6 += t5 >> 57; r[ 5] = t5 & 0x1ffffffffffffffL; + t7 += t6 >> 57; r[ 6] = t6 & 0x1ffffffffffffffL; + t8 += t7 >> 57; r[ 7] = t7 & 0x1ffffffffffffffL; + t9 += t8 >> 57; r[ 8] = t8 & 0x1ffffffffffffffL; + t10 += t9 >> 57; r[ 9] = t9 & 0x1ffffffffffffffL; + t11 += t10 >> 57; r[10] = t10 & 0x1ffffffffffffffL; + t12 += t11 >> 57; r[11] = t11 & 0x1ffffffffffffffL; + t13 += t12 >> 57; r[12] = t12 & 0x1ffffffffffffffL; + t14 += t13 >> 57; r[13] = t13 & 0x1ffffffffffffffL; + t15 += t14 >> 57; r[14] = t14 & 0x1ffffffffffffffL; + t16 += t15 >> 57; r[15] = t15 & 0x1ffffffffffffffL; + r[17] = (sp_digit)(t16 >> 57); + r[16] = t16 & 0x1ffffffffffffffL; +} + +/* Square a into r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a) +{ + sp_digit p0[18]; + sp_digit p1[18]; + sp_digit p2[18]; + sp_digit p3[18]; + sp_digit p4[18]; + sp_digit p5[18]; + sp_digit t0[18]; + sp_digit t1[18]; + sp_digit t2[18]; + sp_digit a0[9]; + sp_digit a1[9]; + sp_digit a2[9]; + (void)sp_3072_add_9(a0, a, &a[9]); + (void)sp_3072_add_9(a1, &a[9], &a[18]); + (void)sp_3072_add_9(a2, a0, &a[18]); + sp_3072_sqr_9(p0, a); + sp_3072_sqr_9(p2, &a[9]); + sp_3072_sqr_9(p4, &a[18]); + sp_3072_sqr_9(p1, a0); + sp_3072_sqr_9(p3, a1); + sp_3072_sqr_9(p5, a2); + XMEMSET(r, 0, sizeof(*r)*2U*27U); + (void)sp_3072_sub_18(t0, p3, p2); + (void)sp_3072_sub_18(t1, p1, p2); + (void)sp_3072_sub_18(t2, p5, t0); + (void)sp_3072_sub_18(t2, t2, t1); + (void)sp_3072_sub_18(t0, t0, p4); + (void)sp_3072_sub_18(t1, t1, p0); + (void)sp_3072_add_18(r, r, p0); + (void)sp_3072_add_18(&r[9], &r[9], t1); + (void)sp_3072_add_18(&r[18], &r[18], t2); + (void)sp_3072_add_18(&r[27], &r[27], t0); + (void)sp_3072_add_18(&r[36], &r[36], p4); +} + /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -11849,7 +11861,7 @@ static void sp_3072_mont_reduce_27(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_27(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_27(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_27(r, a, b); @@ -11863,7 +11875,7 @@ static void sp_3072_mont_mul_27(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_27(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_27(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_27(r, a); @@ -11919,6 +11931,7 @@ SP_NOINLINE static void sp_3072_mul_d_27(sp_digit* r, const sp_digit* a, r[27] = (sp_digit)(t & 0x1ffffffffffffffL); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -11946,6 +11959,7 @@ static void sp_3072_cond_add_27(sp_digit* r, const sp_digit* a, r[25] = a[25] + (b[25] & m); r[26] = a[26] + (b[26] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_3072_rshift_27(sp_digit* r, const sp_digit* a, byte n) @@ -12773,7 +12787,7 @@ static void sp_3072_mont_reduce_54(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_54(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_54(r, a, b); @@ -12787,7 +12801,7 @@ static void sp_3072_mont_mul_54(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_54(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_54(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_54(r, a); @@ -12834,6 +12848,7 @@ SP_NOINLINE static void sp_3072_mul_d_108(sp_digit* r, const sp_digit* a, r[108] = (sp_digit)(t & 0x1ffffffffffffffL); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -12864,6 +12879,7 @@ static void sp_3072_cond_add_54(sp_digit* r, const sp_digit* a, r[52] = a[52] + (b[52] & m); r[53] = a[53] + (b[53] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_3072_rshift_54(sp_digit* r, const sp_digit* a, byte n) @@ -15269,7 +15285,7 @@ SP_NOINLINE static void sp_4096_mul_35(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_35(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_35(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_35(r, a, b); @@ -15324,7 +15340,7 @@ SP_NOINLINE static void sp_4096_sqr_35(sp_digit* r, const sp_digit* a) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_35(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_35(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_35(r, a); @@ -15352,6 +15368,7 @@ SP_NOINLINE static void sp_4096_mul_d_35(sp_digit* r, const sp_digit* a, r[35] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -15369,6 +15386,7 @@ static void sp_4096_cond_add_35(sp_digit* r, const sp_digit* a, r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -16200,7 +16218,7 @@ static void sp_4096_mont_reduce_70(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_70(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_70(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_70(r, a, b); @@ -16214,7 +16232,7 @@ static void sp_4096_mont_mul_70(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_70(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_70(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_70(r, a); @@ -16242,6 +16260,7 @@ SP_NOINLINE static void sp_4096_mul_d_140(sp_digit* r, const sp_digit* a, r[140] = (sp_digit)t; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -16255,10 +16274,11 @@ static void sp_4096_cond_add_70(sp_digit* r, const sp_digit* a, { int i; - for (i = 0; i < 35; i++) { + for (i = 0; i < 70; i++) { r[i] = a[i] + (b[i] & m); } } +#endif /* WOLFSSL_SP_SMALL */ /* Add b to a into r. (r = a + b) * @@ -18348,133 +18368,6 @@ SP_NOINLINE static void sp_4096_mul_13(sp_digit* r, const sp_digit* a, r[24] = t24 & 0x1fffffffffffffL; } -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_4096_sqr_13(sp_digit* r, const sp_digit* a) -{ - sp_uint128 t0 = ((sp_uint128)a[ 0]) * a[ 0]; - sp_uint128 t1 = (((sp_uint128)a[ 0]) * a[ 1]) * 2; - sp_uint128 t2 = (((sp_uint128)a[ 0]) * a[ 2]) * 2 - + ((sp_uint128)a[ 1]) * a[ 1]; - sp_uint128 t3 = (((sp_uint128)a[ 0]) * a[ 3] - + ((sp_uint128)a[ 1]) * a[ 2]) * 2; - sp_uint128 t4 = (((sp_uint128)a[ 0]) * a[ 4] - + ((sp_uint128)a[ 1]) * a[ 3]) * 2 - + ((sp_uint128)a[ 2]) * a[ 2]; - sp_uint128 t5 = (((sp_uint128)a[ 0]) * a[ 5] - + ((sp_uint128)a[ 1]) * a[ 4] - + ((sp_uint128)a[ 2]) * a[ 3]) * 2; - sp_uint128 t6 = (((sp_uint128)a[ 0]) * a[ 6] - + ((sp_uint128)a[ 1]) * a[ 5] - + ((sp_uint128)a[ 2]) * a[ 4]) * 2 - + ((sp_uint128)a[ 3]) * a[ 3]; - sp_uint128 t7 = (((sp_uint128)a[ 0]) * a[ 7] - + ((sp_uint128)a[ 1]) * a[ 6] - + ((sp_uint128)a[ 2]) * a[ 5] - + ((sp_uint128)a[ 3]) * a[ 4]) * 2; - sp_uint128 t8 = (((sp_uint128)a[ 0]) * a[ 8] - + ((sp_uint128)a[ 1]) * a[ 7] - + ((sp_uint128)a[ 2]) * a[ 6] - + ((sp_uint128)a[ 3]) * a[ 5]) * 2 - + ((sp_uint128)a[ 4]) * a[ 4]; - sp_uint128 t9 = (((sp_uint128)a[ 0]) * a[ 9] - + ((sp_uint128)a[ 1]) * a[ 8] - + ((sp_uint128)a[ 2]) * a[ 7] - + ((sp_uint128)a[ 3]) * a[ 6] - + ((sp_uint128)a[ 4]) * a[ 5]) * 2; - sp_uint128 t10 = (((sp_uint128)a[ 0]) * a[10] - + ((sp_uint128)a[ 1]) * a[ 9] - + ((sp_uint128)a[ 2]) * a[ 8] - + ((sp_uint128)a[ 3]) * a[ 7] - + ((sp_uint128)a[ 4]) * a[ 6]) * 2 - + ((sp_uint128)a[ 5]) * a[ 5]; - sp_uint128 t11 = (((sp_uint128)a[ 0]) * a[11] - + ((sp_uint128)a[ 1]) * a[10] - + ((sp_uint128)a[ 2]) * a[ 9] - + ((sp_uint128)a[ 3]) * a[ 8] - + ((sp_uint128)a[ 4]) * a[ 7] - + ((sp_uint128)a[ 5]) * a[ 6]) * 2; - sp_uint128 t12 = (((sp_uint128)a[ 0]) * a[12] - + ((sp_uint128)a[ 1]) * a[11] - + ((sp_uint128)a[ 2]) * a[10] - + ((sp_uint128)a[ 3]) * a[ 9] - + ((sp_uint128)a[ 4]) * a[ 8] - + ((sp_uint128)a[ 5]) * a[ 7]) * 2 - + ((sp_uint128)a[ 6]) * a[ 6]; - sp_uint128 t13 = (((sp_uint128)a[ 1]) * a[12] - + ((sp_uint128)a[ 2]) * a[11] - + ((sp_uint128)a[ 3]) * a[10] - + ((sp_uint128)a[ 4]) * a[ 9] - + ((sp_uint128)a[ 5]) * a[ 8] - + ((sp_uint128)a[ 6]) * a[ 7]) * 2; - sp_uint128 t14 = (((sp_uint128)a[ 2]) * a[12] - + ((sp_uint128)a[ 3]) * a[11] - + ((sp_uint128)a[ 4]) * a[10] - + ((sp_uint128)a[ 5]) * a[ 9] - + ((sp_uint128)a[ 6]) * a[ 8]) * 2 - + ((sp_uint128)a[ 7]) * a[ 7]; - sp_uint128 t15 = (((sp_uint128)a[ 3]) * a[12] - + ((sp_uint128)a[ 4]) * a[11] - + ((sp_uint128)a[ 5]) * a[10] - + ((sp_uint128)a[ 6]) * a[ 9] - + ((sp_uint128)a[ 7]) * a[ 8]) * 2; - sp_uint128 t16 = (((sp_uint128)a[ 4]) * a[12] - + ((sp_uint128)a[ 5]) * a[11] - + ((sp_uint128)a[ 6]) * a[10] - + ((sp_uint128)a[ 7]) * a[ 9]) * 2 - + ((sp_uint128)a[ 8]) * a[ 8]; - sp_uint128 t17 = (((sp_uint128)a[ 5]) * a[12] - + ((sp_uint128)a[ 6]) * a[11] - + ((sp_uint128)a[ 7]) * a[10] - + ((sp_uint128)a[ 8]) * a[ 9]) * 2; - sp_uint128 t18 = (((sp_uint128)a[ 6]) * a[12] - + ((sp_uint128)a[ 7]) * a[11] - + ((sp_uint128)a[ 8]) * a[10]) * 2 - + ((sp_uint128)a[ 9]) * a[ 9]; - sp_uint128 t19 = (((sp_uint128)a[ 7]) * a[12] - + ((sp_uint128)a[ 8]) * a[11] - + ((sp_uint128)a[ 9]) * a[10]) * 2; - sp_uint128 t20 = (((sp_uint128)a[ 8]) * a[12] - + ((sp_uint128)a[ 9]) * a[11]) * 2 - + ((sp_uint128)a[10]) * a[10]; - sp_uint128 t21 = (((sp_uint128)a[ 9]) * a[12] - + ((sp_uint128)a[10]) * a[11]) * 2; - sp_uint128 t22 = (((sp_uint128)a[10]) * a[12]) * 2 - + ((sp_uint128)a[11]) * a[11]; - sp_uint128 t23 = (((sp_uint128)a[11]) * a[12]) * 2; - sp_uint128 t24 = ((sp_uint128)a[12]) * a[12]; - - t1 += t0 >> 53; r[ 0] = t0 & 0x1fffffffffffffL; - t2 += t1 >> 53; r[ 1] = t1 & 0x1fffffffffffffL; - t3 += t2 >> 53; r[ 2] = t2 & 0x1fffffffffffffL; - t4 += t3 >> 53; r[ 3] = t3 & 0x1fffffffffffffL; - t5 += t4 >> 53; r[ 4] = t4 & 0x1fffffffffffffL; - t6 += t5 >> 53; r[ 5] = t5 & 0x1fffffffffffffL; - t7 += t6 >> 53; r[ 6] = t6 & 0x1fffffffffffffL; - t8 += t7 >> 53; r[ 7] = t7 & 0x1fffffffffffffL; - t9 += t8 >> 53; r[ 8] = t8 & 0x1fffffffffffffL; - t10 += t9 >> 53; r[ 9] = t9 & 0x1fffffffffffffL; - t11 += t10 >> 53; r[10] = t10 & 0x1fffffffffffffL; - t12 += t11 >> 53; r[11] = t11 & 0x1fffffffffffffL; - t13 += t12 >> 53; r[12] = t12 & 0x1fffffffffffffL; - t14 += t13 >> 53; r[13] = t13 & 0x1fffffffffffffL; - t15 += t14 >> 53; r[14] = t14 & 0x1fffffffffffffL; - t16 += t15 >> 53; r[15] = t15 & 0x1fffffffffffffL; - t17 += t16 >> 53; r[16] = t16 & 0x1fffffffffffffL; - t18 += t17 >> 53; r[17] = t17 & 0x1fffffffffffffL; - t19 += t18 >> 53; r[18] = t18 & 0x1fffffffffffffL; - t20 += t19 >> 53; r[19] = t19 & 0x1fffffffffffffL; - t21 += t20 >> 53; r[20] = t20 & 0x1fffffffffffffL; - t22 += t21 >> 53; r[21] = t21 & 0x1fffffffffffffL; - t23 += t22 >> 53; r[22] = t22 & 0x1fffffffffffffL; - t24 += t23 >> 53; r[23] = t23 & 0x1fffffffffffffL; - r[25] = (sp_digit)(t24 >> 53); - r[24] = t24 & 0x1fffffffffffffL; -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -18605,48 +18498,6 @@ SP_NOINLINE static void sp_4096_mul_39(sp_digit* r, const sp_digit* a, (void)sp_4096_add_26(&r[52], &r[52], p4); } -/* Square a into r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_4096_sqr_39(sp_digit* r, const sp_digit* a) -{ - sp_digit p0[26]; - sp_digit p1[26]; - sp_digit p2[26]; - sp_digit p3[26]; - sp_digit p4[26]; - sp_digit p5[26]; - sp_digit t0[26]; - sp_digit t1[26]; - sp_digit t2[26]; - sp_digit a0[13]; - sp_digit a1[13]; - sp_digit a2[13]; - (void)sp_4096_add_13(a0, a, &a[13]); - (void)sp_4096_add_13(a1, &a[13], &a[26]); - (void)sp_4096_add_13(a2, a0, &a[26]); - sp_4096_sqr_13(p0, a); - sp_4096_sqr_13(p2, &a[13]); - sp_4096_sqr_13(p4, &a[26]); - sp_4096_sqr_13(p1, a0); - sp_4096_sqr_13(p3, a1); - sp_4096_sqr_13(p5, a2); - XMEMSET(r, 0, sizeof(*r)*2U*39U); - (void)sp_4096_sub_26(t0, p3, p2); - (void)sp_4096_sub_26(t1, p1, p2); - (void)sp_4096_sub_26(t2, p5, t0); - (void)sp_4096_sub_26(t2, t2, t1); - (void)sp_4096_sub_26(t0, t0, p4); - (void)sp_4096_sub_26(t1, t1, p0); - (void)sp_4096_add_26(r, r, p0); - (void)sp_4096_add_26(&r[13], &r[13], t1); - (void)sp_4096_add_26(&r[26], &r[26], t2); - (void)sp_4096_add_26(&r[39], &r[39], t0); - (void)sp_4096_add_26(&r[52], &r[52], p4); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -18765,6 +18616,175 @@ SP_NOINLINE static void sp_4096_mul_78(sp_digit* r, const sp_digit* a, (void)sp_4096_add_78(r + 39, r + 39, z1); } +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_4096_sqr_13(sp_digit* r, const sp_digit* a) +{ + sp_uint128 t0 = ((sp_uint128)a[ 0]) * a[ 0]; + sp_uint128 t1 = (((sp_uint128)a[ 0]) * a[ 1]) * 2; + sp_uint128 t2 = (((sp_uint128)a[ 0]) * a[ 2]) * 2 + + ((sp_uint128)a[ 1]) * a[ 1]; + sp_uint128 t3 = (((sp_uint128)a[ 0]) * a[ 3] + + ((sp_uint128)a[ 1]) * a[ 2]) * 2; + sp_uint128 t4 = (((sp_uint128)a[ 0]) * a[ 4] + + ((sp_uint128)a[ 1]) * a[ 3]) * 2 + + ((sp_uint128)a[ 2]) * a[ 2]; + sp_uint128 t5 = (((sp_uint128)a[ 0]) * a[ 5] + + ((sp_uint128)a[ 1]) * a[ 4] + + ((sp_uint128)a[ 2]) * a[ 3]) * 2; + sp_uint128 t6 = (((sp_uint128)a[ 0]) * a[ 6] + + ((sp_uint128)a[ 1]) * a[ 5] + + ((sp_uint128)a[ 2]) * a[ 4]) * 2 + + ((sp_uint128)a[ 3]) * a[ 3]; + sp_uint128 t7 = (((sp_uint128)a[ 0]) * a[ 7] + + ((sp_uint128)a[ 1]) * a[ 6] + + ((sp_uint128)a[ 2]) * a[ 5] + + ((sp_uint128)a[ 3]) * a[ 4]) * 2; + sp_uint128 t8 = (((sp_uint128)a[ 0]) * a[ 8] + + ((sp_uint128)a[ 1]) * a[ 7] + + ((sp_uint128)a[ 2]) * a[ 6] + + ((sp_uint128)a[ 3]) * a[ 5]) * 2 + + ((sp_uint128)a[ 4]) * a[ 4]; + sp_uint128 t9 = (((sp_uint128)a[ 0]) * a[ 9] + + ((sp_uint128)a[ 1]) * a[ 8] + + ((sp_uint128)a[ 2]) * a[ 7] + + ((sp_uint128)a[ 3]) * a[ 6] + + ((sp_uint128)a[ 4]) * a[ 5]) * 2; + sp_uint128 t10 = (((sp_uint128)a[ 0]) * a[10] + + ((sp_uint128)a[ 1]) * a[ 9] + + ((sp_uint128)a[ 2]) * a[ 8] + + ((sp_uint128)a[ 3]) * a[ 7] + + ((sp_uint128)a[ 4]) * a[ 6]) * 2 + + ((sp_uint128)a[ 5]) * a[ 5]; + sp_uint128 t11 = (((sp_uint128)a[ 0]) * a[11] + + ((sp_uint128)a[ 1]) * a[10] + + ((sp_uint128)a[ 2]) * a[ 9] + + ((sp_uint128)a[ 3]) * a[ 8] + + ((sp_uint128)a[ 4]) * a[ 7] + + ((sp_uint128)a[ 5]) * a[ 6]) * 2; + sp_uint128 t12 = (((sp_uint128)a[ 0]) * a[12] + + ((sp_uint128)a[ 1]) * a[11] + + ((sp_uint128)a[ 2]) * a[10] + + ((sp_uint128)a[ 3]) * a[ 9] + + ((sp_uint128)a[ 4]) * a[ 8] + + ((sp_uint128)a[ 5]) * a[ 7]) * 2 + + ((sp_uint128)a[ 6]) * a[ 6]; + sp_uint128 t13 = (((sp_uint128)a[ 1]) * a[12] + + ((sp_uint128)a[ 2]) * a[11] + + ((sp_uint128)a[ 3]) * a[10] + + ((sp_uint128)a[ 4]) * a[ 9] + + ((sp_uint128)a[ 5]) * a[ 8] + + ((sp_uint128)a[ 6]) * a[ 7]) * 2; + sp_uint128 t14 = (((sp_uint128)a[ 2]) * a[12] + + ((sp_uint128)a[ 3]) * a[11] + + ((sp_uint128)a[ 4]) * a[10] + + ((sp_uint128)a[ 5]) * a[ 9] + + ((sp_uint128)a[ 6]) * a[ 8]) * 2 + + ((sp_uint128)a[ 7]) * a[ 7]; + sp_uint128 t15 = (((sp_uint128)a[ 3]) * a[12] + + ((sp_uint128)a[ 4]) * a[11] + + ((sp_uint128)a[ 5]) * a[10] + + ((sp_uint128)a[ 6]) * a[ 9] + + ((sp_uint128)a[ 7]) * a[ 8]) * 2; + sp_uint128 t16 = (((sp_uint128)a[ 4]) * a[12] + + ((sp_uint128)a[ 5]) * a[11] + + ((sp_uint128)a[ 6]) * a[10] + + ((sp_uint128)a[ 7]) * a[ 9]) * 2 + + ((sp_uint128)a[ 8]) * a[ 8]; + sp_uint128 t17 = (((sp_uint128)a[ 5]) * a[12] + + ((sp_uint128)a[ 6]) * a[11] + + ((sp_uint128)a[ 7]) * a[10] + + ((sp_uint128)a[ 8]) * a[ 9]) * 2; + sp_uint128 t18 = (((sp_uint128)a[ 6]) * a[12] + + ((sp_uint128)a[ 7]) * a[11] + + ((sp_uint128)a[ 8]) * a[10]) * 2 + + ((sp_uint128)a[ 9]) * a[ 9]; + sp_uint128 t19 = (((sp_uint128)a[ 7]) * a[12] + + ((sp_uint128)a[ 8]) * a[11] + + ((sp_uint128)a[ 9]) * a[10]) * 2; + sp_uint128 t20 = (((sp_uint128)a[ 8]) * a[12] + + ((sp_uint128)a[ 9]) * a[11]) * 2 + + ((sp_uint128)a[10]) * a[10]; + sp_uint128 t21 = (((sp_uint128)a[ 9]) * a[12] + + ((sp_uint128)a[10]) * a[11]) * 2; + sp_uint128 t22 = (((sp_uint128)a[10]) * a[12]) * 2 + + ((sp_uint128)a[11]) * a[11]; + sp_uint128 t23 = (((sp_uint128)a[11]) * a[12]) * 2; + sp_uint128 t24 = ((sp_uint128)a[12]) * a[12]; + + t1 += t0 >> 53; r[ 0] = t0 & 0x1fffffffffffffL; + t2 += t1 >> 53; r[ 1] = t1 & 0x1fffffffffffffL; + t3 += t2 >> 53; r[ 2] = t2 & 0x1fffffffffffffL; + t4 += t3 >> 53; r[ 3] = t3 & 0x1fffffffffffffL; + t5 += t4 >> 53; r[ 4] = t4 & 0x1fffffffffffffL; + t6 += t5 >> 53; r[ 5] = t5 & 0x1fffffffffffffL; + t7 += t6 >> 53; r[ 6] = t6 & 0x1fffffffffffffL; + t8 += t7 >> 53; r[ 7] = t7 & 0x1fffffffffffffL; + t9 += t8 >> 53; r[ 8] = t8 & 0x1fffffffffffffL; + t10 += t9 >> 53; r[ 9] = t9 & 0x1fffffffffffffL; + t11 += t10 >> 53; r[10] = t10 & 0x1fffffffffffffL; + t12 += t11 >> 53; r[11] = t11 & 0x1fffffffffffffL; + t13 += t12 >> 53; r[12] = t12 & 0x1fffffffffffffL; + t14 += t13 >> 53; r[13] = t13 & 0x1fffffffffffffL; + t15 += t14 >> 53; r[14] = t14 & 0x1fffffffffffffL; + t16 += t15 >> 53; r[15] = t15 & 0x1fffffffffffffL; + t17 += t16 >> 53; r[16] = t16 & 0x1fffffffffffffL; + t18 += t17 >> 53; r[17] = t17 & 0x1fffffffffffffL; + t19 += t18 >> 53; r[18] = t18 & 0x1fffffffffffffL; + t20 += t19 >> 53; r[19] = t19 & 0x1fffffffffffffL; + t21 += t20 >> 53; r[20] = t20 & 0x1fffffffffffffL; + t22 += t21 >> 53; r[21] = t21 & 0x1fffffffffffffL; + t23 += t22 >> 53; r[22] = t22 & 0x1fffffffffffffL; + t24 += t23 >> 53; r[23] = t23 & 0x1fffffffffffffL; + r[25] = (sp_digit)(t24 >> 53); + r[24] = t24 & 0x1fffffffffffffL; +} + +/* Square a into r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_4096_sqr_39(sp_digit* r, const sp_digit* a) +{ + sp_digit p0[26]; + sp_digit p1[26]; + sp_digit p2[26]; + sp_digit p3[26]; + sp_digit p4[26]; + sp_digit p5[26]; + sp_digit t0[26]; + sp_digit t1[26]; + sp_digit t2[26]; + sp_digit a0[13]; + sp_digit a1[13]; + sp_digit a2[13]; + (void)sp_4096_add_13(a0, a, &a[13]); + (void)sp_4096_add_13(a1, &a[13], &a[26]); + (void)sp_4096_add_13(a2, a0, &a[26]); + sp_4096_sqr_13(p0, a); + sp_4096_sqr_13(p2, &a[13]); + sp_4096_sqr_13(p4, &a[26]); + sp_4096_sqr_13(p1, a0); + sp_4096_sqr_13(p3, a1); + sp_4096_sqr_13(p5, a2); + XMEMSET(r, 0, sizeof(*r)*2U*39U); + (void)sp_4096_sub_26(t0, p3, p2); + (void)sp_4096_sub_26(t1, p1, p2); + (void)sp_4096_sub_26(t2, p5, t0); + (void)sp_4096_sub_26(t2, t2, t1); + (void)sp_4096_sub_26(t0, t0, p4); + (void)sp_4096_sub_26(t1, t1, p0); + (void)sp_4096_add_26(r, r, p0); + (void)sp_4096_add_26(&r[13], &r[13], t1); + (void)sp_4096_add_26(&r[26], &r[26], t2); + (void)sp_4096_add_26(&r[39], &r[39], t0); + (void)sp_4096_add_26(&r[52], &r[52], p4); +} + /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -19111,7 +19131,7 @@ static void sp_4096_mont_reduce_39(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_39(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_39(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_39(r, a, b); @@ -19125,7 +19145,7 @@ static void sp_4096_mont_mul_39(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_39(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_39(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_39(r, a); @@ -19181,6 +19201,7 @@ SP_NOINLINE static void sp_4096_mul_d_39(sp_digit* r, const sp_digit* a, r[39] = (sp_digit)(t & 0x1fffffffffffffL); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -19212,6 +19233,7 @@ static void sp_4096_cond_add_39(sp_digit* r, const sp_digit* a, r[37] = a[37] + (b[37] & m); r[38] = a[38] + (b[38] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_4096_rshift_39(sp_digit* r, const sp_digit* a, byte n) @@ -20012,7 +20034,7 @@ static void sp_4096_mont_reduce_78(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_78(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_78(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_78(r, a, b); @@ -20026,7 +20048,7 @@ static void sp_4096_mont_mul_78(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_78(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_78(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_78(r, a); @@ -20073,6 +20095,7 @@ SP_NOINLINE static void sp_4096_mul_d_156(sp_digit* r, const sp_digit* a, r[156] = (sp_digit)(t & 0x1fffffffffffffL); } +#ifndef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -20103,6 +20126,7 @@ static void sp_4096_cond_add_78(sp_digit* r, const sp_digit* a, r[76] = a[76] + (b[76] & m); r[77] = a[77] + (b[77] & m); } +#endif /* !WOLFSSL_SP_SMALL */ SP_NOINLINE static void sp_4096_rshift_78(sp_digit* r, const sp_digit* a, byte n) @@ -22625,7 +22649,7 @@ static void sp_256_mont_reduce_5(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_256_mont_mul_5(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_256_mont_mul_5(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_256_mul_5(r, a, b); @@ -22639,7 +22663,7 @@ static void sp_256_mont_mul_5(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_256_mont_sqr_5(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_256_mont_sqr_5(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_256_sqr_5(r, a); @@ -22837,6 +22861,7 @@ static void sp_256_mont_tpl_5(sp_digit* r, const sp_digit* a, const sp_digit* m) sp_256_norm_5(r); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -22848,20 +22873,33 @@ static void sp_256_mont_tpl_5(sp_digit* r, const sp_digit* a, const sp_digit* m) static void sp_256_cond_add_5(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; for (i = 0; i < 5; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_256_cond_add_5(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ r[ 0] = a[ 0] + (b[ 0] & m); r[ 1] = a[ 1] + (b[ 1] & m); r[ 2] = a[ 2] + (b[ 2] & m); r[ 3] = a[ 3] + (b[ 3] & m); r[ 4] = a[ 4] + (b[ 4] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * @@ -29348,7 +29386,7 @@ static void sp_384_mont_reduce_7(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_mul_7(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_mul_7(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_384_mul_7(r, a, b); @@ -29362,7 +29400,7 @@ static void sp_384_mont_mul_7(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_sqr_7(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_sqr_7(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_384_sqr_7(r, a); @@ -29576,6 +29614,7 @@ static void sp_384_mont_tpl_7(sp_digit* r, const sp_digit* a, const sp_digit* m) sp_384_norm_7(r); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -29587,13 +29626,26 @@ static void sp_384_mont_tpl_7(sp_digit* r, const sp_digit* a, const sp_digit* m) static void sp_384_cond_add_7(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; for (i = 0; i < 7; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_384_cond_add_7(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ r[ 0] = a[ 0] + (b[ 0] & m); r[ 1] = a[ 1] + (b[ 1] & m); r[ 2] = a[ 2] + (b[ 2] & m); @@ -29601,8 +29653,8 @@ static void sp_384_cond_add_7(sp_digit* r, const sp_digit* a, r[ 4] = a[ 4] + (b[ 4] & m); r[ 5] = a[ 5] + (b[ 5] & m); r[ 6] = a[ 6] + (b[ 6] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * @@ -36701,7 +36753,7 @@ static void sp_521_mont_reduce_order_9(sp_digit* a, const sp_digit* m, sp_digit * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_mul_9(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_521_mul_9(r, a, b); @@ -36715,7 +36767,7 @@ static void sp_521_mont_mul_9(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_521_sqr_9(r, a); @@ -36926,6 +36978,7 @@ static void sp_521_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m) sp_521_norm_9(r); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -36937,13 +36990,26 @@ static void sp_521_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m) static void sp_521_cond_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; for (i = 0; i < 9; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_521_cond_add_9(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ r[ 0] = a[ 0] + (b[ 0] & m); r[ 1] = a[ 1] + (b[ 1] & m); r[ 2] = a[ 2] + (b[ 2] & m); @@ -36953,8 +37019,8 @@ static void sp_521_cond_add_9(sp_digit* r, const sp_digit* a, r[ 6] = a[ 6] + (b[ 6] & m); r[ 7] = a[ 7] + (b[ 7] & m); r[ 8] = a[ 8] + (b[ 8] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * @@ -43766,6 +43832,7 @@ SP_NOINLINE static void sp_1024_mul_d_36(sp_digit* r, const sp_digit* a, #endif /* WOLFSSL_SP_SMALL */ } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -43777,13 +43844,26 @@ SP_NOINLINE static void sp_1024_mul_d_36(sp_digit* r, const sp_digit* a, static void sp_1024_cond_add_18(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit m) { -#ifdef WOLFSSL_SP_SMALL int i; for (i = 0; i < 18; i++) { r[i] = a[i] + (b[i] & m); } -#else +} +#endif /* WOLFSSL_SP_SMALL */ + +#ifndef WOLFSSL_SP_SMALL +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static void sp_1024_cond_add_18(sp_digit* r, const sp_digit* a, + const sp_digit* b, const sp_digit m) +{ int i; for (i = 0; i < 16; i += 8) { @@ -43798,8 +43878,8 @@ static void sp_1024_cond_add_18(sp_digit* r, const sp_digit* a, } r[16] = a[16] + (b[16] & m); r[17] = a[17] + (b[17] & m); -#endif /* WOLFSSL_SP_SMALL */ } +#endif /* !WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Sub b from a into r. (r = a - b) @@ -44566,7 +44646,7 @@ static void sp_1024_mont_reduce_18(sp_digit* a, const sp_digit* m, sp_digit mp) * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_mul_18(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_1024_mul_18(r, a, b); @@ -44580,7 +44660,7 @@ static void sp_1024_mont_mul_18(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_sqr_18(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_sqr_18(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_1024_sqr_18(r, a); diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index e373269c9..cbff69c33 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -57,19 +57,19 @@ #endif #ifdef WOLFSSL_SP_ARM_CORTEX_M_ASM -#define SP_PRINT_NUM(var, name, total, words, bits) \ - do { \ - int ii; \ - fprintf(stderr, name "=0x"); \ - for (ii = words - 1; ii >= 0; ii--) \ - fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ - fprintf(stderr, "\n"); \ +#define SP_PRINT_NUM(var, name, total, words, bits) \ + do { \ + int ii; \ + fprintf(stderr, name "=0x"); \ + for (ii = ((bits + 31) / 32) - 1; ii >= 0; ii--) \ + fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ + fprintf(stderr, "\n"); \ } while (0) -#define SP_PRINT_VAL(var, name) \ +#define SP_PRINT_VAL(var, name) \ fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var) -#define SP_PRINT_INT(var, name) \ +#define SP_PRINT_INT(var, name) \ fprintf(stderr, name "=%d\n", var) #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) @@ -740,357 +740,6 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, ); } -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) -{ - sp_digit tmp_arr[8]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - /* A[0] * A[0] */ - "ldr r6, [%[a], #0]\n\t" - "umull r3, r4, r6, r6\n\t" - "mov r5, #0\n\t" - "str r3, [%[tmp], #0]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[1] */ - "ldr r8, [%[a], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #4]\n\t" - "mov r4, #0\n\t" - /* A[0] * A[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * A[1] */ - "ldr r6, [%[a], #4]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #8]\n\t" - "mov r5, #0\n\t" - /* A[0] * A[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[tmp], #12]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[2] */ - "ldr r6, [%[a], #8]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[tmp], #16]\n\t" - "mov r4, #0\n\t" - /* A[0] * A[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r5, r5, r9\n\t" - "adcs r3, r3, r10\n\t" - "adc r4, r4, r11\n\t" - "str r5, [%[tmp], #20]\n\t" - "mov r5, #0\n\t" - /* A[0] * A[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[3] */ - "ldr r6, [%[a], #12]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[tmp], #24]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[tmp], #28]\n\t" - "mov r4, #0\n\t" - /* A[1] * A[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[2] * A[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[4] * A[4] */ - "ldr r6, [%[a], #16]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r5, r5, r9\n\t" - "adcs r3, r3, r10\n\t" - "adc r4, r4, r11\n\t" - "str r5, [%[r], #32]\n\t" - "mov r5, #0\n\t" - /* A[2] * A[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[3] * A[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[4] * A[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[r], #36]\n\t" - "mov r3, #0\n\t" - /* A[3] * A[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[4] * A[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[5] * A[5] */ - "ldr r6, [%[a], #20]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[r], #40]\n\t" - "mov r4, #0\n\t" - /* A[4] * A[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * A[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #44]\n\t" - "mov r5, #0\n\t" - /* A[5] * A[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * A[6] */ - "ldr r6, [%[a], #24]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #48]\n\t" - "mov r3, #0\n\t" - /* A[6] * A[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #52]\n\t" - "mov r4, #0\n\t" - /* A[7] * A[7] */ - "ldr r6, [%[a], #28]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "str r5, [%[r], #56]\n\t" - "str r3, [%[r], #60]\n\t" - /* Transfer tmp to r */ - "ldr r3, [%[tmp], #0]\n\t" - "ldr r4, [%[tmp], #4]\n\t" - "ldr r5, [%[tmp], #8]\n\t" - "ldr r6, [%[tmp], #12]\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[tmp], #16]\n\t" - "ldr r4, [%[tmp], #20]\n\t" - "ldr r5, [%[tmp], #24]\n\t" - "ldr r6, [%[tmp], #28]\n\t" - "str r3, [%[r], #16]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #24]\n\t" - "str r6, [%[r], #28]\n\t" - : - : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11" - ); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -1295,7 +944,7 @@ SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, sp_digit z1[16]; sp_digit a1[8]; sp_digit b1[8]; - sp_digit z2[16]; + sp_digit* z2 = r + 16; sp_digit u; sp_digit ca; sp_digit cb; @@ -1303,45 +952,22 @@ SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, ca = sp_2048_add_8(a1, a, &a[8]); cb = sp_2048_add_8(b1, b, &b[8]); u = ca & cb; - sp_2048_mul_8(z1, a1, b1); + sp_2048_mul_8(z2, &a[8], &b[8]); sp_2048_mul_8(z0, a, b); - sp_2048_mask_8(r + 16, a1, 0 - cb); + sp_2048_mul_8(z1, a1, b1); + + u += sp_2048_sub_in_place_16(z1, z0); + u += sp_2048_sub_in_place_16(z1, z2); + sp_2048_mask_8(a1, a1, 0 - cb); + u += sp_2048_add_8(z1 + 8, z1 + 8, a1); sp_2048_mask_8(b1, b1, 0 - ca); - u += sp_2048_add_8(r + 16, r + 16, b1); - u += sp_2048_sub_in_place_16(z1, z2); - u += sp_2048_sub_in_place_16(z1, z0); - u += sp_2048_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - (void)sp_2048_add_16(r + 16, r + 16, z2); -} + u += sp_2048_add_8(z1 + 8, z1 + 8, b1); -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[16]; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit u; - - u = sp_2048_add_8(a1, a, &a[8]); - sp_2048_sqr_8(z1, a1); - sp_2048_sqr_8(z2, &a[8]); - sp_2048_sqr_8(z0, a); - sp_2048_mask_8(r + 16, a1, 0 - u); - u += sp_2048_add_8(r + 16, r + 16, r + 16); - u += sp_2048_sub_in_place_16(z1, z2); - u += sp_2048_sub_in_place_16(z1, z0); u += sp_2048_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - (void)sp_2048_add_16(r + 16, r + 16, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (8 - 1)); + a1[0] = u; + (void)sp_2048_add_8(r + 24, r + 24, a1); } /* Sub b from a into r. (r = a - b) @@ -1590,7 +1216,7 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, sp_digit z1[32]; sp_digit a1[16]; sp_digit b1[16]; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit u; sp_digit ca; sp_digit cb; @@ -1598,45 +1224,22 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, ca = sp_2048_add_16(a1, a, &a[16]); cb = sp_2048_add_16(b1, b, &b[16]); u = ca & cb; - sp_2048_mul_16(z1, a1, b1); + sp_2048_mul_16(z2, &a[16], &b[16]); sp_2048_mul_16(z0, a, b); - sp_2048_mask_16(r + 32, a1, 0 - cb); + sp_2048_mul_16(z1, a1, b1); + + u += sp_2048_sub_in_place_32(z1, z0); + u += sp_2048_sub_in_place_32(z1, z2); + sp_2048_mask_16(a1, a1, 0 - cb); + u += sp_2048_add_16(z1 + 16, z1 + 16, a1); sp_2048_mask_16(b1, b1, 0 - ca); - u += sp_2048_add_16(r + 32, r + 32, b1); - u += sp_2048_sub_in_place_32(z1, z2); - u += sp_2048_sub_in_place_32(z1, z0); - u += sp_2048_add_32(r + 16, r + 16, z1); - r[48] = u; - XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1)); - (void)sp_2048_add_32(r + 32, r + 32, z2); -} + u += sp_2048_add_16(z1 + 16, z1 + 16, b1); -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[32]; - sp_digit z1[32]; - sp_digit a1[16]; - sp_digit u; - - u = sp_2048_add_16(a1, a, &a[16]); - sp_2048_sqr_16(z1, a1); - sp_2048_sqr_16(z2, &a[16]); - sp_2048_sqr_16(z0, a); - sp_2048_mask_16(r + 32, a1, 0 - u); - u += sp_2048_add_16(r + 32, r + 32, r + 32); - u += sp_2048_sub_in_place_32(z1, z2); - u += sp_2048_sub_in_place_32(z1, z0); u += sp_2048_add_32(r + 16, r + 16, z1); - r[48] = u; - XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1)); - (void)sp_2048_add_32(r + 32, r + 32, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (16 - 1)); + a1[0] = u; + (void)sp_2048_add_16(r + 48, r + 48, a1); } /* Sub b from a into r. (r = a - b) @@ -2045,7 +1648,7 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, sp_digit z1[64]; sp_digit a1[32]; sp_digit b1[32]; - sp_digit z2[64]; + sp_digit* z2 = r + 64; sp_digit u; sp_digit ca; sp_digit cb; @@ -2053,18 +1656,648 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, ca = sp_2048_add_32(a1, a, &a[32]); cb = sp_2048_add_32(b1, b, &b[32]); u = ca & cb; - sp_2048_mul_32(z1, a1, b1); + sp_2048_mul_32(z2, &a[32], &b[32]); sp_2048_mul_32(z0, a, b); - sp_2048_mask_32(r + 64, a1, 0 - cb); - sp_2048_mask_32(b1, b1, 0 - ca); - u += sp_2048_add_32(r + 64, r + 64, b1); - u += sp_2048_sub_in_place_64(z1, z2); + sp_2048_mul_32(z1, a1, b1); + u += sp_2048_sub_in_place_64(z1, z0); + u += sp_2048_sub_in_place_64(z1, z2); + sp_2048_mask_32(a1, a1, 0 - cb); + u += sp_2048_add_32(z1 + 32, z1 + 32, a1); + sp_2048_mask_32(b1, b1, 0 - ca); + u += sp_2048_add_32(z1 + 32, z1 + 32, b1); + u += sp_2048_add_64(r + 32, r + 32, z1); - r[96] = u; - XMEMSET(r + 96 + 1, 0, sizeof(sp_digit) * (32 - 1)); - (void)sp_2048_add_64(r + 64, r + 64, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (32 - 1)); + a1[0] = u; + (void)sp_2048_add_32(r + 96, r + 96, a1); +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) +{ + sp_digit tmp_arr[8]; + sp_digit* tmp = tmp_arr; + __asm__ __volatile__ ( + /* A[0] * A[0] */ + "ldr r6, [%[a], #0]\n\t" + "umull r3, r4, r6, r6\n\t" + "mov r5, #0\n\t" + "str r3, [%[tmp], #0]\n\t" + "mov r3, #0\n\t" + /* A[0] * A[1] */ + "ldr r8, [%[a], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "str r4, [%[tmp], #4]\n\t" + "mov r4, #0\n\t" + /* A[0] * A[2] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[1] */ + "ldr r6, [%[a], #4]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "str r5, [%[tmp], #8]\n\t" + "mov r5, #0\n\t" + /* A[0] * A[3] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[2] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r3, r3, r9\n\t" + "adcs r4, r4, r10\n\t" + "adc r5, r5, r11\n\t" + "str r3, [%[tmp], #12]\n\t" + "mov r3, #0\n\t" + /* A[0] * A[4] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[3] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[2] * A[2] */ + "ldr r6, [%[a], #8]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adc r3, r3, r11\n\t" + "str r4, [%[tmp], #16]\n\t" + "mov r4, #0\n\t" + /* A[0] * A[5] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[4] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[2] * A[3] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r5, r5, r9\n\t" + "adcs r3, r3, r10\n\t" + "adc r4, r4, r11\n\t" + "str r5, [%[tmp], #20]\n\t" + "mov r5, #0\n\t" + /* A[0] * A[6] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[5] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[2] * A[4] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[3] * A[3] */ + "ldr r6, [%[a], #12]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r3, r3, r9\n\t" + "adcs r4, r4, r10\n\t" + "adc r5, r5, r11\n\t" + "str r3, [%[tmp], #24]\n\t" + "mov r3, #0\n\t" + /* A[0] * A[7] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[6] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[2] * A[5] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[3] * A[4] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adc r3, r3, r11\n\t" + "str r4, [%[tmp], #28]\n\t" + "mov r4, #0\n\t" + /* A[1] * A[7] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[2] * A[6] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[3] * A[5] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[4] * A[4] */ + "ldr r6, [%[a], #16]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r5, r5, r9\n\t" + "adcs r3, r3, r10\n\t" + "adc r4, r4, r11\n\t" + "str r5, [%[r], #32]\n\t" + "mov r5, #0\n\t" + /* A[2] * A[7] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[3] * A[6] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[4] * A[5] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r3, r3, r9\n\t" + "adcs r4, r4, r10\n\t" + "adc r5, r5, r11\n\t" + "str r3, [%[r], #36]\n\t" + "mov r3, #0\n\t" + /* A[3] * A[7] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[4] * A[6] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[5] * A[5] */ + "ldr r6, [%[a], #20]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adc r3, r3, r11\n\t" + "str r4, [%[r], #40]\n\t" + "mov r4, #0\n\t" + /* A[4] * A[7] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * A[6] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "str r5, [%[r], #44]\n\t" + "mov r5, #0\n\t" + /* A[5] * A[7] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * A[6] */ + "ldr r6, [%[a], #24]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "str r3, [%[r], #48]\n\t" + "mov r3, #0\n\t" + /* A[6] * A[7] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "str r4, [%[r], #52]\n\t" + "mov r4, #0\n\t" + /* A[7] * A[7] */ + "ldr r6, [%[a], #28]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r8\n\t" + "str r5, [%[r], #56]\n\t" + "str r3, [%[r], #60]\n\t" + /* Transfer tmp to r */ + "ldr r3, [%[tmp], #0]\n\t" + "ldr r4, [%[tmp], #4]\n\t" + "ldr r5, [%[tmp], #8]\n\t" + "ldr r6, [%[tmp], #12]\n\t" + "str r3, [%[r], #0]\n\t" + "str r4, [%[r], #4]\n\t" + "str r5, [%[r], #8]\n\t" + "str r6, [%[r], #12]\n\t" + "ldr r3, [%[tmp], #16]\n\t" + "ldr r4, [%[tmp], #20]\n\t" + "ldr r5, [%[tmp], #24]\n\t" + "ldr r6, [%[tmp], #28]\n\t" + "str r3, [%[r], #16]\n\t" + "str r4, [%[r], #20]\n\t" + "str r5, [%[r], #24]\n\t" + "str r6, [%[r], #28]\n\t" + : + : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) + : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11" + ); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_8(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit* z2 = r + 16; + sp_digit z1[16]; + sp_digit* a1 = z1; + sp_digit zero[8]; + sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 8); + + mask = sp_2048_sub_8(a1, a, &a[8]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_8(a1, p1, p2); + + sp_2048_sqr_8(z2, &a[8]); + sp_2048_sqr_8(z0, a); + sp_2048_sqr_8(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_16(z1, z2); + u -= sp_2048_sub_in_place_16(z1, z0); + u += sp_2048_sub_in_place_16(r + 8, z1); + zero[0] = u; + (void)sp_2048_add_8(r + 24, r + 24, zero); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_16(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit* z2 = r + 32; + sp_digit z1[32]; + sp_digit* a1 = z1; + sp_digit zero[16]; + sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 16); + + mask = sp_2048_sub_16(a1, a, &a[16]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_16(a1, p1, p2); + + sp_2048_sqr_16(z2, &a[16]); + sp_2048_sqr_16(z0, a); + sp_2048_sqr_16(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_32(z1, z2); + u -= sp_2048_sub_in_place_32(z1, z0); + u += sp_2048_sub_in_place_32(r + 16, z1); + zero[0] = u; + (void)sp_2048_add_16(r + 48, r + 48, zero); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; } /* Square a and put result in r. (r = a * a) @@ -2075,23 +2308,32 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[64]; + sp_digit* z2 = r + 64; sp_digit z1[64]; - sp_digit a1[32]; + sp_digit* a1 = z1; + sp_digit zero[32]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 32); + + mask = sp_2048_sub_32(a1, a, &a[32]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_32(a1, p1, p2); - u = sp_2048_add_32(a1, a, &a[32]); - sp_2048_sqr_32(z1, a1); sp_2048_sqr_32(z2, &a[32]); sp_2048_sqr_32(z0, a); - sp_2048_mask_32(r + 64, a1, 0 - u); - u += sp_2048_add_32(r + 64, r + 64, r + 64); - u += sp_2048_sub_in_place_64(z1, z2); - u += sp_2048_sub_in_place_64(z1, z0); - u += sp_2048_add_64(r + 32, r + 32, z1); - r[96] = u; - XMEMSET(r + 96 + 1, 0, sizeof(sp_digit) * (32 - 1)); - (void)sp_2048_add_64(r + 64, r + 64, z2); + sp_2048_sqr_32(z1, a1); + + u = 0; + u -= sp_2048_sub_in_place_64(z1, z2); + u -= sp_2048_sub_in_place_64(z1, z0); + u += sp_2048_sub_in_place_64(r + 32, z1); + zero[0] = u; + (void)sp_2048_add_32(r + 96, r + 96, zero); } #endif /* !WOLFSSL_SP_SMALL */ @@ -2948,7 +3190,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_32(r, a, b); @@ -2962,7 +3204,7 @@ static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_32(r, a); @@ -3011,11 +3253,11 @@ SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -3130,7 +3372,7 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig div = d[31]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i=31; i>=0; i--) { + for (i = 31; i >= 0; i--) { sp_digit hi = t1[32 + i] - (t1[32 + i] == div); r1 = div_2048_word_32(hi, t1[32 + i - 1], div); @@ -3661,7 +3903,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_64(r, a, b); @@ -3675,7 +3917,7 @@ static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_64(r, a); @@ -3907,11 +4149,11 @@ SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -3979,9 +4221,13 @@ static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, s div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_2048_word_64(hi, t1[64 + i - 1], div); + for (i = 63; i >= 0; i--) { + if (t1[64 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_2048_word_64(t1[64 + i], t1[64 + i - 1], div); + } sp_2048_mul_d_64(t2, d, r1); t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2); @@ -4117,7 +4363,7 @@ static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_dig div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { + for (i = 63; i >= 0; i--) { sp_digit hi = t1[64 + i] - (t1[64 + i] == div); r1 = div_2048_word_64(hi, t1[64 + i - 1], div); @@ -4503,9 +4749,9 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 64; r = a + 64 * 2; m = r + 64 * 2; - ah = a + 64; sp_2048_from_bin(ah, 64, in, inLen); #if DIGIT_BIT >= 32 @@ -4523,7 +4769,38 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_2048_from_mp(m, 64, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_2048_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 64); + err = sp_2048_mod_64_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_2048_mont_sqr_64(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_2048_mont_mul_64(r, r, ah, m, mp); + + for (i = 63; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_2048_sub_in_place_64(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_2048_sqr_64(r, ah); err = sp_2048_mod_64_cond(r, r, m); @@ -4551,7 +4828,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 64); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_2048_mont_sqr_64(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_2048_mont_mul_64(r, r, a, m, mp); @@ -5830,130 +6107,6 @@ SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); } -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #96\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #44\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" -#ifdef __GNUC__ - "beq 4f\n\t" -#else - "beq.n 4f\n\t" -#endif /* __GNUC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ -#ifdef __GNUC__ - "bal 5f\n\t" -#else - "bal.n 5f\n\t" -#endif /* __GNUC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #48\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#ifdef __GNUC__ - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ */ - "cmp %[a], r2\n\t" -#ifdef __GNUC__ - "bgt 3f\n\t" -#else - "bgt.n 3f\n\t" -#endif /* __GNUC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" -#ifdef __GNUC__ - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #88\n\t" - "cmp r8, r6\n\t" -#ifdef __GNUC__ - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #92\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#ifdef __GNUC__ - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ */ - "mov r6, #96\n\t" - "add sp, sp, r6\n\t" - : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" - ); -} - /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -6212,7 +6365,7 @@ SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, sp_digit z1[24]; sp_digit a1[12]; sp_digit b1[12]; - sp_digit z2[24]; + sp_digit* z2 = r + 24; sp_digit u; sp_digit ca; sp_digit cb; @@ -6220,45 +6373,22 @@ SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, ca = sp_3072_add_12(a1, a, &a[12]); cb = sp_3072_add_12(b1, b, &b[12]); u = ca & cb; - sp_3072_mul_12(z1, a1, b1); + sp_3072_mul_12(z2, &a[12], &b[12]); sp_3072_mul_12(z0, a, b); - sp_3072_mask_12(r + 24, a1, 0 - cb); + sp_3072_mul_12(z1, a1, b1); + + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_sub_in_place_24(z1, z2); + sp_3072_mask_12(a1, a1, 0 - cb); + u += sp_3072_add_12(z1 + 12, z1 + 12, a1); sp_3072_mask_12(b1, b1, 0 - ca); - u += sp_3072_add_12(r + 24, r + 24, b1); - u += sp_3072_sub_in_place_24(z1, z2); - u += sp_3072_sub_in_place_24(z1, z0); - u += sp_3072_add_24(r + 12, r + 12, z1); - r[36] = u; - XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); - (void)sp_3072_add_24(r + 24, r + 24, z2); -} + u += sp_3072_add_12(z1 + 12, z1 + 12, b1); -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[24]; - sp_digit z1[24]; - sp_digit a1[12]; - sp_digit u; - - u = sp_3072_add_12(a1, a, &a[12]); - sp_3072_sqr_12(z1, a1); - sp_3072_sqr_12(z2, &a[12]); - sp_3072_sqr_12(z0, a); - sp_3072_mask_12(r + 24, a1, 0 - u); - u += sp_3072_add_12(r + 24, r + 24, r + 24); - u += sp_3072_sub_in_place_24(z1, z2); - u += sp_3072_sub_in_place_24(z1, z0); u += sp_3072_add_24(r + 12, r + 12, z1); - r[36] = u; - XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); - (void)sp_3072_add_24(r + 24, r + 24, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (12 - 1)); + a1[0] = u; + (void)sp_3072_add_12(r + 36, r + 36, a1); } /* Sub b from a into r. (r = a - b) @@ -6587,7 +6717,7 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, sp_digit z1[48]; sp_digit a1[24]; sp_digit b1[24]; - sp_digit z2[48]; + sp_digit* z2 = r + 48; sp_digit u; sp_digit ca; sp_digit cb; @@ -6595,45 +6725,22 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, ca = sp_3072_add_24(a1, a, &a[24]); cb = sp_3072_add_24(b1, b, &b[24]); u = ca & cb; - sp_3072_mul_24(z1, a1, b1); + sp_3072_mul_24(z2, &a[24], &b[24]); sp_3072_mul_24(z0, a, b); - sp_3072_mask_24(r + 48, a1, 0 - cb); + sp_3072_mul_24(z1, a1, b1); + + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_sub_in_place_48(z1, z2); + sp_3072_mask_24(a1, a1, 0 - cb); + u += sp_3072_add_24(z1 + 24, z1 + 24, a1); sp_3072_mask_24(b1, b1, 0 - ca); - u += sp_3072_add_24(r + 48, r + 48, b1); - u += sp_3072_sub_in_place_48(z1, z2); - u += sp_3072_sub_in_place_48(z1, z0); - u += sp_3072_add_48(r + 24, r + 24, z1); - r[72] = u; - XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); - (void)sp_3072_add_48(r + 48, r + 48, z2); -} + u += sp_3072_add_24(z1 + 24, z1 + 24, b1); -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[48]; - sp_digit z1[48]; - sp_digit a1[24]; - sp_digit u; - - u = sp_3072_add_24(a1, a, &a[24]); - sp_3072_sqr_24(z1, a1); - sp_3072_sqr_24(z2, &a[24]); - sp_3072_sqr_24(z0, a); - sp_3072_mask_24(r + 48, a1, 0 - u); - u += sp_3072_add_24(r + 48, r + 48, r + 48); - u += sp_3072_sub_in_place_48(z1, z2); - u += sp_3072_sub_in_place_48(z1, z0); u += sp_3072_add_48(r + 24, r + 24, z1); - r[72] = u; - XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); - (void)sp_3072_add_48(r + 48, r + 48, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (24 - 1)); + a1[0] = u; + (void)sp_3072_add_24(r + 72, r + 72, a1); } /* Sub b from a into r. (r = a - b) @@ -7202,7 +7309,7 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, sp_digit z1[96]; sp_digit a1[48]; sp_digit b1[48]; - sp_digit z2[96]; + sp_digit* z2 = r + 96; sp_digit u; sp_digit ca; sp_digit cb; @@ -7210,18 +7317,491 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, ca = sp_3072_add_48(a1, a, &a[48]); cb = sp_3072_add_48(b1, b, &b[48]); u = ca & cb; - sp_3072_mul_48(z1, a1, b1); + sp_3072_mul_48(z2, &a[48], &b[48]); sp_3072_mul_48(z0, a, b); - sp_3072_mask_48(r + 96, a1, 0 - cb); - sp_3072_mask_48(b1, b1, 0 - ca); - u += sp_3072_add_48(r + 96, r + 96, b1); - u += sp_3072_sub_in_place_96(z1, z2); + sp_3072_mul_48(z1, a1, b1); + u += sp_3072_sub_in_place_96(z1, z0); + u += sp_3072_sub_in_place_96(z1, z2); + sp_3072_mask_48(a1, a1, 0 - cb); + u += sp_3072_add_48(z1 + 48, z1 + 48, a1); + sp_3072_mask_48(b1, b1, 0 - ca); + u += sp_3072_add_48(z1 + 48, z1 + 48, b1); + u += sp_3072_add_96(r + 48, r + 48, z1); - r[144] = u; - XMEMSET(r + 144 + 1, 0, sizeof(sp_digit) * (48 - 1)); - (void)sp_3072_add_96(r + 96, r + 96, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (48 - 1)); + a1[0] = u; + (void)sp_3072_add_48(r + 144, r + 144, a1); +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) +{ + __asm__ __volatile__ ( + "mov r3, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r9, r3\n\t" + "mov r12, %[r]\n\t" + "mov r6, #96\n\t" + "neg r6, r6\n\t" + "add sp, sp, r6\n\t" + "mov r11, sp\n\t" + "mov r10, %[a]\n\t" + "\n1:\n\t" + "mov %[r], #0\n\t" + "mov r6, #44\n\t" + "mov %[a], r9\n\t" + "subs %[a], %[a], r6\n\t" + "sbc r6, r6, r6\n\t" + "mvn r6, r6\n\t" + "and %[a], %[a], r6\n\t" + "mov r2, r9\n\t" + "sub r2, r2, %[a]\n\t" + "add %[a], %[a], r10\n\t" + "add r2, r2, r10\n\t" + "\n2:\n\t" + "cmp r2, %[a]\n\t" +#ifdef __GNUC__ + "beq 4f\n\t" +#else + "beq.n 4f\n\t" +#endif /* __GNUC__ */ + /* Multiply * 2: Start */ + "ldr r6, [%[a]]\n\t" + "ldr r8, [r2]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, %[r]\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, %[r]\n\t" + /* Multiply * 2: Done */ +#ifdef __GNUC__ + "bal 5f\n\t" +#else + "bal.n 5f\n\t" +#endif /* __GNUC__ */ + "\n4:\n\t" + /* Square: Start */ + "ldr r6, [%[a]]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, %[r]\n\t" + /* Square: Done */ + "\n5:\n\t" + "add %[a], %[a], #4\n\t" + "sub r2, r2, #4\n\t" + "mov r6, #48\n\t" + "add r6, r6, r10\n\t" + "cmp %[a], r6\n\t" +#ifdef __GNUC__ + "beq 3f\n\t" +#else + "beq.n 3f\n\t" +#endif /* __GNUC__ */ + "cmp %[a], r2\n\t" +#ifdef __GNUC__ + "bgt 3f\n\t" +#else + "bgt.n 3f\n\t" +#endif /* __GNUC__ */ + "mov r8, r9\n\t" + "add r8, r8, r10\n\t" + "cmp %[a], r8\n\t" +#ifdef __GNUC__ + "ble 2b\n\t" +#else + "ble.n 2b\n\t" +#endif /* __GNUC__ */ + "\n3:\n\t" + "mov %[r], r11\n\t" + "mov r8, r9\n\t" + "str r3, [%[r], r8]\n\t" + "mov r3, r4\n\t" + "mov r4, r5\n\t" + "mov r5, #0\n\t" + "add r8, r8, #4\n\t" + "mov r9, r8\n\t" + "mov r6, #88\n\t" + "cmp r8, r6\n\t" +#ifdef __GNUC__ + "ble 1b\n\t" +#else + "ble.n 1b\n\t" +#endif /* __GNUC__ */ + "mov %[a], r10\n\t" + "str r3, [%[r], r8]\n\t" + "mov %[r], r12\n\t" + "mov %[a], r11\n\t" + "mov r3, #92\n\t" + "\n4:\n\t" + "ldr r6, [%[a], r3]\n\t" + "str r6, [%[r], r3]\n\t" + "subs r3, r3, #4\n\t" +#ifdef __GNUC__ + "bge 4b\n\t" +#else + "bge.n 4b\n\t" +#endif /* __GNUC__ */ + "mov r6, #96\n\t" + "add sp, sp, r6\n\t" + : + : [r] "r" (r), [a] "r" (a) + : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + ); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_12(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit* z2 = r + 24; + sp_digit z1[24]; + sp_digit* a1 = z1; + sp_digit zero[12]; + sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 12); + + mask = sp_3072_sub_12(a1, a, &a[12]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_12(a1, p1, p2); + + sp_3072_sqr_12(z2, &a[12]); + sp_3072_sqr_12(z0, a); + sp_3072_sqr_12(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_24(z1, z2); + u -= sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_sub_in_place_24(r + 12, z1); + zero[0] = u; + (void)sp_3072_add_12(r + 36, r + 36, zero); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit* z2 = r + 48; + sp_digit z1[48]; + sp_digit* a1 = z1; + sp_digit zero[24]; + sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 24); + + mask = sp_3072_sub_24(a1, a, &a[24]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_24(a1, p1, p2); + + sp_3072_sqr_24(z2, &a[24]); + sp_3072_sqr_24(z0, a); + sp_3072_sqr_24(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_48(z1, z2); + u -= sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_sub_in_place_48(r + 24, z1); + zero[0] = u; + (void)sp_3072_add_24(r + 72, r + 72, zero); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; } /* Square a and put result in r. (r = a * a) @@ -7232,23 +7812,32 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[96]; + sp_digit* z2 = r + 96; sp_digit z1[96]; - sp_digit a1[48]; + sp_digit* a1 = z1; + sp_digit zero[48]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 48); + + mask = sp_3072_sub_48(a1, a, &a[48]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_3072_sub_48(a1, p1, p2); - u = sp_3072_add_48(a1, a, &a[48]); - sp_3072_sqr_48(z1, a1); sp_3072_sqr_48(z2, &a[48]); sp_3072_sqr_48(z0, a); - sp_3072_mask_48(r + 96, a1, 0 - u); - u += sp_3072_add_48(r + 96, r + 96, r + 96); - u += sp_3072_sub_in_place_96(z1, z2); - u += sp_3072_sub_in_place_96(z1, z0); - u += sp_3072_add_96(r + 48, r + 48, z1); - r[144] = u; - XMEMSET(r + 144 + 1, 0, sizeof(sp_digit) * (48 - 1)); - (void)sp_3072_add_96(r + 96, r + 96, z2); + sp_3072_sqr_48(z1, a1); + + u = 0; + u -= sp_3072_sub_in_place_96(z1, z2); + u -= sp_3072_sub_in_place_96(z1, z0); + u += sp_3072_sub_in_place_96(r + 48, z1); + zero[0] = u; + (void)sp_3072_add_48(r + 144, r + 144, zero); } #endif /* !WOLFSSL_SP_SMALL */ @@ -8119,7 +8708,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_48(r, a, b); @@ -8133,7 +8722,7 @@ static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_48(r, a); @@ -8182,11 +8771,11 @@ SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -8301,7 +8890,7 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig div = d[47]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); - for (i=47; i>=0; i--) { + for (i = 47; i >= 0; i--) { sp_digit hi = t1[48 + i] - (t1[48 + i] == div); r1 = div_3072_word_48(hi, t1[48 + i - 1], div); @@ -8833,7 +9422,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_96(r, a, b); @@ -8847,7 +9436,7 @@ static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_96(r, a); @@ -9160,11 +9749,11 @@ SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -9232,9 +9821,13 @@ static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, s div = d[95]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); - for (i=95; i>=0; i--) { - sp_digit hi = t1[96 + i] - (t1[96 + i] == div); - r1 = div_3072_word_96(hi, t1[96 + i - 1], div); + for (i = 95; i >= 0; i--) { + if (t1[96 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_3072_word_96(t1[96 + i], t1[96 + i - 1], div); + } sp_3072_mul_d_96(t2, d, r1); t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2); @@ -9372,7 +9965,7 @@ static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_dig div = d[95]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); - for (i=95; i>=0; i--) { + for (i = 95; i >= 0; i--) { sp_digit hi = t1[96 + i] - (t1[96 + i] == div); r1 = div_3072_word_96(hi, t1[96 + i - 1], div); @@ -9758,9 +10351,9 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 96; r = a + 96 * 2; m = r + 96 * 2; - ah = a + 96; sp_3072_from_bin(ah, 96, in, inLen); #if DIGIT_BIT >= 32 @@ -9778,7 +10371,38 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_3072_from_mp(m, 96, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_3072_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 96); + err = sp_3072_mod_96_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_3072_mont_sqr_96(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_3072_mont_mul_96(r, r, ah, m, mp); + + for (i = 95; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_3072_sub_in_place_96(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_3072_sqr_96(r, ah); err = sp_3072_mod_96_cond(r, r, m); @@ -9806,7 +10430,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 96); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_3072_mont_sqr_96(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_3072_mont_mul_96(r, r, a, m, mp); @@ -11892,7 +12516,7 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, sp_digit z1[128]; sp_digit a1[64]; sp_digit b1[64]; - sp_digit z2[128]; + sp_digit* z2 = r + 128; sp_digit u; sp_digit ca; sp_digit cb; @@ -11900,18 +12524,22 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, ca = sp_2048_add_64(a1, a, &a[64]); cb = sp_2048_add_64(b1, b, &b[64]); u = ca & cb; - sp_2048_mul_64(z1, a1, b1); + sp_2048_mul_64(z2, &a[64], &b[64]); sp_2048_mul_64(z0, a, b); - sp_2048_mask_64(r + 128, a1, 0 - cb); - sp_2048_mask_64(b1, b1, 0 - ca); - u += sp_2048_add_64(r + 128, r + 128, b1); - u += sp_4096_sub_in_place_128(z1, z2); + sp_2048_mul_64(z1, a1, b1); + u += sp_4096_sub_in_place_128(z1, z0); + u += sp_4096_sub_in_place_128(z1, z2); + sp_2048_mask_64(a1, a1, 0 - cb); + u += sp_2048_add_64(z1 + 64, z1 + 64, a1); + sp_2048_mask_64(b1, b1, 0 - ca); + u += sp_2048_add_64(z1 + 64, z1 + 64, b1); + u += sp_4096_add_128(r + 64, r + 64, z1); - r[192] = u; - XMEMSET(r + 192 + 1, 0, sizeof(sp_digit) * (64 - 1)); - (void)sp_4096_add_128(r + 128, r + 128, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (64 - 1)); + a1[0] = u; + (void)sp_4096_add_64(r + 192, r + 192, a1); } /* Square a and put result in r. (r = a * a) @@ -11922,23 +12550,32 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[128]; + sp_digit* z2 = r + 128; sp_digit z1[128]; - sp_digit a1[64]; + sp_digit* a1 = z1; + sp_digit zero[64]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 64); + + mask = sp_2048_sub_64(a1, a, &a[64]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_2048_sub_64(a1, p1, p2); - u = sp_2048_add_64(a1, a, &a[64]); - sp_2048_sqr_64(z1, a1); sp_2048_sqr_64(z2, &a[64]); sp_2048_sqr_64(z0, a); - sp_2048_mask_64(r + 128, a1, 0 - u); - u += sp_2048_add_64(r + 128, r + 128, r + 128); - u += sp_4096_sub_in_place_128(z1, z2); - u += sp_4096_sub_in_place_128(z1, z0); - u += sp_4096_add_128(r + 64, r + 64, z1); - r[192] = u; - XMEMSET(r + 192 + 1, 0, sizeof(sp_digit) * (64 - 1)); - (void)sp_4096_add_128(r + 128, r + 128, z2); + sp_2048_sqr_64(z1, a1); + + u = 0; + u -= sp_4096_sub_in_place_128(z1, z2); + u -= sp_4096_sub_in_place_128(z1, z0); + u += sp_4096_sub_in_place_128(r + 64, z1); + zero[0] = u; + (void)sp_2048_add_64(r + 192, r + 192, zero); } #endif /* !WOLFSSL_SP_SMALL */ @@ -12483,7 +13120,7 @@ SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_128(r, a, b); @@ -12497,7 +13134,7 @@ static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_128(r, a); @@ -12889,11 +13526,11 @@ SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_SP_SMALL */ -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -12961,9 +13598,13 @@ static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, div = d[127]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); - for (i=127; i>=0; i--) { - sp_digit hi = t1[128 + i] - (t1[128 + i] == div); - r1 = div_4096_word_128(hi, t1[128 + i - 1], div); + for (i = 127; i >= 0; i--) { + if (t1[128 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_4096_word_128(t1[128 + i], t1[128 + i - 1], div); + } sp_4096_mul_d_128(t2, d, r1); t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2); @@ -13101,7 +13742,7 @@ static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_di div = d[127]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); - for (i=127; i>=0; i--) { + for (i = 127; i >= 0; i--) { sp_digit hi = t1[128 + i] - (t1[128 + i] == div); r1 = div_4096_word_128(hi, t1[128 + i - 1], div); @@ -13487,9 +14128,9 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, #endif if (err == MP_OKAY) { + ah = a + 128; r = a + 128 * 2; m = r + 128 * 2; - ah = a + 128; sp_4096_from_bin(ah, 128, in, inLen); #if DIGIT_BIT >= 32 @@ -13507,7 +14148,38 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_4096_from_mp(m, 128, mm); - if (e[0] == 0x3) { + if (e[0] == 0x10001) { + int i; + sp_digit mp; + + sp_4096_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 128); + err = sp_4096_mod_128_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ + for (i = 15; i >= 0; i--) { + sp_4096_mont_sqr_128(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_4096_mont_mul_128(r, r, ah, m, mp); + + for (i = 127; i > 0; i--) { + if (r[i] != m[i]) { + break; + } + } + if (r[i] >= m[i]) { + sp_4096_sub_in_place_128(r, m); + } + } + } + else if (e[0] == 0x3) { if (err == MP_OKAY) { sp_4096_sqr_128(r, ah); err = sp_4096_mod_128_cond(r, r, m); @@ -13535,7 +14207,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, } XMEMCPY(r, a, sizeof(sp_digit) * 128); - for (i--; i>=0; i--) { + for (i--; i >= 0; i--) { sp_4096_mont_sqr_128(r, r, m, mp); if (((e[0] >> i) & 1) == 1) { sp_4096_mont_mul_128(r, r, a, m, mp); @@ -22329,11 +23001,11 @@ SP_NOINLINE static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -22427,7 +23099,7 @@ static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit div = d[7]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 8); - for (i=7; i>=0; i--) { + for (i = 7; i >= 0; i--) { sp_digit hi = t1[8 + i] - (t1[8 + i] == div); r1 = div_256_word_8(hi, t1[8 + i - 1], div); @@ -25148,7 +25820,7 @@ SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_mul_12(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_384_mul_12(r, a, b); @@ -25162,7 +25834,7 @@ static void sp_384_mont_mul_12(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_sqr_12(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_sqr_12(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_384_sqr_12(r, a); @@ -29507,11 +30179,11 @@ SP_NOINLINE static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -29609,7 +30281,7 @@ static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digi div = d[11]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 12); - for (i=11; i>=0; i--) { + for (i = 11; i >= 0; i--) { sp_digit hi = t1[12 + i] - (t1[12 + i] == div); r1 = div_384_word_12(hi, t1[12 + i - 1], div); @@ -32514,7 +33186,7 @@ SP_NOINLINE static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_mul_17(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_mul_17(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_521_mul_17(r, a, b); @@ -32528,7 +33200,7 @@ static void sp_521_mont_mul_17(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_521_mont_sqr_17(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_521_mont_sqr_17(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_521_sqr_17(r, a); @@ -38265,11 +38937,11 @@ SP_NOINLINE static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -38377,7 +39049,7 @@ static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d, sp_digi sp_521_lshift_17(sd, d, 23); sp_521_lshift_34(t1, t1, 23); - for (i=16; i>=0; i--) { + for (i = 16; i >= 0; i--) { sp_digit hi = t1[17 + i] - (t1[17 + i] == div); r1 = div_521_word_17(hi, t1[17 + i - 1], div); @@ -40824,7 +41496,7 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, sp_digit z1[32]; sp_digit a1[16]; sp_digit b1[16]; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit u; sp_digit ca; sp_digit cb; @@ -40832,18 +41504,83 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, ca = sp_1024_add_16(a1, a, &a[16]); cb = sp_1024_add_16(b1, b, &b[16]); u = ca & cb; - sp_1024_mul_16(z1, a1, b1); + sp_1024_mul_16(z2, &a[16], &b[16]); sp_1024_mul_16(z0, a, b); - sp_1024_mask_16(r + 32, a1, 0 - cb); - sp_1024_mask_16(b1, b1, 0 - ca); - u += sp_1024_add_16(r + 32, r + 32, b1); - u += sp_1024_sub_in_place_32(z1, z2); + sp_1024_mul_16(z1, a1, b1); + u += sp_1024_sub_in_place_32(z1, z0); + u += sp_1024_sub_in_place_32(z1, z2); + sp_1024_mask_16(a1, a1, 0 - cb); + u += sp_1024_add_16(z1 + 16, z1 + 16, a1); + sp_1024_mask_16(b1, b1, 0 - ca); + u += sp_1024_add_16(z1 + 16, z1 + 16, b1); + u += sp_1024_add_32(r + 16, r + 16, z1); - r[48] = u; - XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1)); - (void)sp_1024_add_32(r + 32, r + 32, z2); + XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (16 - 1)); + a1[0] = u; + (void)sp_1024_add_16(r + 48, r + 48, a1); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_1024_sub_16(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "ldm %[a]!, {r4, r5}\n\t" + "ldm %[b]!, {r6, r8}\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "stm %[r]!, {r4, r5}\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; } /* Square a and put result in r. (r = a * a) @@ -40854,23 +41591,32 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, SP_NOINLINE static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; - sp_digit z2[32]; + sp_digit* z2 = r + 32; sp_digit z1[32]; - sp_digit a1[16]; + sp_digit* a1 = z1; + sp_digit zero[16]; sp_digit u; + sp_digit mask; + sp_digit* p1; + sp_digit* p2; + + XMEMSET(zero, 0, sizeof(sp_digit) * 16); + + mask = sp_1024_sub_16(a1, a, &a[16]); + p1 = (sp_digit*)(((sp_digit)zero & mask ) | ((sp_digit)a1 & (~mask))); + p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 & mask )); + (void)sp_1024_sub_16(a1, p1, p2); - u = sp_1024_add_16(a1, a, &a[16]); - sp_1024_sqr_16(z1, a1); sp_1024_sqr_16(z2, &a[16]); sp_1024_sqr_16(z0, a); - sp_1024_mask_16(r + 32, a1, 0 - u); - u += sp_1024_add_16(r + 32, r + 32, r + 32); - u += sp_1024_sub_in_place_32(z1, z2); - u += sp_1024_sub_in_place_32(z1, z0); - u += sp_1024_add_32(r + 16, r + 16, z1); - r[48] = u; - XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1)); - (void)sp_1024_add_32(r + 32, r + 32, z2); + sp_1024_sqr_16(z1, a1); + + u = 0; + u -= sp_1024_sub_in_place_32(z1, z2); + u -= sp_1024_sub_in_place_32(z1, z0); + u += sp_1024_sub_in_place_32(r + 16, z1); + zero[0] = u; + (void)sp_1024_add_16(r + 48, r + 48, zero); } #else @@ -41339,11 +42085,11 @@ SP_NOINLINE static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, ); } -/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. - * div The dividend. + * div The divisor. * returns the result of the division. * * Note that this is an approximate div. It may give an answer 1 larger. @@ -41488,7 +42234,7 @@ static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_dig div = d[31]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i=31; i>=0; i--) { + for (i = 31; i >= 0; i--) { sp_digit hi = t1[32 + i] - (t1[32 + i] == div); r1 = div_1024_word_32(hi, t1[32 + i - 1], div); @@ -41902,7 +42648,7 @@ SP_NOINLINE static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_1024_mul_32(r, a, b); @@ -41916,7 +42662,7 @@ static void sp_1024_mont_mul_32(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_sqr_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_sqr_32(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_1024_sqr_32(r, a); diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c index 8826657b0..57c4f9fff 100644 --- a/wolfcrypt/src/sp_x86_64.c +++ b/wolfcrypt/src/sp_x86_64.c @@ -48,19 +48,19 @@ #include #ifdef WOLFSSL_SP_X86_64_ASM -#define SP_PRINT_NUM(var, name, total, words, bits) \ - do { \ - int ii; \ - fprintf(stderr, name "=0x"); \ - for (ii = words - 1; ii >= 0; ii--) \ - fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ - fprintf(stderr, "\n"); \ +#define SP_PRINT_NUM(var, name, total, words, bits) \ + do { \ + int ii; \ + fprintf(stderr, name "=0x"); \ + for (ii = ((bits + 63) / 64) - 1; ii >= 0; ii--) \ + fprintf(stderr, SP_PRINT_FMT, (var)[ii]); \ + fprintf(stderr, "\n"); \ } while (0) -#define SP_PRINT_VAL(var, name) \ +#define SP_PRINT_VAL(var, name) \ fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var) -#define SP_PRINT_INT(var, name) \ +#define SP_PRINT_INT(var, name) \ fprintf(stderr, name "=%d\n", var) #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) @@ -212,19 +212,19 @@ static void sp_2048_to_bin_32(sp_digit* r, byte* a) #define sp_2048_norm_32(a) extern void sp_2048_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b); -extern void sp_2048_sqr_16(sp_digit* r, const sp_digit* a); extern void sp_2048_mul_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* b); -extern void sp_2048_sqr_avx2_16(sp_digit* r, const sp_digit* a); extern sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, const sp_digit* b); extern sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b); extern sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b); extern void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b); +extern void sp_2048_mul_avx2_32(sp_digit* r, const sp_digit* a, const sp_digit* b); + +extern void sp_2048_sqr_16(sp_digit* r, const sp_digit* a); +extern void sp_2048_sqr_avx2_16(sp_digit* r, const sp_digit* a); extern sp_digit sp_2048_dbl_16(sp_digit* r, const sp_digit* a); extern void sp_2048_sqr_32(sp_digit* r, const sp_digit* a); -extern void sp_2048_mul_avx2_32(sp_digit* r, const sp_digit* a, const sp_digit* b); - extern void sp_2048_sqr_avx2_32(sp_digit* r, const sp_digit* a); #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) @@ -281,7 +281,7 @@ extern void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, sp_digit mp); * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_16(r, a, b); @@ -295,7 +295,7 @@ static void sp_2048_mont_mul_16(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_16(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_16(r, a); @@ -413,7 +413,7 @@ static WC_INLINE int sp_2048_div_16(const sp_digit* a, const sp_digit* d, sp_dig else #endif sp_2048_cond_sub_16(&t1[16], &t1[16], d, (sp_digit)0 - r1); - for (i=15; i>=0; i--) { + for (i = 15; i >= 0; i--) { sp_digit hi = t1[16 + i] - (t1[16 + i] == div); r1 = div_2048_word_16(hi, t1[16 + i - 1], div); @@ -658,7 +658,7 @@ extern void sp_2048_mont_reduce_avx2_16(sp_digit* a, const sp_digit* m, sp_digit * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_avx2_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_avx2_16(r, a, b); @@ -674,7 +674,7 @@ static void sp_2048_mont_mul_avx2_16(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_avx2_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_avx2_16(r, a); @@ -906,7 +906,7 @@ extern void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp); * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_32(r, a, b); @@ -920,7 +920,7 @@ static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_32(r, a); @@ -1006,9 +1006,13 @@ static WC_INLINE int sp_2048_div_32_cond(const sp_digit* a, const sp_digit* d, s if (t1[i + 32] >= d[i]) { sp_2048_sub_in_place_32(&t1[32], d); } - for (i=31; i>=0; i--) { - sp_digit hi = t1[32 + i] - (t1[32 + i] == div); - r1 = div_2048_word_32(hi, t1[32 + i - 1], div); + for (i = 31; i >= 0; i--) { + if (t1[32 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_2048_word_32(t1[32 + i], t1[32 + i - 1], div); + } #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) @@ -1120,7 +1124,7 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig else #endif sp_2048_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1); - for (i=31; i>=0; i--) { + for (i = 31; i >= 0; i--) { sp_digit hi = t1[32 + i] - (t1[32 + i] == div); r1 = div_2048_word_32(hi, t1[32 + i - 1], div); @@ -1350,7 +1354,7 @@ extern void sp_2048_mont_reduce_avx2_32(sp_digit* a, const sp_digit* m, sp_digit * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_mul_avx2_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_mul_avx2_32(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_2048_mul_avx2_32(r, a, b); @@ -1366,7 +1370,7 @@ static void sp_2048_mont_mul_avx2_32(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_2048_mont_sqr_avx2_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mont_sqr_avx2_32(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_2048_sqr_avx2_32(r, a); @@ -1622,7 +1626,50 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_2048_from_mp(m, 32, mm); - if (e == 0x3) { + if (e == 0x10001) { + int i; + sp_digit mp; + + sp_2048_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 32); + err = sp_2048_mod_32_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ +#ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) { + for (i = 15; i >= 0; i--) { + sp_2048_mont_sqr_avx2_32(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_2048_mont_mul_avx2_32(r, r, ah, m, mp); + } + else +#endif + { + for (i = 15; i >= 0; i--) { + sp_2048_mont_sqr_32(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_2048_mont_mul_32(r, r, ah, m, mp); + } + + for (i = 31; i > 0; i--) { + if (r[i] != m[i]) + break; + } + if (r[i] >= m[i]) + sp_2048_sub_in_place_32(r, m); + } + } + else if (e == 0x3) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) { if (err == MP_OKAY) { @@ -2751,30 +2798,30 @@ static void sp_3072_to_bin_48(sp_digit* r, byte* a) #define sp_3072_norm_48(a) extern void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b); -extern void sp_3072_sqr_12(sp_digit* r, const sp_digit* a); extern void sp_3072_mul_avx2_12(sp_digit* r, const sp_digit* a, const sp_digit* b); -extern void sp_3072_sqr_avx2_12(sp_digit* r, const sp_digit* a); extern sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b); extern sp_digit sp_3072_sub_in_place_24(sp_digit* a, const sp_digit* b); extern sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, const sp_digit* b); extern void sp_3072_mul_24(sp_digit* r, const sp_digit* a, const sp_digit* b); -extern sp_digit sp_3072_dbl_12(sp_digit* r, const sp_digit* a); -extern void sp_3072_sqr_24(sp_digit* r, const sp_digit* a); - extern void sp_3072_mul_avx2_24(sp_digit* r, const sp_digit* a, const sp_digit* b); -extern void sp_3072_sqr_avx2_24(sp_digit* r, const sp_digit* a); - extern sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b); extern sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b); extern void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b); +extern void sp_3072_mul_avx2_48(sp_digit* r, const sp_digit* a, const sp_digit* b); + +extern void sp_3072_sqr_12(sp_digit* r, const sp_digit* a); +extern void sp_3072_sqr_avx2_12(sp_digit* r, const sp_digit* a); +extern sp_digit sp_3072_dbl_12(sp_digit* r, const sp_digit* a); +extern void sp_3072_sqr_24(sp_digit* r, const sp_digit* a); + +extern void sp_3072_sqr_avx2_24(sp_digit* r, const sp_digit* a); + extern sp_digit sp_3072_dbl_24(sp_digit* r, const sp_digit* a); extern void sp_3072_sqr_48(sp_digit* r, const sp_digit* a); -extern void sp_3072_mul_avx2_48(sp_digit* r, const sp_digit* a, const sp_digit* b); - extern void sp_3072_sqr_avx2_48(sp_digit* r, const sp_digit* a); #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) @@ -2830,7 +2877,7 @@ extern void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, sp_digit mp); * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_24(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_24(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_24(r, a, b); @@ -2844,7 +2891,7 @@ static void sp_3072_mont_mul_24(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_24(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_24(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_24(r, a); @@ -2962,7 +3009,7 @@ static WC_INLINE int sp_3072_div_24(const sp_digit* a, const sp_digit* d, sp_dig else #endif sp_3072_cond_sub_24(&t1[24], &t1[24], d, (sp_digit)0 - r1); - for (i=23; i>=0; i--) { + for (i = 23; i >= 0; i--) { sp_digit hi = t1[24 + i] - (t1[24 + i] == div); r1 = div_3072_word_24(hi, t1[24 + i - 1], div); @@ -3207,7 +3254,7 @@ extern void sp_3072_mont_reduce_avx2_24(sp_digit* a, const sp_digit* m, sp_digit * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_avx2_24(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_avx2_24(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_avx2_24(r, a, b); @@ -3223,7 +3270,7 @@ static void sp_3072_mont_mul_avx2_24(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_avx2_24(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_avx2_24(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_avx2_24(r, a); @@ -3455,7 +3502,7 @@ extern void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp); * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_48(r, a, b); @@ -3469,7 +3516,7 @@ static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_48(r, a); @@ -3555,9 +3602,13 @@ static WC_INLINE int sp_3072_div_48_cond(const sp_digit* a, const sp_digit* d, s if (t1[i + 48] >= d[i]) { sp_3072_sub_in_place_48(&t1[48], d); } - for (i=47; i>=0; i--) { - sp_digit hi = t1[48 + i] - (t1[48 + i] == div); - r1 = div_3072_word_48(hi, t1[48 + i - 1], div); + for (i = 47; i >= 0; i--) { + if (t1[48 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_3072_word_48(t1[48 + i], t1[48 + i - 1], div); + } #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) @@ -3669,7 +3720,7 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig else #endif sp_3072_cond_sub_48(&t1[48], &t1[48], d, (sp_digit)0 - r1); - for (i=47; i>=0; i--) { + for (i = 47; i >= 0; i--) { sp_digit hi = t1[48 + i] - (t1[48 + i] == div); r1 = div_3072_word_48(hi, t1[48 + i - 1], div); @@ -3899,7 +3950,7 @@ extern void sp_3072_mont_reduce_avx2_48(sp_digit* a, const sp_digit* m, sp_digit * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_mul_avx2_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_mul_avx2_48(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_3072_mul_avx2_48(r, a, b); @@ -3915,7 +3966,7 @@ static void sp_3072_mont_mul_avx2_48(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_3072_mont_sqr_avx2_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mont_sqr_avx2_48(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_3072_sqr_avx2_48(r, a); @@ -4171,7 +4222,50 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_3072_from_mp(m, 48, mm); - if (e == 0x3) { + if (e == 0x10001) { + int i; + sp_digit mp; + + sp_3072_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 48); + err = sp_3072_mod_48_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ +#ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) { + for (i = 15; i >= 0; i--) { + sp_3072_mont_sqr_avx2_48(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_3072_mont_mul_avx2_48(r, r, ah, m, mp); + } + else +#endif + { + for (i = 15; i >= 0; i--) { + sp_3072_mont_sqr_48(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_3072_mont_mul_48(r, r, ah, m, mp); + } + + for (i = 47; i > 0; i--) { + if (r[i] != m[i]) + break; + } + if (r[i] >= m[i]) + sp_3072_sub_in_place_48(r, m); + } + } + else if (e == 0x3) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) { if (err == MP_OKAY) { @@ -5303,11 +5397,11 @@ extern sp_digit sp_4096_sub_in_place_64(sp_digit* a, const sp_digit* b); extern sp_digit sp_4096_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b); extern void sp_4096_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b); +extern void sp_4096_mul_avx2_64(sp_digit* r, const sp_digit* a, const sp_digit* b); + extern sp_digit sp_2048_dbl_32(sp_digit* r, const sp_digit* a); extern void sp_4096_sqr_64(sp_digit* r, const sp_digit* a); -extern void sp_4096_mul_avx2_64(sp_digit* r, const sp_digit* a, const sp_digit* b); - extern void sp_4096_sqr_avx2_64(sp_digit* r, const sp_digit* a); /* Caclulate the bottom digit of -1/a mod 2^n. @@ -5361,7 +5455,7 @@ extern void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp); * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_64(r, a, b); @@ -5375,7 +5469,7 @@ static void sp_4096_mont_mul_64(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_64(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_64(r, a); @@ -5461,9 +5555,13 @@ static WC_INLINE int sp_4096_div_64_cond(const sp_digit* a, const sp_digit* d, s if (t1[i + 64] >= d[i]) { sp_4096_sub_in_place_64(&t1[64], d); } - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_4096_word_64(hi, t1[64 + i - 1], div); + for (i = 63; i >= 0; i--) { + if (t1[64 + i] == div) { + r1 = SP_DIGIT_MAX; + } + else { + r1 = div_4096_word_64(t1[64 + i], t1[64 + i - 1], div); + } #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) @@ -5575,7 +5673,7 @@ static WC_INLINE int sp_4096_div_64(const sp_digit* a, const sp_digit* d, sp_dig else #endif sp_4096_cond_sub_64(&t1[64], &t1[64], d, (sp_digit)0 - r1); - for (i=63; i>=0; i--) { + for (i = 63; i >= 0; i--) { sp_digit hi = t1[64 + i] - (t1[64 + i] == div); r1 = div_4096_word_64(hi, t1[64 + i - 1], div); @@ -5805,7 +5903,7 @@ extern void sp_4096_mont_reduce_avx2_64(sp_digit* a, const sp_digit* m, sp_digit * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_mul_avx2_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_mul_avx2_64(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_4096_mul_avx2_64(r, a, b); @@ -5821,7 +5919,7 @@ static void sp_4096_mont_mul_avx2_64(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_4096_mont_sqr_avx2_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_4096_mont_sqr_avx2_64(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_4096_sqr_avx2_64(r, a); @@ -6077,7 +6175,50 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, if (err == MP_OKAY) { sp_4096_from_mp(m, 64, mm); - if (e == 0x3) { + if (e == 0x10001) { + int i; + sp_digit mp; + + sp_4096_mont_setup(m, &mp); + + /* Convert to Montgomery form. */ + XMEMSET(a, 0, sizeof(sp_digit) * 64); + err = sp_4096_mod_64_cond(r, a, m); + /* Montgomery form: r = a.R mod m */ + + if (err == MP_OKAY) { + /* r = a ^ 0x10000 => r = a squared 16 times */ +#ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) { + for (i = 15; i >= 0; i--) { + sp_4096_mont_sqr_avx2_64(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_4096_mont_mul_avx2_64(r, r, ah, m, mp); + } + else +#endif + { + for (i = 15; i >= 0; i--) { + sp_4096_mont_sqr_64(r, r, m, mp); + } + /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m + * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m + */ + sp_4096_mont_mul_64(r, r, ah, m, mp); + } + + for (i = 63; i > 0; i--) { + if (r[i] != m[i]) + break; + } + if (r[i] >= m[i]) + sp_4096_sub_in_place_64(r, m); + } + } + else if (e == 0x3) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) { if (err == MP_OKAY) { @@ -23667,7 +23808,7 @@ static WC_INLINE int sp_256_div_4(const sp_digit* a, const sp_digit* d, sp_digit else #endif sp_256_cond_sub_4(&t1[4], &t1[4], d, (sp_digit)0 - r1); - for (i=3; i>=0; i--) { + for (i = 3; i >= 0; i--) { sp_digit hi = t1[4 + i] - (t1[4 + i] == div); r1 = div_256_word_4(hi, t1[4 + i - 1], div); @@ -25984,7 +26125,7 @@ extern void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit* m, sp_digit * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_mul_6(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_mul_6(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_384_mul_6(r, a, b); @@ -25998,7 +26139,7 @@ static void sp_384_mont_mul_6(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_sqr_6(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_sqr_6(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_384_sqr_6(r, a); @@ -27218,7 +27359,7 @@ extern void sp_384_mont_reduce_order_avx2_6(sp_digit* a, const sp_digit* m, sp_d * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_mul_avx2_6(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_mul_avx2_6(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_384_mul_avx2_6(r, a, b); @@ -27234,7 +27375,7 @@ static void sp_384_mont_mul_avx2_6(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_384_mont_sqr_avx2_6(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_384_mont_sqr_avx2_6(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_384_sqr_avx2_6(r, a); @@ -48285,7 +48426,7 @@ static WC_INLINE int sp_384_div_6(const sp_digit* a, const sp_digit* d, sp_digit else #endif sp_384_cond_sub_6(&t1[6], &t1[6], d, (sp_digit)0 - r1); - for (i=5; i>=0; i--) { + for (i = 5; i >= 0; i--) { sp_digit hi = t1[6 + i] - (t1[6 + i] == div); r1 = div_384_word_6(hi, t1[6 + i - 1], div); @@ -88947,7 +89088,7 @@ static WC_INLINE int sp_521_div_9(const sp_digit* a, const sp_digit* d, sp_digit sp_521_lshift_9(sd, d, 55); sp_521_lshift_18(t1, t1, 55); - for (i=8; i>=0; i--) { + for (i = 8; i >= 0; i--) { sp_digit hi = t1[9 + i] - (t1[9 + i] == div); r1 = div_521_word_9(hi, t1[9 + i - 1], div); @@ -91079,7 +91220,7 @@ static WC_INLINE int sp_1024_div_16(const sp_digit* a, const sp_digit* d, sp_dig else #endif sp_1024_cond_sub_16(&t1[16], &t1[16], d, (sp_digit)0 - r1); - for (i=15; i>=0; i--) { + for (i = 15; i >= 0; i--) { sp_digit hi = t1[16 + i] - (t1[16 + i] == div); r1 = div_1024_word_16(hi, t1[16 + i - 1], div); @@ -91414,7 +91555,7 @@ extern void sp_1024_mont_reduce_16(sp_digit* a, const sp_digit* m, sp_digit mp); * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_mul_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_1024_mul_16(r, a, b); @@ -91428,7 +91569,7 @@ static void sp_1024_mont_mul_16(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_sqr_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_sqr_16(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_1024_sqr_16(r, a); @@ -92538,7 +92679,7 @@ extern void sp_1024_mont_reduce_avx2_16(sp_digit* a, const sp_digit* m, sp_digit * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_mul_avx2_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_mul_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { sp_1024_mul_avx2_16(r, a, b); @@ -92554,7 +92695,7 @@ static void sp_1024_mont_mul_avx2_16(sp_digit* r, const sp_digit* a, * m Modulus (prime). * mp Montgomery mulitplier. */ -static void sp_1024_mont_sqr_avx2_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_1024_mont_sqr_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { sp_1024_sqr_avx2_16(r, a); diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index 10b5c98ba..b0ad90f63 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -2069,1100 +2069,6 @@ _sp_2048_mul_16: #ifndef __APPLE__ .size sp_2048_mul_16,.-sp_2048_mul_16 #endif /* __APPLE__ */ -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_sqr_16 -.type sp_2048_sqr_16,@function -.align 16 -sp_2048_sqr_16: -#else -.section __TEXT,__text -.globl _sp_2048_sqr_16 -.p2align 4 -_sp_2048_sqr_16: -#endif /* __APPLE__ */ - pushq %r12 - subq $0x80, %rsp - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - xorq %r9, %r9 - movq %rax, (%rsp) - movq %rdx, %r8 - # A[0] * A[1] - movq 8(%rsi), %rax - mulq (%rsi) - xorq %rcx, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - movq %r8, 8(%rsp) - # A[0] * A[2] - movq 16(%rsi), %rax - mulq (%rsi) - xorq %r8, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - movq %r9, 16(%rsp) - # A[0] * A[3] - movq 24(%rsi), %rax - mulq (%rsi) - xorq %r9, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - # A[1] * A[2] - movq 16(%rsi), %rax - mulq 8(%rsi) - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - movq %rcx, 24(%rsp) - # A[0] * A[4] - movq 32(%rsi), %rax - mulq (%rsi) - xorq %rcx, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - # A[1] * A[3] - movq 24(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - movq %r8, 32(%rsp) - # A[0] * A[5] - movq 40(%rsi), %rax - mulq (%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[4] - movq 32(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[3] - movq 24(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 40(%rsp) - # A[0] * A[6] - movq 48(%rsi), %rax - mulq (%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[5] - movq 40(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[4] - movq 32(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 48(%rsp) - # A[0] * A[7] - movq 56(%rsi), %rax - mulq (%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[6] - movq 48(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[5] - movq 40(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[4] - movq 32(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 56(%rsp) - # A[0] * A[8] - movq 64(%rsi), %rax - mulq (%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[7] - movq 56(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[6] - movq 48(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[5] - movq 40(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[4] - movq 32(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 64(%rsp) - # A[0] * A[9] - movq 72(%rsi), %rax - mulq (%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[8] - movq 64(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[7] - movq 56(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[6] - movq 48(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[5] - movq 40(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 72(%rsp) - # A[0] * A[10] - movq 80(%rsi), %rax - mulq (%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[9] - movq 72(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[8] - movq 64(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[7] - movq 56(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[6] - movq 48(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[5] - movq 40(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 80(%rsp) - # A[0] * A[11] - movq 88(%rsi), %rax - mulq (%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[10] - movq 80(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[9] - movq 72(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[8] - movq 64(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[7] - movq 56(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[6] - movq 48(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 88(%rsp) - # A[0] * A[12] - movq 96(%rsi), %rax - mulq (%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[11] - movq 88(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[10] - movq 80(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[9] - movq 72(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[8] - movq 64(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[7] - movq 56(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[6] - movq 48(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 96(%rsp) - # A[0] * A[13] - movq 104(%rsi), %rax - mulq (%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[12] - movq 96(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[11] - movq 88(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[10] - movq 80(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[9] - movq 72(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[8] - movq 64(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[7] - movq 56(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 104(%rsp) - # A[0] * A[14] - movq 112(%rsi), %rax - mulq (%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[13] - movq 104(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[12] - movq 96(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[11] - movq 88(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[10] - movq 80(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[9] - movq 72(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[8] - movq 64(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[7] - movq 56(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 112(%rsp) - # A[0] * A[15] - movq 120(%rsi), %rax - mulq (%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[14] - movq 112(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[13] - movq 104(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[12] - movq 96(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[11] - movq 88(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[10] - movq 80(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[9] - movq 72(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[8] - movq 64(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 120(%rsp) - # A[1] * A[15] - movq 120(%rsi), %rax - mulq 8(%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[2] * A[14] - movq 112(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[13] - movq 104(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[12] - movq 96(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[11] - movq 88(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[10] - movq 80(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[9] - movq 72(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[8] * A[8] - movq 64(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 128(%rdi) - # A[2] * A[15] - movq 120(%rsi), %rax - mulq 16(%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[3] * A[14] - movq 112(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[13] - movq 104(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[12] - movq 96(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[11] - movq 88(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[10] - movq 80(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[8] * A[9] - movq 72(%rsi), %rax - mulq 64(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 136(%rdi) - # A[3] * A[15] - movq 120(%rsi), %rax - mulq 24(%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[4] * A[14] - movq 112(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[13] - movq 104(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[12] - movq 96(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[11] - movq 88(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[8] * A[10] - movq 80(%rsi), %rax - mulq 64(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[9] * A[9] - movq 72(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 144(%rdi) - # A[4] * A[15] - movq 120(%rsi), %rax - mulq 32(%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[5] * A[14] - movq 112(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[13] - movq 104(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[12] - movq 96(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[8] * A[11] - movq 88(%rsi), %rax - mulq 64(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[9] * A[10] - movq 80(%rsi), %rax - mulq 72(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 152(%rdi) - # A[5] * A[15] - movq 120(%rsi), %rax - mulq 40(%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[6] * A[14] - movq 112(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[13] - movq 104(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[8] * A[12] - movq 96(%rsi), %rax - mulq 64(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[9] * A[11] - movq 88(%rsi), %rax - mulq 72(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[10] * A[10] - movq 80(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 160(%rdi) - # A[6] * A[15] - movq 120(%rsi), %rax - mulq 48(%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[7] * A[14] - movq 112(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[8] * A[13] - movq 104(%rsi), %rax - mulq 64(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[9] * A[12] - movq 96(%rsi), %rax - mulq 72(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[10] * A[11] - movq 88(%rsi), %rax - mulq 80(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 168(%rdi) - # A[7] * A[15] - movq 120(%rsi), %rax - mulq 56(%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[8] * A[14] - movq 112(%rsi), %rax - mulq 64(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[9] * A[13] - movq 104(%rsi), %rax - mulq 72(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[10] * A[12] - movq 96(%rsi), %rax - mulq 80(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[11] * A[11] - movq 88(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 176(%rdi) - # A[8] * A[15] - movq 120(%rsi), %rax - mulq 64(%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[9] * A[14] - movq 112(%rsi), %rax - mulq 72(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[10] * A[13] - movq 104(%rsi), %rax - mulq 80(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[11] * A[12] - movq 96(%rsi), %rax - mulq 88(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 184(%rdi) - # A[9] * A[15] - movq 120(%rsi), %rax - mulq 72(%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[10] * A[14] - movq 112(%rsi), %rax - mulq 80(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[11] * A[13] - movq 104(%rsi), %rax - mulq 88(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[12] * A[12] - movq 96(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 192(%rdi) - # A[10] * A[15] - movq 120(%rsi), %rax - mulq 80(%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[11] * A[14] - movq 112(%rsi), %rax - mulq 88(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[12] * A[13] - movq 104(%rsi), %rax - mulq 96(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 200(%rdi) - # A[11] * A[15] - movq 120(%rsi), %rax - mulq 88(%rsi) - xorq %r8, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - # A[12] * A[14] - movq 112(%rsi), %rax - mulq 96(%rsi) - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - # A[13] * A[13] - movq 104(%rsi), %rax - mulq %rax - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - movq %r9, 208(%rdi) - # A[12] * A[15] - movq 120(%rsi), %rax - mulq 96(%rsi) - xorq %r9, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - # A[13] * A[14] - movq 112(%rsi), %rax - mulq 104(%rsi) - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - movq %rcx, 216(%rdi) - # A[13] * A[15] - movq 120(%rsi), %rax - mulq 104(%rsi) - xorq %rcx, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - # A[14] * A[14] - movq 112(%rsi), %rax - mulq %rax - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - movq %r8, 224(%rdi) - # A[14] * A[15] - movq 120(%rsi), %rax - mulq 112(%rsi) - xorq %r8, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - movq %r9, 232(%rdi) - # A[15] * A[15] - movq 120(%rsi), %rax - mulq %rax - addq %rax, %rcx - adcq %rdx, %r8 - movq %rcx, 240(%rdi) - movq %r8, 248(%rdi) - movq (%rsp), %rax - movq 8(%rsp), %rdx - movq 16(%rsp), %r10 - movq 24(%rsp), %r11 - movq %rax, (%rdi) - movq %rdx, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 32(%rsp), %rax - movq 40(%rsp), %rdx - movq 48(%rsp), %r10 - movq 56(%rsp), %r11 - movq %rax, 32(%rdi) - movq %rdx, 40(%rdi) - movq %r10, 48(%rdi) - movq %r11, 56(%rdi) - movq 64(%rsp), %rax - movq 72(%rsp), %rdx - movq 80(%rsp), %r10 - movq 88(%rsp), %r11 - movq %rax, 64(%rdi) - movq %rdx, 72(%rdi) - movq %r10, 80(%rdi) - movq %r11, 88(%rdi) - movq 96(%rsp), %rax - movq 104(%rsp), %rdx - movq 112(%rsp), %r10 - movq 120(%rsp), %r11 - movq %rax, 96(%rdi) - movq %rdx, 104(%rdi) - movq %r10, 112(%rdi) - movq %r11, 120(%rdi) - addq $0x80, %rsp - popq %r12 - repz retq -#ifndef __APPLE__ -.size sp_2048_sqr_16,.-sp_2048_sqr_16 -#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Multiply a and b into r. (r = a * b) * @@ -4840,6 +3746,2819 @@ L_end_2048_mul_avx2_16: .size sp_2048_mul_avx2_16,.-sp_2048_mul_avx2_16 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_2048_add_16 +.type sp_2048_add_16,@function +.align 16 +sp_2048_add_16: +#else +.section __TEXT,__text +.globl _sp_2048_add_16 +.p2align 4 +_sp_2048_add_16: +#endif /* __APPLE__ */ + # Add + movq (%rsi), %rcx + xorq %rax, %rax + addq (%rdx), %rcx + movq 8(%rsi), %r8 + movq %rcx, (%rdi) + adcq 8(%rdx), %r8 + movq 16(%rsi), %rcx + movq %r8, 8(%rdi) + adcq 16(%rdx), %rcx + movq 24(%rsi), %r8 + movq %rcx, 16(%rdi) + adcq 24(%rdx), %r8 + movq 32(%rsi), %rcx + movq %r8, 24(%rdi) + adcq 32(%rdx), %rcx + movq 40(%rsi), %r8 + movq %rcx, 32(%rdi) + adcq 40(%rdx), %r8 + movq 48(%rsi), %rcx + movq %r8, 40(%rdi) + adcq 48(%rdx), %rcx + movq 56(%rsi), %r8 + movq %rcx, 48(%rdi) + adcq 56(%rdx), %r8 + movq 64(%rsi), %rcx + movq %r8, 56(%rdi) + adcq 64(%rdx), %rcx + movq 72(%rsi), %r8 + movq %rcx, 64(%rdi) + adcq 72(%rdx), %r8 + movq 80(%rsi), %rcx + movq %r8, 72(%rdi) + adcq 80(%rdx), %rcx + movq 88(%rsi), %r8 + movq %rcx, 80(%rdi) + adcq 88(%rdx), %r8 + movq 96(%rsi), %rcx + movq %r8, 88(%rdi) + adcq 96(%rdx), %rcx + movq 104(%rsi), %r8 + movq %rcx, 96(%rdi) + adcq 104(%rdx), %r8 + movq 112(%rsi), %rcx + movq %r8, 104(%rdi) + adcq 112(%rdx), %rcx + movq 120(%rsi), %r8 + movq %rcx, 112(%rdi) + adcq 120(%rdx), %r8 + movq %r8, 120(%rdi) + adcq $0x00, %rax + repz retq +#ifndef __APPLE__ +.size sp_2048_add_16,.-sp_2048_add_16 +#endif /* __APPLE__ */ +/* Sub b from a into a. (a -= b) + * + * a A single precision integer and result. + * b A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_2048_sub_in_place_32 +.type sp_2048_sub_in_place_32,@function +.align 16 +sp_2048_sub_in_place_32: +#else +.section __TEXT,__text +.globl _sp_2048_sub_in_place_32 +.p2align 4 +_sp_2048_sub_in_place_32: +#endif /* __APPLE__ */ + movq (%rdi), %rdx + xorq %rax, %rax + subq (%rsi), %rdx + movq 8(%rdi), %rcx + movq %rdx, (%rdi) + sbbq 8(%rsi), %rcx + movq 16(%rdi), %rdx + movq %rcx, 8(%rdi) + sbbq 16(%rsi), %rdx + movq 24(%rdi), %rcx + movq %rdx, 16(%rdi) + sbbq 24(%rsi), %rcx + movq 32(%rdi), %rdx + movq %rcx, 24(%rdi) + sbbq 32(%rsi), %rdx + movq 40(%rdi), %rcx + movq %rdx, 32(%rdi) + sbbq 40(%rsi), %rcx + movq 48(%rdi), %rdx + movq %rcx, 40(%rdi) + sbbq 48(%rsi), %rdx + movq 56(%rdi), %rcx + movq %rdx, 48(%rdi) + sbbq 56(%rsi), %rcx + movq 64(%rdi), %rdx + movq %rcx, 56(%rdi) + sbbq 64(%rsi), %rdx + movq 72(%rdi), %rcx + movq %rdx, 64(%rdi) + sbbq 72(%rsi), %rcx + movq 80(%rdi), %rdx + movq %rcx, 72(%rdi) + sbbq 80(%rsi), %rdx + movq 88(%rdi), %rcx + movq %rdx, 80(%rdi) + sbbq 88(%rsi), %rcx + movq 96(%rdi), %rdx + movq %rcx, 88(%rdi) + sbbq 96(%rsi), %rdx + movq 104(%rdi), %rcx + movq %rdx, 96(%rdi) + sbbq 104(%rsi), %rcx + movq 112(%rdi), %rdx + movq %rcx, 104(%rdi) + sbbq 112(%rsi), %rdx + movq 120(%rdi), %rcx + movq %rdx, 112(%rdi) + sbbq 120(%rsi), %rcx + movq 128(%rdi), %rdx + movq %rcx, 120(%rdi) + sbbq 128(%rsi), %rdx + movq 136(%rdi), %rcx + movq %rdx, 128(%rdi) + sbbq 136(%rsi), %rcx + movq 144(%rdi), %rdx + movq %rcx, 136(%rdi) + sbbq 144(%rsi), %rdx + movq 152(%rdi), %rcx + movq %rdx, 144(%rdi) + sbbq 152(%rsi), %rcx + movq 160(%rdi), %rdx + movq %rcx, 152(%rdi) + sbbq 160(%rsi), %rdx + movq 168(%rdi), %rcx + movq %rdx, 160(%rdi) + sbbq 168(%rsi), %rcx + movq 176(%rdi), %rdx + movq %rcx, 168(%rdi) + sbbq 176(%rsi), %rdx + movq 184(%rdi), %rcx + movq %rdx, 176(%rdi) + sbbq 184(%rsi), %rcx + movq 192(%rdi), %rdx + movq %rcx, 184(%rdi) + sbbq 192(%rsi), %rdx + movq 200(%rdi), %rcx + movq %rdx, 192(%rdi) + sbbq 200(%rsi), %rcx + movq 208(%rdi), %rdx + movq %rcx, 200(%rdi) + sbbq 208(%rsi), %rdx + movq 216(%rdi), %rcx + movq %rdx, 208(%rdi) + sbbq 216(%rsi), %rcx + movq 224(%rdi), %rdx + movq %rcx, 216(%rdi) + sbbq 224(%rsi), %rdx + movq 232(%rdi), %rcx + movq %rdx, 224(%rdi) + sbbq 232(%rsi), %rcx + movq 240(%rdi), %rdx + movq %rcx, 232(%rdi) + sbbq 240(%rsi), %rdx + movq 248(%rdi), %rcx + movq %rdx, 240(%rdi) + sbbq 248(%rsi), %rcx + movq %rcx, 248(%rdi) + sbbq $0x00, %rax + repz retq +#ifndef __APPLE__ +.size sp_2048_sub_in_place_32,.-sp_2048_sub_in_place_32 +#endif /* __APPLE__ */ +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_2048_add_32 +.type sp_2048_add_32,@function +.align 16 +sp_2048_add_32: +#else +.section __TEXT,__text +.globl _sp_2048_add_32 +.p2align 4 +_sp_2048_add_32: +#endif /* __APPLE__ */ + # Add + movq (%rsi), %rcx + xorq %rax, %rax + addq (%rdx), %rcx + movq 8(%rsi), %r8 + movq %rcx, (%rdi) + adcq 8(%rdx), %r8 + movq 16(%rsi), %rcx + movq %r8, 8(%rdi) + adcq 16(%rdx), %rcx + movq 24(%rsi), %r8 + movq %rcx, 16(%rdi) + adcq 24(%rdx), %r8 + movq 32(%rsi), %rcx + movq %r8, 24(%rdi) + adcq 32(%rdx), %rcx + movq 40(%rsi), %r8 + movq %rcx, 32(%rdi) + adcq 40(%rdx), %r8 + movq 48(%rsi), %rcx + movq %r8, 40(%rdi) + adcq 48(%rdx), %rcx + movq 56(%rsi), %r8 + movq %rcx, 48(%rdi) + adcq 56(%rdx), %r8 + movq 64(%rsi), %rcx + movq %r8, 56(%rdi) + adcq 64(%rdx), %rcx + movq 72(%rsi), %r8 + movq %rcx, 64(%rdi) + adcq 72(%rdx), %r8 + movq 80(%rsi), %rcx + movq %r8, 72(%rdi) + adcq 80(%rdx), %rcx + movq 88(%rsi), %r8 + movq %rcx, 80(%rdi) + adcq 88(%rdx), %r8 + movq 96(%rsi), %rcx + movq %r8, 88(%rdi) + adcq 96(%rdx), %rcx + movq 104(%rsi), %r8 + movq %rcx, 96(%rdi) + adcq 104(%rdx), %r8 + movq 112(%rsi), %rcx + movq %r8, 104(%rdi) + adcq 112(%rdx), %rcx + movq 120(%rsi), %r8 + movq %rcx, 112(%rdi) + adcq 120(%rdx), %r8 + movq 128(%rsi), %rcx + movq %r8, 120(%rdi) + adcq 128(%rdx), %rcx + movq 136(%rsi), %r8 + movq %rcx, 128(%rdi) + adcq 136(%rdx), %r8 + movq 144(%rsi), %rcx + movq %r8, 136(%rdi) + adcq 144(%rdx), %rcx + movq 152(%rsi), %r8 + movq %rcx, 144(%rdi) + adcq 152(%rdx), %r8 + movq 160(%rsi), %rcx + movq %r8, 152(%rdi) + adcq 160(%rdx), %rcx + movq 168(%rsi), %r8 + movq %rcx, 160(%rdi) + adcq 168(%rdx), %r8 + movq 176(%rsi), %rcx + movq %r8, 168(%rdi) + adcq 176(%rdx), %rcx + movq 184(%rsi), %r8 + movq %rcx, 176(%rdi) + adcq 184(%rdx), %r8 + movq 192(%rsi), %rcx + movq %r8, 184(%rdi) + adcq 192(%rdx), %rcx + movq 200(%rsi), %r8 + movq %rcx, 192(%rdi) + adcq 200(%rdx), %r8 + movq 208(%rsi), %rcx + movq %r8, 200(%rdi) + adcq 208(%rdx), %rcx + movq 216(%rsi), %r8 + movq %rcx, 208(%rdi) + adcq 216(%rdx), %r8 + movq 224(%rsi), %rcx + movq %r8, 216(%rdi) + adcq 224(%rdx), %rcx + movq 232(%rsi), %r8 + movq %rcx, 224(%rdi) + adcq 232(%rdx), %r8 + movq 240(%rsi), %rcx + movq %r8, 232(%rdi) + adcq 240(%rdx), %rcx + movq 248(%rsi), %r8 + movq %rcx, 240(%rdi) + adcq 248(%rdx), %r8 + movq %r8, 248(%rdi) + adcq $0x00, %rax + repz retq +#ifndef __APPLE__ +.size sp_2048_add_32,.-sp_2048_add_32 +#endif /* __APPLE__ */ +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_2048_mul_32 +.type sp_2048_mul_32,@function +.align 16 +sp_2048_mul_32: +#else +.section __TEXT,__text +.globl _sp_2048_mul_32 +.p2align 4 +_sp_2048_mul_32: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x328, %rsp + movq %rdi, 768(%rsp) + movq %rsi, 776(%rsp) + movq %rdx, 784(%rsp) + leaq 512(%rsp), %r10 + leaq 128(%rsi), %r12 + # Add + movq (%rsi), %rax + xorq %r13, %r13 + addq (%r12), %rax + movq 8(%rsi), %rcx + movq %rax, (%r10) + adcq 8(%r12), %rcx + movq 16(%rsi), %r8 + movq %rcx, 8(%r10) + adcq 16(%r12), %r8 + movq 24(%rsi), %rax + movq %r8, 16(%r10) + adcq 24(%r12), %rax + movq 32(%rsi), %rcx + movq %rax, 24(%r10) + adcq 32(%r12), %rcx + movq 40(%rsi), %r8 + movq %rcx, 32(%r10) + adcq 40(%r12), %r8 + movq 48(%rsi), %rax + movq %r8, 40(%r10) + adcq 48(%r12), %rax + movq 56(%rsi), %rcx + movq %rax, 48(%r10) + adcq 56(%r12), %rcx + movq 64(%rsi), %r8 + movq %rcx, 56(%r10) + adcq 64(%r12), %r8 + movq 72(%rsi), %rax + movq %r8, 64(%r10) + adcq 72(%r12), %rax + movq 80(%rsi), %rcx + movq %rax, 72(%r10) + adcq 80(%r12), %rcx + movq 88(%rsi), %r8 + movq %rcx, 80(%r10) + adcq 88(%r12), %r8 + movq 96(%rsi), %rax + movq %r8, 88(%r10) + adcq 96(%r12), %rax + movq 104(%rsi), %rcx + movq %rax, 96(%r10) + adcq 104(%r12), %rcx + movq 112(%rsi), %r8 + movq %rcx, 104(%r10) + adcq 112(%r12), %r8 + movq 120(%rsi), %rax + movq %r8, 112(%r10) + adcq 120(%r12), %rax + movq %rax, 120(%r10) + adcq $0x00, %r13 + movq %r13, 792(%rsp) + leaq 640(%rsp), %r11 + leaq 128(%rdx), %r12 + # Add + movq (%rdx), %rax + xorq %r14, %r14 + addq (%r12), %rax + movq 8(%rdx), %rcx + movq %rax, (%r11) + adcq 8(%r12), %rcx + movq 16(%rdx), %r8 + movq %rcx, 8(%r11) + adcq 16(%r12), %r8 + movq 24(%rdx), %rax + movq %r8, 16(%r11) + adcq 24(%r12), %rax + movq 32(%rdx), %rcx + movq %rax, 24(%r11) + adcq 32(%r12), %rcx + movq 40(%rdx), %r8 + movq %rcx, 32(%r11) + adcq 40(%r12), %r8 + movq 48(%rdx), %rax + movq %r8, 40(%r11) + adcq 48(%r12), %rax + movq 56(%rdx), %rcx + movq %rax, 48(%r11) + adcq 56(%r12), %rcx + movq 64(%rdx), %r8 + movq %rcx, 56(%r11) + adcq 64(%r12), %r8 + movq 72(%rdx), %rax + movq %r8, 64(%r11) + adcq 72(%r12), %rax + movq 80(%rdx), %rcx + movq %rax, 72(%r11) + adcq 80(%r12), %rcx + movq 88(%rdx), %r8 + movq %rcx, 80(%r11) + adcq 88(%r12), %r8 + movq 96(%rdx), %rax + movq %r8, 88(%r11) + adcq 96(%r12), %rax + movq 104(%rdx), %rcx + movq %rax, 96(%r11) + adcq 104(%r12), %rcx + movq 112(%rdx), %r8 + movq %rcx, 104(%r11) + adcq 112(%r12), %r8 + movq 120(%rdx), %rax + movq %r8, 112(%r11) + adcq 120(%r12), %rax + movq %rax, 120(%r11) + adcq $0x00, %r14 + movq %r14, 800(%rsp) + movq %r11, %rdx + movq %r10, %rsi + movq %rsp, %rdi +#ifndef __APPLE__ + callq sp_2048_mul_16@plt +#else + callq _sp_2048_mul_16 +#endif /* __APPLE__ */ + movq 784(%rsp), %rdx + movq 776(%rsp), %rsi + leaq 256(%rsp), %rdi + addq $0x80, %rdx + addq $0x80, %rsi +#ifndef __APPLE__ + callq sp_2048_mul_16@plt +#else + callq _sp_2048_mul_16 +#endif /* __APPLE__ */ + movq 784(%rsp), %rdx + movq 776(%rsp), %rsi + movq 768(%rsp), %rdi +#ifndef __APPLE__ + callq sp_2048_mul_16@plt +#else + callq _sp_2048_mul_16 +#endif /* __APPLE__ */ +#ifdef _WIN64 + movq 784(%rsp), %rdx + movq 776(%rsp), %rsi + movq 768(%rsp), %rdi +#endif /* _WIN64 */ + movq 792(%rsp), %r13 + movq 800(%rsp), %r14 + movq 768(%rsp), %r15 + movq %r13, %r9 + leaq 512(%rsp), %r10 + leaq 640(%rsp), %r11 + andq %r14, %r9 + negq %r13 + negq %r14 + addq $0x100, %r15 + movq (%r10), %rax + movq (%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, (%r10) + movq %rcx, (%r11) + movq 8(%r10), %rax + movq 8(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 8(%r10) + movq %rcx, 8(%r11) + movq 16(%r10), %rax + movq 16(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 16(%r10) + movq %rcx, 16(%r11) + movq 24(%r10), %rax + movq 24(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 24(%r10) + movq %rcx, 24(%r11) + movq 32(%r10), %rax + movq 32(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 32(%r10) + movq %rcx, 32(%r11) + movq 40(%r10), %rax + movq 40(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 40(%r10) + movq %rcx, 40(%r11) + movq 48(%r10), %rax + movq 48(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 48(%r10) + movq %rcx, 48(%r11) + movq 56(%r10), %rax + movq 56(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 56(%r10) + movq %rcx, 56(%r11) + movq 64(%r10), %rax + movq 64(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 64(%r10) + movq %rcx, 64(%r11) + movq 72(%r10), %rax + movq 72(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 72(%r10) + movq %rcx, 72(%r11) + movq 80(%r10), %rax + movq 80(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 80(%r10) + movq %rcx, 80(%r11) + movq 88(%r10), %rax + movq 88(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 88(%r10) + movq %rcx, 88(%r11) + movq 96(%r10), %rax + movq 96(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 96(%r10) + movq %rcx, 96(%r11) + movq 104(%r10), %rax + movq 104(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 104(%r10) + movq %rcx, 104(%r11) + movq 112(%r10), %rax + movq 112(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 112(%r10) + movq %rcx, 112(%r11) + movq 120(%r10), %rax + movq 120(%r11), %rcx + andq %r14, %rax + andq %r13, %rcx + movq %rax, 120(%r10) + movq %rcx, 120(%r11) + movq (%r10), %rax + addq (%r11), %rax + movq 8(%r10), %rcx + movq %rax, (%r15) + adcq 8(%r11), %rcx + movq 16(%r10), %r8 + movq %rcx, 8(%r15) + adcq 16(%r11), %r8 + movq 24(%r10), %rax + movq %r8, 16(%r15) + adcq 24(%r11), %rax + movq 32(%r10), %rcx + movq %rax, 24(%r15) + adcq 32(%r11), %rcx + movq 40(%r10), %r8 + movq %rcx, 32(%r15) + adcq 40(%r11), %r8 + movq 48(%r10), %rax + movq %r8, 40(%r15) + adcq 48(%r11), %rax + movq 56(%r10), %rcx + movq %rax, 48(%r15) + adcq 56(%r11), %rcx + movq 64(%r10), %r8 + movq %rcx, 56(%r15) + adcq 64(%r11), %r8 + movq 72(%r10), %rax + movq %r8, 64(%r15) + adcq 72(%r11), %rax + movq 80(%r10), %rcx + movq %rax, 72(%r15) + adcq 80(%r11), %rcx + movq 88(%r10), %r8 + movq %rcx, 80(%r15) + adcq 88(%r11), %r8 + movq 96(%r10), %rax + movq %r8, 88(%r15) + adcq 96(%r11), %rax + movq 104(%r10), %rcx + movq %rax, 96(%r15) + adcq 104(%r11), %rcx + movq 112(%r10), %r8 + movq %rcx, 104(%r15) + adcq 112(%r11), %r8 + movq 120(%r10), %rax + movq %r8, 112(%r15) + adcq 120(%r11), %rax + movq %rax, 120(%r15) + adcq $0x00, %r9 + leaq 256(%rsp), %r11 + movq %rsp, %r10 + movq (%r10), %rax + subq (%r11), %rax + movq 8(%r10), %rcx + movq %rax, (%r10) + sbbq 8(%r11), %rcx + movq 16(%r10), %r8 + movq %rcx, 8(%r10) + sbbq 16(%r11), %r8 + movq 24(%r10), %rax + movq %r8, 16(%r10) + sbbq 24(%r11), %rax + movq 32(%r10), %rcx + movq %rax, 24(%r10) + sbbq 32(%r11), %rcx + movq 40(%r10), %r8 + movq %rcx, 32(%r10) + sbbq 40(%r11), %r8 + movq 48(%r10), %rax + movq %r8, 40(%r10) + sbbq 48(%r11), %rax + movq 56(%r10), %rcx + movq %rax, 48(%r10) + sbbq 56(%r11), %rcx + movq 64(%r10), %r8 + movq %rcx, 56(%r10) + sbbq 64(%r11), %r8 + movq 72(%r10), %rax + movq %r8, 64(%r10) + sbbq 72(%r11), %rax + movq 80(%r10), %rcx + movq %rax, 72(%r10) + sbbq 80(%r11), %rcx + movq 88(%r10), %r8 + movq %rcx, 80(%r10) + sbbq 88(%r11), %r8 + movq 96(%r10), %rax + movq %r8, 88(%r10) + sbbq 96(%r11), %rax + movq 104(%r10), %rcx + movq %rax, 96(%r10) + sbbq 104(%r11), %rcx + movq 112(%r10), %r8 + movq %rcx, 104(%r10) + sbbq 112(%r11), %r8 + movq 120(%r10), %rax + movq %r8, 112(%r10) + sbbq 120(%r11), %rax + movq 128(%r10), %rcx + movq %rax, 120(%r10) + sbbq 128(%r11), %rcx + movq 136(%r10), %r8 + movq %rcx, 128(%r10) + sbbq 136(%r11), %r8 + movq 144(%r10), %rax + movq %r8, 136(%r10) + sbbq 144(%r11), %rax + movq 152(%r10), %rcx + movq %rax, 144(%r10) + sbbq 152(%r11), %rcx + movq 160(%r10), %r8 + movq %rcx, 152(%r10) + sbbq 160(%r11), %r8 + movq 168(%r10), %rax + movq %r8, 160(%r10) + sbbq 168(%r11), %rax + movq 176(%r10), %rcx + movq %rax, 168(%r10) + sbbq 176(%r11), %rcx + movq 184(%r10), %r8 + movq %rcx, 176(%r10) + sbbq 184(%r11), %r8 + movq 192(%r10), %rax + movq %r8, 184(%r10) + sbbq 192(%r11), %rax + movq 200(%r10), %rcx + movq %rax, 192(%r10) + sbbq 200(%r11), %rcx + movq 208(%r10), %r8 + movq %rcx, 200(%r10) + sbbq 208(%r11), %r8 + movq 216(%r10), %rax + movq %r8, 208(%r10) + sbbq 216(%r11), %rax + movq 224(%r10), %rcx + movq %rax, 216(%r10) + sbbq 224(%r11), %rcx + movq 232(%r10), %r8 + movq %rcx, 224(%r10) + sbbq 232(%r11), %r8 + movq 240(%r10), %rax + movq %r8, 232(%r10) + sbbq 240(%r11), %rax + movq 248(%r10), %rcx + movq %rax, 240(%r10) + sbbq 248(%r11), %rcx + movq %rcx, 248(%r10) + sbbq $0x00, %r9 + movq (%r10), %rax + subq (%rdi), %rax + movq 8(%r10), %rcx + movq %rax, (%r10) + sbbq 8(%rdi), %rcx + movq 16(%r10), %r8 + movq %rcx, 8(%r10) + sbbq 16(%rdi), %r8 + movq 24(%r10), %rax + movq %r8, 16(%r10) + sbbq 24(%rdi), %rax + movq 32(%r10), %rcx + movq %rax, 24(%r10) + sbbq 32(%rdi), %rcx + movq 40(%r10), %r8 + movq %rcx, 32(%r10) + sbbq 40(%rdi), %r8 + movq 48(%r10), %rax + movq %r8, 40(%r10) + sbbq 48(%rdi), %rax + movq 56(%r10), %rcx + movq %rax, 48(%r10) + sbbq 56(%rdi), %rcx + movq 64(%r10), %r8 + movq %rcx, 56(%r10) + sbbq 64(%rdi), %r8 + movq 72(%r10), %rax + movq %r8, 64(%r10) + sbbq 72(%rdi), %rax + movq 80(%r10), %rcx + movq %rax, 72(%r10) + sbbq 80(%rdi), %rcx + movq 88(%r10), %r8 + movq %rcx, 80(%r10) + sbbq 88(%rdi), %r8 + movq 96(%r10), %rax + movq %r8, 88(%r10) + sbbq 96(%rdi), %rax + movq 104(%r10), %rcx + movq %rax, 96(%r10) + sbbq 104(%rdi), %rcx + movq 112(%r10), %r8 + movq %rcx, 104(%r10) + sbbq 112(%rdi), %r8 + movq 120(%r10), %rax + movq %r8, 112(%r10) + sbbq 120(%rdi), %rax + movq 128(%r10), %rcx + movq %rax, 120(%r10) + sbbq 128(%rdi), %rcx + movq 136(%r10), %r8 + movq %rcx, 128(%r10) + sbbq 136(%rdi), %r8 + movq 144(%r10), %rax + movq %r8, 136(%r10) + sbbq 144(%rdi), %rax + movq 152(%r10), %rcx + movq %rax, 144(%r10) + sbbq 152(%rdi), %rcx + movq 160(%r10), %r8 + movq %rcx, 152(%r10) + sbbq 160(%rdi), %r8 + movq 168(%r10), %rax + movq %r8, 160(%r10) + sbbq 168(%rdi), %rax + movq 176(%r10), %rcx + movq %rax, 168(%r10) + sbbq 176(%rdi), %rcx + movq 184(%r10), %r8 + movq %rcx, 176(%r10) + sbbq 184(%rdi), %r8 + movq 192(%r10), %rax + movq %r8, 184(%r10) + sbbq 192(%rdi), %rax + movq 200(%r10), %rcx + movq %rax, 192(%r10) + sbbq 200(%rdi), %rcx + movq 208(%r10), %r8 + movq %rcx, 200(%r10) + sbbq 208(%rdi), %r8 + movq 216(%r10), %rax + movq %r8, 208(%r10) + sbbq 216(%rdi), %rax + movq 224(%r10), %rcx + movq %rax, 216(%r10) + sbbq 224(%rdi), %rcx + movq 232(%r10), %r8 + movq %rcx, 224(%r10) + sbbq 232(%rdi), %r8 + movq 240(%r10), %rax + movq %r8, 232(%r10) + sbbq 240(%rdi), %rax + movq 248(%r10), %rcx + movq %rax, 240(%r10) + sbbq 248(%rdi), %rcx + movq %rcx, 248(%r10) + sbbq $0x00, %r9 + subq $0x80, %r15 + # Add + movq (%r15), %rax + addq (%r10), %rax + movq 8(%r15), %rcx + movq %rax, (%r15) + adcq 8(%r10), %rcx + movq 16(%r15), %r8 + movq %rcx, 8(%r15) + adcq 16(%r10), %r8 + movq 24(%r15), %rax + movq %r8, 16(%r15) + adcq 24(%r10), %rax + movq 32(%r15), %rcx + movq %rax, 24(%r15) + adcq 32(%r10), %rcx + movq 40(%r15), %r8 + movq %rcx, 32(%r15) + adcq 40(%r10), %r8 + movq 48(%r15), %rax + movq %r8, 40(%r15) + adcq 48(%r10), %rax + movq 56(%r15), %rcx + movq %rax, 48(%r15) + adcq 56(%r10), %rcx + movq 64(%r15), %r8 + movq %rcx, 56(%r15) + adcq 64(%r10), %r8 + movq 72(%r15), %rax + movq %r8, 64(%r15) + adcq 72(%r10), %rax + movq 80(%r15), %rcx + movq %rax, 72(%r15) + adcq 80(%r10), %rcx + movq 88(%r15), %r8 + movq %rcx, 80(%r15) + adcq 88(%r10), %r8 + movq 96(%r15), %rax + movq %r8, 88(%r15) + adcq 96(%r10), %rax + movq 104(%r15), %rcx + movq %rax, 96(%r15) + adcq 104(%r10), %rcx + movq 112(%r15), %r8 + movq %rcx, 104(%r15) + adcq 112(%r10), %r8 + movq 120(%r15), %rax + movq %r8, 112(%r15) + adcq 120(%r10), %rax + movq 128(%r15), %rcx + movq %rax, 120(%r15) + adcq 128(%r10), %rcx + movq 136(%r15), %r8 + movq %rcx, 128(%r15) + adcq 136(%r10), %r8 + movq 144(%r15), %rax + movq %r8, 136(%r15) + adcq 144(%r10), %rax + movq 152(%r15), %rcx + movq %rax, 144(%r15) + adcq 152(%r10), %rcx + movq 160(%r15), %r8 + movq %rcx, 152(%r15) + adcq 160(%r10), %r8 + movq 168(%r15), %rax + movq %r8, 160(%r15) + adcq 168(%r10), %rax + movq 176(%r15), %rcx + movq %rax, 168(%r15) + adcq 176(%r10), %rcx + movq 184(%r15), %r8 + movq %rcx, 176(%r15) + adcq 184(%r10), %r8 + movq 192(%r15), %rax + movq %r8, 184(%r15) + adcq 192(%r10), %rax + movq 200(%r15), %rcx + movq %rax, 192(%r15) + adcq 200(%r10), %rcx + movq 208(%r15), %r8 + movq %rcx, 200(%r15) + adcq 208(%r10), %r8 + movq 216(%r15), %rax + movq %r8, 208(%r15) + adcq 216(%r10), %rax + movq 224(%r15), %rcx + movq %rax, 216(%r15) + adcq 224(%r10), %rcx + movq 232(%r15), %r8 + movq %rcx, 224(%r15) + adcq 232(%r10), %r8 + movq 240(%r15), %rax + movq %r8, 232(%r15) + adcq 240(%r10), %rax + movq 248(%r15), %rcx + movq %rax, 240(%r15) + adcq 248(%r10), %rcx + movq %rcx, 248(%r15) + adcq $0x00, %r9 + movq %r9, 384(%rdi) + addq $0x80, %r15 + # Add + movq (%r15), %rax + xorq %r9, %r9 + addq (%r11), %rax + movq 8(%r15), %rcx + movq %rax, (%r15) + adcq 8(%r11), %rcx + movq 16(%r15), %r8 + movq %rcx, 8(%r15) + adcq 16(%r11), %r8 + movq 24(%r15), %rax + movq %r8, 16(%r15) + adcq 24(%r11), %rax + movq 32(%r15), %rcx + movq %rax, 24(%r15) + adcq 32(%r11), %rcx + movq 40(%r15), %r8 + movq %rcx, 32(%r15) + adcq 40(%r11), %r8 + movq 48(%r15), %rax + movq %r8, 40(%r15) + adcq 48(%r11), %rax + movq 56(%r15), %rcx + movq %rax, 48(%r15) + adcq 56(%r11), %rcx + movq 64(%r15), %r8 + movq %rcx, 56(%r15) + adcq 64(%r11), %r8 + movq 72(%r15), %rax + movq %r8, 64(%r15) + adcq 72(%r11), %rax + movq 80(%r15), %rcx + movq %rax, 72(%r15) + adcq 80(%r11), %rcx + movq 88(%r15), %r8 + movq %rcx, 80(%r15) + adcq 88(%r11), %r8 + movq 96(%r15), %rax + movq %r8, 88(%r15) + adcq 96(%r11), %rax + movq 104(%r15), %rcx + movq %rax, 96(%r15) + adcq 104(%r11), %rcx + movq 112(%r15), %r8 + movq %rcx, 104(%r15) + adcq 112(%r11), %r8 + movq 120(%r15), %rax + movq %r8, 112(%r15) + adcq 120(%r11), %rax + movq 128(%r15), %rcx + movq %rax, 120(%r15) + adcq 128(%r11), %rcx + movq %rcx, 128(%r15) + adcq $0x00, %r9 + # Add to zero + movq 136(%r11), %rax + adcq $0x00, %rax + movq 144(%r11), %rcx + movq %rax, 136(%r15) + adcq $0x00, %rcx + movq 152(%r11), %r8 + movq %rcx, 144(%r15) + adcq $0x00, %r8 + movq 160(%r11), %rax + movq %r8, 152(%r15) + adcq $0x00, %rax + movq 168(%r11), %rcx + movq %rax, 160(%r15) + adcq $0x00, %rcx + movq 176(%r11), %r8 + movq %rcx, 168(%r15) + adcq $0x00, %r8 + movq 184(%r11), %rax + movq %r8, 176(%r15) + adcq $0x00, %rax + movq 192(%r11), %rcx + movq %rax, 184(%r15) + adcq $0x00, %rcx + movq 200(%r11), %r8 + movq %rcx, 192(%r15) + adcq $0x00, %r8 + movq 208(%r11), %rax + movq %r8, 200(%r15) + adcq $0x00, %rax + movq 216(%r11), %rcx + movq %rax, 208(%r15) + adcq $0x00, %rcx + movq 224(%r11), %r8 + movq %rcx, 216(%r15) + adcq $0x00, %r8 + movq 232(%r11), %rax + movq %r8, 224(%r15) + adcq $0x00, %rax + movq 240(%r11), %rcx + movq %rax, 232(%r15) + adcq $0x00, %rcx + movq 248(%r11), %r8 + movq %rcx, 240(%r15) + adcq $0x00, %r8 + movq %r8, 248(%r15) + addq $0x328, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_2048_mul_32,.-sp_2048_mul_32 +#endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_2048_mul_avx2_32 +.type sp_2048_mul_avx2_32,@function +.align 16 +sp_2048_mul_avx2_32: +#else +.section __TEXT,__text +.globl _sp_2048_mul_avx2_32 +.p2align 4 +_sp_2048_mul_avx2_32: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x328, %rsp + movq %rdi, 768(%rsp) + movq %rsi, 776(%rsp) + movq %rdx, 784(%rsp) + leaq 512(%rsp), %r10 + leaq 128(%rsi), %r12 + # Add + movq (%rsi), %rax + xorq %r13, %r13 + addq (%r12), %rax + movq 8(%rsi), %rcx + movq %rax, (%r10) + adcq 8(%r12), %rcx + movq 16(%rsi), %r8 + movq %rcx, 8(%r10) + adcq 16(%r12), %r8 + movq 24(%rsi), %rax + movq %r8, 16(%r10) + adcq 24(%r12), %rax + movq 32(%rsi), %rcx + movq %rax, 24(%r10) + adcq 32(%r12), %rcx + movq 40(%rsi), %r8 + movq %rcx, 32(%r10) + adcq 40(%r12), %r8 + movq 48(%rsi), %rax + movq %r8, 40(%r10) + adcq 48(%r12), %rax + movq 56(%rsi), %rcx + movq %rax, 48(%r10) + adcq 56(%r12), %rcx + movq 64(%rsi), %r8 + movq %rcx, 56(%r10) + adcq 64(%r12), %r8 + movq 72(%rsi), %rax + movq %r8, 64(%r10) + adcq 72(%r12), %rax + movq 80(%rsi), %rcx + movq %rax, 72(%r10) + adcq 80(%r12), %rcx + movq 88(%rsi), %r8 + movq %rcx, 80(%r10) + adcq 88(%r12), %r8 + movq 96(%rsi), %rax + movq %r8, 88(%r10) + adcq 96(%r12), %rax + movq 104(%rsi), %rcx + movq %rax, 96(%r10) + adcq 104(%r12), %rcx + movq 112(%rsi), %r8 + movq %rcx, 104(%r10) + adcq 112(%r12), %r8 + movq 120(%rsi), %rax + movq %r8, 112(%r10) + adcq 120(%r12), %rax + movq %rax, 120(%r10) + adcq $0x00, %r13 + movq %r13, 792(%rsp) + leaq 640(%rsp), %r11 + leaq 128(%rdx), %r12 + # Add + movq (%rdx), %rax + xorq %r14, %r14 + addq (%r12), %rax + movq 8(%rdx), %rcx + movq %rax, (%r11) + adcq 8(%r12), %rcx + movq 16(%rdx), %r8 + movq %rcx, 8(%r11) + adcq 16(%r12), %r8 + movq 24(%rdx), %rax + movq %r8, 16(%r11) + adcq 24(%r12), %rax + movq 32(%rdx), %rcx + movq %rax, 24(%r11) + adcq 32(%r12), %rcx + movq 40(%rdx), %r8 + movq %rcx, 32(%r11) + adcq 40(%r12), %r8 + movq 48(%rdx), %rax + movq %r8, 40(%r11) + adcq 48(%r12), %rax + movq 56(%rdx), %rcx + movq %rax, 48(%r11) + adcq 56(%r12), %rcx + movq 64(%rdx), %r8 + movq %rcx, 56(%r11) + adcq 64(%r12), %r8 + movq 72(%rdx), %rax + movq %r8, 64(%r11) + adcq 72(%r12), %rax + movq 80(%rdx), %rcx + movq %rax, 72(%r11) + adcq 80(%r12), %rcx + movq 88(%rdx), %r8 + movq %rcx, 80(%r11) + adcq 88(%r12), %r8 + movq 96(%rdx), %rax + movq %r8, 88(%r11) + adcq 96(%r12), %rax + movq 104(%rdx), %rcx + movq %rax, 96(%r11) + adcq 104(%r12), %rcx + movq 112(%rdx), %r8 + movq %rcx, 104(%r11) + adcq 112(%r12), %r8 + movq 120(%rdx), %rax + movq %r8, 112(%r11) + adcq 120(%r12), %rax + movq %rax, 120(%r11) + adcq $0x00, %r14 + movq %r14, 800(%rsp) + movq %r11, %rdx + movq %r10, %rsi + movq %rsp, %rdi +#ifndef __APPLE__ + callq sp_2048_mul_avx2_16@plt +#else + callq _sp_2048_mul_avx2_16 +#endif /* __APPLE__ */ + movq 784(%rsp), %rdx + movq 776(%rsp), %rsi + leaq 256(%rsp), %rdi + addq $0x80, %rdx + addq $0x80, %rsi +#ifndef __APPLE__ + callq sp_2048_mul_avx2_16@plt +#else + callq _sp_2048_mul_avx2_16 +#endif /* __APPLE__ */ + movq 784(%rsp), %rdx + movq 776(%rsp), %rsi + movq 768(%rsp), %rdi +#ifndef __APPLE__ + callq sp_2048_mul_avx2_16@plt +#else + callq _sp_2048_mul_avx2_16 +#endif /* __APPLE__ */ +#ifdef _WIN64 + movq 784(%rsp), %rdx + movq 776(%rsp), %rsi + movq 768(%rsp), %rdi +#endif /* _WIN64 */ + movq 792(%rsp), %r13 + movq 800(%rsp), %r14 + movq 768(%rsp), %r15 + movq %r13, %r9 + leaq 512(%rsp), %r10 + leaq 640(%rsp), %r11 + andq %r14, %r9 + negq %r13 + negq %r14 + addq $0x100, %r15 + movq (%r10), %rax + movq (%r11), %rcx + pextq %r14, %rax, %rax + pextq %r13, %rcx, %rcx + addq %rcx, %rax + movq 8(%r10), %rcx + movq 8(%r11), %r8 + pextq %r14, %rcx, %rcx + pextq %r13, %r8, %r8 + movq %rax, (%r15) + adcq %r8, %rcx + movq 16(%r10), %r8 + movq 16(%r11), %rax + pextq %r14, %r8, %r8 + pextq %r13, %rax, %rax + movq %rcx, 8(%r15) + adcq %rax, %r8 + movq 24(%r10), %rax + movq 24(%r11), %rcx + pextq %r14, %rax, %rax + pextq %r13, %rcx, %rcx + movq %r8, 16(%r15) + adcq %rcx, %rax + movq 32(%r10), %rcx + movq 32(%r11), %r8 + pextq %r14, %rcx, %rcx + pextq %r13, %r8, %r8 + movq %rax, 24(%r15) + adcq %r8, %rcx + movq 40(%r10), %r8 + movq 40(%r11), %rax + pextq %r14, %r8, %r8 + pextq %r13, %rax, %rax + movq %rcx, 32(%r15) + adcq %rax, %r8 + movq 48(%r10), %rax + movq 48(%r11), %rcx + pextq %r14, %rax, %rax + pextq %r13, %rcx, %rcx + movq %r8, 40(%r15) + adcq %rcx, %rax + movq 56(%r10), %rcx + movq 56(%r11), %r8 + pextq %r14, %rcx, %rcx + pextq %r13, %r8, %r8 + movq %rax, 48(%r15) + adcq %r8, %rcx + movq 64(%r10), %r8 + movq 64(%r11), %rax + pextq %r14, %r8, %r8 + pextq %r13, %rax, %rax + movq %rcx, 56(%r15) + adcq %rax, %r8 + movq 72(%r10), %rax + movq 72(%r11), %rcx + pextq %r14, %rax, %rax + pextq %r13, %rcx, %rcx + movq %r8, 64(%r15) + adcq %rcx, %rax + movq 80(%r10), %rcx + movq 80(%r11), %r8 + pextq %r14, %rcx, %rcx + pextq %r13, %r8, %r8 + movq %rax, 72(%r15) + adcq %r8, %rcx + movq 88(%r10), %r8 + movq 88(%r11), %rax + pextq %r14, %r8, %r8 + pextq %r13, %rax, %rax + movq %rcx, 80(%r15) + adcq %rax, %r8 + movq 96(%r10), %rax + movq 96(%r11), %rcx + pextq %r14, %rax, %rax + pextq %r13, %rcx, %rcx + movq %r8, 88(%r15) + adcq %rcx, %rax + movq 104(%r10), %rcx + movq 104(%r11), %r8 + pextq %r14, %rcx, %rcx + pextq %r13, %r8, %r8 + movq %rax, 96(%r15) + adcq %r8, %rcx + movq 112(%r10), %r8 + movq 112(%r11), %rax + pextq %r14, %r8, %r8 + pextq %r13, %rax, %rax + movq %rcx, 104(%r15) + adcq %rax, %r8 + movq 120(%r10), %rax + movq 120(%r11), %rcx + pextq %r14, %rax, %rax + pextq %r13, %rcx, %rcx + movq %r8, 112(%r15) + adcq %rcx, %rax + movq %rax, 120(%r15) + adcq $0x00, %r9 + leaq 256(%rsp), %r11 + movq %rsp, %r10 + movq (%r10), %rax + subq (%r11), %rax + movq 8(%r10), %rcx + movq %rax, (%r10) + sbbq 8(%r11), %rcx + movq 16(%r10), %r8 + movq %rcx, 8(%r10) + sbbq 16(%r11), %r8 + movq 24(%r10), %rax + movq %r8, 16(%r10) + sbbq 24(%r11), %rax + movq 32(%r10), %rcx + movq %rax, 24(%r10) + sbbq 32(%r11), %rcx + movq 40(%r10), %r8 + movq %rcx, 32(%r10) + sbbq 40(%r11), %r8 + movq 48(%r10), %rax + movq %r8, 40(%r10) + sbbq 48(%r11), %rax + movq 56(%r10), %rcx + movq %rax, 48(%r10) + sbbq 56(%r11), %rcx + movq 64(%r10), %r8 + movq %rcx, 56(%r10) + sbbq 64(%r11), %r8 + movq 72(%r10), %rax + movq %r8, 64(%r10) + sbbq 72(%r11), %rax + movq 80(%r10), %rcx + movq %rax, 72(%r10) + sbbq 80(%r11), %rcx + movq 88(%r10), %r8 + movq %rcx, 80(%r10) + sbbq 88(%r11), %r8 + movq 96(%r10), %rax + movq %r8, 88(%r10) + sbbq 96(%r11), %rax + movq 104(%r10), %rcx + movq %rax, 96(%r10) + sbbq 104(%r11), %rcx + movq 112(%r10), %r8 + movq %rcx, 104(%r10) + sbbq 112(%r11), %r8 + movq 120(%r10), %rax + movq %r8, 112(%r10) + sbbq 120(%r11), %rax + movq 128(%r10), %rcx + movq %rax, 120(%r10) + sbbq 128(%r11), %rcx + movq 136(%r10), %r8 + movq %rcx, 128(%r10) + sbbq 136(%r11), %r8 + movq 144(%r10), %rax + movq %r8, 136(%r10) + sbbq 144(%r11), %rax + movq 152(%r10), %rcx + movq %rax, 144(%r10) + sbbq 152(%r11), %rcx + movq 160(%r10), %r8 + movq %rcx, 152(%r10) + sbbq 160(%r11), %r8 + movq 168(%r10), %rax + movq %r8, 160(%r10) + sbbq 168(%r11), %rax + movq 176(%r10), %rcx + movq %rax, 168(%r10) + sbbq 176(%r11), %rcx + movq 184(%r10), %r8 + movq %rcx, 176(%r10) + sbbq 184(%r11), %r8 + movq 192(%r10), %rax + movq %r8, 184(%r10) + sbbq 192(%r11), %rax + movq 200(%r10), %rcx + movq %rax, 192(%r10) + sbbq 200(%r11), %rcx + movq 208(%r10), %r8 + movq %rcx, 200(%r10) + sbbq 208(%r11), %r8 + movq 216(%r10), %rax + movq %r8, 208(%r10) + sbbq 216(%r11), %rax + movq 224(%r10), %rcx + movq %rax, 216(%r10) + sbbq 224(%r11), %rcx + movq 232(%r10), %r8 + movq %rcx, 224(%r10) + sbbq 232(%r11), %r8 + movq 240(%r10), %rax + movq %r8, 232(%r10) + sbbq 240(%r11), %rax + movq 248(%r10), %rcx + movq %rax, 240(%r10) + sbbq 248(%r11), %rcx + movq %rcx, 248(%r10) + sbbq $0x00, %r9 + movq (%r10), %rax + subq (%rdi), %rax + movq 8(%r10), %rcx + movq %rax, (%r10) + sbbq 8(%rdi), %rcx + movq 16(%r10), %r8 + movq %rcx, 8(%r10) + sbbq 16(%rdi), %r8 + movq 24(%r10), %rax + movq %r8, 16(%r10) + sbbq 24(%rdi), %rax + movq 32(%r10), %rcx + movq %rax, 24(%r10) + sbbq 32(%rdi), %rcx + movq 40(%r10), %r8 + movq %rcx, 32(%r10) + sbbq 40(%rdi), %r8 + movq 48(%r10), %rax + movq %r8, 40(%r10) + sbbq 48(%rdi), %rax + movq 56(%r10), %rcx + movq %rax, 48(%r10) + sbbq 56(%rdi), %rcx + movq 64(%r10), %r8 + movq %rcx, 56(%r10) + sbbq 64(%rdi), %r8 + movq 72(%r10), %rax + movq %r8, 64(%r10) + sbbq 72(%rdi), %rax + movq 80(%r10), %rcx + movq %rax, 72(%r10) + sbbq 80(%rdi), %rcx + movq 88(%r10), %r8 + movq %rcx, 80(%r10) + sbbq 88(%rdi), %r8 + movq 96(%r10), %rax + movq %r8, 88(%r10) + sbbq 96(%rdi), %rax + movq 104(%r10), %rcx + movq %rax, 96(%r10) + sbbq 104(%rdi), %rcx + movq 112(%r10), %r8 + movq %rcx, 104(%r10) + sbbq 112(%rdi), %r8 + movq 120(%r10), %rax + movq %r8, 112(%r10) + sbbq 120(%rdi), %rax + movq 128(%r10), %rcx + movq %rax, 120(%r10) + sbbq 128(%rdi), %rcx + movq 136(%r10), %r8 + movq %rcx, 128(%r10) + sbbq 136(%rdi), %r8 + movq 144(%r10), %rax + movq %r8, 136(%r10) + sbbq 144(%rdi), %rax + movq 152(%r10), %rcx + movq %rax, 144(%r10) + sbbq 152(%rdi), %rcx + movq 160(%r10), %r8 + movq %rcx, 152(%r10) + sbbq 160(%rdi), %r8 + movq 168(%r10), %rax + movq %r8, 160(%r10) + sbbq 168(%rdi), %rax + movq 176(%r10), %rcx + movq %rax, 168(%r10) + sbbq 176(%rdi), %rcx + movq 184(%r10), %r8 + movq %rcx, 176(%r10) + sbbq 184(%rdi), %r8 + movq 192(%r10), %rax + movq %r8, 184(%r10) + sbbq 192(%rdi), %rax + movq 200(%r10), %rcx + movq %rax, 192(%r10) + sbbq 200(%rdi), %rcx + movq 208(%r10), %r8 + movq %rcx, 200(%r10) + sbbq 208(%rdi), %r8 + movq 216(%r10), %rax + movq %r8, 208(%r10) + sbbq 216(%rdi), %rax + movq 224(%r10), %rcx + movq %rax, 216(%r10) + sbbq 224(%rdi), %rcx + movq 232(%r10), %r8 + movq %rcx, 224(%r10) + sbbq 232(%rdi), %r8 + movq 240(%r10), %rax + movq %r8, 232(%r10) + sbbq 240(%rdi), %rax + movq 248(%r10), %rcx + movq %rax, 240(%r10) + sbbq 248(%rdi), %rcx + movq %rcx, 248(%r10) + sbbq $0x00, %r9 + subq $0x80, %r15 + # Add + movq (%r15), %rax + addq (%r10), %rax + movq 8(%r15), %rcx + movq %rax, (%r15) + adcq 8(%r10), %rcx + movq 16(%r15), %r8 + movq %rcx, 8(%r15) + adcq 16(%r10), %r8 + movq 24(%r15), %rax + movq %r8, 16(%r15) + adcq 24(%r10), %rax + movq 32(%r15), %rcx + movq %rax, 24(%r15) + adcq 32(%r10), %rcx + movq 40(%r15), %r8 + movq %rcx, 32(%r15) + adcq 40(%r10), %r8 + movq 48(%r15), %rax + movq %r8, 40(%r15) + adcq 48(%r10), %rax + movq 56(%r15), %rcx + movq %rax, 48(%r15) + adcq 56(%r10), %rcx + movq 64(%r15), %r8 + movq %rcx, 56(%r15) + adcq 64(%r10), %r8 + movq 72(%r15), %rax + movq %r8, 64(%r15) + adcq 72(%r10), %rax + movq 80(%r15), %rcx + movq %rax, 72(%r15) + adcq 80(%r10), %rcx + movq 88(%r15), %r8 + movq %rcx, 80(%r15) + adcq 88(%r10), %r8 + movq 96(%r15), %rax + movq %r8, 88(%r15) + adcq 96(%r10), %rax + movq 104(%r15), %rcx + movq %rax, 96(%r15) + adcq 104(%r10), %rcx + movq 112(%r15), %r8 + movq %rcx, 104(%r15) + adcq 112(%r10), %r8 + movq 120(%r15), %rax + movq %r8, 112(%r15) + adcq 120(%r10), %rax + movq 128(%r15), %rcx + movq %rax, 120(%r15) + adcq 128(%r10), %rcx + movq 136(%r15), %r8 + movq %rcx, 128(%r15) + adcq 136(%r10), %r8 + movq 144(%r15), %rax + movq %r8, 136(%r15) + adcq 144(%r10), %rax + movq 152(%r15), %rcx + movq %rax, 144(%r15) + adcq 152(%r10), %rcx + movq 160(%r15), %r8 + movq %rcx, 152(%r15) + adcq 160(%r10), %r8 + movq 168(%r15), %rax + movq %r8, 160(%r15) + adcq 168(%r10), %rax + movq 176(%r15), %rcx + movq %rax, 168(%r15) + adcq 176(%r10), %rcx + movq 184(%r15), %r8 + movq %rcx, 176(%r15) + adcq 184(%r10), %r8 + movq 192(%r15), %rax + movq %r8, 184(%r15) + adcq 192(%r10), %rax + movq 200(%r15), %rcx + movq %rax, 192(%r15) + adcq 200(%r10), %rcx + movq 208(%r15), %r8 + movq %rcx, 200(%r15) + adcq 208(%r10), %r8 + movq 216(%r15), %rax + movq %r8, 208(%r15) + adcq 216(%r10), %rax + movq 224(%r15), %rcx + movq %rax, 216(%r15) + adcq 224(%r10), %rcx + movq 232(%r15), %r8 + movq %rcx, 224(%r15) + adcq 232(%r10), %r8 + movq 240(%r15), %rax + movq %r8, 232(%r15) + adcq 240(%r10), %rax + movq 248(%r15), %rcx + movq %rax, 240(%r15) + adcq 248(%r10), %rcx + movq %rcx, 248(%r15) + adcq $0x00, %r9 + movq %r9, 384(%rdi) + addq $0x80, %r15 + # Add + movq (%r15), %rax + xorq %r9, %r9 + addq (%r11), %rax + movq 8(%r15), %rcx + movq %rax, (%r15) + adcq 8(%r11), %rcx + movq 16(%r15), %r8 + movq %rcx, 8(%r15) + adcq 16(%r11), %r8 + movq 24(%r15), %rax + movq %r8, 16(%r15) + adcq 24(%r11), %rax + movq 32(%r15), %rcx + movq %rax, 24(%r15) + adcq 32(%r11), %rcx + movq 40(%r15), %r8 + movq %rcx, 32(%r15) + adcq 40(%r11), %r8 + movq 48(%r15), %rax + movq %r8, 40(%r15) + adcq 48(%r11), %rax + movq 56(%r15), %rcx + movq %rax, 48(%r15) + adcq 56(%r11), %rcx + movq 64(%r15), %r8 + movq %rcx, 56(%r15) + adcq 64(%r11), %r8 + movq 72(%r15), %rax + movq %r8, 64(%r15) + adcq 72(%r11), %rax + movq 80(%r15), %rcx + movq %rax, 72(%r15) + adcq 80(%r11), %rcx + movq 88(%r15), %r8 + movq %rcx, 80(%r15) + adcq 88(%r11), %r8 + movq 96(%r15), %rax + movq %r8, 88(%r15) + adcq 96(%r11), %rax + movq 104(%r15), %rcx + movq %rax, 96(%r15) + adcq 104(%r11), %rcx + movq 112(%r15), %r8 + movq %rcx, 104(%r15) + adcq 112(%r11), %r8 + movq 120(%r15), %rax + movq %r8, 112(%r15) + adcq 120(%r11), %rax + movq 128(%r15), %rcx + movq %rax, 120(%r15) + adcq 128(%r11), %rcx + movq %rcx, 128(%r15) + adcq $0x00, %r9 + # Add to zero + movq 136(%r11), %rax + adcq $0x00, %rax + movq 144(%r11), %rcx + movq %rax, 136(%r15) + adcq $0x00, %rcx + movq 152(%r11), %r8 + movq %rcx, 144(%r15) + adcq $0x00, %r8 + movq 160(%r11), %rax + movq %r8, 152(%r15) + adcq $0x00, %rax + movq 168(%r11), %rcx + movq %rax, 160(%r15) + adcq $0x00, %rcx + movq 176(%r11), %r8 + movq %rcx, 168(%r15) + adcq $0x00, %r8 + movq 184(%r11), %rax + movq %r8, 176(%r15) + adcq $0x00, %rax + movq 192(%r11), %rcx + movq %rax, 184(%r15) + adcq $0x00, %rcx + movq 200(%r11), %r8 + movq %rcx, 192(%r15) + adcq $0x00, %r8 + movq 208(%r11), %rax + movq %r8, 200(%r15) + adcq $0x00, %rax + movq 216(%r11), %rcx + movq %rax, 208(%r15) + adcq $0x00, %rcx + movq 224(%r11), %r8 + movq %rcx, 216(%r15) + adcq $0x00, %r8 + movq 232(%r11), %rax + movq %r8, 224(%r15) + adcq $0x00, %rax + movq 240(%r11), %rcx + movq %rax, 232(%r15) + adcq $0x00, %rcx + movq 248(%r11), %r8 + movq %rcx, 240(%r15) + adcq $0x00, %r8 + movq %r8, 248(%r15) + addq $0x328, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_2048_mul_avx2_32,.-sp_2048_mul_avx2_32 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_2048_sqr_16 +.type sp_2048_sqr_16,@function +.align 16 +sp_2048_sqr_16: +#else +.section __TEXT,__text +.globl _sp_2048_sqr_16 +.p2align 4 +_sp_2048_sqr_16: +#endif /* __APPLE__ */ + pushq %r12 + subq $0x80, %rsp + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + xorq %r9, %r9 + movq %rax, (%rsp) + movq %rdx, %r8 + # A[0] * A[1] + movq 8(%rsi), %rax + mulq (%rsi) + xorq %rcx, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + movq %r8, 8(%rsp) + # A[0] * A[2] + movq 16(%rsi), %rax + mulq (%rsi) + xorq %r8, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + movq %r9, 16(%rsp) + # A[0] * A[3] + movq 24(%rsi), %rax + mulq (%rsi) + xorq %r9, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + # A[1] * A[2] + movq 16(%rsi), %rax + mulq 8(%rsi) + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + movq %rcx, 24(%rsp) + # A[0] * A[4] + movq 32(%rsi), %rax + mulq (%rsi) + xorq %rcx, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + # A[1] * A[3] + movq 24(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + movq %r8, 32(%rsp) + # A[0] * A[5] + movq 40(%rsi), %rax + mulq (%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[4] + movq 32(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[3] + movq 24(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 40(%rsp) + # A[0] * A[6] + movq 48(%rsi), %rax + mulq (%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[5] + movq 40(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[4] + movq 32(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 48(%rsp) + # A[0] * A[7] + movq 56(%rsi), %rax + mulq (%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[6] + movq 48(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[5] + movq 40(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[4] + movq 32(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 56(%rsp) + # A[0] * A[8] + movq 64(%rsi), %rax + mulq (%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[7] + movq 56(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[6] + movq 48(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[5] + movq 40(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[4] + movq 32(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 64(%rsp) + # A[0] * A[9] + movq 72(%rsi), %rax + mulq (%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[8] + movq 64(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[7] + movq 56(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[6] + movq 48(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[5] + movq 40(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 72(%rsp) + # A[0] * A[10] + movq 80(%rsi), %rax + mulq (%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[9] + movq 72(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[8] + movq 64(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[7] + movq 56(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[6] + movq 48(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[5] + movq 40(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 80(%rsp) + # A[0] * A[11] + movq 88(%rsi), %rax + mulq (%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[10] + movq 80(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[9] + movq 72(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[8] + movq 64(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[7] + movq 56(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[6] + movq 48(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 88(%rsp) + # A[0] * A[12] + movq 96(%rsi), %rax + mulq (%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[11] + movq 88(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[10] + movq 80(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[9] + movq 72(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[8] + movq 64(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[7] + movq 56(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[6] + movq 48(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 96(%rsp) + # A[0] * A[13] + movq 104(%rsi), %rax + mulq (%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[12] + movq 96(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[11] + movq 88(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[10] + movq 80(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[9] + movq 72(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[8] + movq 64(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[7] + movq 56(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 104(%rsp) + # A[0] * A[14] + movq 112(%rsi), %rax + mulq (%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[13] + movq 104(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[12] + movq 96(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[11] + movq 88(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[10] + movq 80(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[9] + movq 72(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[8] + movq 64(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[7] + movq 56(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 112(%rsp) + # A[0] * A[15] + movq 120(%rsi), %rax + mulq (%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[14] + movq 112(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[13] + movq 104(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[12] + movq 96(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[11] + movq 88(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[10] + movq 80(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[9] + movq 72(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[8] + movq 64(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 120(%rsp) + # A[1] * A[15] + movq 120(%rsi), %rax + mulq 8(%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[2] * A[14] + movq 112(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[13] + movq 104(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[12] + movq 96(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[11] + movq 88(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[10] + movq 80(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[9] + movq 72(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[8] * A[8] + movq 64(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 128(%rdi) + # A[2] * A[15] + movq 120(%rsi), %rax + mulq 16(%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[3] * A[14] + movq 112(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[13] + movq 104(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[12] + movq 96(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[11] + movq 88(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[10] + movq 80(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[8] * A[9] + movq 72(%rsi), %rax + mulq 64(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 136(%rdi) + # A[3] * A[15] + movq 120(%rsi), %rax + mulq 24(%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[4] * A[14] + movq 112(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[13] + movq 104(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[12] + movq 96(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[11] + movq 88(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[8] * A[10] + movq 80(%rsi), %rax + mulq 64(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[9] * A[9] + movq 72(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 144(%rdi) + # A[4] * A[15] + movq 120(%rsi), %rax + mulq 32(%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[5] * A[14] + movq 112(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[13] + movq 104(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[12] + movq 96(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[8] * A[11] + movq 88(%rsi), %rax + mulq 64(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[9] * A[10] + movq 80(%rsi), %rax + mulq 72(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 152(%rdi) + # A[5] * A[15] + movq 120(%rsi), %rax + mulq 40(%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[6] * A[14] + movq 112(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[13] + movq 104(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[8] * A[12] + movq 96(%rsi), %rax + mulq 64(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[9] * A[11] + movq 88(%rsi), %rax + mulq 72(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[10] * A[10] + movq 80(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 160(%rdi) + # A[6] * A[15] + movq 120(%rsi), %rax + mulq 48(%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[7] * A[14] + movq 112(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[8] * A[13] + movq 104(%rsi), %rax + mulq 64(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[9] * A[12] + movq 96(%rsi), %rax + mulq 72(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[10] * A[11] + movq 88(%rsi), %rax + mulq 80(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 168(%rdi) + # A[7] * A[15] + movq 120(%rsi), %rax + mulq 56(%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[8] * A[14] + movq 112(%rsi), %rax + mulq 64(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[9] * A[13] + movq 104(%rsi), %rax + mulq 72(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[10] * A[12] + movq 96(%rsi), %rax + mulq 80(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[11] * A[11] + movq 88(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 176(%rdi) + # A[8] * A[15] + movq 120(%rsi), %rax + mulq 64(%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[9] * A[14] + movq 112(%rsi), %rax + mulq 72(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[10] * A[13] + movq 104(%rsi), %rax + mulq 80(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[11] * A[12] + movq 96(%rsi), %rax + mulq 88(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 184(%rdi) + # A[9] * A[15] + movq 120(%rsi), %rax + mulq 72(%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[10] * A[14] + movq 112(%rsi), %rax + mulq 80(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[11] * A[13] + movq 104(%rsi), %rax + mulq 88(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[12] * A[12] + movq 96(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 192(%rdi) + # A[10] * A[15] + movq 120(%rsi), %rax + mulq 80(%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[11] * A[14] + movq 112(%rsi), %rax + mulq 88(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[12] * A[13] + movq 104(%rsi), %rax + mulq 96(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 200(%rdi) + # A[11] * A[15] + movq 120(%rsi), %rax + mulq 88(%rsi) + xorq %r8, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + # A[12] * A[14] + movq 112(%rsi), %rax + mulq 96(%rsi) + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + # A[13] * A[13] + movq 104(%rsi), %rax + mulq %rax + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + movq %r9, 208(%rdi) + # A[12] * A[15] + movq 120(%rsi), %rax + mulq 96(%rsi) + xorq %r9, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + # A[13] * A[14] + movq 112(%rsi), %rax + mulq 104(%rsi) + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + movq %rcx, 216(%rdi) + # A[13] * A[15] + movq 120(%rsi), %rax + mulq 104(%rsi) + xorq %rcx, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + # A[14] * A[14] + movq 112(%rsi), %rax + mulq %rax + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + movq %r8, 224(%rdi) + # A[14] * A[15] + movq 120(%rsi), %rax + mulq 112(%rsi) + xorq %r8, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + movq %r9, 232(%rdi) + # A[15] * A[15] + movq 120(%rsi), %rax + mulq %rax + addq %rax, %rcx + adcq %rdx, %r8 + movq %rcx, 240(%rdi) + movq %r8, 248(%rdi) + movq (%rsp), %rax + movq 8(%rsp), %rdx + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq %rax, (%rdi) + movq %rdx, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 32(%rsp), %rax + movq 40(%rsp), %rdx + movq 48(%rsp), %r10 + movq 56(%rsp), %r11 + movq %rax, 32(%rdi) + movq %rdx, 40(%rdi) + movq %r10, 48(%rdi) + movq %r11, 56(%rdi) + movq 64(%rsp), %rax + movq 72(%rsp), %rdx + movq 80(%rsp), %r10 + movq 88(%rsp), %r11 + movq %rax, 64(%rdi) + movq %rdx, 72(%rdi) + movq %r10, 80(%rdi) + movq %r11, 88(%rdi) + movq 96(%rsp), %rax + movq 104(%rsp), %rdx + movq 112(%rsp), %r10 + movq 120(%rsp), %r11 + movq %rax, 96(%rdi) + movq %rdx, 104(%rdi) + movq %r10, 112(%rdi) + movq %r11, 120(%rdi) + addq $0x80, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_2048_sqr_16,.-sp_2048_sqr_16 +#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Square a and put result in r. (r = a * a) * @@ -5899,1045 +7618,6 @@ L_end_2048_sqr_avx2_16: .size sp_2048_sqr_avx2_16,.-sp_2048_sqr_avx2_16 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_add_16 -.type sp_2048_add_16,@function -.align 16 -sp_2048_add_16: -#else -.section __TEXT,__text -.globl _sp_2048_add_16 -.p2align 4 -_sp_2048_add_16: -#endif /* __APPLE__ */ - # Add - movq (%rsi), %rcx - xorq %rax, %rax - addq (%rdx), %rcx - movq 8(%rsi), %r8 - movq %rcx, (%rdi) - adcq 8(%rdx), %r8 - movq 16(%rsi), %rcx - movq %r8, 8(%rdi) - adcq 16(%rdx), %rcx - movq 24(%rsi), %r8 - movq %rcx, 16(%rdi) - adcq 24(%rdx), %r8 - movq 32(%rsi), %rcx - movq %r8, 24(%rdi) - adcq 32(%rdx), %rcx - movq 40(%rsi), %r8 - movq %rcx, 32(%rdi) - adcq 40(%rdx), %r8 - movq 48(%rsi), %rcx - movq %r8, 40(%rdi) - adcq 48(%rdx), %rcx - movq 56(%rsi), %r8 - movq %rcx, 48(%rdi) - adcq 56(%rdx), %r8 - movq 64(%rsi), %rcx - movq %r8, 56(%rdi) - adcq 64(%rdx), %rcx - movq 72(%rsi), %r8 - movq %rcx, 64(%rdi) - adcq 72(%rdx), %r8 - movq 80(%rsi), %rcx - movq %r8, 72(%rdi) - adcq 80(%rdx), %rcx - movq 88(%rsi), %r8 - movq %rcx, 80(%rdi) - adcq 88(%rdx), %r8 - movq 96(%rsi), %rcx - movq %r8, 88(%rdi) - adcq 96(%rdx), %rcx - movq 104(%rsi), %r8 - movq %rcx, 96(%rdi) - adcq 104(%rdx), %r8 - movq 112(%rsi), %rcx - movq %r8, 104(%rdi) - adcq 112(%rdx), %rcx - movq 120(%rsi), %r8 - movq %rcx, 112(%rdi) - adcq 120(%rdx), %r8 - movq %r8, 120(%rdi) - adcq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_2048_add_16,.-sp_2048_add_16 -#endif /* __APPLE__ */ -/* Sub b from a into a. (a -= b) - * - * a A single precision integer and result. - * b A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_sub_in_place_32 -.type sp_2048_sub_in_place_32,@function -.align 16 -sp_2048_sub_in_place_32: -#else -.section __TEXT,__text -.globl _sp_2048_sub_in_place_32 -.p2align 4 -_sp_2048_sub_in_place_32: -#endif /* __APPLE__ */ - movq (%rdi), %rdx - xorq %rax, %rax - subq (%rsi), %rdx - movq 8(%rdi), %rcx - movq %rdx, (%rdi) - sbbq 8(%rsi), %rcx - movq 16(%rdi), %rdx - movq %rcx, 8(%rdi) - sbbq 16(%rsi), %rdx - movq 24(%rdi), %rcx - movq %rdx, 16(%rdi) - sbbq 24(%rsi), %rcx - movq 32(%rdi), %rdx - movq %rcx, 24(%rdi) - sbbq 32(%rsi), %rdx - movq 40(%rdi), %rcx - movq %rdx, 32(%rdi) - sbbq 40(%rsi), %rcx - movq 48(%rdi), %rdx - movq %rcx, 40(%rdi) - sbbq 48(%rsi), %rdx - movq 56(%rdi), %rcx - movq %rdx, 48(%rdi) - sbbq 56(%rsi), %rcx - movq 64(%rdi), %rdx - movq %rcx, 56(%rdi) - sbbq 64(%rsi), %rdx - movq 72(%rdi), %rcx - movq %rdx, 64(%rdi) - sbbq 72(%rsi), %rcx - movq 80(%rdi), %rdx - movq %rcx, 72(%rdi) - sbbq 80(%rsi), %rdx - movq 88(%rdi), %rcx - movq %rdx, 80(%rdi) - sbbq 88(%rsi), %rcx - movq 96(%rdi), %rdx - movq %rcx, 88(%rdi) - sbbq 96(%rsi), %rdx - movq 104(%rdi), %rcx - movq %rdx, 96(%rdi) - sbbq 104(%rsi), %rcx - movq 112(%rdi), %rdx - movq %rcx, 104(%rdi) - sbbq 112(%rsi), %rdx - movq 120(%rdi), %rcx - movq %rdx, 112(%rdi) - sbbq 120(%rsi), %rcx - movq 128(%rdi), %rdx - movq %rcx, 120(%rdi) - sbbq 128(%rsi), %rdx - movq 136(%rdi), %rcx - movq %rdx, 128(%rdi) - sbbq 136(%rsi), %rcx - movq 144(%rdi), %rdx - movq %rcx, 136(%rdi) - sbbq 144(%rsi), %rdx - movq 152(%rdi), %rcx - movq %rdx, 144(%rdi) - sbbq 152(%rsi), %rcx - movq 160(%rdi), %rdx - movq %rcx, 152(%rdi) - sbbq 160(%rsi), %rdx - movq 168(%rdi), %rcx - movq %rdx, 160(%rdi) - sbbq 168(%rsi), %rcx - movq 176(%rdi), %rdx - movq %rcx, 168(%rdi) - sbbq 176(%rsi), %rdx - movq 184(%rdi), %rcx - movq %rdx, 176(%rdi) - sbbq 184(%rsi), %rcx - movq 192(%rdi), %rdx - movq %rcx, 184(%rdi) - sbbq 192(%rsi), %rdx - movq 200(%rdi), %rcx - movq %rdx, 192(%rdi) - sbbq 200(%rsi), %rcx - movq 208(%rdi), %rdx - movq %rcx, 200(%rdi) - sbbq 208(%rsi), %rdx - movq 216(%rdi), %rcx - movq %rdx, 208(%rdi) - sbbq 216(%rsi), %rcx - movq 224(%rdi), %rdx - movq %rcx, 216(%rdi) - sbbq 224(%rsi), %rdx - movq 232(%rdi), %rcx - movq %rdx, 224(%rdi) - sbbq 232(%rsi), %rcx - movq 240(%rdi), %rdx - movq %rcx, 232(%rdi) - sbbq 240(%rsi), %rdx - movq 248(%rdi), %rcx - movq %rdx, 240(%rdi) - sbbq 248(%rsi), %rcx - movq %rcx, 248(%rdi) - sbbq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_2048_sub_in_place_32,.-sp_2048_sub_in_place_32 -#endif /* __APPLE__ */ -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_add_32 -.type sp_2048_add_32,@function -.align 16 -sp_2048_add_32: -#else -.section __TEXT,__text -.globl _sp_2048_add_32 -.p2align 4 -_sp_2048_add_32: -#endif /* __APPLE__ */ - # Add - movq (%rsi), %rcx - xorq %rax, %rax - addq (%rdx), %rcx - movq 8(%rsi), %r8 - movq %rcx, (%rdi) - adcq 8(%rdx), %r8 - movq 16(%rsi), %rcx - movq %r8, 8(%rdi) - adcq 16(%rdx), %rcx - movq 24(%rsi), %r8 - movq %rcx, 16(%rdi) - adcq 24(%rdx), %r8 - movq 32(%rsi), %rcx - movq %r8, 24(%rdi) - adcq 32(%rdx), %rcx - movq 40(%rsi), %r8 - movq %rcx, 32(%rdi) - adcq 40(%rdx), %r8 - movq 48(%rsi), %rcx - movq %r8, 40(%rdi) - adcq 48(%rdx), %rcx - movq 56(%rsi), %r8 - movq %rcx, 48(%rdi) - adcq 56(%rdx), %r8 - movq 64(%rsi), %rcx - movq %r8, 56(%rdi) - adcq 64(%rdx), %rcx - movq 72(%rsi), %r8 - movq %rcx, 64(%rdi) - adcq 72(%rdx), %r8 - movq 80(%rsi), %rcx - movq %r8, 72(%rdi) - adcq 80(%rdx), %rcx - movq 88(%rsi), %r8 - movq %rcx, 80(%rdi) - adcq 88(%rdx), %r8 - movq 96(%rsi), %rcx - movq %r8, 88(%rdi) - adcq 96(%rdx), %rcx - movq 104(%rsi), %r8 - movq %rcx, 96(%rdi) - adcq 104(%rdx), %r8 - movq 112(%rsi), %rcx - movq %r8, 104(%rdi) - adcq 112(%rdx), %rcx - movq 120(%rsi), %r8 - movq %rcx, 112(%rdi) - adcq 120(%rdx), %r8 - movq 128(%rsi), %rcx - movq %r8, 120(%rdi) - adcq 128(%rdx), %rcx - movq 136(%rsi), %r8 - movq %rcx, 128(%rdi) - adcq 136(%rdx), %r8 - movq 144(%rsi), %rcx - movq %r8, 136(%rdi) - adcq 144(%rdx), %rcx - movq 152(%rsi), %r8 - movq %rcx, 144(%rdi) - adcq 152(%rdx), %r8 - movq 160(%rsi), %rcx - movq %r8, 152(%rdi) - adcq 160(%rdx), %rcx - movq 168(%rsi), %r8 - movq %rcx, 160(%rdi) - adcq 168(%rdx), %r8 - movq 176(%rsi), %rcx - movq %r8, 168(%rdi) - adcq 176(%rdx), %rcx - movq 184(%rsi), %r8 - movq %rcx, 176(%rdi) - adcq 184(%rdx), %r8 - movq 192(%rsi), %rcx - movq %r8, 184(%rdi) - adcq 192(%rdx), %rcx - movq 200(%rsi), %r8 - movq %rcx, 192(%rdi) - adcq 200(%rdx), %r8 - movq 208(%rsi), %rcx - movq %r8, 200(%rdi) - adcq 208(%rdx), %rcx - movq 216(%rsi), %r8 - movq %rcx, 208(%rdi) - adcq 216(%rdx), %r8 - movq 224(%rsi), %rcx - movq %r8, 216(%rdi) - adcq 224(%rdx), %rcx - movq 232(%rsi), %r8 - movq %rcx, 224(%rdi) - adcq 232(%rdx), %r8 - movq 240(%rsi), %rcx - movq %r8, 232(%rdi) - adcq 240(%rdx), %rcx - movq 248(%rsi), %r8 - movq %rcx, 240(%rdi) - adcq 248(%rdx), %r8 - movq %r8, 248(%rdi) - adcq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_2048_add_32,.-sp_2048_add_32 -#endif /* __APPLE__ */ -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_mul_32 -.type sp_2048_mul_32,@function -.align 16 -sp_2048_mul_32: -#else -.section __TEXT,__text -.globl _sp_2048_mul_32 -.p2align 4 -_sp_2048_mul_32: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $0x328, %rsp - movq %rdi, 768(%rsp) - movq %rsi, 776(%rsp) - movq %rdx, 784(%rsp) - leaq 512(%rsp), %r10 - leaq 128(%rsi), %r12 - # Add - movq (%rsi), %rax - xorq %r13, %r13 - addq (%r12), %rax - movq 8(%rsi), %rcx - movq %rax, (%r10) - adcq 8(%r12), %rcx - movq 16(%rsi), %r8 - movq %rcx, 8(%r10) - adcq 16(%r12), %r8 - movq 24(%rsi), %rax - movq %r8, 16(%r10) - adcq 24(%r12), %rax - movq 32(%rsi), %rcx - movq %rax, 24(%r10) - adcq 32(%r12), %rcx - movq 40(%rsi), %r8 - movq %rcx, 32(%r10) - adcq 40(%r12), %r8 - movq 48(%rsi), %rax - movq %r8, 40(%r10) - adcq 48(%r12), %rax - movq 56(%rsi), %rcx - movq %rax, 48(%r10) - adcq 56(%r12), %rcx - movq 64(%rsi), %r8 - movq %rcx, 56(%r10) - adcq 64(%r12), %r8 - movq 72(%rsi), %rax - movq %r8, 64(%r10) - adcq 72(%r12), %rax - movq 80(%rsi), %rcx - movq %rax, 72(%r10) - adcq 80(%r12), %rcx - movq 88(%rsi), %r8 - movq %rcx, 80(%r10) - adcq 88(%r12), %r8 - movq 96(%rsi), %rax - movq %r8, 88(%r10) - adcq 96(%r12), %rax - movq 104(%rsi), %rcx - movq %rax, 96(%r10) - adcq 104(%r12), %rcx - movq 112(%rsi), %r8 - movq %rcx, 104(%r10) - adcq 112(%r12), %r8 - movq 120(%rsi), %rax - movq %r8, 112(%r10) - adcq 120(%r12), %rax - movq %rax, 120(%r10) - adcq $0x00, %r13 - movq %r13, 792(%rsp) - leaq 640(%rsp), %r11 - leaq 128(%rdx), %r12 - # Add - movq (%rdx), %rax - xorq %r14, %r14 - addq (%r12), %rax - movq 8(%rdx), %rcx - movq %rax, (%r11) - adcq 8(%r12), %rcx - movq 16(%rdx), %r8 - movq %rcx, 8(%r11) - adcq 16(%r12), %r8 - movq 24(%rdx), %rax - movq %r8, 16(%r11) - adcq 24(%r12), %rax - movq 32(%rdx), %rcx - movq %rax, 24(%r11) - adcq 32(%r12), %rcx - movq 40(%rdx), %r8 - movq %rcx, 32(%r11) - adcq 40(%r12), %r8 - movq 48(%rdx), %rax - movq %r8, 40(%r11) - adcq 48(%r12), %rax - movq 56(%rdx), %rcx - movq %rax, 48(%r11) - adcq 56(%r12), %rcx - movq 64(%rdx), %r8 - movq %rcx, 56(%r11) - adcq 64(%r12), %r8 - movq 72(%rdx), %rax - movq %r8, 64(%r11) - adcq 72(%r12), %rax - movq 80(%rdx), %rcx - movq %rax, 72(%r11) - adcq 80(%r12), %rcx - movq 88(%rdx), %r8 - movq %rcx, 80(%r11) - adcq 88(%r12), %r8 - movq 96(%rdx), %rax - movq %r8, 88(%r11) - adcq 96(%r12), %rax - movq 104(%rdx), %rcx - movq %rax, 96(%r11) - adcq 104(%r12), %rcx - movq 112(%rdx), %r8 - movq %rcx, 104(%r11) - adcq 112(%r12), %r8 - movq 120(%rdx), %rax - movq %r8, 112(%r11) - adcq 120(%r12), %rax - movq %rax, 120(%r11) - adcq $0x00, %r14 - movq %r14, 800(%rsp) - movq %r11, %rdx - movq %r10, %rsi - movq %rsp, %rdi -#ifndef __APPLE__ - callq sp_2048_mul_16@plt -#else - callq _sp_2048_mul_16 -#endif /* __APPLE__ */ - movq 784(%rsp), %rdx - movq 776(%rsp), %rsi - leaq 256(%rsp), %rdi - addq $0x80, %rdx - addq $0x80, %rsi -#ifndef __APPLE__ - callq sp_2048_mul_16@plt -#else - callq _sp_2048_mul_16 -#endif /* __APPLE__ */ - movq 784(%rsp), %rdx - movq 776(%rsp), %rsi - movq 768(%rsp), %rdi -#ifndef __APPLE__ - callq sp_2048_mul_16@plt -#else - callq _sp_2048_mul_16 -#endif /* __APPLE__ */ -#ifdef _WIN64 - movq 784(%rsp), %rdx - movq 776(%rsp), %rsi - movq 768(%rsp), %rdi -#endif /* _WIN64 */ - movq 792(%rsp), %r13 - movq 800(%rsp), %r14 - movq 768(%rsp), %r15 - movq %r13, %r9 - leaq 512(%rsp), %r10 - leaq 640(%rsp), %r11 - andq %r14, %r9 - negq %r13 - negq %r14 - addq $0x100, %r15 - movq (%r10), %rax - movq (%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, (%r10) - movq %rcx, (%r11) - movq 8(%r10), %rax - movq 8(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 8(%r10) - movq %rcx, 8(%r11) - movq 16(%r10), %rax - movq 16(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 16(%r10) - movq %rcx, 16(%r11) - movq 24(%r10), %rax - movq 24(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 24(%r10) - movq %rcx, 24(%r11) - movq 32(%r10), %rax - movq 32(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 32(%r10) - movq %rcx, 32(%r11) - movq 40(%r10), %rax - movq 40(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 40(%r10) - movq %rcx, 40(%r11) - movq 48(%r10), %rax - movq 48(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 48(%r10) - movq %rcx, 48(%r11) - movq 56(%r10), %rax - movq 56(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 56(%r10) - movq %rcx, 56(%r11) - movq 64(%r10), %rax - movq 64(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 64(%r10) - movq %rcx, 64(%r11) - movq 72(%r10), %rax - movq 72(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 72(%r10) - movq %rcx, 72(%r11) - movq 80(%r10), %rax - movq 80(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 80(%r10) - movq %rcx, 80(%r11) - movq 88(%r10), %rax - movq 88(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 88(%r10) - movq %rcx, 88(%r11) - movq 96(%r10), %rax - movq 96(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 96(%r10) - movq %rcx, 96(%r11) - movq 104(%r10), %rax - movq 104(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 104(%r10) - movq %rcx, 104(%r11) - movq 112(%r10), %rax - movq 112(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 112(%r10) - movq %rcx, 112(%r11) - movq 120(%r10), %rax - movq 120(%r11), %rcx - andq %r14, %rax - andq %r13, %rcx - movq %rax, 120(%r10) - movq %rcx, 120(%r11) - movq (%r10), %rax - addq (%r11), %rax - movq 8(%r10), %rcx - movq %rax, (%r15) - adcq 8(%r11), %rcx - movq 16(%r10), %r8 - movq %rcx, 8(%r15) - adcq 16(%r11), %r8 - movq 24(%r10), %rax - movq %r8, 16(%r15) - adcq 24(%r11), %rax - movq 32(%r10), %rcx - movq %rax, 24(%r15) - adcq 32(%r11), %rcx - movq 40(%r10), %r8 - movq %rcx, 32(%r15) - adcq 40(%r11), %r8 - movq 48(%r10), %rax - movq %r8, 40(%r15) - adcq 48(%r11), %rax - movq 56(%r10), %rcx - movq %rax, 48(%r15) - adcq 56(%r11), %rcx - movq 64(%r10), %r8 - movq %rcx, 56(%r15) - adcq 64(%r11), %r8 - movq 72(%r10), %rax - movq %r8, 64(%r15) - adcq 72(%r11), %rax - movq 80(%r10), %rcx - movq %rax, 72(%r15) - adcq 80(%r11), %rcx - movq 88(%r10), %r8 - movq %rcx, 80(%r15) - adcq 88(%r11), %r8 - movq 96(%r10), %rax - movq %r8, 88(%r15) - adcq 96(%r11), %rax - movq 104(%r10), %rcx - movq %rax, 96(%r15) - adcq 104(%r11), %rcx - movq 112(%r10), %r8 - movq %rcx, 104(%r15) - adcq 112(%r11), %r8 - movq 120(%r10), %rax - movq %r8, 112(%r15) - adcq 120(%r11), %rax - movq %rax, 120(%r15) - adcq $0x00, %r9 - leaq 256(%rsp), %r11 - movq %rsp, %r10 - movq (%r10), %rax - subq (%r11), %rax - movq 8(%r10), %rcx - movq %rax, (%r10) - sbbq 8(%r11), %rcx - movq 16(%r10), %r8 - movq %rcx, 8(%r10) - sbbq 16(%r11), %r8 - movq 24(%r10), %rax - movq %r8, 16(%r10) - sbbq 24(%r11), %rax - movq 32(%r10), %rcx - movq %rax, 24(%r10) - sbbq 32(%r11), %rcx - movq 40(%r10), %r8 - movq %rcx, 32(%r10) - sbbq 40(%r11), %r8 - movq 48(%r10), %rax - movq %r8, 40(%r10) - sbbq 48(%r11), %rax - movq 56(%r10), %rcx - movq %rax, 48(%r10) - sbbq 56(%r11), %rcx - movq 64(%r10), %r8 - movq %rcx, 56(%r10) - sbbq 64(%r11), %r8 - movq 72(%r10), %rax - movq %r8, 64(%r10) - sbbq 72(%r11), %rax - movq 80(%r10), %rcx - movq %rax, 72(%r10) - sbbq 80(%r11), %rcx - movq 88(%r10), %r8 - movq %rcx, 80(%r10) - sbbq 88(%r11), %r8 - movq 96(%r10), %rax - movq %r8, 88(%r10) - sbbq 96(%r11), %rax - movq 104(%r10), %rcx - movq %rax, 96(%r10) - sbbq 104(%r11), %rcx - movq 112(%r10), %r8 - movq %rcx, 104(%r10) - sbbq 112(%r11), %r8 - movq 120(%r10), %rax - movq %r8, 112(%r10) - sbbq 120(%r11), %rax - movq 128(%r10), %rcx - movq %rax, 120(%r10) - sbbq 128(%r11), %rcx - movq 136(%r10), %r8 - movq %rcx, 128(%r10) - sbbq 136(%r11), %r8 - movq 144(%r10), %rax - movq %r8, 136(%r10) - sbbq 144(%r11), %rax - movq 152(%r10), %rcx - movq %rax, 144(%r10) - sbbq 152(%r11), %rcx - movq 160(%r10), %r8 - movq %rcx, 152(%r10) - sbbq 160(%r11), %r8 - movq 168(%r10), %rax - movq %r8, 160(%r10) - sbbq 168(%r11), %rax - movq 176(%r10), %rcx - movq %rax, 168(%r10) - sbbq 176(%r11), %rcx - movq 184(%r10), %r8 - movq %rcx, 176(%r10) - sbbq 184(%r11), %r8 - movq 192(%r10), %rax - movq %r8, 184(%r10) - sbbq 192(%r11), %rax - movq 200(%r10), %rcx - movq %rax, 192(%r10) - sbbq 200(%r11), %rcx - movq 208(%r10), %r8 - movq %rcx, 200(%r10) - sbbq 208(%r11), %r8 - movq 216(%r10), %rax - movq %r8, 208(%r10) - sbbq 216(%r11), %rax - movq 224(%r10), %rcx - movq %rax, 216(%r10) - sbbq 224(%r11), %rcx - movq 232(%r10), %r8 - movq %rcx, 224(%r10) - sbbq 232(%r11), %r8 - movq 240(%r10), %rax - movq %r8, 232(%r10) - sbbq 240(%r11), %rax - movq 248(%r10), %rcx - movq %rax, 240(%r10) - sbbq 248(%r11), %rcx - movq %rcx, 248(%r10) - sbbq $0x00, %r9 - movq (%r10), %rax - subq (%rdi), %rax - movq 8(%r10), %rcx - movq %rax, (%r10) - sbbq 8(%rdi), %rcx - movq 16(%r10), %r8 - movq %rcx, 8(%r10) - sbbq 16(%rdi), %r8 - movq 24(%r10), %rax - movq %r8, 16(%r10) - sbbq 24(%rdi), %rax - movq 32(%r10), %rcx - movq %rax, 24(%r10) - sbbq 32(%rdi), %rcx - movq 40(%r10), %r8 - movq %rcx, 32(%r10) - sbbq 40(%rdi), %r8 - movq 48(%r10), %rax - movq %r8, 40(%r10) - sbbq 48(%rdi), %rax - movq 56(%r10), %rcx - movq %rax, 48(%r10) - sbbq 56(%rdi), %rcx - movq 64(%r10), %r8 - movq %rcx, 56(%r10) - sbbq 64(%rdi), %r8 - movq 72(%r10), %rax - movq %r8, 64(%r10) - sbbq 72(%rdi), %rax - movq 80(%r10), %rcx - movq %rax, 72(%r10) - sbbq 80(%rdi), %rcx - movq 88(%r10), %r8 - movq %rcx, 80(%r10) - sbbq 88(%rdi), %r8 - movq 96(%r10), %rax - movq %r8, 88(%r10) - sbbq 96(%rdi), %rax - movq 104(%r10), %rcx - movq %rax, 96(%r10) - sbbq 104(%rdi), %rcx - movq 112(%r10), %r8 - movq %rcx, 104(%r10) - sbbq 112(%rdi), %r8 - movq 120(%r10), %rax - movq %r8, 112(%r10) - sbbq 120(%rdi), %rax - movq 128(%r10), %rcx - movq %rax, 120(%r10) - sbbq 128(%rdi), %rcx - movq 136(%r10), %r8 - movq %rcx, 128(%r10) - sbbq 136(%rdi), %r8 - movq 144(%r10), %rax - movq %r8, 136(%r10) - sbbq 144(%rdi), %rax - movq 152(%r10), %rcx - movq %rax, 144(%r10) - sbbq 152(%rdi), %rcx - movq 160(%r10), %r8 - movq %rcx, 152(%r10) - sbbq 160(%rdi), %r8 - movq 168(%r10), %rax - movq %r8, 160(%r10) - sbbq 168(%rdi), %rax - movq 176(%r10), %rcx - movq %rax, 168(%r10) - sbbq 176(%rdi), %rcx - movq 184(%r10), %r8 - movq %rcx, 176(%r10) - sbbq 184(%rdi), %r8 - movq 192(%r10), %rax - movq %r8, 184(%r10) - sbbq 192(%rdi), %rax - movq 200(%r10), %rcx - movq %rax, 192(%r10) - sbbq 200(%rdi), %rcx - movq 208(%r10), %r8 - movq %rcx, 200(%r10) - sbbq 208(%rdi), %r8 - movq 216(%r10), %rax - movq %r8, 208(%r10) - sbbq 216(%rdi), %rax - movq 224(%r10), %rcx - movq %rax, 216(%r10) - sbbq 224(%rdi), %rcx - movq 232(%r10), %r8 - movq %rcx, 224(%r10) - sbbq 232(%rdi), %r8 - movq 240(%r10), %rax - movq %r8, 232(%r10) - sbbq 240(%rdi), %rax - movq 248(%r10), %rcx - movq %rax, 240(%r10) - sbbq 248(%rdi), %rcx - movq %rcx, 248(%r10) - sbbq $0x00, %r9 - subq $0x80, %r15 - # Add - movq (%r15), %rax - addq (%r10), %rax - movq 8(%r15), %rcx - movq %rax, (%r15) - adcq 8(%r10), %rcx - movq 16(%r15), %r8 - movq %rcx, 8(%r15) - adcq 16(%r10), %r8 - movq 24(%r15), %rax - movq %r8, 16(%r15) - adcq 24(%r10), %rax - movq 32(%r15), %rcx - movq %rax, 24(%r15) - adcq 32(%r10), %rcx - movq 40(%r15), %r8 - movq %rcx, 32(%r15) - adcq 40(%r10), %r8 - movq 48(%r15), %rax - movq %r8, 40(%r15) - adcq 48(%r10), %rax - movq 56(%r15), %rcx - movq %rax, 48(%r15) - adcq 56(%r10), %rcx - movq 64(%r15), %r8 - movq %rcx, 56(%r15) - adcq 64(%r10), %r8 - movq 72(%r15), %rax - movq %r8, 64(%r15) - adcq 72(%r10), %rax - movq 80(%r15), %rcx - movq %rax, 72(%r15) - adcq 80(%r10), %rcx - movq 88(%r15), %r8 - movq %rcx, 80(%r15) - adcq 88(%r10), %r8 - movq 96(%r15), %rax - movq %r8, 88(%r15) - adcq 96(%r10), %rax - movq 104(%r15), %rcx - movq %rax, 96(%r15) - adcq 104(%r10), %rcx - movq 112(%r15), %r8 - movq %rcx, 104(%r15) - adcq 112(%r10), %r8 - movq 120(%r15), %rax - movq %r8, 112(%r15) - adcq 120(%r10), %rax - movq 128(%r15), %rcx - movq %rax, 120(%r15) - adcq 128(%r10), %rcx - movq 136(%r15), %r8 - movq %rcx, 128(%r15) - adcq 136(%r10), %r8 - movq 144(%r15), %rax - movq %r8, 136(%r15) - adcq 144(%r10), %rax - movq 152(%r15), %rcx - movq %rax, 144(%r15) - adcq 152(%r10), %rcx - movq 160(%r15), %r8 - movq %rcx, 152(%r15) - adcq 160(%r10), %r8 - movq 168(%r15), %rax - movq %r8, 160(%r15) - adcq 168(%r10), %rax - movq 176(%r15), %rcx - movq %rax, 168(%r15) - adcq 176(%r10), %rcx - movq 184(%r15), %r8 - movq %rcx, 176(%r15) - adcq 184(%r10), %r8 - movq 192(%r15), %rax - movq %r8, 184(%r15) - adcq 192(%r10), %rax - movq 200(%r15), %rcx - movq %rax, 192(%r15) - adcq 200(%r10), %rcx - movq 208(%r15), %r8 - movq %rcx, 200(%r15) - adcq 208(%r10), %r8 - movq 216(%r15), %rax - movq %r8, 208(%r15) - adcq 216(%r10), %rax - movq 224(%r15), %rcx - movq %rax, 216(%r15) - adcq 224(%r10), %rcx - movq 232(%r15), %r8 - movq %rcx, 224(%r15) - adcq 232(%r10), %r8 - movq 240(%r15), %rax - movq %r8, 232(%r15) - adcq 240(%r10), %rax - movq 248(%r15), %rcx - movq %rax, 240(%r15) - adcq 248(%r10), %rcx - movq %rcx, 248(%r15) - adcq $0x00, %r9 - movq %r9, 384(%rdi) - addq $0x80, %r15 - # Add - movq (%r15), %rax - xorq %r9, %r9 - addq (%r11), %rax - movq 8(%r15), %rcx - movq %rax, (%r15) - adcq 8(%r11), %rcx - movq 16(%r15), %r8 - movq %rcx, 8(%r15) - adcq 16(%r11), %r8 - movq 24(%r15), %rax - movq %r8, 16(%r15) - adcq 24(%r11), %rax - movq 32(%r15), %rcx - movq %rax, 24(%r15) - adcq 32(%r11), %rcx - movq 40(%r15), %r8 - movq %rcx, 32(%r15) - adcq 40(%r11), %r8 - movq 48(%r15), %rax - movq %r8, 40(%r15) - adcq 48(%r11), %rax - movq 56(%r15), %rcx - movq %rax, 48(%r15) - adcq 56(%r11), %rcx - movq 64(%r15), %r8 - movq %rcx, 56(%r15) - adcq 64(%r11), %r8 - movq 72(%r15), %rax - movq %r8, 64(%r15) - adcq 72(%r11), %rax - movq 80(%r15), %rcx - movq %rax, 72(%r15) - adcq 80(%r11), %rcx - movq 88(%r15), %r8 - movq %rcx, 80(%r15) - adcq 88(%r11), %r8 - movq 96(%r15), %rax - movq %r8, 88(%r15) - adcq 96(%r11), %rax - movq 104(%r15), %rcx - movq %rax, 96(%r15) - adcq 104(%r11), %rcx - movq 112(%r15), %r8 - movq %rcx, 104(%r15) - adcq 112(%r11), %r8 - movq 120(%r15), %rax - movq %r8, 112(%r15) - adcq 120(%r11), %rax - movq 128(%r15), %rcx - movq %rax, 120(%r15) - adcq 128(%r11), %rcx - movq %rcx, 128(%r15) - adcq $0x00, %r9 - # Add to zero - movq 136(%r11), %rax - adcq $0x00, %rax - movq 144(%r11), %rcx - movq %rax, 136(%r15) - adcq $0x00, %rcx - movq 152(%r11), %r8 - movq %rcx, 144(%r15) - adcq $0x00, %r8 - movq 160(%r11), %rax - movq %r8, 152(%r15) - adcq $0x00, %rax - movq 168(%r11), %rcx - movq %rax, 160(%r15) - adcq $0x00, %rcx - movq 176(%r11), %r8 - movq %rcx, 168(%r15) - adcq $0x00, %r8 - movq 184(%r11), %rax - movq %r8, 176(%r15) - adcq $0x00, %rax - movq 192(%r11), %rcx - movq %rax, 184(%r15) - adcq $0x00, %rcx - movq 200(%r11), %r8 - movq %rcx, 192(%r15) - adcq $0x00, %r8 - movq 208(%r11), %rax - movq %r8, 200(%r15) - adcq $0x00, %rax - movq 216(%r11), %rcx - movq %rax, 208(%r15) - adcq $0x00, %rcx - movq 224(%r11), %r8 - movq %rcx, 216(%r15) - adcq $0x00, %r8 - movq 232(%r11), %rax - movq %r8, 224(%r15) - adcq $0x00, %rax - movq 240(%r11), %rcx - movq %rax, 232(%r15) - adcq $0x00, %rcx - movq 248(%r11), %r8 - movq %rcx, 240(%r15) - adcq $0x00, %r8 - movq %r8, 248(%r15) - addq $0x328, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size sp_2048_mul_32,.-sp_2048_mul_32 -#endif /* __APPLE__ */ /* Add a to a into r. (r = a + a) * * r A single precision integer. @@ -7611,686 +8291,6 @@ _sp_2048_sqr_32: .size sp_2048_sqr_32,.-sp_2048_sqr_32 #endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_mul_avx2_32 -.type sp_2048_mul_avx2_32,@function -.align 16 -sp_2048_mul_avx2_32: -#else -.section __TEXT,__text -.globl _sp_2048_mul_avx2_32 -.p2align 4 -_sp_2048_mul_avx2_32: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $0x328, %rsp - movq %rdi, 768(%rsp) - movq %rsi, 776(%rsp) - movq %rdx, 784(%rsp) - leaq 512(%rsp), %r10 - leaq 128(%rsi), %r12 - # Add - movq (%rsi), %rax - xorq %r13, %r13 - addq (%r12), %rax - movq 8(%rsi), %rcx - movq %rax, (%r10) - adcq 8(%r12), %rcx - movq 16(%rsi), %r8 - movq %rcx, 8(%r10) - adcq 16(%r12), %r8 - movq 24(%rsi), %rax - movq %r8, 16(%r10) - adcq 24(%r12), %rax - movq 32(%rsi), %rcx - movq %rax, 24(%r10) - adcq 32(%r12), %rcx - movq 40(%rsi), %r8 - movq %rcx, 32(%r10) - adcq 40(%r12), %r8 - movq 48(%rsi), %rax - movq %r8, 40(%r10) - adcq 48(%r12), %rax - movq 56(%rsi), %rcx - movq %rax, 48(%r10) - adcq 56(%r12), %rcx - movq 64(%rsi), %r8 - movq %rcx, 56(%r10) - adcq 64(%r12), %r8 - movq 72(%rsi), %rax - movq %r8, 64(%r10) - adcq 72(%r12), %rax - movq 80(%rsi), %rcx - movq %rax, 72(%r10) - adcq 80(%r12), %rcx - movq 88(%rsi), %r8 - movq %rcx, 80(%r10) - adcq 88(%r12), %r8 - movq 96(%rsi), %rax - movq %r8, 88(%r10) - adcq 96(%r12), %rax - movq 104(%rsi), %rcx - movq %rax, 96(%r10) - adcq 104(%r12), %rcx - movq 112(%rsi), %r8 - movq %rcx, 104(%r10) - adcq 112(%r12), %r8 - movq 120(%rsi), %rax - movq %r8, 112(%r10) - adcq 120(%r12), %rax - movq %rax, 120(%r10) - adcq $0x00, %r13 - movq %r13, 792(%rsp) - leaq 640(%rsp), %r11 - leaq 128(%rdx), %r12 - # Add - movq (%rdx), %rax - xorq %r14, %r14 - addq (%r12), %rax - movq 8(%rdx), %rcx - movq %rax, (%r11) - adcq 8(%r12), %rcx - movq 16(%rdx), %r8 - movq %rcx, 8(%r11) - adcq 16(%r12), %r8 - movq 24(%rdx), %rax - movq %r8, 16(%r11) - adcq 24(%r12), %rax - movq 32(%rdx), %rcx - movq %rax, 24(%r11) - adcq 32(%r12), %rcx - movq 40(%rdx), %r8 - movq %rcx, 32(%r11) - adcq 40(%r12), %r8 - movq 48(%rdx), %rax - movq %r8, 40(%r11) - adcq 48(%r12), %rax - movq 56(%rdx), %rcx - movq %rax, 48(%r11) - adcq 56(%r12), %rcx - movq 64(%rdx), %r8 - movq %rcx, 56(%r11) - adcq 64(%r12), %r8 - movq 72(%rdx), %rax - movq %r8, 64(%r11) - adcq 72(%r12), %rax - movq 80(%rdx), %rcx - movq %rax, 72(%r11) - adcq 80(%r12), %rcx - movq 88(%rdx), %r8 - movq %rcx, 80(%r11) - adcq 88(%r12), %r8 - movq 96(%rdx), %rax - movq %r8, 88(%r11) - adcq 96(%r12), %rax - movq 104(%rdx), %rcx - movq %rax, 96(%r11) - adcq 104(%r12), %rcx - movq 112(%rdx), %r8 - movq %rcx, 104(%r11) - adcq 112(%r12), %r8 - movq 120(%rdx), %rax - movq %r8, 112(%r11) - adcq 120(%r12), %rax - movq %rax, 120(%r11) - adcq $0x00, %r14 - movq %r14, 800(%rsp) - movq %r11, %rdx - movq %r10, %rsi - movq %rsp, %rdi -#ifndef __APPLE__ - callq sp_2048_mul_avx2_16@plt -#else - callq _sp_2048_mul_avx2_16 -#endif /* __APPLE__ */ - movq 784(%rsp), %rdx - movq 776(%rsp), %rsi - leaq 256(%rsp), %rdi - addq $0x80, %rdx - addq $0x80, %rsi -#ifndef __APPLE__ - callq sp_2048_mul_avx2_16@plt -#else - callq _sp_2048_mul_avx2_16 -#endif /* __APPLE__ */ - movq 784(%rsp), %rdx - movq 776(%rsp), %rsi - movq 768(%rsp), %rdi -#ifndef __APPLE__ - callq sp_2048_mul_avx2_16@plt -#else - callq _sp_2048_mul_avx2_16 -#endif /* __APPLE__ */ -#ifdef _WIN64 - movq 784(%rsp), %rdx - movq 776(%rsp), %rsi - movq 768(%rsp), %rdi -#endif /* _WIN64 */ - movq 792(%rsp), %r13 - movq 800(%rsp), %r14 - movq 768(%rsp), %r15 - movq %r13, %r9 - leaq 512(%rsp), %r10 - leaq 640(%rsp), %r11 - andq %r14, %r9 - negq %r13 - negq %r14 - addq $0x100, %r15 - movq (%r10), %rax - movq (%r11), %rcx - pextq %r14, %rax, %rax - pextq %r13, %rcx, %rcx - addq %rcx, %rax - movq 8(%r10), %rcx - movq 8(%r11), %r8 - pextq %r14, %rcx, %rcx - pextq %r13, %r8, %r8 - movq %rax, (%r15) - adcq %r8, %rcx - movq 16(%r10), %r8 - movq 16(%r11), %rax - pextq %r14, %r8, %r8 - pextq %r13, %rax, %rax - movq %rcx, 8(%r15) - adcq %rax, %r8 - movq 24(%r10), %rax - movq 24(%r11), %rcx - pextq %r14, %rax, %rax - pextq %r13, %rcx, %rcx - movq %r8, 16(%r15) - adcq %rcx, %rax - movq 32(%r10), %rcx - movq 32(%r11), %r8 - pextq %r14, %rcx, %rcx - pextq %r13, %r8, %r8 - movq %rax, 24(%r15) - adcq %r8, %rcx - movq 40(%r10), %r8 - movq 40(%r11), %rax - pextq %r14, %r8, %r8 - pextq %r13, %rax, %rax - movq %rcx, 32(%r15) - adcq %rax, %r8 - movq 48(%r10), %rax - movq 48(%r11), %rcx - pextq %r14, %rax, %rax - pextq %r13, %rcx, %rcx - movq %r8, 40(%r15) - adcq %rcx, %rax - movq 56(%r10), %rcx - movq 56(%r11), %r8 - pextq %r14, %rcx, %rcx - pextq %r13, %r8, %r8 - movq %rax, 48(%r15) - adcq %r8, %rcx - movq 64(%r10), %r8 - movq 64(%r11), %rax - pextq %r14, %r8, %r8 - pextq %r13, %rax, %rax - movq %rcx, 56(%r15) - adcq %rax, %r8 - movq 72(%r10), %rax - movq 72(%r11), %rcx - pextq %r14, %rax, %rax - pextq %r13, %rcx, %rcx - movq %r8, 64(%r15) - adcq %rcx, %rax - movq 80(%r10), %rcx - movq 80(%r11), %r8 - pextq %r14, %rcx, %rcx - pextq %r13, %r8, %r8 - movq %rax, 72(%r15) - adcq %r8, %rcx - movq 88(%r10), %r8 - movq 88(%r11), %rax - pextq %r14, %r8, %r8 - pextq %r13, %rax, %rax - movq %rcx, 80(%r15) - adcq %rax, %r8 - movq 96(%r10), %rax - movq 96(%r11), %rcx - pextq %r14, %rax, %rax - pextq %r13, %rcx, %rcx - movq %r8, 88(%r15) - adcq %rcx, %rax - movq 104(%r10), %rcx - movq 104(%r11), %r8 - pextq %r14, %rcx, %rcx - pextq %r13, %r8, %r8 - movq %rax, 96(%r15) - adcq %r8, %rcx - movq 112(%r10), %r8 - movq 112(%r11), %rax - pextq %r14, %r8, %r8 - pextq %r13, %rax, %rax - movq %rcx, 104(%r15) - adcq %rax, %r8 - movq 120(%r10), %rax - movq 120(%r11), %rcx - pextq %r14, %rax, %rax - pextq %r13, %rcx, %rcx - movq %r8, 112(%r15) - adcq %rcx, %rax - movq %rax, 120(%r15) - adcq $0x00, %r9 - leaq 256(%rsp), %r11 - movq %rsp, %r10 - movq (%r10), %rax - subq (%r11), %rax - movq 8(%r10), %rcx - movq %rax, (%r10) - sbbq 8(%r11), %rcx - movq 16(%r10), %r8 - movq %rcx, 8(%r10) - sbbq 16(%r11), %r8 - movq 24(%r10), %rax - movq %r8, 16(%r10) - sbbq 24(%r11), %rax - movq 32(%r10), %rcx - movq %rax, 24(%r10) - sbbq 32(%r11), %rcx - movq 40(%r10), %r8 - movq %rcx, 32(%r10) - sbbq 40(%r11), %r8 - movq 48(%r10), %rax - movq %r8, 40(%r10) - sbbq 48(%r11), %rax - movq 56(%r10), %rcx - movq %rax, 48(%r10) - sbbq 56(%r11), %rcx - movq 64(%r10), %r8 - movq %rcx, 56(%r10) - sbbq 64(%r11), %r8 - movq 72(%r10), %rax - movq %r8, 64(%r10) - sbbq 72(%r11), %rax - movq 80(%r10), %rcx - movq %rax, 72(%r10) - sbbq 80(%r11), %rcx - movq 88(%r10), %r8 - movq %rcx, 80(%r10) - sbbq 88(%r11), %r8 - movq 96(%r10), %rax - movq %r8, 88(%r10) - sbbq 96(%r11), %rax - movq 104(%r10), %rcx - movq %rax, 96(%r10) - sbbq 104(%r11), %rcx - movq 112(%r10), %r8 - movq %rcx, 104(%r10) - sbbq 112(%r11), %r8 - movq 120(%r10), %rax - movq %r8, 112(%r10) - sbbq 120(%r11), %rax - movq 128(%r10), %rcx - movq %rax, 120(%r10) - sbbq 128(%r11), %rcx - movq 136(%r10), %r8 - movq %rcx, 128(%r10) - sbbq 136(%r11), %r8 - movq 144(%r10), %rax - movq %r8, 136(%r10) - sbbq 144(%r11), %rax - movq 152(%r10), %rcx - movq %rax, 144(%r10) - sbbq 152(%r11), %rcx - movq 160(%r10), %r8 - movq %rcx, 152(%r10) - sbbq 160(%r11), %r8 - movq 168(%r10), %rax - movq %r8, 160(%r10) - sbbq 168(%r11), %rax - movq 176(%r10), %rcx - movq %rax, 168(%r10) - sbbq 176(%r11), %rcx - movq 184(%r10), %r8 - movq %rcx, 176(%r10) - sbbq 184(%r11), %r8 - movq 192(%r10), %rax - movq %r8, 184(%r10) - sbbq 192(%r11), %rax - movq 200(%r10), %rcx - movq %rax, 192(%r10) - sbbq 200(%r11), %rcx - movq 208(%r10), %r8 - movq %rcx, 200(%r10) - sbbq 208(%r11), %r8 - movq 216(%r10), %rax - movq %r8, 208(%r10) - sbbq 216(%r11), %rax - movq 224(%r10), %rcx - movq %rax, 216(%r10) - sbbq 224(%r11), %rcx - movq 232(%r10), %r8 - movq %rcx, 224(%r10) - sbbq 232(%r11), %r8 - movq 240(%r10), %rax - movq %r8, 232(%r10) - sbbq 240(%r11), %rax - movq 248(%r10), %rcx - movq %rax, 240(%r10) - sbbq 248(%r11), %rcx - movq %rcx, 248(%r10) - sbbq $0x00, %r9 - movq (%r10), %rax - subq (%rdi), %rax - movq 8(%r10), %rcx - movq %rax, (%r10) - sbbq 8(%rdi), %rcx - movq 16(%r10), %r8 - movq %rcx, 8(%r10) - sbbq 16(%rdi), %r8 - movq 24(%r10), %rax - movq %r8, 16(%r10) - sbbq 24(%rdi), %rax - movq 32(%r10), %rcx - movq %rax, 24(%r10) - sbbq 32(%rdi), %rcx - movq 40(%r10), %r8 - movq %rcx, 32(%r10) - sbbq 40(%rdi), %r8 - movq 48(%r10), %rax - movq %r8, 40(%r10) - sbbq 48(%rdi), %rax - movq 56(%r10), %rcx - movq %rax, 48(%r10) - sbbq 56(%rdi), %rcx - movq 64(%r10), %r8 - movq %rcx, 56(%r10) - sbbq 64(%rdi), %r8 - movq 72(%r10), %rax - movq %r8, 64(%r10) - sbbq 72(%rdi), %rax - movq 80(%r10), %rcx - movq %rax, 72(%r10) - sbbq 80(%rdi), %rcx - movq 88(%r10), %r8 - movq %rcx, 80(%r10) - sbbq 88(%rdi), %r8 - movq 96(%r10), %rax - movq %r8, 88(%r10) - sbbq 96(%rdi), %rax - movq 104(%r10), %rcx - movq %rax, 96(%r10) - sbbq 104(%rdi), %rcx - movq 112(%r10), %r8 - movq %rcx, 104(%r10) - sbbq 112(%rdi), %r8 - movq 120(%r10), %rax - movq %r8, 112(%r10) - sbbq 120(%rdi), %rax - movq 128(%r10), %rcx - movq %rax, 120(%r10) - sbbq 128(%rdi), %rcx - movq 136(%r10), %r8 - movq %rcx, 128(%r10) - sbbq 136(%rdi), %r8 - movq 144(%r10), %rax - movq %r8, 136(%r10) - sbbq 144(%rdi), %rax - movq 152(%r10), %rcx - movq %rax, 144(%r10) - sbbq 152(%rdi), %rcx - movq 160(%r10), %r8 - movq %rcx, 152(%r10) - sbbq 160(%rdi), %r8 - movq 168(%r10), %rax - movq %r8, 160(%r10) - sbbq 168(%rdi), %rax - movq 176(%r10), %rcx - movq %rax, 168(%r10) - sbbq 176(%rdi), %rcx - movq 184(%r10), %r8 - movq %rcx, 176(%r10) - sbbq 184(%rdi), %r8 - movq 192(%r10), %rax - movq %r8, 184(%r10) - sbbq 192(%rdi), %rax - movq 200(%r10), %rcx - movq %rax, 192(%r10) - sbbq 200(%rdi), %rcx - movq 208(%r10), %r8 - movq %rcx, 200(%r10) - sbbq 208(%rdi), %r8 - movq 216(%r10), %rax - movq %r8, 208(%r10) - sbbq 216(%rdi), %rax - movq 224(%r10), %rcx - movq %rax, 216(%r10) - sbbq 224(%rdi), %rcx - movq 232(%r10), %r8 - movq %rcx, 224(%r10) - sbbq 232(%rdi), %r8 - movq 240(%r10), %rax - movq %r8, 232(%r10) - sbbq 240(%rdi), %rax - movq 248(%r10), %rcx - movq %rax, 240(%r10) - sbbq 248(%rdi), %rcx - movq %rcx, 248(%r10) - sbbq $0x00, %r9 - subq $0x80, %r15 - # Add - movq (%r15), %rax - addq (%r10), %rax - movq 8(%r15), %rcx - movq %rax, (%r15) - adcq 8(%r10), %rcx - movq 16(%r15), %r8 - movq %rcx, 8(%r15) - adcq 16(%r10), %r8 - movq 24(%r15), %rax - movq %r8, 16(%r15) - adcq 24(%r10), %rax - movq 32(%r15), %rcx - movq %rax, 24(%r15) - adcq 32(%r10), %rcx - movq 40(%r15), %r8 - movq %rcx, 32(%r15) - adcq 40(%r10), %r8 - movq 48(%r15), %rax - movq %r8, 40(%r15) - adcq 48(%r10), %rax - movq 56(%r15), %rcx - movq %rax, 48(%r15) - adcq 56(%r10), %rcx - movq 64(%r15), %r8 - movq %rcx, 56(%r15) - adcq 64(%r10), %r8 - movq 72(%r15), %rax - movq %r8, 64(%r15) - adcq 72(%r10), %rax - movq 80(%r15), %rcx - movq %rax, 72(%r15) - adcq 80(%r10), %rcx - movq 88(%r15), %r8 - movq %rcx, 80(%r15) - adcq 88(%r10), %r8 - movq 96(%r15), %rax - movq %r8, 88(%r15) - adcq 96(%r10), %rax - movq 104(%r15), %rcx - movq %rax, 96(%r15) - adcq 104(%r10), %rcx - movq 112(%r15), %r8 - movq %rcx, 104(%r15) - adcq 112(%r10), %r8 - movq 120(%r15), %rax - movq %r8, 112(%r15) - adcq 120(%r10), %rax - movq 128(%r15), %rcx - movq %rax, 120(%r15) - adcq 128(%r10), %rcx - movq 136(%r15), %r8 - movq %rcx, 128(%r15) - adcq 136(%r10), %r8 - movq 144(%r15), %rax - movq %r8, 136(%r15) - adcq 144(%r10), %rax - movq 152(%r15), %rcx - movq %rax, 144(%r15) - adcq 152(%r10), %rcx - movq 160(%r15), %r8 - movq %rcx, 152(%r15) - adcq 160(%r10), %r8 - movq 168(%r15), %rax - movq %r8, 160(%r15) - adcq 168(%r10), %rax - movq 176(%r15), %rcx - movq %rax, 168(%r15) - adcq 176(%r10), %rcx - movq 184(%r15), %r8 - movq %rcx, 176(%r15) - adcq 184(%r10), %r8 - movq 192(%r15), %rax - movq %r8, 184(%r15) - adcq 192(%r10), %rax - movq 200(%r15), %rcx - movq %rax, 192(%r15) - adcq 200(%r10), %rcx - movq 208(%r15), %r8 - movq %rcx, 200(%r15) - adcq 208(%r10), %r8 - movq 216(%r15), %rax - movq %r8, 208(%r15) - adcq 216(%r10), %rax - movq 224(%r15), %rcx - movq %rax, 216(%r15) - adcq 224(%r10), %rcx - movq 232(%r15), %r8 - movq %rcx, 224(%r15) - adcq 232(%r10), %r8 - movq 240(%r15), %rax - movq %r8, 232(%r15) - adcq 240(%r10), %rax - movq 248(%r15), %rcx - movq %rax, 240(%r15) - adcq 248(%r10), %rcx - movq %rcx, 248(%r15) - adcq $0x00, %r9 - movq %r9, 384(%rdi) - addq $0x80, %r15 - # Add - movq (%r15), %rax - xorq %r9, %r9 - addq (%r11), %rax - movq 8(%r15), %rcx - movq %rax, (%r15) - adcq 8(%r11), %rcx - movq 16(%r15), %r8 - movq %rcx, 8(%r15) - adcq 16(%r11), %r8 - movq 24(%r15), %rax - movq %r8, 16(%r15) - adcq 24(%r11), %rax - movq 32(%r15), %rcx - movq %rax, 24(%r15) - adcq 32(%r11), %rcx - movq 40(%r15), %r8 - movq %rcx, 32(%r15) - adcq 40(%r11), %r8 - movq 48(%r15), %rax - movq %r8, 40(%r15) - adcq 48(%r11), %rax - movq 56(%r15), %rcx - movq %rax, 48(%r15) - adcq 56(%r11), %rcx - movq 64(%r15), %r8 - movq %rcx, 56(%r15) - adcq 64(%r11), %r8 - movq 72(%r15), %rax - movq %r8, 64(%r15) - adcq 72(%r11), %rax - movq 80(%r15), %rcx - movq %rax, 72(%r15) - adcq 80(%r11), %rcx - movq 88(%r15), %r8 - movq %rcx, 80(%r15) - adcq 88(%r11), %r8 - movq 96(%r15), %rax - movq %r8, 88(%r15) - adcq 96(%r11), %rax - movq 104(%r15), %rcx - movq %rax, 96(%r15) - adcq 104(%r11), %rcx - movq 112(%r15), %r8 - movq %rcx, 104(%r15) - adcq 112(%r11), %r8 - movq 120(%r15), %rax - movq %r8, 112(%r15) - adcq 120(%r11), %rax - movq 128(%r15), %rcx - movq %rax, 120(%r15) - adcq 128(%r11), %rcx - movq %rcx, 128(%r15) - adcq $0x00, %r9 - # Add to zero - movq 136(%r11), %rax - adcq $0x00, %rax - movq 144(%r11), %rcx - movq %rax, 136(%r15) - adcq $0x00, %rcx - movq 152(%r11), %r8 - movq %rcx, 144(%r15) - adcq $0x00, %r8 - movq 160(%r11), %rax - movq %r8, 152(%r15) - adcq $0x00, %rax - movq 168(%r11), %rcx - movq %rax, 160(%r15) - adcq $0x00, %rcx - movq 176(%r11), %r8 - movq %rcx, 168(%r15) - adcq $0x00, %r8 - movq 184(%r11), %rax - movq %r8, 176(%r15) - adcq $0x00, %rax - movq 192(%r11), %rcx - movq %rax, 184(%r15) - adcq $0x00, %rcx - movq 200(%r11), %r8 - movq %rcx, 192(%r15) - adcq $0x00, %r8 - movq 208(%r11), %rax - movq %r8, 200(%r15) - adcq $0x00, %rax - movq 216(%r11), %rcx - movq %rax, 208(%r15) - adcq $0x00, %rcx - movq 224(%r11), %r8 - movq %rcx, 216(%r15) - adcq $0x00, %r8 - movq 232(%r11), %rax - movq %r8, 224(%r15) - adcq $0x00, %rax - movq 240(%r11), %rcx - movq %rax, 232(%r15) - adcq $0x00, %rcx - movq 248(%r11), %r8 - movq %rcx, 240(%r15) - adcq $0x00, %r8 - movq %r8, 248(%r15) - addq $0x328, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size sp_2048_mul_avx2_32,.-sp_2048_mul_avx2_32 -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ -#ifdef HAVE_INTEL_AVX2 /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -14149,680 +14149,6 @@ _sp_3072_mul_12: #ifndef __APPLE__ .size sp_3072_mul_12,.-sp_3072_mul_12 #endif /* __APPLE__ */ -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_sqr_12 -.type sp_3072_sqr_12,@function -.align 16 -sp_3072_sqr_12: -#else -.section __TEXT,__text -.globl _sp_3072_sqr_12 -.p2align 4 -_sp_3072_sqr_12: -#endif /* __APPLE__ */ - pushq %r12 - subq $0x60, %rsp - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - xorq %r9, %r9 - movq %rax, (%rsp) - movq %rdx, %r8 - # A[0] * A[1] - movq 8(%rsi), %rax - mulq (%rsi) - xorq %rcx, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - movq %r8, 8(%rsp) - # A[0] * A[2] - movq 16(%rsi), %rax - mulq (%rsi) - xorq %r8, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - movq %r9, 16(%rsp) - # A[0] * A[3] - movq 24(%rsi), %rax - mulq (%rsi) - xorq %r9, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - # A[1] * A[2] - movq 16(%rsi), %rax - mulq 8(%rsi) - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - movq %rcx, 24(%rsp) - # A[0] * A[4] - movq 32(%rsi), %rax - mulq (%rsi) - xorq %rcx, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - # A[1] * A[3] - movq 24(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - movq %r8, 32(%rsp) - # A[0] * A[5] - movq 40(%rsi), %rax - mulq (%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[4] - movq 32(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[3] - movq 24(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 40(%rsp) - # A[0] * A[6] - movq 48(%rsi), %rax - mulq (%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[5] - movq 40(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[4] - movq 32(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 48(%rsp) - # A[0] * A[7] - movq 56(%rsi), %rax - mulq (%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[6] - movq 48(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[5] - movq 40(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[4] - movq 32(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 56(%rsp) - # A[0] * A[8] - movq 64(%rsi), %rax - mulq (%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[7] - movq 56(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[6] - movq 48(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[5] - movq 40(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[4] - movq 32(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 64(%rsp) - # A[0] * A[9] - movq 72(%rsi), %rax - mulq (%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[8] - movq 64(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[7] - movq 56(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[6] - movq 48(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[5] - movq 40(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 72(%rsp) - # A[0] * A[10] - movq 80(%rsi), %rax - mulq (%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[9] - movq 72(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[8] - movq 64(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[7] - movq 56(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[6] - movq 48(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[5] - movq 40(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 80(%rsp) - # A[0] * A[11] - movq 88(%rsi), %rax - mulq (%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[1] * A[10] - movq 80(%rsi), %rax - mulq 8(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * A[9] - movq 72(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[8] - movq 64(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[7] - movq 56(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[6] - movq 48(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 88(%rsp) - # A[1] * A[11] - movq 88(%rsi), %rax - mulq 8(%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[2] * A[10] - movq 80(%rsi), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[3] * A[9] - movq 72(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[8] - movq 64(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[7] - movq 56(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[6] - movq 48(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 96(%rdi) - # A[2] * A[11] - movq 88(%rsi), %rax - mulq 16(%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[3] * A[10] - movq 80(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[4] * A[9] - movq 72(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[8] - movq 64(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[7] - movq 56(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 104(%rdi) - # A[3] * A[11] - movq 88(%rsi), %rax - mulq 24(%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[4] * A[10] - movq 80(%rsi), %rax - mulq 32(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[5] * A[9] - movq 72(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[8] - movq 64(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[7] - movq 56(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 112(%rdi) - # A[4] * A[11] - movq 88(%rsi), %rax - mulq 32(%rsi) - xorq %r9, %r9 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[5] * A[10] - movq 80(%rsi), %rax - mulq 40(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[6] * A[9] - movq 72(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[8] - movq 64(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - movq %rcx, 120(%rdi) - # A[5] * A[11] - movq 88(%rsi), %rax - mulq 40(%rsi) - xorq %rcx, %rcx - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[6] * A[10] - movq 80(%rsi), %rax - mulq 48(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[7] * A[9] - movq 72(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[8] * A[8] - movq 64(%rsi), %rax - mulq %rax - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r8 - adcq %r11, %r9 - adcq %r12, %rcx - movq %r8, 128(%rdi) - # A[6] * A[11] - movq 88(%rsi), %rax - mulq 48(%rsi) - xorq %r8, %r8 - xorq %r12, %r12 - movq %rax, %r10 - movq %rdx, %r11 - # A[7] * A[10] - movq 80(%rsi), %rax - mulq 56(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[8] * A[9] - movq 72(%rsi), %rax - mulq 64(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - addq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - addq %r10, %r9 - adcq %r11, %rcx - adcq %r12, %r8 - movq %r9, 136(%rdi) - # A[7] * A[11] - movq 88(%rsi), %rax - mulq 56(%rsi) - xorq %r9, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - # A[8] * A[10] - movq 80(%rsi), %rax - mulq 64(%rsi) - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - # A[9] * A[9] - movq 72(%rsi), %rax - mulq %rax - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - movq %rcx, 144(%rdi) - # A[8] * A[11] - movq 88(%rsi), %rax - mulq 64(%rsi) - xorq %rcx, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - # A[9] * A[10] - movq 80(%rsi), %rax - mulq 72(%rsi) - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %rcx - movq %r8, 152(%rdi) - # A[9] * A[11] - movq 88(%rsi), %rax - mulq 72(%rsi) - xorq %r8, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - # A[10] * A[10] - movq 80(%rsi), %rax - mulq %rax - addq %rax, %r9 - adcq %rdx, %rcx - adcq $0x00, %r8 - movq %r9, 160(%rdi) - # A[10] * A[11] - movq 88(%rsi), %rax - mulq 80(%rsi) - xorq %r9, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq $0x00, %r9 - movq %rcx, 168(%rdi) - # A[11] * A[11] - movq 88(%rsi), %rax - mulq %rax - addq %rax, %r8 - adcq %rdx, %r9 - movq %r8, 176(%rdi) - movq %r9, 184(%rdi) - movq (%rsp), %rax - movq 8(%rsp), %rdx - movq 16(%rsp), %r10 - movq 24(%rsp), %r11 - movq %rax, (%rdi) - movq %rdx, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 32(%rsp), %rax - movq 40(%rsp), %rdx - movq 48(%rsp), %r10 - movq 56(%rsp), %r11 - movq %rax, 32(%rdi) - movq %rdx, 40(%rdi) - movq %r10, 48(%rdi) - movq %r11, 56(%rdi) - movq 64(%rsp), %rax - movq 72(%rsp), %rdx - movq 80(%rsp), %r10 - movq 88(%rsp), %r11 - movq %rax, 64(%rdi) - movq %rdx, 72(%rdi) - movq %r10, 80(%rdi) - movq %r11, 88(%rdi) - addq $0x60, %rsp - popq %r12 - repz retq -#ifndef __APPLE__ -.size sp_3072_sqr_12,.-sp_3072_sqr_12 -#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Multiply a and b into r. (r = a * b) * @@ -15800,654 +15126,6 @@ L_end_3072_mul_avx2_12: .size sp_3072_mul_avx2_12,.-sp_3072_mul_avx2_12 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ -#ifdef HAVE_INTEL_AVX2 -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_sqr_avx2_12 -.type sp_3072_sqr_avx2_12,@function -.align 16 -sp_3072_sqr_avx2_12: -#else -.section __TEXT,__text -.globl _sp_3072_sqr_avx2_12 -.p2align 4 -_sp_3072_sqr_avx2_12: -#endif /* __APPLE__ */ - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - subq $0x60, %rsp - cmpq %rdi, %rsi - movq %rsp, %rbp - cmovne %rdi, %rbp - addq $0x60, %rdi - xorq %r10, %r10 - # Diagonal 1 - # Zero into %r9 - # A[1] x A[0] - movq (%rsi), %rdx - mulxq 8(%rsi), %r8, %r9 - movq %r8, 8(%rbp) - # Zero into %r8 - # A[2] x A[0] - mulxq 16(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 16(%rbp) - # Zero into %r9 - # A[3] x A[0] - mulxq 24(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, 24(%rbp) - # Zero into %r8 - # A[4] x A[0] - mulxq 32(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 32(%rbp) - # Zero into %r9 - # A[5] x A[0] - mulxq 40(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, 40(%rbp) - # No load %r12 - %r8 - # A[6] x A[0] - mulxq 48(%rsi), %rax, %r12 - adcxq %rax, %r9 - adoxq %r10, %r12 - movq %r9, 48(%rbp) - # No load %r13 - %r9 - # A[7] x A[0] - mulxq 56(%rsi), %rax, %r13 - adcxq %rax, %r12 - adoxq %r10, %r13 - # No store %r12 - %r8 - # No load %r14 - %r8 - # A[8] x A[0] - mulxq 64(%rsi), %rax, %r14 - adcxq %rax, %r13 - adoxq %r10, %r14 - # No store %r13 - %r9 - # No load %r15 - %r9 - # A[9] x A[0] - mulxq 72(%rsi), %rax, %r15 - adcxq %rax, %r14 - adoxq %r10, %r15 - # No store %r14 - %r8 - # No load %rbx - %r8 - # A[10] x A[0] - mulxq 80(%rsi), %rax, %rbx - adcxq %rax, %r15 - adoxq %r10, %rbx - # No store %r15 - %r9 - # Zero into %r9 - # A[11] x A[0] - mulxq 88(%rsi), %rax, %r9 - adcxq %rax, %rbx - adoxq %r10, %r9 - # No store %rbx - %r8 - # Carry - adcxq %r10, %r9 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r9, (%rdi) - # Diagonal 2 - movq 24(%rbp), %r9 - movq 32(%rbp), %r8 - # A[2] x A[1] - movq 8(%rsi), %rdx - mulxq 16(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 24(%rbp) - movq 40(%rbp), %r9 - # A[3] x A[1] - mulxq 24(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 32(%rbp) - movq 48(%rbp), %r8 - # A[4] x A[1] - mulxq 32(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 40(%rbp) - # No load %r12 - %r9 - # A[5] x A[1] - mulxq 40(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r12 - movq %r8, 48(%rbp) - # No load %r13 - %r8 - # A[6] x A[1] - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %r12 - adoxq %rcx, %r13 - # No store %r12 - %r9 - # No load %r14 - %r9 - # A[7] x A[1] - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %r13 - adoxq %rcx, %r14 - # No store %r13 - %r8 - # No load %r15 - %r8 - # A[8] x A[1] - mulxq 64(%rsi), %rax, %rcx - adcxq %rax, %r14 - adoxq %rcx, %r15 - # No store %r14 - %r9 - # No load %rbx - %r9 - # A[9] x A[1] - mulxq 72(%rsi), %rax, %rcx - adcxq %rax, %r15 - adoxq %rcx, %rbx - # No store %r15 - %r8 - movq (%rdi), %r8 - # A[10] x A[1] - mulxq 80(%rsi), %rax, %rcx - adcxq %rax, %rbx - adoxq %rcx, %r8 - # No store %rbx - %r9 - # Zero into %r9 - # A[11] x A[1] - mulxq 88(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, (%rdi) - # Zero into %r8 - # A[11] x A[2] - movq 16(%rsi), %rdx - mulxq 88(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 8(%rdi) - # Carry - adcxq %r11, %r8 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r8, 16(%rdi) - # Diagonal 3 - movq 40(%rbp), %r8 - movq 48(%rbp), %r9 - # A[3] x A[2] - mulxq 24(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 40(%rbp) - # No load %r12 - %r8 - # A[4] x A[2] - mulxq 32(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r12 - movq %r9, 48(%rbp) - # No load %r13 - %r9 - # A[5] x A[2] - mulxq 40(%rsi), %rax, %rcx - adcxq %rax, %r12 - adoxq %rcx, %r13 - # No store %r12 - %r8 - # No load %r14 - %r8 - # A[6] x A[2] - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %r13 - adoxq %rcx, %r14 - # No store %r13 - %r9 - # No load %r15 - %r9 - # A[7] x A[2] - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %r14 - adoxq %rcx, %r15 - # No store %r14 - %r8 - # No load %rbx - %r8 - # A[8] x A[2] - mulxq 64(%rsi), %rax, %rcx - adcxq %rax, %r15 - adoxq %rcx, %rbx - # No store %r15 - %r9 - movq (%rdi), %r9 - # A[9] x A[2] - mulxq 72(%rsi), %rax, %rcx - adcxq %rax, %rbx - adoxq %rcx, %r9 - # No store %rbx - %r8 - movq 8(%rdi), %r8 - # A[10] x A[2] - mulxq 80(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, (%rdi) - movq 16(%rdi), %r9 - # A[10] x A[3] - movq 24(%rsi), %rdx - mulxq 80(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 8(%rdi) - # Zero into %r8 - # A[10] x A[4] - movq 32(%rsi), %rdx - mulxq 80(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 16(%rdi) - # Zero into %r9 - # A[10] x A[5] - movq 40(%rsi), %rdx - mulxq 80(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, 24(%rdi) - # Carry - adcxq %r11, %r9 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r9, 32(%rdi) - # Diagonal 4 - # No load %r13 - %r8 - # A[4] x A[3] - movq 24(%rsi), %rdx - mulxq 32(%rsi), %rax, %rcx - adcxq %rax, %r12 - adoxq %rcx, %r13 - # No store %r12 - %r9 - # No load %r14 - %r9 - # A[5] x A[3] - mulxq 40(%rsi), %rax, %rcx - adcxq %rax, %r13 - adoxq %rcx, %r14 - # No store %r13 - %r8 - # No load %r15 - %r8 - # A[6] x A[3] - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %r14 - adoxq %rcx, %r15 - # No store %r14 - %r9 - # No load %rbx - %r9 - # A[7] x A[3] - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %r15 - adoxq %rcx, %rbx - # No store %r15 - %r8 - movq (%rdi), %r8 - # A[8] x A[3] - mulxq 64(%rsi), %rax, %rcx - adcxq %rax, %rbx - adoxq %rcx, %r8 - # No store %rbx - %r9 - movq 8(%rdi), %r9 - # A[9] x A[3] - mulxq 72(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, (%rdi) - movq 16(%rdi), %r8 - # A[9] x A[4] - movq 32(%rsi), %rdx - mulxq 72(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 8(%rdi) - movq 24(%rdi), %r9 - # A[9] x A[5] - movq 40(%rsi), %rdx - mulxq 72(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 16(%rdi) - movq 32(%rdi), %r8 - # A[9] x A[6] - movq 48(%rsi), %rdx - mulxq 72(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 24(%rdi) - # Zero into %r9 - # A[9] x A[7] - movq 56(%rsi), %rdx - mulxq 72(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, 32(%rdi) - # Zero into %r8 - # A[9] x A[8] - movq 64(%rsi), %rdx - mulxq 72(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 40(%rdi) - # Carry - adcxq %r11, %r8 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r8, 48(%rdi) - # Diagonal 5 - # No load %r15 - %r9 - # A[5] x A[4] - movq 32(%rsi), %rdx - mulxq 40(%rsi), %rax, %rcx - adcxq %rax, %r14 - adoxq %rcx, %r15 - # No store %r14 - %r8 - # No load %rbx - %r8 - # A[6] x A[4] - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %r15 - adoxq %rcx, %rbx - # No store %r15 - %r9 - movq (%rdi), %r9 - # A[7] x A[4] - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %rbx - adoxq %rcx, %r9 - # No store %rbx - %r8 - movq 8(%rdi), %r8 - # A[8] x A[4] - mulxq 64(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, (%rdi) - movq 16(%rdi), %r9 - # A[8] x A[5] - movq 40(%rsi), %rdx - mulxq 64(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 8(%rdi) - movq 24(%rdi), %r8 - # A[8] x A[6] - movq 48(%rsi), %rdx - mulxq 64(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 16(%rdi) - movq 32(%rdi), %r9 - # A[8] x A[7] - movq 56(%rsi), %rdx - mulxq 64(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 24(%rdi) - movq 40(%rdi), %r8 - # A[10] x A[6] - movq 48(%rsi), %rdx - mulxq 80(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 32(%rdi) - movq 48(%rdi), %r9 - # A[10] x A[7] - movq 56(%rsi), %rdx - mulxq 80(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 40(%rdi) - # Zero into %r8 - # A[10] x A[8] - movq 64(%rsi), %rdx - mulxq 80(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 48(%rdi) - # Zero into %r9 - # A[10] x A[9] - movq 72(%rsi), %rdx - mulxq 80(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, 56(%rdi) - # Carry - adcxq %r11, %r9 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r9, 64(%rdi) - # Diagonal 6 - movq (%rdi), %r8 - # A[6] x A[5] - movq 40(%rsi), %rdx - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %rbx - adoxq %rcx, %r8 - # No store %rbx - %r9 - movq 8(%rdi), %r9 - # A[7] x A[5] - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, (%rdi) - movq 16(%rdi), %r8 - # A[7] x A[6] - movq 48(%rsi), %rdx - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 8(%rdi) - movq 24(%rdi), %r9 - # A[11] x A[3] - movq 24(%rsi), %rdx - mulxq 88(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 16(%rdi) - movq 32(%rdi), %r8 - # A[11] x A[4] - movq 32(%rsi), %rdx - mulxq 88(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 24(%rdi) - movq 40(%rdi), %r9 - # A[11] x A[5] - movq 40(%rsi), %rdx - mulxq 88(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 32(%rdi) - movq 48(%rdi), %r8 - # A[11] x A[6] - movq 48(%rsi), %rdx - mulxq 88(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 40(%rdi) - movq 56(%rdi), %r9 - # A[11] x A[7] - movq 56(%rsi), %rdx - mulxq 88(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 48(%rdi) - movq 64(%rdi), %r8 - # A[11] x A[8] - movq 64(%rsi), %rdx - mulxq 88(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 56(%rdi) - # Zero into %r9 - # A[11] x A[9] - movq 72(%rsi), %rdx - mulxq 88(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, 64(%rdi) - # Zero into %r8 - # A[11] x A[10] - movq 80(%rsi), %rdx - mulxq 88(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 72(%rdi) - # Carry - adcxq %r11, %r8 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r8, 80(%rdi) - movq %r11, 88(%rdi) - # Double and Add in A[i] x A[i] - movq 8(%rbp), %r9 - # A[0] x A[0] - movq (%rsi), %rdx - mulxq %rdx, %rax, %rcx - movq %rax, (%rbp) - adoxq %r9, %r9 - adcxq %rcx, %r9 - movq %r9, 8(%rbp) - movq 16(%rbp), %r8 - movq 24(%rbp), %r9 - # A[1] x A[1] - movq 8(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 16(%rbp) - movq %r9, 24(%rbp) - movq 32(%rbp), %r8 - movq 40(%rbp), %r9 - # A[2] x A[2] - movq 16(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 32(%rbp) - movq %r9, 40(%rbp) - movq 48(%rbp), %r8 - # A[3] x A[3] - movq 24(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r12, %r12 - adcxq %rax, %r8 - adcxq %rcx, %r12 - movq %r8, 48(%rbp) - # A[4] x A[4] - movq 32(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r13, %r13 - adoxq %r14, %r14 - adcxq %rax, %r13 - adcxq %rcx, %r14 - # A[5] x A[5] - movq 40(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r15, %r15 - adoxq %rbx, %rbx - adcxq %rax, %r15 - adcxq %rcx, %rbx - movq (%rdi), %r8 - movq 8(%rdi), %r9 - # A[6] x A[6] - movq 48(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq 16(%rdi), %r8 - movq 24(%rdi), %r9 - # A[7] x A[7] - movq 56(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 16(%rdi) - movq %r9, 24(%rdi) - movq 32(%rdi), %r8 - movq 40(%rdi), %r9 - # A[8] x A[8] - movq 64(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 32(%rdi) - movq %r9, 40(%rdi) - movq 48(%rdi), %r8 - movq 56(%rdi), %r9 - # A[9] x A[9] - movq 72(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 48(%rdi) - movq %r9, 56(%rdi) - movq 64(%rdi), %r8 - movq 72(%rdi), %r9 - # A[10] x A[10] - movq 80(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 64(%rdi) - movq %r9, 72(%rdi) - movq 80(%rdi), %r8 - movq 88(%rdi), %r9 - # A[11] x A[11] - movq 88(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 80(%rdi) - movq %r9, 88(%rdi) - movq %r12, -40(%rdi) - movq %r13, -32(%rdi) - movq %r14, -24(%rdi) - movq %r15, -16(%rdi) - movq %rbx, -8(%rdi) - subq $0x60, %rdi - cmpq %rdi, %rsi - jne L_end_3072_sqr_avx2_12 - vmovdqu (%rbp), %xmm0 - vmovups %xmm0, (%rdi) - vmovdqu 16(%rbp), %xmm0 - vmovups %xmm0, 16(%rdi) - vmovdqu 32(%rbp), %xmm0 - vmovups %xmm0, 32(%rdi) - movq 48(%rbp), %rax - movq %rax, 48(%rdi) -L_end_3072_sqr_avx2_12: - addq $0x60, %rsp - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - repz retq -#ifndef __APPLE__ -.size sp_3072_sqr_avx2_12,.-sp_3072_sqr_avx2_12 -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -17269,535 +15947,6 @@ _sp_3072_mul_24: #ifndef __APPLE__ .size sp_3072_mul_24,.-sp_3072_mul_24 #endif /* __APPLE__ */ -/* Add a to a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_dbl_12 -.type sp_3072_dbl_12,@function -.align 16 -sp_3072_dbl_12: -#else -.section __TEXT,__text -.globl _sp_3072_dbl_12 -.p2align 4 -_sp_3072_dbl_12: -#endif /* __APPLE__ */ - movq (%rsi), %rdx - xorq %rax, %rax - addq %rdx, %rdx - movq 8(%rsi), %rcx - movq %rdx, (%rdi) - adcq %rcx, %rcx - movq 16(%rsi), %rdx - movq %rcx, 8(%rdi) - adcq %rdx, %rdx - movq 24(%rsi), %rcx - movq %rdx, 16(%rdi) - adcq %rcx, %rcx - movq 32(%rsi), %rdx - movq %rcx, 24(%rdi) - adcq %rdx, %rdx - movq 40(%rsi), %rcx - movq %rdx, 32(%rdi) - adcq %rcx, %rcx - movq 48(%rsi), %rdx - movq %rcx, 40(%rdi) - adcq %rdx, %rdx - movq 56(%rsi), %rcx - movq %rdx, 48(%rdi) - adcq %rcx, %rcx - movq 64(%rsi), %rdx - movq %rcx, 56(%rdi) - adcq %rdx, %rdx - movq 72(%rsi), %rcx - movq %rdx, 64(%rdi) - adcq %rcx, %rcx - movq 80(%rsi), %rdx - movq %rcx, 72(%rdi) - adcq %rdx, %rdx - movq 88(%rsi), %rcx - movq %rdx, 80(%rdi) - adcq %rcx, %rcx - movq %rcx, 88(%rdi) - adcq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_3072_dbl_12,.-sp_3072_dbl_12 -#endif /* __APPLE__ */ -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_sqr_24 -.type sp_3072_sqr_24,@function -.align 16 -sp_3072_sqr_24: -#else -.section __TEXT,__text -.globl _sp_3072_sqr_24 -.p2align 4 -_sp_3072_sqr_24: -#endif /* __APPLE__ */ - subq $0x1f8, %rsp - movq %rdi, 480(%rsp) - movq %rsi, 488(%rsp) - leaq 384(%rsp), %r8 - leaq 96(%rsi), %r9 - # Add - movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx - movq 8(%rsi), %rax - movq %rdx, (%r8) - adcq 8(%r9), %rax - movq 16(%rsi), %rdx - movq %rax, 8(%r8) - adcq 16(%r9), %rdx - movq 24(%rsi), %rax - movq %rdx, 16(%r8) - adcq 24(%r9), %rax - movq 32(%rsi), %rdx - movq %rax, 24(%r8) - adcq 32(%r9), %rdx - movq 40(%rsi), %rax - movq %rdx, 32(%r8) - adcq 40(%r9), %rax - movq 48(%rsi), %rdx - movq %rax, 40(%r8) - adcq 48(%r9), %rdx - movq 56(%rsi), %rax - movq %rdx, 48(%r8) - adcq 56(%r9), %rax - movq 64(%rsi), %rdx - movq %rax, 56(%r8) - adcq 64(%r9), %rdx - movq 72(%rsi), %rax - movq %rdx, 64(%r8) - adcq 72(%r9), %rax - movq 80(%rsi), %rdx - movq %rax, 72(%r8) - adcq 80(%r9), %rdx - movq 88(%rsi), %rax - movq %rdx, 80(%r8) - adcq 88(%r9), %rax - movq %rax, 88(%r8) - adcq $0x00, %rcx - movq %rcx, 496(%rsp) - movq %r8, %rsi - movq %rsp, %rdi -#ifndef __APPLE__ - callq sp_3072_sqr_12@plt -#else - callq _sp_3072_sqr_12 -#endif /* __APPLE__ */ - movq 488(%rsp), %rsi - leaq 192(%rsp), %rdi - addq $0x60, %rsi -#ifndef __APPLE__ - callq sp_3072_sqr_12@plt -#else - callq _sp_3072_sqr_12 -#endif /* __APPLE__ */ - movq 488(%rsp), %rsi - movq 480(%rsp), %rdi -#ifndef __APPLE__ - callq sp_3072_sqr_12@plt -#else - callq _sp_3072_sqr_12 -#endif /* __APPLE__ */ -#ifdef _WIN64 - movq 488(%rsp), %rsi - movq 480(%rsp), %rdi -#endif /* _WIN64 */ - movq 496(%rsp), %r10 - movq %rdi, %r9 - leaq 384(%rsp), %r8 - movq %r10, %rcx - negq %r10 - addq $0xc0, %r9 - movq (%r8), %rdx - movq 8(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, (%r9) - movq %rax, 8(%r9) - movq 16(%r8), %rdx - movq 24(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 16(%r9) - movq %rax, 24(%r9) - movq 32(%r8), %rdx - movq 40(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 32(%r9) - movq %rax, 40(%r9) - movq 48(%r8), %rdx - movq 56(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 48(%r9) - movq %rax, 56(%r9) - movq 64(%r8), %rdx - movq 72(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 64(%r9) - movq %rax, 72(%r9) - movq 80(%r8), %rdx - movq 88(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 80(%r9) - movq %rax, 88(%r9) - movq (%r9), %rdx - addq %rdx, %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq %rax, %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq %rdx, %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq %rax, %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq %rdx, %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq %rax, %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq %rdx, %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq %rax, %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq %rdx, %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq %rax, %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq %rdx, %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq %rax, %rax - movq %rax, 88(%r9) - adcq $0x00, %rcx - leaq 192(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq %rax, 184(%r8) - sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq %rax, 184(%r8) - sbbq $0x00, %rcx - subq $0x60, %r9 - # Add in place - movq (%r9), %rdx - addq (%r8), %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq 8(%r8), %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq 16(%r8), %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq 24(%r8), %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq 32(%r8), %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq 40(%r8), %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq 48(%r8), %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq 56(%r8), %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq 64(%r8), %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq 72(%r8), %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq 80(%r8), %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq 88(%r8), %rax - movq 96(%r9), %rdx - movq %rax, 88(%r9) - adcq 96(%r8), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 104(%r8), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 112(%r8), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 120(%r8), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 128(%r8), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 136(%r8), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 144(%r8), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 152(%r8), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 160(%r8), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 168(%r8), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 176(%r8), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 184(%r8), %rax - movq %rax, 184(%r9) - adcq $0x00, %rcx - movq %rcx, 288(%rdi) - # Add in place - movq 96(%r9), %rdx - addq (%rsi), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 8(%rsi), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 16(%rsi), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 24(%rsi), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 32(%rsi), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 40(%rsi), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 48(%rsi), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 56(%rsi), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 64(%rsi), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 72(%rsi), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 80(%rsi), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 88(%rsi), %rax - movq 192(%r9), %rdx - movq %rax, 184(%r9) - adcq 96(%rsi), %rdx - movq %rdx, 192(%r9) - # Add to zero - movq 104(%rsi), %rdx - adcq $0x00, %rdx - movq 112(%rsi), %rax - movq %rdx, 200(%r9) - adcq $0x00, %rax - movq 120(%rsi), %rdx - movq %rax, 208(%r9) - adcq $0x00, %rdx - movq 128(%rsi), %rax - movq %rdx, 216(%r9) - adcq $0x00, %rax - movq 136(%rsi), %rdx - movq %rax, 224(%r9) - adcq $0x00, %rdx - movq 144(%rsi), %rax - movq %rdx, 232(%r9) - adcq $0x00, %rax - movq 152(%rsi), %rdx - movq %rax, 240(%r9) - adcq $0x00, %rdx - movq 160(%rsi), %rax - movq %rdx, 248(%r9) - adcq $0x00, %rax - movq 168(%rsi), %rdx - movq %rax, 256(%r9) - adcq $0x00, %rdx - movq 176(%rsi), %rax - movq %rdx, 264(%r9) - adcq $0x00, %rax - movq 184(%rsi), %rdx - movq %rax, 272(%r9) - adcq $0x00, %rdx - movq %rdx, 280(%r9) - addq $0x1f8, %rsp - repz retq -#ifndef __APPLE__ -.size sp_3072_sqr_24,.-sp_3072_sqr_24 -#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Multiply a and b into r. (r = a * b) * @@ -18332,454 +16481,6 @@ _sp_3072_mul_avx2_24: .size sp_3072_mul_avx2_24,.-sp_3072_mul_avx2_24 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ -#ifdef HAVE_INTEL_AVX2 -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_sqr_avx2_24 -.type sp_3072_sqr_avx2_24,@function -.align 16 -sp_3072_sqr_avx2_24: -#else -.section __TEXT,__text -.globl _sp_3072_sqr_avx2_24 -.p2align 4 -_sp_3072_sqr_avx2_24: -#endif /* __APPLE__ */ - subq $0x1f8, %rsp - movq %rdi, 480(%rsp) - movq %rsi, 488(%rsp) - leaq 384(%rsp), %r8 - leaq 96(%rsi), %r9 - # Add - movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx - movq 8(%rsi), %rax - movq %rdx, (%r8) - adcq 8(%r9), %rax - movq 16(%rsi), %rdx - movq %rax, 8(%r8) - adcq 16(%r9), %rdx - movq 24(%rsi), %rax - movq %rdx, 16(%r8) - adcq 24(%r9), %rax - movq 32(%rsi), %rdx - movq %rax, 24(%r8) - adcq 32(%r9), %rdx - movq 40(%rsi), %rax - movq %rdx, 32(%r8) - adcq 40(%r9), %rax - movq 48(%rsi), %rdx - movq %rax, 40(%r8) - adcq 48(%r9), %rdx - movq 56(%rsi), %rax - movq %rdx, 48(%r8) - adcq 56(%r9), %rax - movq 64(%rsi), %rdx - movq %rax, 56(%r8) - adcq 64(%r9), %rdx - movq 72(%rsi), %rax - movq %rdx, 64(%r8) - adcq 72(%r9), %rax - movq 80(%rsi), %rdx - movq %rax, 72(%r8) - adcq 80(%r9), %rdx - movq 88(%rsi), %rax - movq %rdx, 80(%r8) - adcq 88(%r9), %rax - movq %rax, 88(%r8) - adcq $0x00, %rcx - movq %rcx, 496(%rsp) - movq %r8, %rsi - movq %rsp, %rdi -#ifndef __APPLE__ - callq sp_3072_sqr_avx2_12@plt -#else - callq _sp_3072_sqr_avx2_12 -#endif /* __APPLE__ */ - movq 488(%rsp), %rsi - leaq 192(%rsp), %rdi - addq $0x60, %rsi -#ifndef __APPLE__ - callq sp_3072_sqr_avx2_12@plt -#else - callq _sp_3072_sqr_avx2_12 -#endif /* __APPLE__ */ - movq 488(%rsp), %rsi - movq 480(%rsp), %rdi -#ifndef __APPLE__ - callq sp_3072_sqr_avx2_12@plt -#else - callq _sp_3072_sqr_avx2_12 -#endif /* __APPLE__ */ -#ifdef _WIN64 - movq 488(%rsp), %rsi - movq 480(%rsp), %rdi -#endif /* _WIN64 */ - movq 496(%rsp), %r10 - movq %rdi, %r9 - leaq 384(%rsp), %r8 - movq %r10, %rcx - negq %r10 - addq $0xc0, %r9 - movq (%r8), %rdx - pextq %r10, %rdx, %rdx - addq %rdx, %rdx - movq 8(%r8), %rax - movq %rdx, (%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 16(%r8), %rdx - movq %rax, 8(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 32(%r8), %rdx - movq %rax, 24(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 48(%r8), %rdx - movq %rax, 40(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 64(%r8), %rdx - movq %rax, 56(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 80(%r8), %rdx - movq %rax, 72(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq %rax, 88(%r9) - adcq $0x00, %rcx - leaq 192(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq %rax, 184(%r8) - sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq %rax, 184(%r8) - sbbq $0x00, %rcx - subq $0x60, %r9 - # Add in place - movq (%r9), %rdx - addq (%r8), %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq 8(%r8), %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq 16(%r8), %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq 24(%r8), %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq 32(%r8), %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq 40(%r8), %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq 48(%r8), %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq 56(%r8), %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq 64(%r8), %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq 72(%r8), %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq 80(%r8), %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq 88(%r8), %rax - movq 96(%r9), %rdx - movq %rax, 88(%r9) - adcq 96(%r8), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 104(%r8), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 112(%r8), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 120(%r8), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 128(%r8), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 136(%r8), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 144(%r8), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 152(%r8), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 160(%r8), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 168(%r8), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 176(%r8), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 184(%r8), %rax - movq %rax, 184(%r9) - adcq $0x00, %rcx - movq %rcx, 288(%rdi) - # Add in place - movq 96(%r9), %rdx - addq (%rsi), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 8(%rsi), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 16(%rsi), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 24(%rsi), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 32(%rsi), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 40(%rsi), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 48(%rsi), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 56(%rsi), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 64(%rsi), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 72(%rsi), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 80(%rsi), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 88(%rsi), %rax - movq 192(%r9), %rdx - movq %rax, 184(%r9) - adcq 96(%rsi), %rdx - movq %rdx, 192(%r9) - # Add to zero - movq 104(%rsi), %rdx - adcq $0x00, %rdx - movq 112(%rsi), %rax - movq %rdx, 200(%r9) - adcq $0x00, %rax - movq 120(%rsi), %rdx - movq %rax, 208(%r9) - adcq $0x00, %rdx - movq 128(%rsi), %rax - movq %rdx, 216(%r9) - adcq $0x00, %rax - movq 136(%rsi), %rdx - movq %rax, 224(%r9) - adcq $0x00, %rdx - movq 144(%rsi), %rax - movq %rdx, 232(%r9) - adcq $0x00, %rax - movq 152(%rsi), %rdx - movq %rax, 240(%r9) - adcq $0x00, %rdx - movq 160(%rsi), %rax - movq %rdx, 248(%r9) - adcq $0x00, %rax - movq 168(%rsi), %rdx - movq %rax, 256(%r9) - adcq $0x00, %rdx - movq 176(%rsi), %rax - movq %rdx, 264(%r9) - adcq $0x00, %rax - movq 184(%rsi), %rdx - movq %rax, 272(%r9) - adcq $0x00, %rdx - movq %rdx, 280(%r9) - addq $0x1f8, %rsp - repz retq -#ifndef __APPLE__ -.size sp_3072_sqr_avx2_24,.-sp_3072_sqr_avx2_24 -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ /* Sub b from a into a. (a -= b) * * a A single precision integer and result. @@ -20152,967 +17853,6 @@ _sp_3072_mul_48: #ifndef __APPLE__ .size sp_3072_mul_48,.-sp_3072_mul_48 #endif /* __APPLE__ */ -/* Add a to a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_dbl_24 -.type sp_3072_dbl_24,@function -.align 16 -sp_3072_dbl_24: -#else -.section __TEXT,__text -.globl _sp_3072_dbl_24 -.p2align 4 -_sp_3072_dbl_24: -#endif /* __APPLE__ */ - movq (%rsi), %rdx - xorq %rax, %rax - addq %rdx, %rdx - movq 8(%rsi), %rcx - movq %rdx, (%rdi) - adcq %rcx, %rcx - movq 16(%rsi), %rdx - movq %rcx, 8(%rdi) - adcq %rdx, %rdx - movq 24(%rsi), %rcx - movq %rdx, 16(%rdi) - adcq %rcx, %rcx - movq 32(%rsi), %rdx - movq %rcx, 24(%rdi) - adcq %rdx, %rdx - movq 40(%rsi), %rcx - movq %rdx, 32(%rdi) - adcq %rcx, %rcx - movq 48(%rsi), %rdx - movq %rcx, 40(%rdi) - adcq %rdx, %rdx - movq 56(%rsi), %rcx - movq %rdx, 48(%rdi) - adcq %rcx, %rcx - movq 64(%rsi), %rdx - movq %rcx, 56(%rdi) - adcq %rdx, %rdx - movq 72(%rsi), %rcx - movq %rdx, 64(%rdi) - adcq %rcx, %rcx - movq 80(%rsi), %rdx - movq %rcx, 72(%rdi) - adcq %rdx, %rdx - movq 88(%rsi), %rcx - movq %rdx, 80(%rdi) - adcq %rcx, %rcx - movq 96(%rsi), %rdx - movq %rcx, 88(%rdi) - adcq %rdx, %rdx - movq 104(%rsi), %rcx - movq %rdx, 96(%rdi) - adcq %rcx, %rcx - movq 112(%rsi), %rdx - movq %rcx, 104(%rdi) - adcq %rdx, %rdx - movq 120(%rsi), %rcx - movq %rdx, 112(%rdi) - adcq %rcx, %rcx - movq 128(%rsi), %rdx - movq %rcx, 120(%rdi) - adcq %rdx, %rdx - movq 136(%rsi), %rcx - movq %rdx, 128(%rdi) - adcq %rcx, %rcx - movq 144(%rsi), %rdx - movq %rcx, 136(%rdi) - adcq %rdx, %rdx - movq 152(%rsi), %rcx - movq %rdx, 144(%rdi) - adcq %rcx, %rcx - movq 160(%rsi), %rdx - movq %rcx, 152(%rdi) - adcq %rdx, %rdx - movq 168(%rsi), %rcx - movq %rdx, 160(%rdi) - adcq %rcx, %rcx - movq 176(%rsi), %rdx - movq %rcx, 168(%rdi) - adcq %rdx, %rdx - movq 184(%rsi), %rcx - movq %rdx, 176(%rdi) - adcq %rcx, %rcx - movq %rcx, 184(%rdi) - adcq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_3072_dbl_24,.-sp_3072_dbl_24 -#endif /* __APPLE__ */ -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_sqr_48 -.type sp_3072_sqr_48,@function -.align 16 -sp_3072_sqr_48: -#else -.section __TEXT,__text -.globl _sp_3072_sqr_48 -.p2align 4 -_sp_3072_sqr_48: -#endif /* __APPLE__ */ - subq $0x3d8, %rsp - movq %rdi, 960(%rsp) - movq %rsi, 968(%rsp) - leaq 768(%rsp), %r8 - leaq 192(%rsi), %r9 - # Add - movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx - movq 8(%rsi), %rax - movq %rdx, (%r8) - adcq 8(%r9), %rax - movq 16(%rsi), %rdx - movq %rax, 8(%r8) - adcq 16(%r9), %rdx - movq 24(%rsi), %rax - movq %rdx, 16(%r8) - adcq 24(%r9), %rax - movq 32(%rsi), %rdx - movq %rax, 24(%r8) - adcq 32(%r9), %rdx - movq 40(%rsi), %rax - movq %rdx, 32(%r8) - adcq 40(%r9), %rax - movq 48(%rsi), %rdx - movq %rax, 40(%r8) - adcq 48(%r9), %rdx - movq 56(%rsi), %rax - movq %rdx, 48(%r8) - adcq 56(%r9), %rax - movq 64(%rsi), %rdx - movq %rax, 56(%r8) - adcq 64(%r9), %rdx - movq 72(%rsi), %rax - movq %rdx, 64(%r8) - adcq 72(%r9), %rax - movq 80(%rsi), %rdx - movq %rax, 72(%r8) - adcq 80(%r9), %rdx - movq 88(%rsi), %rax - movq %rdx, 80(%r8) - adcq 88(%r9), %rax - movq 96(%rsi), %rdx - movq %rax, 88(%r8) - adcq 96(%r9), %rdx - movq 104(%rsi), %rax - movq %rdx, 96(%r8) - adcq 104(%r9), %rax - movq 112(%rsi), %rdx - movq %rax, 104(%r8) - adcq 112(%r9), %rdx - movq 120(%rsi), %rax - movq %rdx, 112(%r8) - adcq 120(%r9), %rax - movq 128(%rsi), %rdx - movq %rax, 120(%r8) - adcq 128(%r9), %rdx - movq 136(%rsi), %rax - movq %rdx, 128(%r8) - adcq 136(%r9), %rax - movq 144(%rsi), %rdx - movq %rax, 136(%r8) - adcq 144(%r9), %rdx - movq 152(%rsi), %rax - movq %rdx, 144(%r8) - adcq 152(%r9), %rax - movq 160(%rsi), %rdx - movq %rax, 152(%r8) - adcq 160(%r9), %rdx - movq 168(%rsi), %rax - movq %rdx, 160(%r8) - adcq 168(%r9), %rax - movq 176(%rsi), %rdx - movq %rax, 168(%r8) - adcq 176(%r9), %rdx - movq 184(%rsi), %rax - movq %rdx, 176(%r8) - adcq 184(%r9), %rax - movq %rax, 184(%r8) - adcq $0x00, %rcx - movq %rcx, 976(%rsp) - movq %r8, %rsi - movq %rsp, %rdi -#ifndef __APPLE__ - callq sp_3072_sqr_24@plt -#else - callq _sp_3072_sqr_24 -#endif /* __APPLE__ */ - movq 968(%rsp), %rsi - leaq 384(%rsp), %rdi - addq $0xc0, %rsi -#ifndef __APPLE__ - callq sp_3072_sqr_24@plt -#else - callq _sp_3072_sqr_24 -#endif /* __APPLE__ */ - movq 968(%rsp), %rsi - movq 960(%rsp), %rdi -#ifndef __APPLE__ - callq sp_3072_sqr_24@plt -#else - callq _sp_3072_sqr_24 -#endif /* __APPLE__ */ -#ifdef _WIN64 - movq 968(%rsp), %rsi - movq 960(%rsp), %rdi -#endif /* _WIN64 */ - movq 976(%rsp), %r10 - movq %rdi, %r9 - leaq 768(%rsp), %r8 - movq %r10, %rcx - negq %r10 - addq $0x180, %r9 - movq (%r8), %rdx - movq 8(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, (%r9) - movq %rax, 8(%r9) - movq 16(%r8), %rdx - movq 24(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 16(%r9) - movq %rax, 24(%r9) - movq 32(%r8), %rdx - movq 40(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 32(%r9) - movq %rax, 40(%r9) - movq 48(%r8), %rdx - movq 56(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 48(%r9) - movq %rax, 56(%r9) - movq 64(%r8), %rdx - movq 72(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 64(%r9) - movq %rax, 72(%r9) - movq 80(%r8), %rdx - movq 88(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 80(%r9) - movq %rax, 88(%r9) - movq 96(%r8), %rdx - movq 104(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 96(%r9) - movq %rax, 104(%r9) - movq 112(%r8), %rdx - movq 120(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 112(%r9) - movq %rax, 120(%r9) - movq 128(%r8), %rdx - movq 136(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 128(%r9) - movq %rax, 136(%r9) - movq 144(%r8), %rdx - movq 152(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 144(%r9) - movq %rax, 152(%r9) - movq 160(%r8), %rdx - movq 168(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 160(%r9) - movq %rax, 168(%r9) - movq 176(%r8), %rdx - movq 184(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 176(%r9) - movq %rax, 184(%r9) - movq (%r9), %rdx - addq %rdx, %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq %rax, %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq %rdx, %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq %rax, %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq %rdx, %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq %rax, %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq %rdx, %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq %rax, %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq %rdx, %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq %rax, %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq %rdx, %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq %rax, %rax - movq 96(%r9), %rdx - movq %rax, 88(%r9) - adcq %rdx, %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq %rax, %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq %rdx, %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq %rax, %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq %rdx, %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq %rax, %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq %rdx, %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq %rax, %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq %rdx, %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq %rax, %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq %rdx, %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq %rax, %rax - movq %rax, 184(%r9) - adcq $0x00, %rcx - leaq 384(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rsi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rsi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rsi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rsi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rsi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rsi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rsi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rsi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rsi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rsi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rsi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rsi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rsi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rsi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rsi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rsi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rsi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rsi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rsi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rsi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rsi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rsi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rsi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rsi), %rax - movq %rax, 376(%r8) - sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rdi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rdi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rdi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rdi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rdi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rdi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rdi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rdi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rdi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rdi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rdi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rdi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rdi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rdi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rdi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rdi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rdi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rdi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rdi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rdi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rdi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rdi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rdi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rdi), %rax - movq %rax, 376(%r8) - sbbq $0x00, %rcx - subq $0xc0, %r9 - # Add in place - movq (%r9), %rdx - addq (%r8), %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq 8(%r8), %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq 16(%r8), %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq 24(%r8), %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq 32(%r8), %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq 40(%r8), %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq 48(%r8), %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq 56(%r8), %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq 64(%r8), %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq 72(%r8), %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq 80(%r8), %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq 88(%r8), %rax - movq 96(%r9), %rdx - movq %rax, 88(%r9) - adcq 96(%r8), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 104(%r8), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 112(%r8), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 120(%r8), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 128(%r8), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 136(%r8), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 144(%r8), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 152(%r8), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 160(%r8), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 168(%r8), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 176(%r8), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 184(%r8), %rax - movq 192(%r9), %rdx - movq %rax, 184(%r9) - adcq 192(%r8), %rdx - movq 200(%r9), %rax - movq %rdx, 192(%r9) - adcq 200(%r8), %rax - movq 208(%r9), %rdx - movq %rax, 200(%r9) - adcq 208(%r8), %rdx - movq 216(%r9), %rax - movq %rdx, 208(%r9) - adcq 216(%r8), %rax - movq 224(%r9), %rdx - movq %rax, 216(%r9) - adcq 224(%r8), %rdx - movq 232(%r9), %rax - movq %rdx, 224(%r9) - adcq 232(%r8), %rax - movq 240(%r9), %rdx - movq %rax, 232(%r9) - adcq 240(%r8), %rdx - movq 248(%r9), %rax - movq %rdx, 240(%r9) - adcq 248(%r8), %rax - movq 256(%r9), %rdx - movq %rax, 248(%r9) - adcq 256(%r8), %rdx - movq 264(%r9), %rax - movq %rdx, 256(%r9) - adcq 264(%r8), %rax - movq 272(%r9), %rdx - movq %rax, 264(%r9) - adcq 272(%r8), %rdx - movq 280(%r9), %rax - movq %rdx, 272(%r9) - adcq 280(%r8), %rax - movq 288(%r9), %rdx - movq %rax, 280(%r9) - adcq 288(%r8), %rdx - movq 296(%r9), %rax - movq %rdx, 288(%r9) - adcq 296(%r8), %rax - movq 304(%r9), %rdx - movq %rax, 296(%r9) - adcq 304(%r8), %rdx - movq 312(%r9), %rax - movq %rdx, 304(%r9) - adcq 312(%r8), %rax - movq 320(%r9), %rdx - movq %rax, 312(%r9) - adcq 320(%r8), %rdx - movq 328(%r9), %rax - movq %rdx, 320(%r9) - adcq 328(%r8), %rax - movq 336(%r9), %rdx - movq %rax, 328(%r9) - adcq 336(%r8), %rdx - movq 344(%r9), %rax - movq %rdx, 336(%r9) - adcq 344(%r8), %rax - movq 352(%r9), %rdx - movq %rax, 344(%r9) - adcq 352(%r8), %rdx - movq 360(%r9), %rax - movq %rdx, 352(%r9) - adcq 360(%r8), %rax - movq 368(%r9), %rdx - movq %rax, 360(%r9) - adcq 368(%r8), %rdx - movq 376(%r9), %rax - movq %rdx, 368(%r9) - adcq 376(%r8), %rax - movq %rax, 376(%r9) - adcq $0x00, %rcx - movq %rcx, 576(%rdi) - # Add in place - movq 192(%r9), %rdx - addq (%rsi), %rdx - movq 200(%r9), %rax - movq %rdx, 192(%r9) - adcq 8(%rsi), %rax - movq 208(%r9), %rdx - movq %rax, 200(%r9) - adcq 16(%rsi), %rdx - movq 216(%r9), %rax - movq %rdx, 208(%r9) - adcq 24(%rsi), %rax - movq 224(%r9), %rdx - movq %rax, 216(%r9) - adcq 32(%rsi), %rdx - movq 232(%r9), %rax - movq %rdx, 224(%r9) - adcq 40(%rsi), %rax - movq 240(%r9), %rdx - movq %rax, 232(%r9) - adcq 48(%rsi), %rdx - movq 248(%r9), %rax - movq %rdx, 240(%r9) - adcq 56(%rsi), %rax - movq 256(%r9), %rdx - movq %rax, 248(%r9) - adcq 64(%rsi), %rdx - movq 264(%r9), %rax - movq %rdx, 256(%r9) - adcq 72(%rsi), %rax - movq 272(%r9), %rdx - movq %rax, 264(%r9) - adcq 80(%rsi), %rdx - movq 280(%r9), %rax - movq %rdx, 272(%r9) - adcq 88(%rsi), %rax - movq 288(%r9), %rdx - movq %rax, 280(%r9) - adcq 96(%rsi), %rdx - movq 296(%r9), %rax - movq %rdx, 288(%r9) - adcq 104(%rsi), %rax - movq 304(%r9), %rdx - movq %rax, 296(%r9) - adcq 112(%rsi), %rdx - movq 312(%r9), %rax - movq %rdx, 304(%r9) - adcq 120(%rsi), %rax - movq 320(%r9), %rdx - movq %rax, 312(%r9) - adcq 128(%rsi), %rdx - movq 328(%r9), %rax - movq %rdx, 320(%r9) - adcq 136(%rsi), %rax - movq 336(%r9), %rdx - movq %rax, 328(%r9) - adcq 144(%rsi), %rdx - movq 344(%r9), %rax - movq %rdx, 336(%r9) - adcq 152(%rsi), %rax - movq 352(%r9), %rdx - movq %rax, 344(%r9) - adcq 160(%rsi), %rdx - movq 360(%r9), %rax - movq %rdx, 352(%r9) - adcq 168(%rsi), %rax - movq 368(%r9), %rdx - movq %rax, 360(%r9) - adcq 176(%rsi), %rdx - movq 376(%r9), %rax - movq %rdx, 368(%r9) - adcq 184(%rsi), %rax - movq 384(%r9), %rdx - movq %rax, 376(%r9) - adcq 192(%rsi), %rdx - movq %rdx, 384(%r9) - # Add to zero - movq 200(%rsi), %rdx - adcq $0x00, %rdx - movq 208(%rsi), %rax - movq %rdx, 392(%r9) - adcq $0x00, %rax - movq 216(%rsi), %rdx - movq %rax, 400(%r9) - adcq $0x00, %rdx - movq 224(%rsi), %rax - movq %rdx, 408(%r9) - adcq $0x00, %rax - movq 232(%rsi), %rdx - movq %rax, 416(%r9) - adcq $0x00, %rdx - movq 240(%rsi), %rax - movq %rdx, 424(%r9) - adcq $0x00, %rax - movq 248(%rsi), %rdx - movq %rax, 432(%r9) - adcq $0x00, %rdx - movq 256(%rsi), %rax - movq %rdx, 440(%r9) - adcq $0x00, %rax - movq 264(%rsi), %rdx - movq %rax, 448(%r9) - adcq $0x00, %rdx - movq 272(%rsi), %rax - movq %rdx, 456(%r9) - adcq $0x00, %rax - movq 280(%rsi), %rdx - movq %rax, 464(%r9) - adcq $0x00, %rdx - movq 288(%rsi), %rax - movq %rdx, 472(%r9) - adcq $0x00, %rax - movq 296(%rsi), %rdx - movq %rax, 480(%r9) - adcq $0x00, %rdx - movq 304(%rsi), %rax - movq %rdx, 488(%r9) - adcq $0x00, %rax - movq 312(%rsi), %rdx - movq %rax, 496(%r9) - adcq $0x00, %rdx - movq 320(%rsi), %rax - movq %rdx, 504(%r9) - adcq $0x00, %rax - movq 328(%rsi), %rdx - movq %rax, 512(%r9) - adcq $0x00, %rdx - movq 336(%rsi), %rax - movq %rdx, 520(%r9) - adcq $0x00, %rax - movq 344(%rsi), %rdx - movq %rax, 528(%r9) - adcq $0x00, %rdx - movq 352(%rsi), %rax - movq %rdx, 536(%r9) - adcq $0x00, %rax - movq 360(%rsi), %rdx - movq %rax, 544(%r9) - adcq $0x00, %rdx - movq 368(%rsi), %rax - movq %rdx, 552(%r9) - adcq $0x00, %rax - movq 376(%rsi), %rdx - movq %rax, 560(%r9) - adcq $0x00, %rdx - movq %rdx, 568(%r9) - addq $0x3d8, %rsp - repz retq -#ifndef __APPLE__ -.size sp_3072_sqr_48,.-sp_3072_sqr_48 -#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Multiply a and b into r. (r = a * b) * @@ -22079,6 +18819,3266 @@ _sp_3072_mul_avx2_48: .size sp_3072_mul_avx2_48,.-sp_3072_mul_avx2_48 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_3072_sqr_12 +.type sp_3072_sqr_12,@function +.align 16 +sp_3072_sqr_12: +#else +.section __TEXT,__text +.globl _sp_3072_sqr_12 +.p2align 4 +_sp_3072_sqr_12: +#endif /* __APPLE__ */ + pushq %r12 + subq $0x60, %rsp + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + xorq %r9, %r9 + movq %rax, (%rsp) + movq %rdx, %r8 + # A[0] * A[1] + movq 8(%rsi), %rax + mulq (%rsi) + xorq %rcx, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + movq %r8, 8(%rsp) + # A[0] * A[2] + movq 16(%rsi), %rax + mulq (%rsi) + xorq %r8, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + movq %r9, 16(%rsp) + # A[0] * A[3] + movq 24(%rsi), %rax + mulq (%rsi) + xorq %r9, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + # A[1] * A[2] + movq 16(%rsi), %rax + mulq 8(%rsi) + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + movq %rcx, 24(%rsp) + # A[0] * A[4] + movq 32(%rsi), %rax + mulq (%rsi) + xorq %rcx, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + # A[1] * A[3] + movq 24(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + movq %r8, 32(%rsp) + # A[0] * A[5] + movq 40(%rsi), %rax + mulq (%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[4] + movq 32(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[3] + movq 24(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 40(%rsp) + # A[0] * A[6] + movq 48(%rsi), %rax + mulq (%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[5] + movq 40(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[4] + movq 32(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 48(%rsp) + # A[0] * A[7] + movq 56(%rsi), %rax + mulq (%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[6] + movq 48(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[5] + movq 40(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[4] + movq 32(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 56(%rsp) + # A[0] * A[8] + movq 64(%rsi), %rax + mulq (%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[7] + movq 56(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[6] + movq 48(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[5] + movq 40(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[4] + movq 32(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 64(%rsp) + # A[0] * A[9] + movq 72(%rsi), %rax + mulq (%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[8] + movq 64(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[7] + movq 56(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[6] + movq 48(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[5] + movq 40(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 72(%rsp) + # A[0] * A[10] + movq 80(%rsi), %rax + mulq (%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[9] + movq 72(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[8] + movq 64(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[7] + movq 56(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[6] + movq 48(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[5] + movq 40(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 80(%rsp) + # A[0] * A[11] + movq 88(%rsi), %rax + mulq (%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[1] * A[10] + movq 80(%rsi), %rax + mulq 8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * A[9] + movq 72(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[8] + movq 64(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[7] + movq 56(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[6] + movq 48(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 88(%rsp) + # A[1] * A[11] + movq 88(%rsi), %rax + mulq 8(%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[2] * A[10] + movq 80(%rsi), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[3] * A[9] + movq 72(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[8] + movq 64(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[7] + movq 56(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[6] + movq 48(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 96(%rdi) + # A[2] * A[11] + movq 88(%rsi), %rax + mulq 16(%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[3] * A[10] + movq 80(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[4] * A[9] + movq 72(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[8] + movq 64(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[7] + movq 56(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 104(%rdi) + # A[3] * A[11] + movq 88(%rsi), %rax + mulq 24(%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[4] * A[10] + movq 80(%rsi), %rax + mulq 32(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[5] * A[9] + movq 72(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[8] + movq 64(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[7] + movq 56(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 112(%rdi) + # A[4] * A[11] + movq 88(%rsi), %rax + mulq 32(%rsi) + xorq %r9, %r9 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[5] * A[10] + movq 80(%rsi), %rax + mulq 40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[6] * A[9] + movq 72(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[8] + movq 64(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + movq %rcx, 120(%rdi) + # A[5] * A[11] + movq 88(%rsi), %rax + mulq 40(%rsi) + xorq %rcx, %rcx + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[6] * A[10] + movq 80(%rsi), %rax + mulq 48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[7] * A[9] + movq 72(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[8] * A[8] + movq 64(%rsi), %rax + mulq %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r8 + adcq %r11, %r9 + adcq %r12, %rcx + movq %r8, 128(%rdi) + # A[6] * A[11] + movq 88(%rsi), %rax + mulq 48(%rsi) + xorq %r8, %r8 + xorq %r12, %r12 + movq %rax, %r10 + movq %rdx, %r11 + # A[7] * A[10] + movq 80(%rsi), %rax + mulq 56(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[8] * A[9] + movq 72(%rsi), %rax + mulq 64(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + addq %r10, %r9 + adcq %r11, %rcx + adcq %r12, %r8 + movq %r9, 136(%rdi) + # A[7] * A[11] + movq 88(%rsi), %rax + mulq 56(%rsi) + xorq %r9, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + # A[8] * A[10] + movq 80(%rsi), %rax + mulq 64(%rsi) + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + # A[9] * A[9] + movq 72(%rsi), %rax + mulq %rax + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + movq %rcx, 144(%rdi) + # A[8] * A[11] + movq 88(%rsi), %rax + mulq 64(%rsi) + xorq %rcx, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + # A[9] * A[10] + movq 80(%rsi), %rax + mulq 72(%rsi) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rcx + movq %r8, 152(%rdi) + # A[9] * A[11] + movq 88(%rsi), %rax + mulq 72(%rsi) + xorq %r8, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + # A[10] * A[10] + movq 80(%rsi), %rax + mulq %rax + addq %rax, %r9 + adcq %rdx, %rcx + adcq $0x00, %r8 + movq %r9, 160(%rdi) + # A[10] * A[11] + movq 88(%rsi), %rax + mulq 80(%rsi) + xorq %r9, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + addq %rax, %rcx + adcq %rdx, %r8 + adcq $0x00, %r9 + movq %rcx, 168(%rdi) + # A[11] * A[11] + movq 88(%rsi), %rax + mulq %rax + addq %rax, %r8 + adcq %rdx, %r9 + movq %r8, 176(%rdi) + movq %r9, 184(%rdi) + movq (%rsp), %rax + movq 8(%rsp), %rdx + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 + movq %rax, (%rdi) + movq %rdx, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq 32(%rsp), %rax + movq 40(%rsp), %rdx + movq 48(%rsp), %r10 + movq 56(%rsp), %r11 + movq %rax, 32(%rdi) + movq %rdx, 40(%rdi) + movq %r10, 48(%rdi) + movq %r11, 56(%rdi) + movq 64(%rsp), %rax + movq 72(%rsp), %rdx + movq 80(%rsp), %r10 + movq 88(%rsp), %r11 + movq %rax, 64(%rdi) + movq %rdx, 72(%rdi) + movq %r10, 80(%rdi) + movq %r11, 88(%rdi) + addq $0x60, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_3072_sqr_12,.-sp_3072_sqr_12 +#endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_3072_sqr_avx2_12 +.type sp_3072_sqr_avx2_12,@function +.align 16 +sp_3072_sqr_avx2_12: +#else +.section __TEXT,__text +.globl _sp_3072_sqr_avx2_12 +.p2align 4 +_sp_3072_sqr_avx2_12: +#endif /* __APPLE__ */ + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + subq $0x60, %rsp + cmpq %rdi, %rsi + movq %rsp, %rbp + cmovne %rdi, %rbp + addq $0x60, %rdi + xorq %r10, %r10 + # Diagonal 1 + # Zero into %r9 + # A[1] x A[0] + movq (%rsi), %rdx + mulxq 8(%rsi), %r8, %r9 + movq %r8, 8(%rbp) + # Zero into %r8 + # A[2] x A[0] + mulxq 16(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 16(%rbp) + # Zero into %r9 + # A[3] x A[0] + mulxq 24(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, 24(%rbp) + # Zero into %r8 + # A[4] x A[0] + mulxq 32(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 32(%rbp) + # Zero into %r9 + # A[5] x A[0] + mulxq 40(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, 40(%rbp) + # No load %r12 - %r8 + # A[6] x A[0] + mulxq 48(%rsi), %rax, %r12 + adcxq %rax, %r9 + adoxq %r10, %r12 + movq %r9, 48(%rbp) + # No load %r13 - %r9 + # A[7] x A[0] + mulxq 56(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %r10, %r13 + # No store %r12 - %r8 + # No load %r14 - %r8 + # A[8] x A[0] + mulxq 64(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %r10, %r14 + # No store %r13 - %r9 + # No load %r15 - %r9 + # A[9] x A[0] + mulxq 72(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %r10, %r15 + # No store %r14 - %r8 + # No load %rbx - %r8 + # A[10] x A[0] + mulxq 80(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %r10, %rbx + # No store %r15 - %r9 + # Zero into %r9 + # A[11] x A[0] + mulxq 88(%rsi), %rax, %r9 + adcxq %rax, %rbx + adoxq %r10, %r9 + # No store %rbx - %r8 + # Carry + adcxq %r10, %r9 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r9, (%rdi) + # Diagonal 2 + movq 24(%rbp), %r9 + movq 32(%rbp), %r8 + # A[2] x A[1] + movq 8(%rsi), %rdx + mulxq 16(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 24(%rbp) + movq 40(%rbp), %r9 + # A[3] x A[1] + mulxq 24(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 32(%rbp) + movq 48(%rbp), %r8 + # A[4] x A[1] + mulxq 32(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 40(%rbp) + # No load %r12 - %r9 + # A[5] x A[1] + mulxq 40(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r12 + movq %r8, 48(%rbp) + # No load %r13 - %r8 + # A[6] x A[1] + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + # No store %r12 - %r9 + # No load %r14 - %r9 + # A[7] x A[1] + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + # No store %r13 - %r8 + # No load %r15 - %r8 + # A[8] x A[1] + mulxq 64(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + # No store %r14 - %r9 + # No load %rbx - %r9 + # A[9] x A[1] + mulxq 72(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbx + # No store %r15 - %r8 + movq (%rdi), %r8 + # A[10] x A[1] + mulxq 80(%rsi), %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %r8 + # No store %rbx - %r9 + # Zero into %r9 + # A[11] x A[1] + mulxq 88(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, (%rdi) + # Zero into %r8 + # A[11] x A[2] + movq 16(%rsi), %rdx + mulxq 88(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 8(%rdi) + # Carry + adcxq %r11, %r8 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r8, 16(%rdi) + # Diagonal 3 + movq 40(%rbp), %r8 + movq 48(%rbp), %r9 + # A[3] x A[2] + mulxq 24(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 40(%rbp) + # No load %r12 - %r8 + # A[4] x A[2] + mulxq 32(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r12 + movq %r9, 48(%rbp) + # No load %r13 - %r9 + # A[5] x A[2] + mulxq 40(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + # No store %r12 - %r8 + # No load %r14 - %r8 + # A[6] x A[2] + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + # No store %r13 - %r9 + # No load %r15 - %r9 + # A[7] x A[2] + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + # No store %r14 - %r8 + # No load %rbx - %r8 + # A[8] x A[2] + mulxq 64(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbx + # No store %r15 - %r9 + movq (%rdi), %r9 + # A[9] x A[2] + mulxq 72(%rsi), %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %r9 + # No store %rbx - %r8 + movq 8(%rdi), %r8 + # A[10] x A[2] + mulxq 80(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, (%rdi) + movq 16(%rdi), %r9 + # A[10] x A[3] + movq 24(%rsi), %rdx + mulxq 80(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 8(%rdi) + # Zero into %r8 + # A[10] x A[4] + movq 32(%rsi), %rdx + mulxq 80(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 16(%rdi) + # Zero into %r9 + # A[10] x A[5] + movq 40(%rsi), %rdx + mulxq 80(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, 24(%rdi) + # Carry + adcxq %r11, %r9 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r9, 32(%rdi) + # Diagonal 4 + # No load %r13 - %r8 + # A[4] x A[3] + movq 24(%rsi), %rdx + mulxq 32(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + # No store %r12 - %r9 + # No load %r14 - %r9 + # A[5] x A[3] + mulxq 40(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + # No store %r13 - %r8 + # No load %r15 - %r8 + # A[6] x A[3] + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + # No store %r14 - %r9 + # No load %rbx - %r9 + # A[7] x A[3] + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbx + # No store %r15 - %r8 + movq (%rdi), %r8 + # A[8] x A[3] + mulxq 64(%rsi), %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %r8 + # No store %rbx - %r9 + movq 8(%rdi), %r9 + # A[9] x A[3] + mulxq 72(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, (%rdi) + movq 16(%rdi), %r8 + # A[9] x A[4] + movq 32(%rsi), %rdx + mulxq 72(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 8(%rdi) + movq 24(%rdi), %r9 + # A[9] x A[5] + movq 40(%rsi), %rdx + mulxq 72(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 16(%rdi) + movq 32(%rdi), %r8 + # A[9] x A[6] + movq 48(%rsi), %rdx + mulxq 72(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 24(%rdi) + # Zero into %r9 + # A[9] x A[7] + movq 56(%rsi), %rdx + mulxq 72(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, 32(%rdi) + # Zero into %r8 + # A[9] x A[8] + movq 64(%rsi), %rdx + mulxq 72(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 40(%rdi) + # Carry + adcxq %r11, %r8 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r8, 48(%rdi) + # Diagonal 5 + # No load %r15 - %r9 + # A[5] x A[4] + movq 32(%rsi), %rdx + mulxq 40(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + # No store %r14 - %r8 + # No load %rbx - %r8 + # A[6] x A[4] + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbx + # No store %r15 - %r9 + movq (%rdi), %r9 + # A[7] x A[4] + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %r9 + # No store %rbx - %r8 + movq 8(%rdi), %r8 + # A[8] x A[4] + mulxq 64(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, (%rdi) + movq 16(%rdi), %r9 + # A[8] x A[5] + movq 40(%rsi), %rdx + mulxq 64(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 8(%rdi) + movq 24(%rdi), %r8 + # A[8] x A[6] + movq 48(%rsi), %rdx + mulxq 64(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 16(%rdi) + movq 32(%rdi), %r9 + # A[8] x A[7] + movq 56(%rsi), %rdx + mulxq 64(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 24(%rdi) + movq 40(%rdi), %r8 + # A[10] x A[6] + movq 48(%rsi), %rdx + mulxq 80(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 32(%rdi) + movq 48(%rdi), %r9 + # A[10] x A[7] + movq 56(%rsi), %rdx + mulxq 80(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 40(%rdi) + # Zero into %r8 + # A[10] x A[8] + movq 64(%rsi), %rdx + mulxq 80(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 48(%rdi) + # Zero into %r9 + # A[10] x A[9] + movq 72(%rsi), %rdx + mulxq 80(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, 56(%rdi) + # Carry + adcxq %r11, %r9 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r9, 64(%rdi) + # Diagonal 6 + movq (%rdi), %r8 + # A[6] x A[5] + movq 40(%rsi), %rdx + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %r8 + # No store %rbx - %r9 + movq 8(%rdi), %r9 + # A[7] x A[5] + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, (%rdi) + movq 16(%rdi), %r8 + # A[7] x A[6] + movq 48(%rsi), %rdx + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 8(%rdi) + movq 24(%rdi), %r9 + # A[11] x A[3] + movq 24(%rsi), %rdx + mulxq 88(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 16(%rdi) + movq 32(%rdi), %r8 + # A[11] x A[4] + movq 32(%rsi), %rdx + mulxq 88(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 24(%rdi) + movq 40(%rdi), %r9 + # A[11] x A[5] + movq 40(%rsi), %rdx + mulxq 88(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 32(%rdi) + movq 48(%rdi), %r8 + # A[11] x A[6] + movq 48(%rsi), %rdx + mulxq 88(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 40(%rdi) + movq 56(%rdi), %r9 + # A[11] x A[7] + movq 56(%rsi), %rdx + mulxq 88(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 48(%rdi) + movq 64(%rdi), %r8 + # A[11] x A[8] + movq 64(%rsi), %rdx + mulxq 88(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 56(%rdi) + # Zero into %r9 + # A[11] x A[9] + movq 72(%rsi), %rdx + mulxq 88(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, 64(%rdi) + # Zero into %r8 + # A[11] x A[10] + movq 80(%rsi), %rdx + mulxq 88(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 72(%rdi) + # Carry + adcxq %r11, %r8 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r8, 80(%rdi) + movq %r11, 88(%rdi) + # Double and Add in A[i] x A[i] + movq 8(%rbp), %r9 + # A[0] x A[0] + movq (%rsi), %rdx + mulxq %rdx, %rax, %rcx + movq %rax, (%rbp) + adoxq %r9, %r9 + adcxq %rcx, %r9 + movq %r9, 8(%rbp) + movq 16(%rbp), %r8 + movq 24(%rbp), %r9 + # A[1] x A[1] + movq 8(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 16(%rbp) + movq %r9, 24(%rbp) + movq 32(%rbp), %r8 + movq 40(%rbp), %r9 + # A[2] x A[2] + movq 16(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 32(%rbp) + movq %r9, 40(%rbp) + movq 48(%rbp), %r8 + # A[3] x A[3] + movq 24(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r12, %r12 + adcxq %rax, %r8 + adcxq %rcx, %r12 + movq %r8, 48(%rbp) + # A[4] x A[4] + movq 32(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r13, %r13 + adoxq %r14, %r14 + adcxq %rax, %r13 + adcxq %rcx, %r14 + # A[5] x A[5] + movq 40(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r15, %r15 + adoxq %rbx, %rbx + adcxq %rax, %r15 + adcxq %rcx, %rbx + movq (%rdi), %r8 + movq 8(%rdi), %r9 + # A[6] x A[6] + movq 48(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq 16(%rdi), %r8 + movq 24(%rdi), %r9 + # A[7] x A[7] + movq 56(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + movq 32(%rdi), %r8 + movq 40(%rdi), %r9 + # A[8] x A[8] + movq 64(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 32(%rdi) + movq %r9, 40(%rdi) + movq 48(%rdi), %r8 + movq 56(%rdi), %r9 + # A[9] x A[9] + movq 72(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 48(%rdi) + movq %r9, 56(%rdi) + movq 64(%rdi), %r8 + movq 72(%rdi), %r9 + # A[10] x A[10] + movq 80(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 64(%rdi) + movq %r9, 72(%rdi) + movq 80(%rdi), %r8 + movq 88(%rdi), %r9 + # A[11] x A[11] + movq 88(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 80(%rdi) + movq %r9, 88(%rdi) + movq %r12, -40(%rdi) + movq %r13, -32(%rdi) + movq %r14, -24(%rdi) + movq %r15, -16(%rdi) + movq %rbx, -8(%rdi) + subq $0x60, %rdi + cmpq %rdi, %rsi + jne L_end_3072_sqr_avx2_12 + vmovdqu (%rbp), %xmm0 + vmovups %xmm0, (%rdi) + vmovdqu 16(%rbp), %xmm0 + vmovups %xmm0, 16(%rdi) + vmovdqu 32(%rbp), %xmm0 + vmovups %xmm0, 32(%rdi) + movq 48(%rbp), %rax + movq %rax, 48(%rdi) +L_end_3072_sqr_avx2_12: + addq $0x60, %rsp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + repz retq +#ifndef __APPLE__ +.size sp_3072_sqr_avx2_12,.-sp_3072_sqr_avx2_12 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ +/* Add a to a into r. (r = a + a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_3072_dbl_12 +.type sp_3072_dbl_12,@function +.align 16 +sp_3072_dbl_12: +#else +.section __TEXT,__text +.globl _sp_3072_dbl_12 +.p2align 4 +_sp_3072_dbl_12: +#endif /* __APPLE__ */ + movq (%rsi), %rdx + xorq %rax, %rax + addq %rdx, %rdx + movq 8(%rsi), %rcx + movq %rdx, (%rdi) + adcq %rcx, %rcx + movq 16(%rsi), %rdx + movq %rcx, 8(%rdi) + adcq %rdx, %rdx + movq 24(%rsi), %rcx + movq %rdx, 16(%rdi) + adcq %rcx, %rcx + movq 32(%rsi), %rdx + movq %rcx, 24(%rdi) + adcq %rdx, %rdx + movq 40(%rsi), %rcx + movq %rdx, 32(%rdi) + adcq %rcx, %rcx + movq 48(%rsi), %rdx + movq %rcx, 40(%rdi) + adcq %rdx, %rdx + movq 56(%rsi), %rcx + movq %rdx, 48(%rdi) + adcq %rcx, %rcx + movq 64(%rsi), %rdx + movq %rcx, 56(%rdi) + adcq %rdx, %rdx + movq 72(%rsi), %rcx + movq %rdx, 64(%rdi) + adcq %rcx, %rcx + movq 80(%rsi), %rdx + movq %rcx, 72(%rdi) + adcq %rdx, %rdx + movq 88(%rsi), %rcx + movq %rdx, 80(%rdi) + adcq %rcx, %rcx + movq %rcx, 88(%rdi) + adcq $0x00, %rax + repz retq +#ifndef __APPLE__ +.size sp_3072_dbl_12,.-sp_3072_dbl_12 +#endif /* __APPLE__ */ +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_3072_sqr_24 +.type sp_3072_sqr_24,@function +.align 16 +sp_3072_sqr_24: +#else +.section __TEXT,__text +.globl _sp_3072_sqr_24 +.p2align 4 +_sp_3072_sqr_24: +#endif /* __APPLE__ */ + subq $0x1f8, %rsp + movq %rdi, 480(%rsp) + movq %rsi, 488(%rsp) + leaq 384(%rsp), %r8 + leaq 96(%rsi), %r9 + # Add + movq (%rsi), %rdx + xorq %rcx, %rcx + addq (%r9), %rdx + movq 8(%rsi), %rax + movq %rdx, (%r8) + adcq 8(%r9), %rax + movq 16(%rsi), %rdx + movq %rax, 8(%r8) + adcq 16(%r9), %rdx + movq 24(%rsi), %rax + movq %rdx, 16(%r8) + adcq 24(%r9), %rax + movq 32(%rsi), %rdx + movq %rax, 24(%r8) + adcq 32(%r9), %rdx + movq 40(%rsi), %rax + movq %rdx, 32(%r8) + adcq 40(%r9), %rax + movq 48(%rsi), %rdx + movq %rax, 40(%r8) + adcq 48(%r9), %rdx + movq 56(%rsi), %rax + movq %rdx, 48(%r8) + adcq 56(%r9), %rax + movq 64(%rsi), %rdx + movq %rax, 56(%r8) + adcq 64(%r9), %rdx + movq 72(%rsi), %rax + movq %rdx, 64(%r8) + adcq 72(%r9), %rax + movq 80(%rsi), %rdx + movq %rax, 72(%r8) + adcq 80(%r9), %rdx + movq 88(%rsi), %rax + movq %rdx, 80(%r8) + adcq 88(%r9), %rax + movq %rax, 88(%r8) + adcq $0x00, %rcx + movq %rcx, 496(%rsp) + movq %r8, %rsi + movq %rsp, %rdi +#ifndef __APPLE__ + callq sp_3072_sqr_12@plt +#else + callq _sp_3072_sqr_12 +#endif /* __APPLE__ */ + movq 488(%rsp), %rsi + leaq 192(%rsp), %rdi + addq $0x60, %rsi +#ifndef __APPLE__ + callq sp_3072_sqr_12@plt +#else + callq _sp_3072_sqr_12 +#endif /* __APPLE__ */ + movq 488(%rsp), %rsi + movq 480(%rsp), %rdi +#ifndef __APPLE__ + callq sp_3072_sqr_12@plt +#else + callq _sp_3072_sqr_12 +#endif /* __APPLE__ */ +#ifdef _WIN64 + movq 488(%rsp), %rsi + movq 480(%rsp), %rdi +#endif /* _WIN64 */ + movq 496(%rsp), %r10 + movq %rdi, %r9 + leaq 384(%rsp), %r8 + movq %r10, %rcx + negq %r10 + addq $0xc0, %r9 + movq (%r8), %rdx + movq 8(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, (%r9) + movq %rax, 8(%r9) + movq 16(%r8), %rdx + movq 24(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 16(%r9) + movq %rax, 24(%r9) + movq 32(%r8), %rdx + movq 40(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 32(%r9) + movq %rax, 40(%r9) + movq 48(%r8), %rdx + movq 56(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 48(%r9) + movq %rax, 56(%r9) + movq 64(%r8), %rdx + movq 72(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 64(%r9) + movq %rax, 72(%r9) + movq 80(%r8), %rdx + movq 88(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 80(%r9) + movq %rax, 88(%r9) + movq (%r9), %rdx + addq %rdx, %rdx + movq 8(%r9), %rax + movq %rdx, (%r9) + adcq %rax, %rax + movq 16(%r9), %rdx + movq %rax, 8(%r9) + adcq %rdx, %rdx + movq 24(%r9), %rax + movq %rdx, 16(%r9) + adcq %rax, %rax + movq 32(%r9), %rdx + movq %rax, 24(%r9) + adcq %rdx, %rdx + movq 40(%r9), %rax + movq %rdx, 32(%r9) + adcq %rax, %rax + movq 48(%r9), %rdx + movq %rax, 40(%r9) + adcq %rdx, %rdx + movq 56(%r9), %rax + movq %rdx, 48(%r9) + adcq %rax, %rax + movq 64(%r9), %rdx + movq %rax, 56(%r9) + adcq %rdx, %rdx + movq 72(%r9), %rax + movq %rdx, 64(%r9) + adcq %rax, %rax + movq 80(%r9), %rdx + movq %rax, 72(%r9) + adcq %rdx, %rdx + movq 88(%r9), %rax + movq %rdx, 80(%r9) + adcq %rax, %rax + movq %rax, 88(%r9) + adcq $0x00, %rcx + leaq 192(%rsp), %rsi + movq %rsp, %r8 + movq (%r8), %rdx + subq (%rsi), %rdx + movq 8(%r8), %rax + movq %rdx, (%r8) + sbbq 8(%rsi), %rax + movq 16(%r8), %rdx + movq %rax, 8(%r8) + sbbq 16(%rsi), %rdx + movq 24(%r8), %rax + movq %rdx, 16(%r8) + sbbq 24(%rsi), %rax + movq 32(%r8), %rdx + movq %rax, 24(%r8) + sbbq 32(%rsi), %rdx + movq 40(%r8), %rax + movq %rdx, 32(%r8) + sbbq 40(%rsi), %rax + movq 48(%r8), %rdx + movq %rax, 40(%r8) + sbbq 48(%rsi), %rdx + movq 56(%r8), %rax + movq %rdx, 48(%r8) + sbbq 56(%rsi), %rax + movq 64(%r8), %rdx + movq %rax, 56(%r8) + sbbq 64(%rsi), %rdx + movq 72(%r8), %rax + movq %rdx, 64(%r8) + sbbq 72(%rsi), %rax + movq 80(%r8), %rdx + movq %rax, 72(%r8) + sbbq 80(%rsi), %rdx + movq 88(%r8), %rax + movq %rdx, 80(%r8) + sbbq 88(%rsi), %rax + movq 96(%r8), %rdx + movq %rax, 88(%r8) + sbbq 96(%rsi), %rdx + movq 104(%r8), %rax + movq %rdx, 96(%r8) + sbbq 104(%rsi), %rax + movq 112(%r8), %rdx + movq %rax, 104(%r8) + sbbq 112(%rsi), %rdx + movq 120(%r8), %rax + movq %rdx, 112(%r8) + sbbq 120(%rsi), %rax + movq 128(%r8), %rdx + movq %rax, 120(%r8) + sbbq 128(%rsi), %rdx + movq 136(%r8), %rax + movq %rdx, 128(%r8) + sbbq 136(%rsi), %rax + movq 144(%r8), %rdx + movq %rax, 136(%r8) + sbbq 144(%rsi), %rdx + movq 152(%r8), %rax + movq %rdx, 144(%r8) + sbbq 152(%rsi), %rax + movq 160(%r8), %rdx + movq %rax, 152(%r8) + sbbq 160(%rsi), %rdx + movq 168(%r8), %rax + movq %rdx, 160(%r8) + sbbq 168(%rsi), %rax + movq 176(%r8), %rdx + movq %rax, 168(%r8) + sbbq 176(%rsi), %rdx + movq 184(%r8), %rax + movq %rdx, 176(%r8) + sbbq 184(%rsi), %rax + movq %rax, 184(%r8) + sbbq $0x00, %rcx + movq (%r8), %rdx + subq (%rdi), %rdx + movq 8(%r8), %rax + movq %rdx, (%r8) + sbbq 8(%rdi), %rax + movq 16(%r8), %rdx + movq %rax, 8(%r8) + sbbq 16(%rdi), %rdx + movq 24(%r8), %rax + movq %rdx, 16(%r8) + sbbq 24(%rdi), %rax + movq 32(%r8), %rdx + movq %rax, 24(%r8) + sbbq 32(%rdi), %rdx + movq 40(%r8), %rax + movq %rdx, 32(%r8) + sbbq 40(%rdi), %rax + movq 48(%r8), %rdx + movq %rax, 40(%r8) + sbbq 48(%rdi), %rdx + movq 56(%r8), %rax + movq %rdx, 48(%r8) + sbbq 56(%rdi), %rax + movq 64(%r8), %rdx + movq %rax, 56(%r8) + sbbq 64(%rdi), %rdx + movq 72(%r8), %rax + movq %rdx, 64(%r8) + sbbq 72(%rdi), %rax + movq 80(%r8), %rdx + movq %rax, 72(%r8) + sbbq 80(%rdi), %rdx + movq 88(%r8), %rax + movq %rdx, 80(%r8) + sbbq 88(%rdi), %rax + movq 96(%r8), %rdx + movq %rax, 88(%r8) + sbbq 96(%rdi), %rdx + movq 104(%r8), %rax + movq %rdx, 96(%r8) + sbbq 104(%rdi), %rax + movq 112(%r8), %rdx + movq %rax, 104(%r8) + sbbq 112(%rdi), %rdx + movq 120(%r8), %rax + movq %rdx, 112(%r8) + sbbq 120(%rdi), %rax + movq 128(%r8), %rdx + movq %rax, 120(%r8) + sbbq 128(%rdi), %rdx + movq 136(%r8), %rax + movq %rdx, 128(%r8) + sbbq 136(%rdi), %rax + movq 144(%r8), %rdx + movq %rax, 136(%r8) + sbbq 144(%rdi), %rdx + movq 152(%r8), %rax + movq %rdx, 144(%r8) + sbbq 152(%rdi), %rax + movq 160(%r8), %rdx + movq %rax, 152(%r8) + sbbq 160(%rdi), %rdx + movq 168(%r8), %rax + movq %rdx, 160(%r8) + sbbq 168(%rdi), %rax + movq 176(%r8), %rdx + movq %rax, 168(%r8) + sbbq 176(%rdi), %rdx + movq 184(%r8), %rax + movq %rdx, 176(%r8) + sbbq 184(%rdi), %rax + movq %rax, 184(%r8) + sbbq $0x00, %rcx + subq $0x60, %r9 + # Add in place + movq (%r9), %rdx + addq (%r8), %rdx + movq 8(%r9), %rax + movq %rdx, (%r9) + adcq 8(%r8), %rax + movq 16(%r9), %rdx + movq %rax, 8(%r9) + adcq 16(%r8), %rdx + movq 24(%r9), %rax + movq %rdx, 16(%r9) + adcq 24(%r8), %rax + movq 32(%r9), %rdx + movq %rax, 24(%r9) + adcq 32(%r8), %rdx + movq 40(%r9), %rax + movq %rdx, 32(%r9) + adcq 40(%r8), %rax + movq 48(%r9), %rdx + movq %rax, 40(%r9) + adcq 48(%r8), %rdx + movq 56(%r9), %rax + movq %rdx, 48(%r9) + adcq 56(%r8), %rax + movq 64(%r9), %rdx + movq %rax, 56(%r9) + adcq 64(%r8), %rdx + movq 72(%r9), %rax + movq %rdx, 64(%r9) + adcq 72(%r8), %rax + movq 80(%r9), %rdx + movq %rax, 72(%r9) + adcq 80(%r8), %rdx + movq 88(%r9), %rax + movq %rdx, 80(%r9) + adcq 88(%r8), %rax + movq 96(%r9), %rdx + movq %rax, 88(%r9) + adcq 96(%r8), %rdx + movq 104(%r9), %rax + movq %rdx, 96(%r9) + adcq 104(%r8), %rax + movq 112(%r9), %rdx + movq %rax, 104(%r9) + adcq 112(%r8), %rdx + movq 120(%r9), %rax + movq %rdx, 112(%r9) + adcq 120(%r8), %rax + movq 128(%r9), %rdx + movq %rax, 120(%r9) + adcq 128(%r8), %rdx + movq 136(%r9), %rax + movq %rdx, 128(%r9) + adcq 136(%r8), %rax + movq 144(%r9), %rdx + movq %rax, 136(%r9) + adcq 144(%r8), %rdx + movq 152(%r9), %rax + movq %rdx, 144(%r9) + adcq 152(%r8), %rax + movq 160(%r9), %rdx + movq %rax, 152(%r9) + adcq 160(%r8), %rdx + movq 168(%r9), %rax + movq %rdx, 160(%r9) + adcq 168(%r8), %rax + movq 176(%r9), %rdx + movq %rax, 168(%r9) + adcq 176(%r8), %rdx + movq 184(%r9), %rax + movq %rdx, 176(%r9) + adcq 184(%r8), %rax + movq %rax, 184(%r9) + adcq $0x00, %rcx + movq %rcx, 288(%rdi) + # Add in place + movq 96(%r9), %rdx + addq (%rsi), %rdx + movq 104(%r9), %rax + movq %rdx, 96(%r9) + adcq 8(%rsi), %rax + movq 112(%r9), %rdx + movq %rax, 104(%r9) + adcq 16(%rsi), %rdx + movq 120(%r9), %rax + movq %rdx, 112(%r9) + adcq 24(%rsi), %rax + movq 128(%r9), %rdx + movq %rax, 120(%r9) + adcq 32(%rsi), %rdx + movq 136(%r9), %rax + movq %rdx, 128(%r9) + adcq 40(%rsi), %rax + movq 144(%r9), %rdx + movq %rax, 136(%r9) + adcq 48(%rsi), %rdx + movq 152(%r9), %rax + movq %rdx, 144(%r9) + adcq 56(%rsi), %rax + movq 160(%r9), %rdx + movq %rax, 152(%r9) + adcq 64(%rsi), %rdx + movq 168(%r9), %rax + movq %rdx, 160(%r9) + adcq 72(%rsi), %rax + movq 176(%r9), %rdx + movq %rax, 168(%r9) + adcq 80(%rsi), %rdx + movq 184(%r9), %rax + movq %rdx, 176(%r9) + adcq 88(%rsi), %rax + movq 192(%r9), %rdx + movq %rax, 184(%r9) + adcq 96(%rsi), %rdx + movq %rdx, 192(%r9) + # Add to zero + movq 104(%rsi), %rdx + adcq $0x00, %rdx + movq 112(%rsi), %rax + movq %rdx, 200(%r9) + adcq $0x00, %rax + movq 120(%rsi), %rdx + movq %rax, 208(%r9) + adcq $0x00, %rdx + movq 128(%rsi), %rax + movq %rdx, 216(%r9) + adcq $0x00, %rax + movq 136(%rsi), %rdx + movq %rax, 224(%r9) + adcq $0x00, %rdx + movq 144(%rsi), %rax + movq %rdx, 232(%r9) + adcq $0x00, %rax + movq 152(%rsi), %rdx + movq %rax, 240(%r9) + adcq $0x00, %rdx + movq 160(%rsi), %rax + movq %rdx, 248(%r9) + adcq $0x00, %rax + movq 168(%rsi), %rdx + movq %rax, 256(%r9) + adcq $0x00, %rdx + movq 176(%rsi), %rax + movq %rdx, 264(%r9) + adcq $0x00, %rax + movq 184(%rsi), %rdx + movq %rax, 272(%r9) + adcq $0x00, %rdx + movq %rdx, 280(%r9) + addq $0x1f8, %rsp + repz retq +#ifndef __APPLE__ +.size sp_3072_sqr_24,.-sp_3072_sqr_24 +#endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_3072_sqr_avx2_24 +.type sp_3072_sqr_avx2_24,@function +.align 16 +sp_3072_sqr_avx2_24: +#else +.section __TEXT,__text +.globl _sp_3072_sqr_avx2_24 +.p2align 4 +_sp_3072_sqr_avx2_24: +#endif /* __APPLE__ */ + subq $0x1f8, %rsp + movq %rdi, 480(%rsp) + movq %rsi, 488(%rsp) + leaq 384(%rsp), %r8 + leaq 96(%rsi), %r9 + # Add + movq (%rsi), %rdx + xorq %rcx, %rcx + addq (%r9), %rdx + movq 8(%rsi), %rax + movq %rdx, (%r8) + adcq 8(%r9), %rax + movq 16(%rsi), %rdx + movq %rax, 8(%r8) + adcq 16(%r9), %rdx + movq 24(%rsi), %rax + movq %rdx, 16(%r8) + adcq 24(%r9), %rax + movq 32(%rsi), %rdx + movq %rax, 24(%r8) + adcq 32(%r9), %rdx + movq 40(%rsi), %rax + movq %rdx, 32(%r8) + adcq 40(%r9), %rax + movq 48(%rsi), %rdx + movq %rax, 40(%r8) + adcq 48(%r9), %rdx + movq 56(%rsi), %rax + movq %rdx, 48(%r8) + adcq 56(%r9), %rax + movq 64(%rsi), %rdx + movq %rax, 56(%r8) + adcq 64(%r9), %rdx + movq 72(%rsi), %rax + movq %rdx, 64(%r8) + adcq 72(%r9), %rax + movq 80(%rsi), %rdx + movq %rax, 72(%r8) + adcq 80(%r9), %rdx + movq 88(%rsi), %rax + movq %rdx, 80(%r8) + adcq 88(%r9), %rax + movq %rax, 88(%r8) + adcq $0x00, %rcx + movq %rcx, 496(%rsp) + movq %r8, %rsi + movq %rsp, %rdi +#ifndef __APPLE__ + callq sp_3072_sqr_avx2_12@plt +#else + callq _sp_3072_sqr_avx2_12 +#endif /* __APPLE__ */ + movq 488(%rsp), %rsi + leaq 192(%rsp), %rdi + addq $0x60, %rsi +#ifndef __APPLE__ + callq sp_3072_sqr_avx2_12@plt +#else + callq _sp_3072_sqr_avx2_12 +#endif /* __APPLE__ */ + movq 488(%rsp), %rsi + movq 480(%rsp), %rdi +#ifndef __APPLE__ + callq sp_3072_sqr_avx2_12@plt +#else + callq _sp_3072_sqr_avx2_12 +#endif /* __APPLE__ */ +#ifdef _WIN64 + movq 488(%rsp), %rsi + movq 480(%rsp), %rdi +#endif /* _WIN64 */ + movq 496(%rsp), %r10 + movq %rdi, %r9 + leaq 384(%rsp), %r8 + movq %r10, %rcx + negq %r10 + addq $0xc0, %r9 + movq (%r8), %rdx + pextq %r10, %rdx, %rdx + addq %rdx, %rdx + movq 8(%r8), %rax + movq %rdx, (%r9) + pextq %r10, %rax, %rax + adcq %rax, %rax + movq 16(%r8), %rdx + movq %rax, 8(%r9) + pextq %r10, %rdx, %rdx + adcq %rdx, %rdx + movq 24(%r8), %rax + movq %rdx, 16(%r9) + pextq %r10, %rax, %rax + adcq %rax, %rax + movq 32(%r8), %rdx + movq %rax, 24(%r9) + pextq %r10, %rdx, %rdx + adcq %rdx, %rdx + movq 40(%r8), %rax + movq %rdx, 32(%r9) + pextq %r10, %rax, %rax + adcq %rax, %rax + movq 48(%r8), %rdx + movq %rax, 40(%r9) + pextq %r10, %rdx, %rdx + adcq %rdx, %rdx + movq 56(%r8), %rax + movq %rdx, 48(%r9) + pextq %r10, %rax, %rax + adcq %rax, %rax + movq 64(%r8), %rdx + movq %rax, 56(%r9) + pextq %r10, %rdx, %rdx + adcq %rdx, %rdx + movq 72(%r8), %rax + movq %rdx, 64(%r9) + pextq %r10, %rax, %rax + adcq %rax, %rax + movq 80(%r8), %rdx + movq %rax, 72(%r9) + pextq %r10, %rdx, %rdx + adcq %rdx, %rdx + movq 88(%r8), %rax + movq %rdx, 80(%r9) + pextq %r10, %rax, %rax + adcq %rax, %rax + movq %rax, 88(%r9) + adcq $0x00, %rcx + leaq 192(%rsp), %rsi + movq %rsp, %r8 + movq (%r8), %rdx + subq (%rsi), %rdx + movq 8(%r8), %rax + movq %rdx, (%r8) + sbbq 8(%rsi), %rax + movq 16(%r8), %rdx + movq %rax, 8(%r8) + sbbq 16(%rsi), %rdx + movq 24(%r8), %rax + movq %rdx, 16(%r8) + sbbq 24(%rsi), %rax + movq 32(%r8), %rdx + movq %rax, 24(%r8) + sbbq 32(%rsi), %rdx + movq 40(%r8), %rax + movq %rdx, 32(%r8) + sbbq 40(%rsi), %rax + movq 48(%r8), %rdx + movq %rax, 40(%r8) + sbbq 48(%rsi), %rdx + movq 56(%r8), %rax + movq %rdx, 48(%r8) + sbbq 56(%rsi), %rax + movq 64(%r8), %rdx + movq %rax, 56(%r8) + sbbq 64(%rsi), %rdx + movq 72(%r8), %rax + movq %rdx, 64(%r8) + sbbq 72(%rsi), %rax + movq 80(%r8), %rdx + movq %rax, 72(%r8) + sbbq 80(%rsi), %rdx + movq 88(%r8), %rax + movq %rdx, 80(%r8) + sbbq 88(%rsi), %rax + movq 96(%r8), %rdx + movq %rax, 88(%r8) + sbbq 96(%rsi), %rdx + movq 104(%r8), %rax + movq %rdx, 96(%r8) + sbbq 104(%rsi), %rax + movq 112(%r8), %rdx + movq %rax, 104(%r8) + sbbq 112(%rsi), %rdx + movq 120(%r8), %rax + movq %rdx, 112(%r8) + sbbq 120(%rsi), %rax + movq 128(%r8), %rdx + movq %rax, 120(%r8) + sbbq 128(%rsi), %rdx + movq 136(%r8), %rax + movq %rdx, 128(%r8) + sbbq 136(%rsi), %rax + movq 144(%r8), %rdx + movq %rax, 136(%r8) + sbbq 144(%rsi), %rdx + movq 152(%r8), %rax + movq %rdx, 144(%r8) + sbbq 152(%rsi), %rax + movq 160(%r8), %rdx + movq %rax, 152(%r8) + sbbq 160(%rsi), %rdx + movq 168(%r8), %rax + movq %rdx, 160(%r8) + sbbq 168(%rsi), %rax + movq 176(%r8), %rdx + movq %rax, 168(%r8) + sbbq 176(%rsi), %rdx + movq 184(%r8), %rax + movq %rdx, 176(%r8) + sbbq 184(%rsi), %rax + movq %rax, 184(%r8) + sbbq $0x00, %rcx + movq (%r8), %rdx + subq (%rdi), %rdx + movq 8(%r8), %rax + movq %rdx, (%r8) + sbbq 8(%rdi), %rax + movq 16(%r8), %rdx + movq %rax, 8(%r8) + sbbq 16(%rdi), %rdx + movq 24(%r8), %rax + movq %rdx, 16(%r8) + sbbq 24(%rdi), %rax + movq 32(%r8), %rdx + movq %rax, 24(%r8) + sbbq 32(%rdi), %rdx + movq 40(%r8), %rax + movq %rdx, 32(%r8) + sbbq 40(%rdi), %rax + movq 48(%r8), %rdx + movq %rax, 40(%r8) + sbbq 48(%rdi), %rdx + movq 56(%r8), %rax + movq %rdx, 48(%r8) + sbbq 56(%rdi), %rax + movq 64(%r8), %rdx + movq %rax, 56(%r8) + sbbq 64(%rdi), %rdx + movq 72(%r8), %rax + movq %rdx, 64(%r8) + sbbq 72(%rdi), %rax + movq 80(%r8), %rdx + movq %rax, 72(%r8) + sbbq 80(%rdi), %rdx + movq 88(%r8), %rax + movq %rdx, 80(%r8) + sbbq 88(%rdi), %rax + movq 96(%r8), %rdx + movq %rax, 88(%r8) + sbbq 96(%rdi), %rdx + movq 104(%r8), %rax + movq %rdx, 96(%r8) + sbbq 104(%rdi), %rax + movq 112(%r8), %rdx + movq %rax, 104(%r8) + sbbq 112(%rdi), %rdx + movq 120(%r8), %rax + movq %rdx, 112(%r8) + sbbq 120(%rdi), %rax + movq 128(%r8), %rdx + movq %rax, 120(%r8) + sbbq 128(%rdi), %rdx + movq 136(%r8), %rax + movq %rdx, 128(%r8) + sbbq 136(%rdi), %rax + movq 144(%r8), %rdx + movq %rax, 136(%r8) + sbbq 144(%rdi), %rdx + movq 152(%r8), %rax + movq %rdx, 144(%r8) + sbbq 152(%rdi), %rax + movq 160(%r8), %rdx + movq %rax, 152(%r8) + sbbq 160(%rdi), %rdx + movq 168(%r8), %rax + movq %rdx, 160(%r8) + sbbq 168(%rdi), %rax + movq 176(%r8), %rdx + movq %rax, 168(%r8) + sbbq 176(%rdi), %rdx + movq 184(%r8), %rax + movq %rdx, 176(%r8) + sbbq 184(%rdi), %rax + movq %rax, 184(%r8) + sbbq $0x00, %rcx + subq $0x60, %r9 + # Add in place + movq (%r9), %rdx + addq (%r8), %rdx + movq 8(%r9), %rax + movq %rdx, (%r9) + adcq 8(%r8), %rax + movq 16(%r9), %rdx + movq %rax, 8(%r9) + adcq 16(%r8), %rdx + movq 24(%r9), %rax + movq %rdx, 16(%r9) + adcq 24(%r8), %rax + movq 32(%r9), %rdx + movq %rax, 24(%r9) + adcq 32(%r8), %rdx + movq 40(%r9), %rax + movq %rdx, 32(%r9) + adcq 40(%r8), %rax + movq 48(%r9), %rdx + movq %rax, 40(%r9) + adcq 48(%r8), %rdx + movq 56(%r9), %rax + movq %rdx, 48(%r9) + adcq 56(%r8), %rax + movq 64(%r9), %rdx + movq %rax, 56(%r9) + adcq 64(%r8), %rdx + movq 72(%r9), %rax + movq %rdx, 64(%r9) + adcq 72(%r8), %rax + movq 80(%r9), %rdx + movq %rax, 72(%r9) + adcq 80(%r8), %rdx + movq 88(%r9), %rax + movq %rdx, 80(%r9) + adcq 88(%r8), %rax + movq 96(%r9), %rdx + movq %rax, 88(%r9) + adcq 96(%r8), %rdx + movq 104(%r9), %rax + movq %rdx, 96(%r9) + adcq 104(%r8), %rax + movq 112(%r9), %rdx + movq %rax, 104(%r9) + adcq 112(%r8), %rdx + movq 120(%r9), %rax + movq %rdx, 112(%r9) + adcq 120(%r8), %rax + movq 128(%r9), %rdx + movq %rax, 120(%r9) + adcq 128(%r8), %rdx + movq 136(%r9), %rax + movq %rdx, 128(%r9) + adcq 136(%r8), %rax + movq 144(%r9), %rdx + movq %rax, 136(%r9) + adcq 144(%r8), %rdx + movq 152(%r9), %rax + movq %rdx, 144(%r9) + adcq 152(%r8), %rax + movq 160(%r9), %rdx + movq %rax, 152(%r9) + adcq 160(%r8), %rdx + movq 168(%r9), %rax + movq %rdx, 160(%r9) + adcq 168(%r8), %rax + movq 176(%r9), %rdx + movq %rax, 168(%r9) + adcq 176(%r8), %rdx + movq 184(%r9), %rax + movq %rdx, 176(%r9) + adcq 184(%r8), %rax + movq %rax, 184(%r9) + adcq $0x00, %rcx + movq %rcx, 288(%rdi) + # Add in place + movq 96(%r9), %rdx + addq (%rsi), %rdx + movq 104(%r9), %rax + movq %rdx, 96(%r9) + adcq 8(%rsi), %rax + movq 112(%r9), %rdx + movq %rax, 104(%r9) + adcq 16(%rsi), %rdx + movq 120(%r9), %rax + movq %rdx, 112(%r9) + adcq 24(%rsi), %rax + movq 128(%r9), %rdx + movq %rax, 120(%r9) + adcq 32(%rsi), %rdx + movq 136(%r9), %rax + movq %rdx, 128(%r9) + adcq 40(%rsi), %rax + movq 144(%r9), %rdx + movq %rax, 136(%r9) + adcq 48(%rsi), %rdx + movq 152(%r9), %rax + movq %rdx, 144(%r9) + adcq 56(%rsi), %rax + movq 160(%r9), %rdx + movq %rax, 152(%r9) + adcq 64(%rsi), %rdx + movq 168(%r9), %rax + movq %rdx, 160(%r9) + adcq 72(%rsi), %rax + movq 176(%r9), %rdx + movq %rax, 168(%r9) + adcq 80(%rsi), %rdx + movq 184(%r9), %rax + movq %rdx, 176(%r9) + adcq 88(%rsi), %rax + movq 192(%r9), %rdx + movq %rax, 184(%r9) + adcq 96(%rsi), %rdx + movq %rdx, 192(%r9) + # Add to zero + movq 104(%rsi), %rdx + adcq $0x00, %rdx + movq 112(%rsi), %rax + movq %rdx, 200(%r9) + adcq $0x00, %rax + movq 120(%rsi), %rdx + movq %rax, 208(%r9) + adcq $0x00, %rdx + movq 128(%rsi), %rax + movq %rdx, 216(%r9) + adcq $0x00, %rax + movq 136(%rsi), %rdx + movq %rax, 224(%r9) + adcq $0x00, %rdx + movq 144(%rsi), %rax + movq %rdx, 232(%r9) + adcq $0x00, %rax + movq 152(%rsi), %rdx + movq %rax, 240(%r9) + adcq $0x00, %rdx + movq 160(%rsi), %rax + movq %rdx, 248(%r9) + adcq $0x00, %rax + movq 168(%rsi), %rdx + movq %rax, 256(%r9) + adcq $0x00, %rdx + movq 176(%rsi), %rax + movq %rdx, 264(%r9) + adcq $0x00, %rax + movq 184(%rsi), %rdx + movq %rax, 272(%r9) + adcq $0x00, %rdx + movq %rdx, 280(%r9) + addq $0x1f8, %rsp + repz retq +#ifndef __APPLE__ +.size sp_3072_sqr_avx2_24,.-sp_3072_sqr_avx2_24 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ +/* Add a to a into r. (r = a + a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_3072_dbl_24 +.type sp_3072_dbl_24,@function +.align 16 +sp_3072_dbl_24: +#else +.section __TEXT,__text +.globl _sp_3072_dbl_24 +.p2align 4 +_sp_3072_dbl_24: +#endif /* __APPLE__ */ + movq (%rsi), %rdx + xorq %rax, %rax + addq %rdx, %rdx + movq 8(%rsi), %rcx + movq %rdx, (%rdi) + adcq %rcx, %rcx + movq 16(%rsi), %rdx + movq %rcx, 8(%rdi) + adcq %rdx, %rdx + movq 24(%rsi), %rcx + movq %rdx, 16(%rdi) + adcq %rcx, %rcx + movq 32(%rsi), %rdx + movq %rcx, 24(%rdi) + adcq %rdx, %rdx + movq 40(%rsi), %rcx + movq %rdx, 32(%rdi) + adcq %rcx, %rcx + movq 48(%rsi), %rdx + movq %rcx, 40(%rdi) + adcq %rdx, %rdx + movq 56(%rsi), %rcx + movq %rdx, 48(%rdi) + adcq %rcx, %rcx + movq 64(%rsi), %rdx + movq %rcx, 56(%rdi) + adcq %rdx, %rdx + movq 72(%rsi), %rcx + movq %rdx, 64(%rdi) + adcq %rcx, %rcx + movq 80(%rsi), %rdx + movq %rcx, 72(%rdi) + adcq %rdx, %rdx + movq 88(%rsi), %rcx + movq %rdx, 80(%rdi) + adcq %rcx, %rcx + movq 96(%rsi), %rdx + movq %rcx, 88(%rdi) + adcq %rdx, %rdx + movq 104(%rsi), %rcx + movq %rdx, 96(%rdi) + adcq %rcx, %rcx + movq 112(%rsi), %rdx + movq %rcx, 104(%rdi) + adcq %rdx, %rdx + movq 120(%rsi), %rcx + movq %rdx, 112(%rdi) + adcq %rcx, %rcx + movq 128(%rsi), %rdx + movq %rcx, 120(%rdi) + adcq %rdx, %rdx + movq 136(%rsi), %rcx + movq %rdx, 128(%rdi) + adcq %rcx, %rcx + movq 144(%rsi), %rdx + movq %rcx, 136(%rdi) + adcq %rdx, %rdx + movq 152(%rsi), %rcx + movq %rdx, 144(%rdi) + adcq %rcx, %rcx + movq 160(%rsi), %rdx + movq %rcx, 152(%rdi) + adcq %rdx, %rdx + movq 168(%rsi), %rcx + movq %rdx, 160(%rdi) + adcq %rcx, %rcx + movq 176(%rsi), %rdx + movq %rcx, 168(%rdi) + adcq %rdx, %rdx + movq 184(%rsi), %rcx + movq %rdx, 176(%rdi) + adcq %rcx, %rcx + movq %rcx, 184(%rdi) + adcq $0x00, %rax + repz retq +#ifndef __APPLE__ +.size sp_3072_dbl_24,.-sp_3072_dbl_24 +#endif /* __APPLE__ */ +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_3072_sqr_48 +.type sp_3072_sqr_48,@function +.align 16 +sp_3072_sqr_48: +#else +.section __TEXT,__text +.globl _sp_3072_sqr_48 +.p2align 4 +_sp_3072_sqr_48: +#endif /* __APPLE__ */ + subq $0x3d8, %rsp + movq %rdi, 960(%rsp) + movq %rsi, 968(%rsp) + leaq 768(%rsp), %r8 + leaq 192(%rsi), %r9 + # Add + movq (%rsi), %rdx + xorq %rcx, %rcx + addq (%r9), %rdx + movq 8(%rsi), %rax + movq %rdx, (%r8) + adcq 8(%r9), %rax + movq 16(%rsi), %rdx + movq %rax, 8(%r8) + adcq 16(%r9), %rdx + movq 24(%rsi), %rax + movq %rdx, 16(%r8) + adcq 24(%r9), %rax + movq 32(%rsi), %rdx + movq %rax, 24(%r8) + adcq 32(%r9), %rdx + movq 40(%rsi), %rax + movq %rdx, 32(%r8) + adcq 40(%r9), %rax + movq 48(%rsi), %rdx + movq %rax, 40(%r8) + adcq 48(%r9), %rdx + movq 56(%rsi), %rax + movq %rdx, 48(%r8) + adcq 56(%r9), %rax + movq 64(%rsi), %rdx + movq %rax, 56(%r8) + adcq 64(%r9), %rdx + movq 72(%rsi), %rax + movq %rdx, 64(%r8) + adcq 72(%r9), %rax + movq 80(%rsi), %rdx + movq %rax, 72(%r8) + adcq 80(%r9), %rdx + movq 88(%rsi), %rax + movq %rdx, 80(%r8) + adcq 88(%r9), %rax + movq 96(%rsi), %rdx + movq %rax, 88(%r8) + adcq 96(%r9), %rdx + movq 104(%rsi), %rax + movq %rdx, 96(%r8) + adcq 104(%r9), %rax + movq 112(%rsi), %rdx + movq %rax, 104(%r8) + adcq 112(%r9), %rdx + movq 120(%rsi), %rax + movq %rdx, 112(%r8) + adcq 120(%r9), %rax + movq 128(%rsi), %rdx + movq %rax, 120(%r8) + adcq 128(%r9), %rdx + movq 136(%rsi), %rax + movq %rdx, 128(%r8) + adcq 136(%r9), %rax + movq 144(%rsi), %rdx + movq %rax, 136(%r8) + adcq 144(%r9), %rdx + movq 152(%rsi), %rax + movq %rdx, 144(%r8) + adcq 152(%r9), %rax + movq 160(%rsi), %rdx + movq %rax, 152(%r8) + adcq 160(%r9), %rdx + movq 168(%rsi), %rax + movq %rdx, 160(%r8) + adcq 168(%r9), %rax + movq 176(%rsi), %rdx + movq %rax, 168(%r8) + adcq 176(%r9), %rdx + movq 184(%rsi), %rax + movq %rdx, 176(%r8) + adcq 184(%r9), %rax + movq %rax, 184(%r8) + adcq $0x00, %rcx + movq %rcx, 976(%rsp) + movq %r8, %rsi + movq %rsp, %rdi +#ifndef __APPLE__ + callq sp_3072_sqr_24@plt +#else + callq _sp_3072_sqr_24 +#endif /* __APPLE__ */ + movq 968(%rsp), %rsi + leaq 384(%rsp), %rdi + addq $0xc0, %rsi +#ifndef __APPLE__ + callq sp_3072_sqr_24@plt +#else + callq _sp_3072_sqr_24 +#endif /* __APPLE__ */ + movq 968(%rsp), %rsi + movq 960(%rsp), %rdi +#ifndef __APPLE__ + callq sp_3072_sqr_24@plt +#else + callq _sp_3072_sqr_24 +#endif /* __APPLE__ */ +#ifdef _WIN64 + movq 968(%rsp), %rsi + movq 960(%rsp), %rdi +#endif /* _WIN64 */ + movq 976(%rsp), %r10 + movq %rdi, %r9 + leaq 768(%rsp), %r8 + movq %r10, %rcx + negq %r10 + addq $0x180, %r9 + movq (%r8), %rdx + movq 8(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, (%r9) + movq %rax, 8(%r9) + movq 16(%r8), %rdx + movq 24(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 16(%r9) + movq %rax, 24(%r9) + movq 32(%r8), %rdx + movq 40(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 32(%r9) + movq %rax, 40(%r9) + movq 48(%r8), %rdx + movq 56(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 48(%r9) + movq %rax, 56(%r9) + movq 64(%r8), %rdx + movq 72(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 64(%r9) + movq %rax, 72(%r9) + movq 80(%r8), %rdx + movq 88(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 80(%r9) + movq %rax, 88(%r9) + movq 96(%r8), %rdx + movq 104(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 96(%r9) + movq %rax, 104(%r9) + movq 112(%r8), %rdx + movq 120(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 112(%r9) + movq %rax, 120(%r9) + movq 128(%r8), %rdx + movq 136(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 128(%r9) + movq %rax, 136(%r9) + movq 144(%r8), %rdx + movq 152(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 144(%r9) + movq %rax, 152(%r9) + movq 160(%r8), %rdx + movq 168(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 160(%r9) + movq %rax, 168(%r9) + movq 176(%r8), %rdx + movq 184(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 176(%r9) + movq %rax, 184(%r9) + movq (%r9), %rdx + addq %rdx, %rdx + movq 8(%r9), %rax + movq %rdx, (%r9) + adcq %rax, %rax + movq 16(%r9), %rdx + movq %rax, 8(%r9) + adcq %rdx, %rdx + movq 24(%r9), %rax + movq %rdx, 16(%r9) + adcq %rax, %rax + movq 32(%r9), %rdx + movq %rax, 24(%r9) + adcq %rdx, %rdx + movq 40(%r9), %rax + movq %rdx, 32(%r9) + adcq %rax, %rax + movq 48(%r9), %rdx + movq %rax, 40(%r9) + adcq %rdx, %rdx + movq 56(%r9), %rax + movq %rdx, 48(%r9) + adcq %rax, %rax + movq 64(%r9), %rdx + movq %rax, 56(%r9) + adcq %rdx, %rdx + movq 72(%r9), %rax + movq %rdx, 64(%r9) + adcq %rax, %rax + movq 80(%r9), %rdx + movq %rax, 72(%r9) + adcq %rdx, %rdx + movq 88(%r9), %rax + movq %rdx, 80(%r9) + adcq %rax, %rax + movq 96(%r9), %rdx + movq %rax, 88(%r9) + adcq %rdx, %rdx + movq 104(%r9), %rax + movq %rdx, 96(%r9) + adcq %rax, %rax + movq 112(%r9), %rdx + movq %rax, 104(%r9) + adcq %rdx, %rdx + movq 120(%r9), %rax + movq %rdx, 112(%r9) + adcq %rax, %rax + movq 128(%r9), %rdx + movq %rax, 120(%r9) + adcq %rdx, %rdx + movq 136(%r9), %rax + movq %rdx, 128(%r9) + adcq %rax, %rax + movq 144(%r9), %rdx + movq %rax, 136(%r9) + adcq %rdx, %rdx + movq 152(%r9), %rax + movq %rdx, 144(%r9) + adcq %rax, %rax + movq 160(%r9), %rdx + movq %rax, 152(%r9) + adcq %rdx, %rdx + movq 168(%r9), %rax + movq %rdx, 160(%r9) + adcq %rax, %rax + movq 176(%r9), %rdx + movq %rax, 168(%r9) + adcq %rdx, %rdx + movq 184(%r9), %rax + movq %rdx, 176(%r9) + adcq %rax, %rax + movq %rax, 184(%r9) + adcq $0x00, %rcx + leaq 384(%rsp), %rsi + movq %rsp, %r8 + movq (%r8), %rdx + subq (%rsi), %rdx + movq 8(%r8), %rax + movq %rdx, (%r8) + sbbq 8(%rsi), %rax + movq 16(%r8), %rdx + movq %rax, 8(%r8) + sbbq 16(%rsi), %rdx + movq 24(%r8), %rax + movq %rdx, 16(%r8) + sbbq 24(%rsi), %rax + movq 32(%r8), %rdx + movq %rax, 24(%r8) + sbbq 32(%rsi), %rdx + movq 40(%r8), %rax + movq %rdx, 32(%r8) + sbbq 40(%rsi), %rax + movq 48(%r8), %rdx + movq %rax, 40(%r8) + sbbq 48(%rsi), %rdx + movq 56(%r8), %rax + movq %rdx, 48(%r8) + sbbq 56(%rsi), %rax + movq 64(%r8), %rdx + movq %rax, 56(%r8) + sbbq 64(%rsi), %rdx + movq 72(%r8), %rax + movq %rdx, 64(%r8) + sbbq 72(%rsi), %rax + movq 80(%r8), %rdx + movq %rax, 72(%r8) + sbbq 80(%rsi), %rdx + movq 88(%r8), %rax + movq %rdx, 80(%r8) + sbbq 88(%rsi), %rax + movq 96(%r8), %rdx + movq %rax, 88(%r8) + sbbq 96(%rsi), %rdx + movq 104(%r8), %rax + movq %rdx, 96(%r8) + sbbq 104(%rsi), %rax + movq 112(%r8), %rdx + movq %rax, 104(%r8) + sbbq 112(%rsi), %rdx + movq 120(%r8), %rax + movq %rdx, 112(%r8) + sbbq 120(%rsi), %rax + movq 128(%r8), %rdx + movq %rax, 120(%r8) + sbbq 128(%rsi), %rdx + movq 136(%r8), %rax + movq %rdx, 128(%r8) + sbbq 136(%rsi), %rax + movq 144(%r8), %rdx + movq %rax, 136(%r8) + sbbq 144(%rsi), %rdx + movq 152(%r8), %rax + movq %rdx, 144(%r8) + sbbq 152(%rsi), %rax + movq 160(%r8), %rdx + movq %rax, 152(%r8) + sbbq 160(%rsi), %rdx + movq 168(%r8), %rax + movq %rdx, 160(%r8) + sbbq 168(%rsi), %rax + movq 176(%r8), %rdx + movq %rax, 168(%r8) + sbbq 176(%rsi), %rdx + movq 184(%r8), %rax + movq %rdx, 176(%r8) + sbbq 184(%rsi), %rax + movq 192(%r8), %rdx + movq %rax, 184(%r8) + sbbq 192(%rsi), %rdx + movq 200(%r8), %rax + movq %rdx, 192(%r8) + sbbq 200(%rsi), %rax + movq 208(%r8), %rdx + movq %rax, 200(%r8) + sbbq 208(%rsi), %rdx + movq 216(%r8), %rax + movq %rdx, 208(%r8) + sbbq 216(%rsi), %rax + movq 224(%r8), %rdx + movq %rax, 216(%r8) + sbbq 224(%rsi), %rdx + movq 232(%r8), %rax + movq %rdx, 224(%r8) + sbbq 232(%rsi), %rax + movq 240(%r8), %rdx + movq %rax, 232(%r8) + sbbq 240(%rsi), %rdx + movq 248(%r8), %rax + movq %rdx, 240(%r8) + sbbq 248(%rsi), %rax + movq 256(%r8), %rdx + movq %rax, 248(%r8) + sbbq 256(%rsi), %rdx + movq 264(%r8), %rax + movq %rdx, 256(%r8) + sbbq 264(%rsi), %rax + movq 272(%r8), %rdx + movq %rax, 264(%r8) + sbbq 272(%rsi), %rdx + movq 280(%r8), %rax + movq %rdx, 272(%r8) + sbbq 280(%rsi), %rax + movq 288(%r8), %rdx + movq %rax, 280(%r8) + sbbq 288(%rsi), %rdx + movq 296(%r8), %rax + movq %rdx, 288(%r8) + sbbq 296(%rsi), %rax + movq 304(%r8), %rdx + movq %rax, 296(%r8) + sbbq 304(%rsi), %rdx + movq 312(%r8), %rax + movq %rdx, 304(%r8) + sbbq 312(%rsi), %rax + movq 320(%r8), %rdx + movq %rax, 312(%r8) + sbbq 320(%rsi), %rdx + movq 328(%r8), %rax + movq %rdx, 320(%r8) + sbbq 328(%rsi), %rax + movq 336(%r8), %rdx + movq %rax, 328(%r8) + sbbq 336(%rsi), %rdx + movq 344(%r8), %rax + movq %rdx, 336(%r8) + sbbq 344(%rsi), %rax + movq 352(%r8), %rdx + movq %rax, 344(%r8) + sbbq 352(%rsi), %rdx + movq 360(%r8), %rax + movq %rdx, 352(%r8) + sbbq 360(%rsi), %rax + movq 368(%r8), %rdx + movq %rax, 360(%r8) + sbbq 368(%rsi), %rdx + movq 376(%r8), %rax + movq %rdx, 368(%r8) + sbbq 376(%rsi), %rax + movq %rax, 376(%r8) + sbbq $0x00, %rcx + movq (%r8), %rdx + subq (%rdi), %rdx + movq 8(%r8), %rax + movq %rdx, (%r8) + sbbq 8(%rdi), %rax + movq 16(%r8), %rdx + movq %rax, 8(%r8) + sbbq 16(%rdi), %rdx + movq 24(%r8), %rax + movq %rdx, 16(%r8) + sbbq 24(%rdi), %rax + movq 32(%r8), %rdx + movq %rax, 24(%r8) + sbbq 32(%rdi), %rdx + movq 40(%r8), %rax + movq %rdx, 32(%r8) + sbbq 40(%rdi), %rax + movq 48(%r8), %rdx + movq %rax, 40(%r8) + sbbq 48(%rdi), %rdx + movq 56(%r8), %rax + movq %rdx, 48(%r8) + sbbq 56(%rdi), %rax + movq 64(%r8), %rdx + movq %rax, 56(%r8) + sbbq 64(%rdi), %rdx + movq 72(%r8), %rax + movq %rdx, 64(%r8) + sbbq 72(%rdi), %rax + movq 80(%r8), %rdx + movq %rax, 72(%r8) + sbbq 80(%rdi), %rdx + movq 88(%r8), %rax + movq %rdx, 80(%r8) + sbbq 88(%rdi), %rax + movq 96(%r8), %rdx + movq %rax, 88(%r8) + sbbq 96(%rdi), %rdx + movq 104(%r8), %rax + movq %rdx, 96(%r8) + sbbq 104(%rdi), %rax + movq 112(%r8), %rdx + movq %rax, 104(%r8) + sbbq 112(%rdi), %rdx + movq 120(%r8), %rax + movq %rdx, 112(%r8) + sbbq 120(%rdi), %rax + movq 128(%r8), %rdx + movq %rax, 120(%r8) + sbbq 128(%rdi), %rdx + movq 136(%r8), %rax + movq %rdx, 128(%r8) + sbbq 136(%rdi), %rax + movq 144(%r8), %rdx + movq %rax, 136(%r8) + sbbq 144(%rdi), %rdx + movq 152(%r8), %rax + movq %rdx, 144(%r8) + sbbq 152(%rdi), %rax + movq 160(%r8), %rdx + movq %rax, 152(%r8) + sbbq 160(%rdi), %rdx + movq 168(%r8), %rax + movq %rdx, 160(%r8) + sbbq 168(%rdi), %rax + movq 176(%r8), %rdx + movq %rax, 168(%r8) + sbbq 176(%rdi), %rdx + movq 184(%r8), %rax + movq %rdx, 176(%r8) + sbbq 184(%rdi), %rax + movq 192(%r8), %rdx + movq %rax, 184(%r8) + sbbq 192(%rdi), %rdx + movq 200(%r8), %rax + movq %rdx, 192(%r8) + sbbq 200(%rdi), %rax + movq 208(%r8), %rdx + movq %rax, 200(%r8) + sbbq 208(%rdi), %rdx + movq 216(%r8), %rax + movq %rdx, 208(%r8) + sbbq 216(%rdi), %rax + movq 224(%r8), %rdx + movq %rax, 216(%r8) + sbbq 224(%rdi), %rdx + movq 232(%r8), %rax + movq %rdx, 224(%r8) + sbbq 232(%rdi), %rax + movq 240(%r8), %rdx + movq %rax, 232(%r8) + sbbq 240(%rdi), %rdx + movq 248(%r8), %rax + movq %rdx, 240(%r8) + sbbq 248(%rdi), %rax + movq 256(%r8), %rdx + movq %rax, 248(%r8) + sbbq 256(%rdi), %rdx + movq 264(%r8), %rax + movq %rdx, 256(%r8) + sbbq 264(%rdi), %rax + movq 272(%r8), %rdx + movq %rax, 264(%r8) + sbbq 272(%rdi), %rdx + movq 280(%r8), %rax + movq %rdx, 272(%r8) + sbbq 280(%rdi), %rax + movq 288(%r8), %rdx + movq %rax, 280(%r8) + sbbq 288(%rdi), %rdx + movq 296(%r8), %rax + movq %rdx, 288(%r8) + sbbq 296(%rdi), %rax + movq 304(%r8), %rdx + movq %rax, 296(%r8) + sbbq 304(%rdi), %rdx + movq 312(%r8), %rax + movq %rdx, 304(%r8) + sbbq 312(%rdi), %rax + movq 320(%r8), %rdx + movq %rax, 312(%r8) + sbbq 320(%rdi), %rdx + movq 328(%r8), %rax + movq %rdx, 320(%r8) + sbbq 328(%rdi), %rax + movq 336(%r8), %rdx + movq %rax, 328(%r8) + sbbq 336(%rdi), %rdx + movq 344(%r8), %rax + movq %rdx, 336(%r8) + sbbq 344(%rdi), %rax + movq 352(%r8), %rdx + movq %rax, 344(%r8) + sbbq 352(%rdi), %rdx + movq 360(%r8), %rax + movq %rdx, 352(%r8) + sbbq 360(%rdi), %rax + movq 368(%r8), %rdx + movq %rax, 360(%r8) + sbbq 368(%rdi), %rdx + movq 376(%r8), %rax + movq %rdx, 368(%r8) + sbbq 376(%rdi), %rax + movq %rax, 376(%r8) + sbbq $0x00, %rcx + subq $0xc0, %r9 + # Add in place + movq (%r9), %rdx + addq (%r8), %rdx + movq 8(%r9), %rax + movq %rdx, (%r9) + adcq 8(%r8), %rax + movq 16(%r9), %rdx + movq %rax, 8(%r9) + adcq 16(%r8), %rdx + movq 24(%r9), %rax + movq %rdx, 16(%r9) + adcq 24(%r8), %rax + movq 32(%r9), %rdx + movq %rax, 24(%r9) + adcq 32(%r8), %rdx + movq 40(%r9), %rax + movq %rdx, 32(%r9) + adcq 40(%r8), %rax + movq 48(%r9), %rdx + movq %rax, 40(%r9) + adcq 48(%r8), %rdx + movq 56(%r9), %rax + movq %rdx, 48(%r9) + adcq 56(%r8), %rax + movq 64(%r9), %rdx + movq %rax, 56(%r9) + adcq 64(%r8), %rdx + movq 72(%r9), %rax + movq %rdx, 64(%r9) + adcq 72(%r8), %rax + movq 80(%r9), %rdx + movq %rax, 72(%r9) + adcq 80(%r8), %rdx + movq 88(%r9), %rax + movq %rdx, 80(%r9) + adcq 88(%r8), %rax + movq 96(%r9), %rdx + movq %rax, 88(%r9) + adcq 96(%r8), %rdx + movq 104(%r9), %rax + movq %rdx, 96(%r9) + adcq 104(%r8), %rax + movq 112(%r9), %rdx + movq %rax, 104(%r9) + adcq 112(%r8), %rdx + movq 120(%r9), %rax + movq %rdx, 112(%r9) + adcq 120(%r8), %rax + movq 128(%r9), %rdx + movq %rax, 120(%r9) + adcq 128(%r8), %rdx + movq 136(%r9), %rax + movq %rdx, 128(%r9) + adcq 136(%r8), %rax + movq 144(%r9), %rdx + movq %rax, 136(%r9) + adcq 144(%r8), %rdx + movq 152(%r9), %rax + movq %rdx, 144(%r9) + adcq 152(%r8), %rax + movq 160(%r9), %rdx + movq %rax, 152(%r9) + adcq 160(%r8), %rdx + movq 168(%r9), %rax + movq %rdx, 160(%r9) + adcq 168(%r8), %rax + movq 176(%r9), %rdx + movq %rax, 168(%r9) + adcq 176(%r8), %rdx + movq 184(%r9), %rax + movq %rdx, 176(%r9) + adcq 184(%r8), %rax + movq 192(%r9), %rdx + movq %rax, 184(%r9) + adcq 192(%r8), %rdx + movq 200(%r9), %rax + movq %rdx, 192(%r9) + adcq 200(%r8), %rax + movq 208(%r9), %rdx + movq %rax, 200(%r9) + adcq 208(%r8), %rdx + movq 216(%r9), %rax + movq %rdx, 208(%r9) + adcq 216(%r8), %rax + movq 224(%r9), %rdx + movq %rax, 216(%r9) + adcq 224(%r8), %rdx + movq 232(%r9), %rax + movq %rdx, 224(%r9) + adcq 232(%r8), %rax + movq 240(%r9), %rdx + movq %rax, 232(%r9) + adcq 240(%r8), %rdx + movq 248(%r9), %rax + movq %rdx, 240(%r9) + adcq 248(%r8), %rax + movq 256(%r9), %rdx + movq %rax, 248(%r9) + adcq 256(%r8), %rdx + movq 264(%r9), %rax + movq %rdx, 256(%r9) + adcq 264(%r8), %rax + movq 272(%r9), %rdx + movq %rax, 264(%r9) + adcq 272(%r8), %rdx + movq 280(%r9), %rax + movq %rdx, 272(%r9) + adcq 280(%r8), %rax + movq 288(%r9), %rdx + movq %rax, 280(%r9) + adcq 288(%r8), %rdx + movq 296(%r9), %rax + movq %rdx, 288(%r9) + adcq 296(%r8), %rax + movq 304(%r9), %rdx + movq %rax, 296(%r9) + adcq 304(%r8), %rdx + movq 312(%r9), %rax + movq %rdx, 304(%r9) + adcq 312(%r8), %rax + movq 320(%r9), %rdx + movq %rax, 312(%r9) + adcq 320(%r8), %rdx + movq 328(%r9), %rax + movq %rdx, 320(%r9) + adcq 328(%r8), %rax + movq 336(%r9), %rdx + movq %rax, 328(%r9) + adcq 336(%r8), %rdx + movq 344(%r9), %rax + movq %rdx, 336(%r9) + adcq 344(%r8), %rax + movq 352(%r9), %rdx + movq %rax, 344(%r9) + adcq 352(%r8), %rdx + movq 360(%r9), %rax + movq %rdx, 352(%r9) + adcq 360(%r8), %rax + movq 368(%r9), %rdx + movq %rax, 360(%r9) + adcq 368(%r8), %rdx + movq 376(%r9), %rax + movq %rdx, 368(%r9) + adcq 376(%r8), %rax + movq %rax, 376(%r9) + adcq $0x00, %rcx + movq %rcx, 576(%rdi) + # Add in place + movq 192(%r9), %rdx + addq (%rsi), %rdx + movq 200(%r9), %rax + movq %rdx, 192(%r9) + adcq 8(%rsi), %rax + movq 208(%r9), %rdx + movq %rax, 200(%r9) + adcq 16(%rsi), %rdx + movq 216(%r9), %rax + movq %rdx, 208(%r9) + adcq 24(%rsi), %rax + movq 224(%r9), %rdx + movq %rax, 216(%r9) + adcq 32(%rsi), %rdx + movq 232(%r9), %rax + movq %rdx, 224(%r9) + adcq 40(%rsi), %rax + movq 240(%r9), %rdx + movq %rax, 232(%r9) + adcq 48(%rsi), %rdx + movq 248(%r9), %rax + movq %rdx, 240(%r9) + adcq 56(%rsi), %rax + movq 256(%r9), %rdx + movq %rax, 248(%r9) + adcq 64(%rsi), %rdx + movq 264(%r9), %rax + movq %rdx, 256(%r9) + adcq 72(%rsi), %rax + movq 272(%r9), %rdx + movq %rax, 264(%r9) + adcq 80(%rsi), %rdx + movq 280(%r9), %rax + movq %rdx, 272(%r9) + adcq 88(%rsi), %rax + movq 288(%r9), %rdx + movq %rax, 280(%r9) + adcq 96(%rsi), %rdx + movq 296(%r9), %rax + movq %rdx, 288(%r9) + adcq 104(%rsi), %rax + movq 304(%r9), %rdx + movq %rax, 296(%r9) + adcq 112(%rsi), %rdx + movq 312(%r9), %rax + movq %rdx, 304(%r9) + adcq 120(%rsi), %rax + movq 320(%r9), %rdx + movq %rax, 312(%r9) + adcq 128(%rsi), %rdx + movq 328(%r9), %rax + movq %rdx, 320(%r9) + adcq 136(%rsi), %rax + movq 336(%r9), %rdx + movq %rax, 328(%r9) + adcq 144(%rsi), %rdx + movq 344(%r9), %rax + movq %rdx, 336(%r9) + adcq 152(%rsi), %rax + movq 352(%r9), %rdx + movq %rax, 344(%r9) + adcq 160(%rsi), %rdx + movq 360(%r9), %rax + movq %rdx, 352(%r9) + adcq 168(%rsi), %rax + movq 368(%r9), %rdx + movq %rax, 360(%r9) + adcq 176(%rsi), %rdx + movq 376(%r9), %rax + movq %rdx, 368(%r9) + adcq 184(%rsi), %rax + movq 384(%r9), %rdx + movq %rax, 376(%r9) + adcq 192(%rsi), %rdx + movq %rdx, 384(%r9) + # Add to zero + movq 200(%rsi), %rdx + adcq $0x00, %rdx + movq 208(%rsi), %rax + movq %rdx, 392(%r9) + adcq $0x00, %rax + movq 216(%rsi), %rdx + movq %rax, 400(%r9) + adcq $0x00, %rdx + movq 224(%rsi), %rax + movq %rdx, 408(%r9) + adcq $0x00, %rax + movq 232(%rsi), %rdx + movq %rax, 416(%r9) + adcq $0x00, %rdx + movq 240(%rsi), %rax + movq %rdx, 424(%r9) + adcq $0x00, %rax + movq 248(%rsi), %rdx + movq %rax, 432(%r9) + adcq $0x00, %rdx + movq 256(%rsi), %rax + movq %rdx, 440(%r9) + adcq $0x00, %rax + movq 264(%rsi), %rdx + movq %rax, 448(%r9) + adcq $0x00, %rdx + movq 272(%rsi), %rax + movq %rdx, 456(%r9) + adcq $0x00, %rax + movq 280(%rsi), %rdx + movq %rax, 464(%r9) + adcq $0x00, %rdx + movq 288(%rsi), %rax + movq %rdx, 472(%r9) + adcq $0x00, %rax + movq 296(%rsi), %rdx + movq %rax, 480(%r9) + adcq $0x00, %rdx + movq 304(%rsi), %rax + movq %rdx, 488(%r9) + adcq $0x00, %rax + movq 312(%rsi), %rdx + movq %rax, 496(%r9) + adcq $0x00, %rdx + movq 320(%rsi), %rax + movq %rdx, 504(%r9) + adcq $0x00, %rax + movq 328(%rsi), %rdx + movq %rax, 512(%r9) + adcq $0x00, %rdx + movq 336(%rsi), %rax + movq %rdx, 520(%r9) + adcq $0x00, %rax + movq 344(%rsi), %rdx + movq %rax, 528(%r9) + adcq $0x00, %rdx + movq 352(%rsi), %rax + movq %rdx, 536(%r9) + adcq $0x00, %rax + movq 360(%rsi), %rdx + movq %rax, 544(%r9) + adcq $0x00, %rdx + movq 368(%rsi), %rax + movq %rdx, 552(%r9) + adcq $0x00, %rax + movq 376(%rsi), %rdx + movq %rax, 560(%r9) + adcq $0x00, %rdx + movq %rdx, 568(%r9) + addq $0x3d8, %rsp + repz retq +#ifndef __APPLE__ +.size sp_3072_sqr_48,.-sp_3072_sqr_48 +#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Square a and put result in r. (r = a * a) * @@ -30433,1254 +30433,6 @@ _sp_4096_mul_64: #ifndef __APPLE__ .size sp_4096_mul_64,.-sp_4096_mul_64 #endif /* __APPLE__ */ -/* Add a to a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_dbl_32 -.type sp_2048_dbl_32,@function -.align 16 -sp_2048_dbl_32: -#else -.section __TEXT,__text -.globl _sp_2048_dbl_32 -.p2align 4 -_sp_2048_dbl_32: -#endif /* __APPLE__ */ - movq (%rsi), %rdx - xorq %rax, %rax - addq %rdx, %rdx - movq 8(%rsi), %rcx - movq %rdx, (%rdi) - adcq %rcx, %rcx - movq 16(%rsi), %rdx - movq %rcx, 8(%rdi) - adcq %rdx, %rdx - movq 24(%rsi), %rcx - movq %rdx, 16(%rdi) - adcq %rcx, %rcx - movq 32(%rsi), %rdx - movq %rcx, 24(%rdi) - adcq %rdx, %rdx - movq 40(%rsi), %rcx - movq %rdx, 32(%rdi) - adcq %rcx, %rcx - movq 48(%rsi), %rdx - movq %rcx, 40(%rdi) - adcq %rdx, %rdx - movq 56(%rsi), %rcx - movq %rdx, 48(%rdi) - adcq %rcx, %rcx - movq 64(%rsi), %rdx - movq %rcx, 56(%rdi) - adcq %rdx, %rdx - movq 72(%rsi), %rcx - movq %rdx, 64(%rdi) - adcq %rcx, %rcx - movq 80(%rsi), %rdx - movq %rcx, 72(%rdi) - adcq %rdx, %rdx - movq 88(%rsi), %rcx - movq %rdx, 80(%rdi) - adcq %rcx, %rcx - movq 96(%rsi), %rdx - movq %rcx, 88(%rdi) - adcq %rdx, %rdx - movq 104(%rsi), %rcx - movq %rdx, 96(%rdi) - adcq %rcx, %rcx - movq 112(%rsi), %rdx - movq %rcx, 104(%rdi) - adcq %rdx, %rdx - movq 120(%rsi), %rcx - movq %rdx, 112(%rdi) - adcq %rcx, %rcx - movq 128(%rsi), %rdx - movq %rcx, 120(%rdi) - adcq %rdx, %rdx - movq 136(%rsi), %rcx - movq %rdx, 128(%rdi) - adcq %rcx, %rcx - movq 144(%rsi), %rdx - movq %rcx, 136(%rdi) - adcq %rdx, %rdx - movq 152(%rsi), %rcx - movq %rdx, 144(%rdi) - adcq %rcx, %rcx - movq 160(%rsi), %rdx - movq %rcx, 152(%rdi) - adcq %rdx, %rdx - movq 168(%rsi), %rcx - movq %rdx, 160(%rdi) - adcq %rcx, %rcx - movq 176(%rsi), %rdx - movq %rcx, 168(%rdi) - adcq %rdx, %rdx - movq 184(%rsi), %rcx - movq %rdx, 176(%rdi) - adcq %rcx, %rcx - movq 192(%rsi), %rdx - movq %rcx, 184(%rdi) - adcq %rdx, %rdx - movq 200(%rsi), %rcx - movq %rdx, 192(%rdi) - adcq %rcx, %rcx - movq 208(%rsi), %rdx - movq %rcx, 200(%rdi) - adcq %rdx, %rdx - movq 216(%rsi), %rcx - movq %rdx, 208(%rdi) - adcq %rcx, %rcx - movq 224(%rsi), %rdx - movq %rcx, 216(%rdi) - adcq %rdx, %rdx - movq 232(%rsi), %rcx - movq %rdx, 224(%rdi) - adcq %rcx, %rcx - movq 240(%rsi), %rdx - movq %rcx, 232(%rdi) - adcq %rdx, %rdx - movq 248(%rsi), %rcx - movq %rdx, 240(%rdi) - adcq %rcx, %rcx - movq %rcx, 248(%rdi) - adcq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_2048_dbl_32,.-sp_2048_dbl_32 -#endif /* __APPLE__ */ -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_4096_sqr_64 -.type sp_4096_sqr_64,@function -.align 16 -sp_4096_sqr_64: -#else -.section __TEXT,__text -.globl _sp_4096_sqr_64 -.p2align 4 -_sp_4096_sqr_64: -#endif /* __APPLE__ */ - subq $0x518, %rsp - movq %rdi, 1280(%rsp) - movq %rsi, 1288(%rsp) - leaq 1024(%rsp), %r8 - leaq 256(%rsi), %r9 - # Add - movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx - movq 8(%rsi), %rax - movq %rdx, (%r8) - adcq 8(%r9), %rax - movq 16(%rsi), %rdx - movq %rax, 8(%r8) - adcq 16(%r9), %rdx - movq 24(%rsi), %rax - movq %rdx, 16(%r8) - adcq 24(%r9), %rax - movq 32(%rsi), %rdx - movq %rax, 24(%r8) - adcq 32(%r9), %rdx - movq 40(%rsi), %rax - movq %rdx, 32(%r8) - adcq 40(%r9), %rax - movq 48(%rsi), %rdx - movq %rax, 40(%r8) - adcq 48(%r9), %rdx - movq 56(%rsi), %rax - movq %rdx, 48(%r8) - adcq 56(%r9), %rax - movq 64(%rsi), %rdx - movq %rax, 56(%r8) - adcq 64(%r9), %rdx - movq 72(%rsi), %rax - movq %rdx, 64(%r8) - adcq 72(%r9), %rax - movq 80(%rsi), %rdx - movq %rax, 72(%r8) - adcq 80(%r9), %rdx - movq 88(%rsi), %rax - movq %rdx, 80(%r8) - adcq 88(%r9), %rax - movq 96(%rsi), %rdx - movq %rax, 88(%r8) - adcq 96(%r9), %rdx - movq 104(%rsi), %rax - movq %rdx, 96(%r8) - adcq 104(%r9), %rax - movq 112(%rsi), %rdx - movq %rax, 104(%r8) - adcq 112(%r9), %rdx - movq 120(%rsi), %rax - movq %rdx, 112(%r8) - adcq 120(%r9), %rax - movq 128(%rsi), %rdx - movq %rax, 120(%r8) - adcq 128(%r9), %rdx - movq 136(%rsi), %rax - movq %rdx, 128(%r8) - adcq 136(%r9), %rax - movq 144(%rsi), %rdx - movq %rax, 136(%r8) - adcq 144(%r9), %rdx - movq 152(%rsi), %rax - movq %rdx, 144(%r8) - adcq 152(%r9), %rax - movq 160(%rsi), %rdx - movq %rax, 152(%r8) - adcq 160(%r9), %rdx - movq 168(%rsi), %rax - movq %rdx, 160(%r8) - adcq 168(%r9), %rax - movq 176(%rsi), %rdx - movq %rax, 168(%r8) - adcq 176(%r9), %rdx - movq 184(%rsi), %rax - movq %rdx, 176(%r8) - adcq 184(%r9), %rax - movq 192(%rsi), %rdx - movq %rax, 184(%r8) - adcq 192(%r9), %rdx - movq 200(%rsi), %rax - movq %rdx, 192(%r8) - adcq 200(%r9), %rax - movq 208(%rsi), %rdx - movq %rax, 200(%r8) - adcq 208(%r9), %rdx - movq 216(%rsi), %rax - movq %rdx, 208(%r8) - adcq 216(%r9), %rax - movq 224(%rsi), %rdx - movq %rax, 216(%r8) - adcq 224(%r9), %rdx - movq 232(%rsi), %rax - movq %rdx, 224(%r8) - adcq 232(%r9), %rax - movq 240(%rsi), %rdx - movq %rax, 232(%r8) - adcq 240(%r9), %rdx - movq 248(%rsi), %rax - movq %rdx, 240(%r8) - adcq 248(%r9), %rax - movq %rax, 248(%r8) - adcq $0x00, %rcx - movq %rcx, 1296(%rsp) - movq %r8, %rsi - movq %rsp, %rdi -#ifndef __APPLE__ - callq sp_2048_sqr_32@plt -#else - callq _sp_2048_sqr_32 -#endif /* __APPLE__ */ - movq 1288(%rsp), %rsi - leaq 512(%rsp), %rdi - addq $0x100, %rsi -#ifndef __APPLE__ - callq sp_2048_sqr_32@plt -#else - callq _sp_2048_sqr_32 -#endif /* __APPLE__ */ - movq 1288(%rsp), %rsi - movq 1280(%rsp), %rdi -#ifndef __APPLE__ - callq sp_2048_sqr_32@plt -#else - callq _sp_2048_sqr_32 -#endif /* __APPLE__ */ -#ifdef _WIN64 - movq 1288(%rsp), %rsi - movq 1280(%rsp), %rdi -#endif /* _WIN64 */ - movq 1296(%rsp), %r10 - leaq 1024(%rsp), %r8 - movq %r10, %rcx - negq %r10 - movq (%r8), %rdx - movq 8(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 512(%rdi) - movq %rax, 520(%rdi) - movq 16(%r8), %rdx - movq 24(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 528(%rdi) - movq %rax, 536(%rdi) - movq 32(%r8), %rdx - movq 40(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 544(%rdi) - movq %rax, 552(%rdi) - movq 48(%r8), %rdx - movq 56(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 560(%rdi) - movq %rax, 568(%rdi) - movq 64(%r8), %rdx - movq 72(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 576(%rdi) - movq %rax, 584(%rdi) - movq 80(%r8), %rdx - movq 88(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 592(%rdi) - movq %rax, 600(%rdi) - movq 96(%r8), %rdx - movq 104(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 608(%rdi) - movq %rax, 616(%rdi) - movq 112(%r8), %rdx - movq 120(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 624(%rdi) - movq %rax, 632(%rdi) - movq 128(%r8), %rdx - movq 136(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 640(%rdi) - movq %rax, 648(%rdi) - movq 144(%r8), %rdx - movq 152(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 656(%rdi) - movq %rax, 664(%rdi) - movq 160(%r8), %rdx - movq 168(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 672(%rdi) - movq %rax, 680(%rdi) - movq 176(%r8), %rdx - movq 184(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 688(%rdi) - movq %rax, 696(%rdi) - movq 192(%r8), %rdx - movq 200(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 704(%rdi) - movq %rax, 712(%rdi) - movq 208(%r8), %rdx - movq 216(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 720(%rdi) - movq %rax, 728(%rdi) - movq 224(%r8), %rdx - movq 232(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 736(%rdi) - movq %rax, 744(%rdi) - movq 240(%r8), %rdx - movq 248(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 752(%rdi) - movq %rax, 760(%rdi) - movq 512(%rdi), %rdx - addq %rdx, %rdx - movq 520(%rdi), %rax - movq %rdx, 512(%rdi) - adcq %rax, %rax - movq 528(%rdi), %rdx - movq %rax, 520(%rdi) - adcq %rdx, %rdx - movq 536(%rdi), %rax - movq %rdx, 528(%rdi) - adcq %rax, %rax - movq 544(%rdi), %rdx - movq %rax, 536(%rdi) - adcq %rdx, %rdx - movq 552(%rdi), %rax - movq %rdx, 544(%rdi) - adcq %rax, %rax - movq 560(%rdi), %rdx - movq %rax, 552(%rdi) - adcq %rdx, %rdx - movq 568(%rdi), %rax - movq %rdx, 560(%rdi) - adcq %rax, %rax - movq 576(%rdi), %rdx - movq %rax, 568(%rdi) - adcq %rdx, %rdx - movq 584(%rdi), %rax - movq %rdx, 576(%rdi) - adcq %rax, %rax - movq 592(%rdi), %rdx - movq %rax, 584(%rdi) - adcq %rdx, %rdx - movq 600(%rdi), %rax - movq %rdx, 592(%rdi) - adcq %rax, %rax - movq 608(%rdi), %rdx - movq %rax, 600(%rdi) - adcq %rdx, %rdx - movq 616(%rdi), %rax - movq %rdx, 608(%rdi) - adcq %rax, %rax - movq 624(%rdi), %rdx - movq %rax, 616(%rdi) - adcq %rdx, %rdx - movq 632(%rdi), %rax - movq %rdx, 624(%rdi) - adcq %rax, %rax - movq 640(%rdi), %rdx - movq %rax, 632(%rdi) - adcq %rdx, %rdx - movq 648(%rdi), %rax - movq %rdx, 640(%rdi) - adcq %rax, %rax - movq 656(%rdi), %rdx - movq %rax, 648(%rdi) - adcq %rdx, %rdx - movq 664(%rdi), %rax - movq %rdx, 656(%rdi) - adcq %rax, %rax - movq 672(%rdi), %rdx - movq %rax, 664(%rdi) - adcq %rdx, %rdx - movq 680(%rdi), %rax - movq %rdx, 672(%rdi) - adcq %rax, %rax - movq 688(%rdi), %rdx - movq %rax, 680(%rdi) - adcq %rdx, %rdx - movq 696(%rdi), %rax - movq %rdx, 688(%rdi) - adcq %rax, %rax - movq 704(%rdi), %rdx - movq %rax, 696(%rdi) - adcq %rdx, %rdx - movq 712(%rdi), %rax - movq %rdx, 704(%rdi) - adcq %rax, %rax - movq 720(%rdi), %rdx - movq %rax, 712(%rdi) - adcq %rdx, %rdx - movq 728(%rdi), %rax - movq %rdx, 720(%rdi) - adcq %rax, %rax - movq 736(%rdi), %rdx - movq %rax, 728(%rdi) - adcq %rdx, %rdx - movq 744(%rdi), %rax - movq %rdx, 736(%rdi) - adcq %rax, %rax - movq 752(%rdi), %rdx - movq %rax, 744(%rdi) - adcq %rdx, %rdx - movq 760(%rdi), %rax - movq %rdx, 752(%rdi) - adcq %rax, %rax - movq %rax, 760(%rdi) - adcq $0x00, %rcx - leaq 512(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rsi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rsi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rsi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rsi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rsi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rsi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rsi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rsi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rsi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rsi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rsi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rsi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rsi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rsi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rsi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rsi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rsi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rsi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rsi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rsi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rsi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rsi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rsi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rsi), %rax - movq 384(%r8), %rdx - movq %rax, 376(%r8) - sbbq 384(%rsi), %rdx - movq 392(%r8), %rax - movq %rdx, 384(%r8) - sbbq 392(%rsi), %rax - movq 400(%r8), %rdx - movq %rax, 392(%r8) - sbbq 400(%rsi), %rdx - movq 408(%r8), %rax - movq %rdx, 400(%r8) - sbbq 408(%rsi), %rax - movq 416(%r8), %rdx - movq %rax, 408(%r8) - sbbq 416(%rsi), %rdx - movq 424(%r8), %rax - movq %rdx, 416(%r8) - sbbq 424(%rsi), %rax - movq 432(%r8), %rdx - movq %rax, 424(%r8) - sbbq 432(%rsi), %rdx - movq 440(%r8), %rax - movq %rdx, 432(%r8) - sbbq 440(%rsi), %rax - movq 448(%r8), %rdx - movq %rax, 440(%r8) - sbbq 448(%rsi), %rdx - movq 456(%r8), %rax - movq %rdx, 448(%r8) - sbbq 456(%rsi), %rax - movq 464(%r8), %rdx - movq %rax, 456(%r8) - sbbq 464(%rsi), %rdx - movq 472(%r8), %rax - movq %rdx, 464(%r8) - sbbq 472(%rsi), %rax - movq 480(%r8), %rdx - movq %rax, 472(%r8) - sbbq 480(%rsi), %rdx - movq 488(%r8), %rax - movq %rdx, 480(%r8) - sbbq 488(%rsi), %rax - movq 496(%r8), %rdx - movq %rax, 488(%r8) - sbbq 496(%rsi), %rdx - movq 504(%r8), %rax - movq %rdx, 496(%r8) - sbbq 504(%rsi), %rax - movq %rax, 504(%r8) - sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rdi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rdi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rdi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rdi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rdi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rdi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rdi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rdi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rdi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rdi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rdi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rdi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rdi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rdi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rdi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rdi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rdi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rdi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rdi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rdi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rdi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rdi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rdi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rdi), %rax - movq 384(%r8), %rdx - movq %rax, 376(%r8) - sbbq 384(%rdi), %rdx - movq 392(%r8), %rax - movq %rdx, 384(%r8) - sbbq 392(%rdi), %rax - movq 400(%r8), %rdx - movq %rax, 392(%r8) - sbbq 400(%rdi), %rdx - movq 408(%r8), %rax - movq %rdx, 400(%r8) - sbbq 408(%rdi), %rax - movq 416(%r8), %rdx - movq %rax, 408(%r8) - sbbq 416(%rdi), %rdx - movq 424(%r8), %rax - movq %rdx, 416(%r8) - sbbq 424(%rdi), %rax - movq 432(%r8), %rdx - movq %rax, 424(%r8) - sbbq 432(%rdi), %rdx - movq 440(%r8), %rax - movq %rdx, 432(%r8) - sbbq 440(%rdi), %rax - movq 448(%r8), %rdx - movq %rax, 440(%r8) - sbbq 448(%rdi), %rdx - movq 456(%r8), %rax - movq %rdx, 448(%r8) - sbbq 456(%rdi), %rax - movq 464(%r8), %rdx - movq %rax, 456(%r8) - sbbq 464(%rdi), %rdx - movq 472(%r8), %rax - movq %rdx, 464(%r8) - sbbq 472(%rdi), %rax - movq 480(%r8), %rdx - movq %rax, 472(%r8) - sbbq 480(%rdi), %rdx - movq 488(%r8), %rax - movq %rdx, 480(%r8) - sbbq 488(%rdi), %rax - movq 496(%r8), %rdx - movq %rax, 488(%r8) - sbbq 496(%rdi), %rdx - movq 504(%r8), %rax - movq %rdx, 496(%r8) - sbbq 504(%rdi), %rax - movq %rax, 504(%r8) - sbbq $0x00, %rcx - # Add in place - movq 256(%rdi), %rdx - addq (%r8), %rdx - movq 264(%rdi), %rax - movq %rdx, 256(%rdi) - adcq 8(%r8), %rax - movq 272(%rdi), %rdx - movq %rax, 264(%rdi) - adcq 16(%r8), %rdx - movq 280(%rdi), %rax - movq %rdx, 272(%rdi) - adcq 24(%r8), %rax - movq 288(%rdi), %rdx - movq %rax, 280(%rdi) - adcq 32(%r8), %rdx - movq 296(%rdi), %rax - movq %rdx, 288(%rdi) - adcq 40(%r8), %rax - movq 304(%rdi), %rdx - movq %rax, 296(%rdi) - adcq 48(%r8), %rdx - movq 312(%rdi), %rax - movq %rdx, 304(%rdi) - adcq 56(%r8), %rax - movq 320(%rdi), %rdx - movq %rax, 312(%rdi) - adcq 64(%r8), %rdx - movq 328(%rdi), %rax - movq %rdx, 320(%rdi) - adcq 72(%r8), %rax - movq 336(%rdi), %rdx - movq %rax, 328(%rdi) - adcq 80(%r8), %rdx - movq 344(%rdi), %rax - movq %rdx, 336(%rdi) - adcq 88(%r8), %rax - movq 352(%rdi), %rdx - movq %rax, 344(%rdi) - adcq 96(%r8), %rdx - movq 360(%rdi), %rax - movq %rdx, 352(%rdi) - adcq 104(%r8), %rax - movq 368(%rdi), %rdx - movq %rax, 360(%rdi) - adcq 112(%r8), %rdx - movq 376(%rdi), %rax - movq %rdx, 368(%rdi) - adcq 120(%r8), %rax - movq 384(%rdi), %rdx - movq %rax, 376(%rdi) - adcq 128(%r8), %rdx - movq 392(%rdi), %rax - movq %rdx, 384(%rdi) - adcq 136(%r8), %rax - movq 400(%rdi), %rdx - movq %rax, 392(%rdi) - adcq 144(%r8), %rdx - movq 408(%rdi), %rax - movq %rdx, 400(%rdi) - adcq 152(%r8), %rax - movq 416(%rdi), %rdx - movq %rax, 408(%rdi) - adcq 160(%r8), %rdx - movq 424(%rdi), %rax - movq %rdx, 416(%rdi) - adcq 168(%r8), %rax - movq 432(%rdi), %rdx - movq %rax, 424(%rdi) - adcq 176(%r8), %rdx - movq 440(%rdi), %rax - movq %rdx, 432(%rdi) - adcq 184(%r8), %rax - movq 448(%rdi), %rdx - movq %rax, 440(%rdi) - adcq 192(%r8), %rdx - movq 456(%rdi), %rax - movq %rdx, 448(%rdi) - adcq 200(%r8), %rax - movq 464(%rdi), %rdx - movq %rax, 456(%rdi) - adcq 208(%r8), %rdx - movq 472(%rdi), %rax - movq %rdx, 464(%rdi) - adcq 216(%r8), %rax - movq 480(%rdi), %rdx - movq %rax, 472(%rdi) - adcq 224(%r8), %rdx - movq 488(%rdi), %rax - movq %rdx, 480(%rdi) - adcq 232(%r8), %rax - movq 496(%rdi), %rdx - movq %rax, 488(%rdi) - adcq 240(%r8), %rdx - movq 504(%rdi), %rax - movq %rdx, 496(%rdi) - adcq 248(%r8), %rax - movq 512(%rdi), %rdx - movq %rax, 504(%rdi) - adcq 256(%r8), %rdx - movq 520(%rdi), %rax - movq %rdx, 512(%rdi) - adcq 264(%r8), %rax - movq 528(%rdi), %rdx - movq %rax, 520(%rdi) - adcq 272(%r8), %rdx - movq 536(%rdi), %rax - movq %rdx, 528(%rdi) - adcq 280(%r8), %rax - movq 544(%rdi), %rdx - movq %rax, 536(%rdi) - adcq 288(%r8), %rdx - movq 552(%rdi), %rax - movq %rdx, 544(%rdi) - adcq 296(%r8), %rax - movq 560(%rdi), %rdx - movq %rax, 552(%rdi) - adcq 304(%r8), %rdx - movq 568(%rdi), %rax - movq %rdx, 560(%rdi) - adcq 312(%r8), %rax - movq 576(%rdi), %rdx - movq %rax, 568(%rdi) - adcq 320(%r8), %rdx - movq 584(%rdi), %rax - movq %rdx, 576(%rdi) - adcq 328(%r8), %rax - movq 592(%rdi), %rdx - movq %rax, 584(%rdi) - adcq 336(%r8), %rdx - movq 600(%rdi), %rax - movq %rdx, 592(%rdi) - adcq 344(%r8), %rax - movq 608(%rdi), %rdx - movq %rax, 600(%rdi) - adcq 352(%r8), %rdx - movq 616(%rdi), %rax - movq %rdx, 608(%rdi) - adcq 360(%r8), %rax - movq 624(%rdi), %rdx - movq %rax, 616(%rdi) - adcq 368(%r8), %rdx - movq 632(%rdi), %rax - movq %rdx, 624(%rdi) - adcq 376(%r8), %rax - movq 640(%rdi), %rdx - movq %rax, 632(%rdi) - adcq 384(%r8), %rdx - movq 648(%rdi), %rax - movq %rdx, 640(%rdi) - adcq 392(%r8), %rax - movq 656(%rdi), %rdx - movq %rax, 648(%rdi) - adcq 400(%r8), %rdx - movq 664(%rdi), %rax - movq %rdx, 656(%rdi) - adcq 408(%r8), %rax - movq 672(%rdi), %rdx - movq %rax, 664(%rdi) - adcq 416(%r8), %rdx - movq 680(%rdi), %rax - movq %rdx, 672(%rdi) - adcq 424(%r8), %rax - movq 688(%rdi), %rdx - movq %rax, 680(%rdi) - adcq 432(%r8), %rdx - movq 696(%rdi), %rax - movq %rdx, 688(%rdi) - adcq 440(%r8), %rax - movq 704(%rdi), %rdx - movq %rax, 696(%rdi) - adcq 448(%r8), %rdx - movq 712(%rdi), %rax - movq %rdx, 704(%rdi) - adcq 456(%r8), %rax - movq 720(%rdi), %rdx - movq %rax, 712(%rdi) - adcq 464(%r8), %rdx - movq 728(%rdi), %rax - movq %rdx, 720(%rdi) - adcq 472(%r8), %rax - movq 736(%rdi), %rdx - movq %rax, 728(%rdi) - adcq 480(%r8), %rdx - movq 744(%rdi), %rax - movq %rdx, 736(%rdi) - adcq 488(%r8), %rax - movq 752(%rdi), %rdx - movq %rax, 744(%rdi) - adcq 496(%r8), %rdx - movq 760(%rdi), %rax - movq %rdx, 752(%rdi) - adcq 504(%r8), %rax - movq %rax, 760(%rdi) - adcq $0x00, %rcx - movq %rcx, 768(%rdi) - # Add in place - movq 512(%rdi), %rdx - xorq %rcx, %rcx - addq (%rsi), %rdx - movq 520(%rdi), %rax - movq %rdx, 512(%rdi) - adcq 8(%rsi), %rax - movq 528(%rdi), %rdx - movq %rax, 520(%rdi) - adcq 16(%rsi), %rdx - movq 536(%rdi), %rax - movq %rdx, 528(%rdi) - adcq 24(%rsi), %rax - movq 544(%rdi), %rdx - movq %rax, 536(%rdi) - adcq 32(%rsi), %rdx - movq 552(%rdi), %rax - movq %rdx, 544(%rdi) - adcq 40(%rsi), %rax - movq 560(%rdi), %rdx - movq %rax, 552(%rdi) - adcq 48(%rsi), %rdx - movq 568(%rdi), %rax - movq %rdx, 560(%rdi) - adcq 56(%rsi), %rax - movq 576(%rdi), %rdx - movq %rax, 568(%rdi) - adcq 64(%rsi), %rdx - movq 584(%rdi), %rax - movq %rdx, 576(%rdi) - adcq 72(%rsi), %rax - movq 592(%rdi), %rdx - movq %rax, 584(%rdi) - adcq 80(%rsi), %rdx - movq 600(%rdi), %rax - movq %rdx, 592(%rdi) - adcq 88(%rsi), %rax - movq 608(%rdi), %rdx - movq %rax, 600(%rdi) - adcq 96(%rsi), %rdx - movq 616(%rdi), %rax - movq %rdx, 608(%rdi) - adcq 104(%rsi), %rax - movq 624(%rdi), %rdx - movq %rax, 616(%rdi) - adcq 112(%rsi), %rdx - movq 632(%rdi), %rax - movq %rdx, 624(%rdi) - adcq 120(%rsi), %rax - movq 640(%rdi), %rdx - movq %rax, 632(%rdi) - adcq 128(%rsi), %rdx - movq 648(%rdi), %rax - movq %rdx, 640(%rdi) - adcq 136(%rsi), %rax - movq 656(%rdi), %rdx - movq %rax, 648(%rdi) - adcq 144(%rsi), %rdx - movq 664(%rdi), %rax - movq %rdx, 656(%rdi) - adcq 152(%rsi), %rax - movq 672(%rdi), %rdx - movq %rax, 664(%rdi) - adcq 160(%rsi), %rdx - movq 680(%rdi), %rax - movq %rdx, 672(%rdi) - adcq 168(%rsi), %rax - movq 688(%rdi), %rdx - movq %rax, 680(%rdi) - adcq 176(%rsi), %rdx - movq 696(%rdi), %rax - movq %rdx, 688(%rdi) - adcq 184(%rsi), %rax - movq 704(%rdi), %rdx - movq %rax, 696(%rdi) - adcq 192(%rsi), %rdx - movq 712(%rdi), %rax - movq %rdx, 704(%rdi) - adcq 200(%rsi), %rax - movq 720(%rdi), %rdx - movq %rax, 712(%rdi) - adcq 208(%rsi), %rdx - movq 728(%rdi), %rax - movq %rdx, 720(%rdi) - adcq 216(%rsi), %rax - movq 736(%rdi), %rdx - movq %rax, 728(%rdi) - adcq 224(%rsi), %rdx - movq 744(%rdi), %rax - movq %rdx, 736(%rdi) - adcq 232(%rsi), %rax - movq 752(%rdi), %rdx - movq %rax, 744(%rdi) - adcq 240(%rsi), %rdx - movq 760(%rdi), %rax - movq %rdx, 752(%rdi) - adcq 248(%rsi), %rax - movq 768(%rdi), %rdx - movq %rax, 760(%rdi) - adcq 256(%rsi), %rdx - movq %rdx, 768(%rdi) - adcq $0x00, %rcx - # Add to zero - movq 264(%rsi), %rdx - adcq $0x00, %rdx - movq 272(%rsi), %rax - movq %rdx, 776(%rdi) - adcq $0x00, %rax - movq 280(%rsi), %rdx - movq %rax, 784(%rdi) - adcq $0x00, %rdx - movq 288(%rsi), %rax - movq %rdx, 792(%rdi) - adcq $0x00, %rax - movq 296(%rsi), %rdx - movq %rax, 800(%rdi) - adcq $0x00, %rdx - movq 304(%rsi), %rax - movq %rdx, 808(%rdi) - adcq $0x00, %rax - movq 312(%rsi), %rdx - movq %rax, 816(%rdi) - adcq $0x00, %rdx - movq 320(%rsi), %rax - movq %rdx, 824(%rdi) - adcq $0x00, %rax - movq 328(%rsi), %rdx - movq %rax, 832(%rdi) - adcq $0x00, %rdx - movq 336(%rsi), %rax - movq %rdx, 840(%rdi) - adcq $0x00, %rax - movq 344(%rsi), %rdx - movq %rax, 848(%rdi) - adcq $0x00, %rdx - movq 352(%rsi), %rax - movq %rdx, 856(%rdi) - adcq $0x00, %rax - movq 360(%rsi), %rdx - movq %rax, 864(%rdi) - adcq $0x00, %rdx - movq 368(%rsi), %rax - movq %rdx, 872(%rdi) - adcq $0x00, %rax - movq 376(%rsi), %rdx - movq %rax, 880(%rdi) - adcq $0x00, %rdx - movq 384(%rsi), %rax - movq %rdx, 888(%rdi) - adcq $0x00, %rax - movq 392(%rsi), %rdx - movq %rax, 896(%rdi) - adcq $0x00, %rdx - movq 400(%rsi), %rax - movq %rdx, 904(%rdi) - adcq $0x00, %rax - movq 408(%rsi), %rdx - movq %rax, 912(%rdi) - adcq $0x00, %rdx - movq 416(%rsi), %rax - movq %rdx, 920(%rdi) - adcq $0x00, %rax - movq 424(%rsi), %rdx - movq %rax, 928(%rdi) - adcq $0x00, %rdx - movq 432(%rsi), %rax - movq %rdx, 936(%rdi) - adcq $0x00, %rax - movq 440(%rsi), %rdx - movq %rax, 944(%rdi) - adcq $0x00, %rdx - movq 448(%rsi), %rax - movq %rdx, 952(%rdi) - adcq $0x00, %rax - movq 456(%rsi), %rdx - movq %rax, 960(%rdi) - adcq $0x00, %rdx - movq 464(%rsi), %rax - movq %rdx, 968(%rdi) - adcq $0x00, %rax - movq 472(%rsi), %rdx - movq %rax, 976(%rdi) - adcq $0x00, %rdx - movq 480(%rsi), %rax - movq %rdx, 984(%rdi) - adcq $0x00, %rax - movq 488(%rsi), %rdx - movq %rax, 992(%rdi) - adcq $0x00, %rdx - movq 496(%rsi), %rax - movq %rdx, 1000(%rdi) - adcq $0x00, %rax - movq 504(%rsi), %rdx - movq %rax, 1008(%rdi) - adcq $0x00, %rdx - movq %rdx, 1016(%rdi) - addq $0x518, %rsp - repz retq -#ifndef __APPLE__ -.size sp_4096_sqr_64,.-sp_4096_sqr_64 -#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Multiply a and b into r. (r = a * b) * @@ -32937,6 +31689,1254 @@ _sp_4096_mul_avx2_64: .size sp_4096_mul_avx2_64,.-sp_4096_mul_avx2_64 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +/* Add a to a into r. (r = a + a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_2048_dbl_32 +.type sp_2048_dbl_32,@function +.align 16 +sp_2048_dbl_32: +#else +.section __TEXT,__text +.globl _sp_2048_dbl_32 +.p2align 4 +_sp_2048_dbl_32: +#endif /* __APPLE__ */ + movq (%rsi), %rdx + xorq %rax, %rax + addq %rdx, %rdx + movq 8(%rsi), %rcx + movq %rdx, (%rdi) + adcq %rcx, %rcx + movq 16(%rsi), %rdx + movq %rcx, 8(%rdi) + adcq %rdx, %rdx + movq 24(%rsi), %rcx + movq %rdx, 16(%rdi) + adcq %rcx, %rcx + movq 32(%rsi), %rdx + movq %rcx, 24(%rdi) + adcq %rdx, %rdx + movq 40(%rsi), %rcx + movq %rdx, 32(%rdi) + adcq %rcx, %rcx + movq 48(%rsi), %rdx + movq %rcx, 40(%rdi) + adcq %rdx, %rdx + movq 56(%rsi), %rcx + movq %rdx, 48(%rdi) + adcq %rcx, %rcx + movq 64(%rsi), %rdx + movq %rcx, 56(%rdi) + adcq %rdx, %rdx + movq 72(%rsi), %rcx + movq %rdx, 64(%rdi) + adcq %rcx, %rcx + movq 80(%rsi), %rdx + movq %rcx, 72(%rdi) + adcq %rdx, %rdx + movq 88(%rsi), %rcx + movq %rdx, 80(%rdi) + adcq %rcx, %rcx + movq 96(%rsi), %rdx + movq %rcx, 88(%rdi) + adcq %rdx, %rdx + movq 104(%rsi), %rcx + movq %rdx, 96(%rdi) + adcq %rcx, %rcx + movq 112(%rsi), %rdx + movq %rcx, 104(%rdi) + adcq %rdx, %rdx + movq 120(%rsi), %rcx + movq %rdx, 112(%rdi) + adcq %rcx, %rcx + movq 128(%rsi), %rdx + movq %rcx, 120(%rdi) + adcq %rdx, %rdx + movq 136(%rsi), %rcx + movq %rdx, 128(%rdi) + adcq %rcx, %rcx + movq 144(%rsi), %rdx + movq %rcx, 136(%rdi) + adcq %rdx, %rdx + movq 152(%rsi), %rcx + movq %rdx, 144(%rdi) + adcq %rcx, %rcx + movq 160(%rsi), %rdx + movq %rcx, 152(%rdi) + adcq %rdx, %rdx + movq 168(%rsi), %rcx + movq %rdx, 160(%rdi) + adcq %rcx, %rcx + movq 176(%rsi), %rdx + movq %rcx, 168(%rdi) + adcq %rdx, %rdx + movq 184(%rsi), %rcx + movq %rdx, 176(%rdi) + adcq %rcx, %rcx + movq 192(%rsi), %rdx + movq %rcx, 184(%rdi) + adcq %rdx, %rdx + movq 200(%rsi), %rcx + movq %rdx, 192(%rdi) + adcq %rcx, %rcx + movq 208(%rsi), %rdx + movq %rcx, 200(%rdi) + adcq %rdx, %rdx + movq 216(%rsi), %rcx + movq %rdx, 208(%rdi) + adcq %rcx, %rcx + movq 224(%rsi), %rdx + movq %rcx, 216(%rdi) + adcq %rdx, %rdx + movq 232(%rsi), %rcx + movq %rdx, 224(%rdi) + adcq %rcx, %rcx + movq 240(%rsi), %rdx + movq %rcx, 232(%rdi) + adcq %rdx, %rdx + movq 248(%rsi), %rcx + movq %rdx, 240(%rdi) + adcq %rcx, %rcx + movq %rcx, 248(%rdi) + adcq $0x00, %rax + repz retq +#ifndef __APPLE__ +.size sp_2048_dbl_32,.-sp_2048_dbl_32 +#endif /* __APPLE__ */ +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_4096_sqr_64 +.type sp_4096_sqr_64,@function +.align 16 +sp_4096_sqr_64: +#else +.section __TEXT,__text +.globl _sp_4096_sqr_64 +.p2align 4 +_sp_4096_sqr_64: +#endif /* __APPLE__ */ + subq $0x518, %rsp + movq %rdi, 1280(%rsp) + movq %rsi, 1288(%rsp) + leaq 1024(%rsp), %r8 + leaq 256(%rsi), %r9 + # Add + movq (%rsi), %rdx + xorq %rcx, %rcx + addq (%r9), %rdx + movq 8(%rsi), %rax + movq %rdx, (%r8) + adcq 8(%r9), %rax + movq 16(%rsi), %rdx + movq %rax, 8(%r8) + adcq 16(%r9), %rdx + movq 24(%rsi), %rax + movq %rdx, 16(%r8) + adcq 24(%r9), %rax + movq 32(%rsi), %rdx + movq %rax, 24(%r8) + adcq 32(%r9), %rdx + movq 40(%rsi), %rax + movq %rdx, 32(%r8) + adcq 40(%r9), %rax + movq 48(%rsi), %rdx + movq %rax, 40(%r8) + adcq 48(%r9), %rdx + movq 56(%rsi), %rax + movq %rdx, 48(%r8) + adcq 56(%r9), %rax + movq 64(%rsi), %rdx + movq %rax, 56(%r8) + adcq 64(%r9), %rdx + movq 72(%rsi), %rax + movq %rdx, 64(%r8) + adcq 72(%r9), %rax + movq 80(%rsi), %rdx + movq %rax, 72(%r8) + adcq 80(%r9), %rdx + movq 88(%rsi), %rax + movq %rdx, 80(%r8) + adcq 88(%r9), %rax + movq 96(%rsi), %rdx + movq %rax, 88(%r8) + adcq 96(%r9), %rdx + movq 104(%rsi), %rax + movq %rdx, 96(%r8) + adcq 104(%r9), %rax + movq 112(%rsi), %rdx + movq %rax, 104(%r8) + adcq 112(%r9), %rdx + movq 120(%rsi), %rax + movq %rdx, 112(%r8) + adcq 120(%r9), %rax + movq 128(%rsi), %rdx + movq %rax, 120(%r8) + adcq 128(%r9), %rdx + movq 136(%rsi), %rax + movq %rdx, 128(%r8) + adcq 136(%r9), %rax + movq 144(%rsi), %rdx + movq %rax, 136(%r8) + adcq 144(%r9), %rdx + movq 152(%rsi), %rax + movq %rdx, 144(%r8) + adcq 152(%r9), %rax + movq 160(%rsi), %rdx + movq %rax, 152(%r8) + adcq 160(%r9), %rdx + movq 168(%rsi), %rax + movq %rdx, 160(%r8) + adcq 168(%r9), %rax + movq 176(%rsi), %rdx + movq %rax, 168(%r8) + adcq 176(%r9), %rdx + movq 184(%rsi), %rax + movq %rdx, 176(%r8) + adcq 184(%r9), %rax + movq 192(%rsi), %rdx + movq %rax, 184(%r8) + adcq 192(%r9), %rdx + movq 200(%rsi), %rax + movq %rdx, 192(%r8) + adcq 200(%r9), %rax + movq 208(%rsi), %rdx + movq %rax, 200(%r8) + adcq 208(%r9), %rdx + movq 216(%rsi), %rax + movq %rdx, 208(%r8) + adcq 216(%r9), %rax + movq 224(%rsi), %rdx + movq %rax, 216(%r8) + adcq 224(%r9), %rdx + movq 232(%rsi), %rax + movq %rdx, 224(%r8) + adcq 232(%r9), %rax + movq 240(%rsi), %rdx + movq %rax, 232(%r8) + adcq 240(%r9), %rdx + movq 248(%rsi), %rax + movq %rdx, 240(%r8) + adcq 248(%r9), %rax + movq %rax, 248(%r8) + adcq $0x00, %rcx + movq %rcx, 1296(%rsp) + movq %r8, %rsi + movq %rsp, %rdi +#ifndef __APPLE__ + callq sp_2048_sqr_32@plt +#else + callq _sp_2048_sqr_32 +#endif /* __APPLE__ */ + movq 1288(%rsp), %rsi + leaq 512(%rsp), %rdi + addq $0x100, %rsi +#ifndef __APPLE__ + callq sp_2048_sqr_32@plt +#else + callq _sp_2048_sqr_32 +#endif /* __APPLE__ */ + movq 1288(%rsp), %rsi + movq 1280(%rsp), %rdi +#ifndef __APPLE__ + callq sp_2048_sqr_32@plt +#else + callq _sp_2048_sqr_32 +#endif /* __APPLE__ */ +#ifdef _WIN64 + movq 1288(%rsp), %rsi + movq 1280(%rsp), %rdi +#endif /* _WIN64 */ + movq 1296(%rsp), %r10 + leaq 1024(%rsp), %r8 + movq %r10, %rcx + negq %r10 + movq (%r8), %rdx + movq 8(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 512(%rdi) + movq %rax, 520(%rdi) + movq 16(%r8), %rdx + movq 24(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 528(%rdi) + movq %rax, 536(%rdi) + movq 32(%r8), %rdx + movq 40(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 544(%rdi) + movq %rax, 552(%rdi) + movq 48(%r8), %rdx + movq 56(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 560(%rdi) + movq %rax, 568(%rdi) + movq 64(%r8), %rdx + movq 72(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 576(%rdi) + movq %rax, 584(%rdi) + movq 80(%r8), %rdx + movq 88(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 592(%rdi) + movq %rax, 600(%rdi) + movq 96(%r8), %rdx + movq 104(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 608(%rdi) + movq %rax, 616(%rdi) + movq 112(%r8), %rdx + movq 120(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 624(%rdi) + movq %rax, 632(%rdi) + movq 128(%r8), %rdx + movq 136(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 640(%rdi) + movq %rax, 648(%rdi) + movq 144(%r8), %rdx + movq 152(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 656(%rdi) + movq %rax, 664(%rdi) + movq 160(%r8), %rdx + movq 168(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 672(%rdi) + movq %rax, 680(%rdi) + movq 176(%r8), %rdx + movq 184(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 688(%rdi) + movq %rax, 696(%rdi) + movq 192(%r8), %rdx + movq 200(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 704(%rdi) + movq %rax, 712(%rdi) + movq 208(%r8), %rdx + movq 216(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 720(%rdi) + movq %rax, 728(%rdi) + movq 224(%r8), %rdx + movq 232(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 736(%rdi) + movq %rax, 744(%rdi) + movq 240(%r8), %rdx + movq 248(%r8), %rax + andq %r10, %rdx + andq %r10, %rax + movq %rdx, 752(%rdi) + movq %rax, 760(%rdi) + movq 512(%rdi), %rdx + addq %rdx, %rdx + movq 520(%rdi), %rax + movq %rdx, 512(%rdi) + adcq %rax, %rax + movq 528(%rdi), %rdx + movq %rax, 520(%rdi) + adcq %rdx, %rdx + movq 536(%rdi), %rax + movq %rdx, 528(%rdi) + adcq %rax, %rax + movq 544(%rdi), %rdx + movq %rax, 536(%rdi) + adcq %rdx, %rdx + movq 552(%rdi), %rax + movq %rdx, 544(%rdi) + adcq %rax, %rax + movq 560(%rdi), %rdx + movq %rax, 552(%rdi) + adcq %rdx, %rdx + movq 568(%rdi), %rax + movq %rdx, 560(%rdi) + adcq %rax, %rax + movq 576(%rdi), %rdx + movq %rax, 568(%rdi) + adcq %rdx, %rdx + movq 584(%rdi), %rax + movq %rdx, 576(%rdi) + adcq %rax, %rax + movq 592(%rdi), %rdx + movq %rax, 584(%rdi) + adcq %rdx, %rdx + movq 600(%rdi), %rax + movq %rdx, 592(%rdi) + adcq %rax, %rax + movq 608(%rdi), %rdx + movq %rax, 600(%rdi) + adcq %rdx, %rdx + movq 616(%rdi), %rax + movq %rdx, 608(%rdi) + adcq %rax, %rax + movq 624(%rdi), %rdx + movq %rax, 616(%rdi) + adcq %rdx, %rdx + movq 632(%rdi), %rax + movq %rdx, 624(%rdi) + adcq %rax, %rax + movq 640(%rdi), %rdx + movq %rax, 632(%rdi) + adcq %rdx, %rdx + movq 648(%rdi), %rax + movq %rdx, 640(%rdi) + adcq %rax, %rax + movq 656(%rdi), %rdx + movq %rax, 648(%rdi) + adcq %rdx, %rdx + movq 664(%rdi), %rax + movq %rdx, 656(%rdi) + adcq %rax, %rax + movq 672(%rdi), %rdx + movq %rax, 664(%rdi) + adcq %rdx, %rdx + movq 680(%rdi), %rax + movq %rdx, 672(%rdi) + adcq %rax, %rax + movq 688(%rdi), %rdx + movq %rax, 680(%rdi) + adcq %rdx, %rdx + movq 696(%rdi), %rax + movq %rdx, 688(%rdi) + adcq %rax, %rax + movq 704(%rdi), %rdx + movq %rax, 696(%rdi) + adcq %rdx, %rdx + movq 712(%rdi), %rax + movq %rdx, 704(%rdi) + adcq %rax, %rax + movq 720(%rdi), %rdx + movq %rax, 712(%rdi) + adcq %rdx, %rdx + movq 728(%rdi), %rax + movq %rdx, 720(%rdi) + adcq %rax, %rax + movq 736(%rdi), %rdx + movq %rax, 728(%rdi) + adcq %rdx, %rdx + movq 744(%rdi), %rax + movq %rdx, 736(%rdi) + adcq %rax, %rax + movq 752(%rdi), %rdx + movq %rax, 744(%rdi) + adcq %rdx, %rdx + movq 760(%rdi), %rax + movq %rdx, 752(%rdi) + adcq %rax, %rax + movq %rax, 760(%rdi) + adcq $0x00, %rcx + leaq 512(%rsp), %rsi + movq %rsp, %r8 + movq (%r8), %rdx + subq (%rsi), %rdx + movq 8(%r8), %rax + movq %rdx, (%r8) + sbbq 8(%rsi), %rax + movq 16(%r8), %rdx + movq %rax, 8(%r8) + sbbq 16(%rsi), %rdx + movq 24(%r8), %rax + movq %rdx, 16(%r8) + sbbq 24(%rsi), %rax + movq 32(%r8), %rdx + movq %rax, 24(%r8) + sbbq 32(%rsi), %rdx + movq 40(%r8), %rax + movq %rdx, 32(%r8) + sbbq 40(%rsi), %rax + movq 48(%r8), %rdx + movq %rax, 40(%r8) + sbbq 48(%rsi), %rdx + movq 56(%r8), %rax + movq %rdx, 48(%r8) + sbbq 56(%rsi), %rax + movq 64(%r8), %rdx + movq %rax, 56(%r8) + sbbq 64(%rsi), %rdx + movq 72(%r8), %rax + movq %rdx, 64(%r8) + sbbq 72(%rsi), %rax + movq 80(%r8), %rdx + movq %rax, 72(%r8) + sbbq 80(%rsi), %rdx + movq 88(%r8), %rax + movq %rdx, 80(%r8) + sbbq 88(%rsi), %rax + movq 96(%r8), %rdx + movq %rax, 88(%r8) + sbbq 96(%rsi), %rdx + movq 104(%r8), %rax + movq %rdx, 96(%r8) + sbbq 104(%rsi), %rax + movq 112(%r8), %rdx + movq %rax, 104(%r8) + sbbq 112(%rsi), %rdx + movq 120(%r8), %rax + movq %rdx, 112(%r8) + sbbq 120(%rsi), %rax + movq 128(%r8), %rdx + movq %rax, 120(%r8) + sbbq 128(%rsi), %rdx + movq 136(%r8), %rax + movq %rdx, 128(%r8) + sbbq 136(%rsi), %rax + movq 144(%r8), %rdx + movq %rax, 136(%r8) + sbbq 144(%rsi), %rdx + movq 152(%r8), %rax + movq %rdx, 144(%r8) + sbbq 152(%rsi), %rax + movq 160(%r8), %rdx + movq %rax, 152(%r8) + sbbq 160(%rsi), %rdx + movq 168(%r8), %rax + movq %rdx, 160(%r8) + sbbq 168(%rsi), %rax + movq 176(%r8), %rdx + movq %rax, 168(%r8) + sbbq 176(%rsi), %rdx + movq 184(%r8), %rax + movq %rdx, 176(%r8) + sbbq 184(%rsi), %rax + movq 192(%r8), %rdx + movq %rax, 184(%r8) + sbbq 192(%rsi), %rdx + movq 200(%r8), %rax + movq %rdx, 192(%r8) + sbbq 200(%rsi), %rax + movq 208(%r8), %rdx + movq %rax, 200(%r8) + sbbq 208(%rsi), %rdx + movq 216(%r8), %rax + movq %rdx, 208(%r8) + sbbq 216(%rsi), %rax + movq 224(%r8), %rdx + movq %rax, 216(%r8) + sbbq 224(%rsi), %rdx + movq 232(%r8), %rax + movq %rdx, 224(%r8) + sbbq 232(%rsi), %rax + movq 240(%r8), %rdx + movq %rax, 232(%r8) + sbbq 240(%rsi), %rdx + movq 248(%r8), %rax + movq %rdx, 240(%r8) + sbbq 248(%rsi), %rax + movq 256(%r8), %rdx + movq %rax, 248(%r8) + sbbq 256(%rsi), %rdx + movq 264(%r8), %rax + movq %rdx, 256(%r8) + sbbq 264(%rsi), %rax + movq 272(%r8), %rdx + movq %rax, 264(%r8) + sbbq 272(%rsi), %rdx + movq 280(%r8), %rax + movq %rdx, 272(%r8) + sbbq 280(%rsi), %rax + movq 288(%r8), %rdx + movq %rax, 280(%r8) + sbbq 288(%rsi), %rdx + movq 296(%r8), %rax + movq %rdx, 288(%r8) + sbbq 296(%rsi), %rax + movq 304(%r8), %rdx + movq %rax, 296(%r8) + sbbq 304(%rsi), %rdx + movq 312(%r8), %rax + movq %rdx, 304(%r8) + sbbq 312(%rsi), %rax + movq 320(%r8), %rdx + movq %rax, 312(%r8) + sbbq 320(%rsi), %rdx + movq 328(%r8), %rax + movq %rdx, 320(%r8) + sbbq 328(%rsi), %rax + movq 336(%r8), %rdx + movq %rax, 328(%r8) + sbbq 336(%rsi), %rdx + movq 344(%r8), %rax + movq %rdx, 336(%r8) + sbbq 344(%rsi), %rax + movq 352(%r8), %rdx + movq %rax, 344(%r8) + sbbq 352(%rsi), %rdx + movq 360(%r8), %rax + movq %rdx, 352(%r8) + sbbq 360(%rsi), %rax + movq 368(%r8), %rdx + movq %rax, 360(%r8) + sbbq 368(%rsi), %rdx + movq 376(%r8), %rax + movq %rdx, 368(%r8) + sbbq 376(%rsi), %rax + movq 384(%r8), %rdx + movq %rax, 376(%r8) + sbbq 384(%rsi), %rdx + movq 392(%r8), %rax + movq %rdx, 384(%r8) + sbbq 392(%rsi), %rax + movq 400(%r8), %rdx + movq %rax, 392(%r8) + sbbq 400(%rsi), %rdx + movq 408(%r8), %rax + movq %rdx, 400(%r8) + sbbq 408(%rsi), %rax + movq 416(%r8), %rdx + movq %rax, 408(%r8) + sbbq 416(%rsi), %rdx + movq 424(%r8), %rax + movq %rdx, 416(%r8) + sbbq 424(%rsi), %rax + movq 432(%r8), %rdx + movq %rax, 424(%r8) + sbbq 432(%rsi), %rdx + movq 440(%r8), %rax + movq %rdx, 432(%r8) + sbbq 440(%rsi), %rax + movq 448(%r8), %rdx + movq %rax, 440(%r8) + sbbq 448(%rsi), %rdx + movq 456(%r8), %rax + movq %rdx, 448(%r8) + sbbq 456(%rsi), %rax + movq 464(%r8), %rdx + movq %rax, 456(%r8) + sbbq 464(%rsi), %rdx + movq 472(%r8), %rax + movq %rdx, 464(%r8) + sbbq 472(%rsi), %rax + movq 480(%r8), %rdx + movq %rax, 472(%r8) + sbbq 480(%rsi), %rdx + movq 488(%r8), %rax + movq %rdx, 480(%r8) + sbbq 488(%rsi), %rax + movq 496(%r8), %rdx + movq %rax, 488(%r8) + sbbq 496(%rsi), %rdx + movq 504(%r8), %rax + movq %rdx, 496(%r8) + sbbq 504(%rsi), %rax + movq %rax, 504(%r8) + sbbq $0x00, %rcx + movq (%r8), %rdx + subq (%rdi), %rdx + movq 8(%r8), %rax + movq %rdx, (%r8) + sbbq 8(%rdi), %rax + movq 16(%r8), %rdx + movq %rax, 8(%r8) + sbbq 16(%rdi), %rdx + movq 24(%r8), %rax + movq %rdx, 16(%r8) + sbbq 24(%rdi), %rax + movq 32(%r8), %rdx + movq %rax, 24(%r8) + sbbq 32(%rdi), %rdx + movq 40(%r8), %rax + movq %rdx, 32(%r8) + sbbq 40(%rdi), %rax + movq 48(%r8), %rdx + movq %rax, 40(%r8) + sbbq 48(%rdi), %rdx + movq 56(%r8), %rax + movq %rdx, 48(%r8) + sbbq 56(%rdi), %rax + movq 64(%r8), %rdx + movq %rax, 56(%r8) + sbbq 64(%rdi), %rdx + movq 72(%r8), %rax + movq %rdx, 64(%r8) + sbbq 72(%rdi), %rax + movq 80(%r8), %rdx + movq %rax, 72(%r8) + sbbq 80(%rdi), %rdx + movq 88(%r8), %rax + movq %rdx, 80(%r8) + sbbq 88(%rdi), %rax + movq 96(%r8), %rdx + movq %rax, 88(%r8) + sbbq 96(%rdi), %rdx + movq 104(%r8), %rax + movq %rdx, 96(%r8) + sbbq 104(%rdi), %rax + movq 112(%r8), %rdx + movq %rax, 104(%r8) + sbbq 112(%rdi), %rdx + movq 120(%r8), %rax + movq %rdx, 112(%r8) + sbbq 120(%rdi), %rax + movq 128(%r8), %rdx + movq %rax, 120(%r8) + sbbq 128(%rdi), %rdx + movq 136(%r8), %rax + movq %rdx, 128(%r8) + sbbq 136(%rdi), %rax + movq 144(%r8), %rdx + movq %rax, 136(%r8) + sbbq 144(%rdi), %rdx + movq 152(%r8), %rax + movq %rdx, 144(%r8) + sbbq 152(%rdi), %rax + movq 160(%r8), %rdx + movq %rax, 152(%r8) + sbbq 160(%rdi), %rdx + movq 168(%r8), %rax + movq %rdx, 160(%r8) + sbbq 168(%rdi), %rax + movq 176(%r8), %rdx + movq %rax, 168(%r8) + sbbq 176(%rdi), %rdx + movq 184(%r8), %rax + movq %rdx, 176(%r8) + sbbq 184(%rdi), %rax + movq 192(%r8), %rdx + movq %rax, 184(%r8) + sbbq 192(%rdi), %rdx + movq 200(%r8), %rax + movq %rdx, 192(%r8) + sbbq 200(%rdi), %rax + movq 208(%r8), %rdx + movq %rax, 200(%r8) + sbbq 208(%rdi), %rdx + movq 216(%r8), %rax + movq %rdx, 208(%r8) + sbbq 216(%rdi), %rax + movq 224(%r8), %rdx + movq %rax, 216(%r8) + sbbq 224(%rdi), %rdx + movq 232(%r8), %rax + movq %rdx, 224(%r8) + sbbq 232(%rdi), %rax + movq 240(%r8), %rdx + movq %rax, 232(%r8) + sbbq 240(%rdi), %rdx + movq 248(%r8), %rax + movq %rdx, 240(%r8) + sbbq 248(%rdi), %rax + movq 256(%r8), %rdx + movq %rax, 248(%r8) + sbbq 256(%rdi), %rdx + movq 264(%r8), %rax + movq %rdx, 256(%r8) + sbbq 264(%rdi), %rax + movq 272(%r8), %rdx + movq %rax, 264(%r8) + sbbq 272(%rdi), %rdx + movq 280(%r8), %rax + movq %rdx, 272(%r8) + sbbq 280(%rdi), %rax + movq 288(%r8), %rdx + movq %rax, 280(%r8) + sbbq 288(%rdi), %rdx + movq 296(%r8), %rax + movq %rdx, 288(%r8) + sbbq 296(%rdi), %rax + movq 304(%r8), %rdx + movq %rax, 296(%r8) + sbbq 304(%rdi), %rdx + movq 312(%r8), %rax + movq %rdx, 304(%r8) + sbbq 312(%rdi), %rax + movq 320(%r8), %rdx + movq %rax, 312(%r8) + sbbq 320(%rdi), %rdx + movq 328(%r8), %rax + movq %rdx, 320(%r8) + sbbq 328(%rdi), %rax + movq 336(%r8), %rdx + movq %rax, 328(%r8) + sbbq 336(%rdi), %rdx + movq 344(%r8), %rax + movq %rdx, 336(%r8) + sbbq 344(%rdi), %rax + movq 352(%r8), %rdx + movq %rax, 344(%r8) + sbbq 352(%rdi), %rdx + movq 360(%r8), %rax + movq %rdx, 352(%r8) + sbbq 360(%rdi), %rax + movq 368(%r8), %rdx + movq %rax, 360(%r8) + sbbq 368(%rdi), %rdx + movq 376(%r8), %rax + movq %rdx, 368(%r8) + sbbq 376(%rdi), %rax + movq 384(%r8), %rdx + movq %rax, 376(%r8) + sbbq 384(%rdi), %rdx + movq 392(%r8), %rax + movq %rdx, 384(%r8) + sbbq 392(%rdi), %rax + movq 400(%r8), %rdx + movq %rax, 392(%r8) + sbbq 400(%rdi), %rdx + movq 408(%r8), %rax + movq %rdx, 400(%r8) + sbbq 408(%rdi), %rax + movq 416(%r8), %rdx + movq %rax, 408(%r8) + sbbq 416(%rdi), %rdx + movq 424(%r8), %rax + movq %rdx, 416(%r8) + sbbq 424(%rdi), %rax + movq 432(%r8), %rdx + movq %rax, 424(%r8) + sbbq 432(%rdi), %rdx + movq 440(%r8), %rax + movq %rdx, 432(%r8) + sbbq 440(%rdi), %rax + movq 448(%r8), %rdx + movq %rax, 440(%r8) + sbbq 448(%rdi), %rdx + movq 456(%r8), %rax + movq %rdx, 448(%r8) + sbbq 456(%rdi), %rax + movq 464(%r8), %rdx + movq %rax, 456(%r8) + sbbq 464(%rdi), %rdx + movq 472(%r8), %rax + movq %rdx, 464(%r8) + sbbq 472(%rdi), %rax + movq 480(%r8), %rdx + movq %rax, 472(%r8) + sbbq 480(%rdi), %rdx + movq 488(%r8), %rax + movq %rdx, 480(%r8) + sbbq 488(%rdi), %rax + movq 496(%r8), %rdx + movq %rax, 488(%r8) + sbbq 496(%rdi), %rdx + movq 504(%r8), %rax + movq %rdx, 496(%r8) + sbbq 504(%rdi), %rax + movq %rax, 504(%r8) + sbbq $0x00, %rcx + # Add in place + movq 256(%rdi), %rdx + addq (%r8), %rdx + movq 264(%rdi), %rax + movq %rdx, 256(%rdi) + adcq 8(%r8), %rax + movq 272(%rdi), %rdx + movq %rax, 264(%rdi) + adcq 16(%r8), %rdx + movq 280(%rdi), %rax + movq %rdx, 272(%rdi) + adcq 24(%r8), %rax + movq 288(%rdi), %rdx + movq %rax, 280(%rdi) + adcq 32(%r8), %rdx + movq 296(%rdi), %rax + movq %rdx, 288(%rdi) + adcq 40(%r8), %rax + movq 304(%rdi), %rdx + movq %rax, 296(%rdi) + adcq 48(%r8), %rdx + movq 312(%rdi), %rax + movq %rdx, 304(%rdi) + adcq 56(%r8), %rax + movq 320(%rdi), %rdx + movq %rax, 312(%rdi) + adcq 64(%r8), %rdx + movq 328(%rdi), %rax + movq %rdx, 320(%rdi) + adcq 72(%r8), %rax + movq 336(%rdi), %rdx + movq %rax, 328(%rdi) + adcq 80(%r8), %rdx + movq 344(%rdi), %rax + movq %rdx, 336(%rdi) + adcq 88(%r8), %rax + movq 352(%rdi), %rdx + movq %rax, 344(%rdi) + adcq 96(%r8), %rdx + movq 360(%rdi), %rax + movq %rdx, 352(%rdi) + adcq 104(%r8), %rax + movq 368(%rdi), %rdx + movq %rax, 360(%rdi) + adcq 112(%r8), %rdx + movq 376(%rdi), %rax + movq %rdx, 368(%rdi) + adcq 120(%r8), %rax + movq 384(%rdi), %rdx + movq %rax, 376(%rdi) + adcq 128(%r8), %rdx + movq 392(%rdi), %rax + movq %rdx, 384(%rdi) + adcq 136(%r8), %rax + movq 400(%rdi), %rdx + movq %rax, 392(%rdi) + adcq 144(%r8), %rdx + movq 408(%rdi), %rax + movq %rdx, 400(%rdi) + adcq 152(%r8), %rax + movq 416(%rdi), %rdx + movq %rax, 408(%rdi) + adcq 160(%r8), %rdx + movq 424(%rdi), %rax + movq %rdx, 416(%rdi) + adcq 168(%r8), %rax + movq 432(%rdi), %rdx + movq %rax, 424(%rdi) + adcq 176(%r8), %rdx + movq 440(%rdi), %rax + movq %rdx, 432(%rdi) + adcq 184(%r8), %rax + movq 448(%rdi), %rdx + movq %rax, 440(%rdi) + adcq 192(%r8), %rdx + movq 456(%rdi), %rax + movq %rdx, 448(%rdi) + adcq 200(%r8), %rax + movq 464(%rdi), %rdx + movq %rax, 456(%rdi) + adcq 208(%r8), %rdx + movq 472(%rdi), %rax + movq %rdx, 464(%rdi) + adcq 216(%r8), %rax + movq 480(%rdi), %rdx + movq %rax, 472(%rdi) + adcq 224(%r8), %rdx + movq 488(%rdi), %rax + movq %rdx, 480(%rdi) + adcq 232(%r8), %rax + movq 496(%rdi), %rdx + movq %rax, 488(%rdi) + adcq 240(%r8), %rdx + movq 504(%rdi), %rax + movq %rdx, 496(%rdi) + adcq 248(%r8), %rax + movq 512(%rdi), %rdx + movq %rax, 504(%rdi) + adcq 256(%r8), %rdx + movq 520(%rdi), %rax + movq %rdx, 512(%rdi) + adcq 264(%r8), %rax + movq 528(%rdi), %rdx + movq %rax, 520(%rdi) + adcq 272(%r8), %rdx + movq 536(%rdi), %rax + movq %rdx, 528(%rdi) + adcq 280(%r8), %rax + movq 544(%rdi), %rdx + movq %rax, 536(%rdi) + adcq 288(%r8), %rdx + movq 552(%rdi), %rax + movq %rdx, 544(%rdi) + adcq 296(%r8), %rax + movq 560(%rdi), %rdx + movq %rax, 552(%rdi) + adcq 304(%r8), %rdx + movq 568(%rdi), %rax + movq %rdx, 560(%rdi) + adcq 312(%r8), %rax + movq 576(%rdi), %rdx + movq %rax, 568(%rdi) + adcq 320(%r8), %rdx + movq 584(%rdi), %rax + movq %rdx, 576(%rdi) + adcq 328(%r8), %rax + movq 592(%rdi), %rdx + movq %rax, 584(%rdi) + adcq 336(%r8), %rdx + movq 600(%rdi), %rax + movq %rdx, 592(%rdi) + adcq 344(%r8), %rax + movq 608(%rdi), %rdx + movq %rax, 600(%rdi) + adcq 352(%r8), %rdx + movq 616(%rdi), %rax + movq %rdx, 608(%rdi) + adcq 360(%r8), %rax + movq 624(%rdi), %rdx + movq %rax, 616(%rdi) + adcq 368(%r8), %rdx + movq 632(%rdi), %rax + movq %rdx, 624(%rdi) + adcq 376(%r8), %rax + movq 640(%rdi), %rdx + movq %rax, 632(%rdi) + adcq 384(%r8), %rdx + movq 648(%rdi), %rax + movq %rdx, 640(%rdi) + adcq 392(%r8), %rax + movq 656(%rdi), %rdx + movq %rax, 648(%rdi) + adcq 400(%r8), %rdx + movq 664(%rdi), %rax + movq %rdx, 656(%rdi) + adcq 408(%r8), %rax + movq 672(%rdi), %rdx + movq %rax, 664(%rdi) + adcq 416(%r8), %rdx + movq 680(%rdi), %rax + movq %rdx, 672(%rdi) + adcq 424(%r8), %rax + movq 688(%rdi), %rdx + movq %rax, 680(%rdi) + adcq 432(%r8), %rdx + movq 696(%rdi), %rax + movq %rdx, 688(%rdi) + adcq 440(%r8), %rax + movq 704(%rdi), %rdx + movq %rax, 696(%rdi) + adcq 448(%r8), %rdx + movq 712(%rdi), %rax + movq %rdx, 704(%rdi) + adcq 456(%r8), %rax + movq 720(%rdi), %rdx + movq %rax, 712(%rdi) + adcq 464(%r8), %rdx + movq 728(%rdi), %rax + movq %rdx, 720(%rdi) + adcq 472(%r8), %rax + movq 736(%rdi), %rdx + movq %rax, 728(%rdi) + adcq 480(%r8), %rdx + movq 744(%rdi), %rax + movq %rdx, 736(%rdi) + adcq 488(%r8), %rax + movq 752(%rdi), %rdx + movq %rax, 744(%rdi) + adcq 496(%r8), %rdx + movq 760(%rdi), %rax + movq %rdx, 752(%rdi) + adcq 504(%r8), %rax + movq %rax, 760(%rdi) + adcq $0x00, %rcx + movq %rcx, 768(%rdi) + # Add in place + movq 512(%rdi), %rdx + xorq %rcx, %rcx + addq (%rsi), %rdx + movq 520(%rdi), %rax + movq %rdx, 512(%rdi) + adcq 8(%rsi), %rax + movq 528(%rdi), %rdx + movq %rax, 520(%rdi) + adcq 16(%rsi), %rdx + movq 536(%rdi), %rax + movq %rdx, 528(%rdi) + adcq 24(%rsi), %rax + movq 544(%rdi), %rdx + movq %rax, 536(%rdi) + adcq 32(%rsi), %rdx + movq 552(%rdi), %rax + movq %rdx, 544(%rdi) + adcq 40(%rsi), %rax + movq 560(%rdi), %rdx + movq %rax, 552(%rdi) + adcq 48(%rsi), %rdx + movq 568(%rdi), %rax + movq %rdx, 560(%rdi) + adcq 56(%rsi), %rax + movq 576(%rdi), %rdx + movq %rax, 568(%rdi) + adcq 64(%rsi), %rdx + movq 584(%rdi), %rax + movq %rdx, 576(%rdi) + adcq 72(%rsi), %rax + movq 592(%rdi), %rdx + movq %rax, 584(%rdi) + adcq 80(%rsi), %rdx + movq 600(%rdi), %rax + movq %rdx, 592(%rdi) + adcq 88(%rsi), %rax + movq 608(%rdi), %rdx + movq %rax, 600(%rdi) + adcq 96(%rsi), %rdx + movq 616(%rdi), %rax + movq %rdx, 608(%rdi) + adcq 104(%rsi), %rax + movq 624(%rdi), %rdx + movq %rax, 616(%rdi) + adcq 112(%rsi), %rdx + movq 632(%rdi), %rax + movq %rdx, 624(%rdi) + adcq 120(%rsi), %rax + movq 640(%rdi), %rdx + movq %rax, 632(%rdi) + adcq 128(%rsi), %rdx + movq 648(%rdi), %rax + movq %rdx, 640(%rdi) + adcq 136(%rsi), %rax + movq 656(%rdi), %rdx + movq %rax, 648(%rdi) + adcq 144(%rsi), %rdx + movq 664(%rdi), %rax + movq %rdx, 656(%rdi) + adcq 152(%rsi), %rax + movq 672(%rdi), %rdx + movq %rax, 664(%rdi) + adcq 160(%rsi), %rdx + movq 680(%rdi), %rax + movq %rdx, 672(%rdi) + adcq 168(%rsi), %rax + movq 688(%rdi), %rdx + movq %rax, 680(%rdi) + adcq 176(%rsi), %rdx + movq 696(%rdi), %rax + movq %rdx, 688(%rdi) + adcq 184(%rsi), %rax + movq 704(%rdi), %rdx + movq %rax, 696(%rdi) + adcq 192(%rsi), %rdx + movq 712(%rdi), %rax + movq %rdx, 704(%rdi) + adcq 200(%rsi), %rax + movq 720(%rdi), %rdx + movq %rax, 712(%rdi) + adcq 208(%rsi), %rdx + movq 728(%rdi), %rax + movq %rdx, 720(%rdi) + adcq 216(%rsi), %rax + movq 736(%rdi), %rdx + movq %rax, 728(%rdi) + adcq 224(%rsi), %rdx + movq 744(%rdi), %rax + movq %rdx, 736(%rdi) + adcq 232(%rsi), %rax + movq 752(%rdi), %rdx + movq %rax, 744(%rdi) + adcq 240(%rsi), %rdx + movq 760(%rdi), %rax + movq %rdx, 752(%rdi) + adcq 248(%rsi), %rax + movq 768(%rdi), %rdx + movq %rax, 760(%rdi) + adcq 256(%rsi), %rdx + movq %rdx, 768(%rdi) + adcq $0x00, %rcx + # Add to zero + movq 264(%rsi), %rdx + adcq $0x00, %rdx + movq 272(%rsi), %rax + movq %rdx, 776(%rdi) + adcq $0x00, %rax + movq 280(%rsi), %rdx + movq %rax, 784(%rdi) + adcq $0x00, %rdx + movq 288(%rsi), %rax + movq %rdx, 792(%rdi) + adcq $0x00, %rax + movq 296(%rsi), %rdx + movq %rax, 800(%rdi) + adcq $0x00, %rdx + movq 304(%rsi), %rax + movq %rdx, 808(%rdi) + adcq $0x00, %rax + movq 312(%rsi), %rdx + movq %rax, 816(%rdi) + adcq $0x00, %rdx + movq 320(%rsi), %rax + movq %rdx, 824(%rdi) + adcq $0x00, %rax + movq 328(%rsi), %rdx + movq %rax, 832(%rdi) + adcq $0x00, %rdx + movq 336(%rsi), %rax + movq %rdx, 840(%rdi) + adcq $0x00, %rax + movq 344(%rsi), %rdx + movq %rax, 848(%rdi) + adcq $0x00, %rdx + movq 352(%rsi), %rax + movq %rdx, 856(%rdi) + adcq $0x00, %rax + movq 360(%rsi), %rdx + movq %rax, 864(%rdi) + adcq $0x00, %rdx + movq 368(%rsi), %rax + movq %rdx, 872(%rdi) + adcq $0x00, %rax + movq 376(%rsi), %rdx + movq %rax, 880(%rdi) + adcq $0x00, %rdx + movq 384(%rsi), %rax + movq %rdx, 888(%rdi) + adcq $0x00, %rax + movq 392(%rsi), %rdx + movq %rax, 896(%rdi) + adcq $0x00, %rdx + movq 400(%rsi), %rax + movq %rdx, 904(%rdi) + adcq $0x00, %rax + movq 408(%rsi), %rdx + movq %rax, 912(%rdi) + adcq $0x00, %rdx + movq 416(%rsi), %rax + movq %rdx, 920(%rdi) + adcq $0x00, %rax + movq 424(%rsi), %rdx + movq %rax, 928(%rdi) + adcq $0x00, %rdx + movq 432(%rsi), %rax + movq %rdx, 936(%rdi) + adcq $0x00, %rax + movq 440(%rsi), %rdx + movq %rax, 944(%rdi) + adcq $0x00, %rdx + movq 448(%rsi), %rax + movq %rdx, 952(%rdi) + adcq $0x00, %rax + movq 456(%rsi), %rdx + movq %rax, 960(%rdi) + adcq $0x00, %rdx + movq 464(%rsi), %rax + movq %rdx, 968(%rdi) + adcq $0x00, %rax + movq 472(%rsi), %rdx + movq %rax, 976(%rdi) + adcq $0x00, %rdx + movq 480(%rsi), %rax + movq %rdx, 984(%rdi) + adcq $0x00, %rax + movq 488(%rsi), %rdx + movq %rax, 992(%rdi) + adcq $0x00, %rdx + movq 496(%rsi), %rax + movq %rdx, 1000(%rdi) + adcq $0x00, %rax + movq 504(%rsi), %rdx + movq %rax, 1008(%rdi) + adcq $0x00, %rdx + movq %rdx, 1016(%rdi) + addq $0x518, %rsp + repz retq +#ifndef __APPLE__ +.size sp_4096_sqr_64,.-sp_4096_sqr_64 +#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Square a and put result in r. (r = a * a) * diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index 24e69c373..b991cfbda 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -2035,1094 +2035,6 @@ sp_2048_mul_16 PROC ret sp_2048_mul_16 ENDP _text ENDS -; /* Square a and put result in r. (r = a * a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_sqr_16 PROC - push r12 - push r13 - push r14 - mov r8, rdx - sub rsp, 128 - ; A[0] * A[0] - mov rax, QWORD PTR [r8] - mul rax - xor r11, r11 - mov QWORD PTR [rsp], rax - mov r10, rdx - ; A[0] * A[1] - mov rax, QWORD PTR [r8+8] - mul QWORD PTR [r8] - xor r9, r9 - add r10, rax - adc r11, rdx - adc r9, 0 - add r10, rax - adc r11, rdx - adc r9, 0 - mov QWORD PTR [rsp+8], r10 - ; A[0] * A[2] - mov rax, QWORD PTR [r8+16] - mul QWORD PTR [r8] - xor r10, r10 - add r11, rax - adc r9, rdx - adc r10, 0 - add r11, rax - adc r9, rdx - adc r10, 0 - ; A[1] * A[1] - mov rax, QWORD PTR [r8+8] - mul rax - add r11, rax - adc r9, rdx - adc r10, 0 - mov QWORD PTR [rsp+16], r11 - ; A[0] * A[3] - mov rax, QWORD PTR [r8+24] - mul QWORD PTR [r8] - xor r11, r11 - add r9, rax - adc r10, rdx - adc r11, 0 - add r9, rax - adc r10, rdx - adc r11, 0 - ; A[1] * A[2] - mov rax, QWORD PTR [r8+16] - mul QWORD PTR [r8+8] - add r9, rax - adc r10, rdx - adc r11, 0 - add r9, rax - adc r10, rdx - adc r11, 0 - mov QWORD PTR [rsp+24], r9 - ; A[0] * A[4] - mov rax, QWORD PTR [r8+32] - mul QWORD PTR [r8] - xor r9, r9 - add r10, rax - adc r11, rdx - adc r9, 0 - add r10, rax - adc r11, rdx - adc r9, 0 - ; A[1] * A[3] - mov rax, QWORD PTR [r8+24] - mul QWORD PTR [r8+8] - add r10, rax - adc r11, rdx - adc r9, 0 - add r10, rax - adc r11, rdx - adc r9, 0 - ; A[2] * A[2] - mov rax, QWORD PTR [r8+16] - mul rax - add r10, rax - adc r11, rdx - adc r9, 0 - mov QWORD PTR [rsp+32], r10 - ; A[0] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[4] - mov rax, QWORD PTR [r8+32] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[3] - mov rax, QWORD PTR [r8+24] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rsp+40], r11 - ; A[0] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[4] - mov rax, QWORD PTR [r8+32] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[3] - mov rax, QWORD PTR [r8+24] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rsp+48], r9 - ; A[0] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[4] - mov rax, QWORD PTR [r8+32] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rsp+56], r10 - ; A[0] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[4] - mov rax, QWORD PTR [r8+32] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rsp+64], r11 - ; A[0] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rsp+72], r9 - ; A[0] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[5] - mov rax, QWORD PTR [r8+40] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rsp+80], r10 - ; A[0] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rsp+88], r11 - ; A[0] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[6] - mov rax, QWORD PTR [r8+48] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rsp+96], r9 - ; A[0] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rsp+104], r10 - ; A[0] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[7] - mov rax, QWORD PTR [r8+56] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rsp+112], r11 - ; A[0] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rsp+120], r9 - ; A[1] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+8] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[2] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[8] * A[8] - mov rax, QWORD PTR [r8+64] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rcx+128], r10 - ; A[2] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+16] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[3] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[8] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+64] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rcx+136], r11 - ; A[3] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+24] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[4] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[8] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+64] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[9] * A[9] - mov rax, QWORD PTR [r8+72] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rcx+144], r9 - ; A[4] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+32] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[5] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[8] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+64] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[9] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+72] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rcx+152], r10 - ; A[5] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+40] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[6] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[8] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+64] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[9] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+72] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[10] * A[10] - mov rax, QWORD PTR [r8+80] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rcx+160], r11 - ; A[6] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+48] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[7] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[8] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+64] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[9] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+72] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[10] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+80] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rcx+168], r9 - ; A[7] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+56] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[8] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+64] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[9] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+72] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[10] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+80] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[11] * A[11] - mov rax, QWORD PTR [r8+88] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rcx+176], r10 - ; A[8] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+64] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[9] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+72] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[10] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+80] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[11] * A[12] - mov rax, QWORD PTR [r8+96] - mul QWORD PTR [r8+88] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rcx+184], r11 - ; A[9] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+72] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[10] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+80] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[11] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+88] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[12] * A[12] - mov rax, QWORD PTR [r8+96] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rcx+192], r9 - ; A[10] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+80] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[11] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+88] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[12] * A[13] - mov rax, QWORD PTR [r8+104] - mul QWORD PTR [r8+96] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rcx+200], r10 - ; A[11] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+88] - xor r10, r10 - add r11, rax - adc r9, rdx - adc r10, 0 - add r11, rax - adc r9, rdx - adc r10, 0 - ; A[12] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+96] - add r11, rax - adc r9, rdx - adc r10, 0 - add r11, rax - adc r9, rdx - adc r10, 0 - ; A[13] * A[13] - mov rax, QWORD PTR [r8+104] - mul rax - add r11, rax - adc r9, rdx - adc r10, 0 - mov QWORD PTR [rcx+208], r11 - ; A[12] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+96] - xor r11, r11 - add r9, rax - adc r10, rdx - adc r11, 0 - add r9, rax - adc r10, rdx - adc r11, 0 - ; A[13] * A[14] - mov rax, QWORD PTR [r8+112] - mul QWORD PTR [r8+104] - add r9, rax - adc r10, rdx - adc r11, 0 - add r9, rax - adc r10, rdx - adc r11, 0 - mov QWORD PTR [rcx+216], r9 - ; A[13] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+104] - xor r9, r9 - add r10, rax - adc r11, rdx - adc r9, 0 - add r10, rax - adc r11, rdx - adc r9, 0 - ; A[14] * A[14] - mov rax, QWORD PTR [r8+112] - mul rax - add r10, rax - adc r11, rdx - adc r9, 0 - mov QWORD PTR [rcx+224], r10 - ; A[14] * A[15] - mov rax, QWORD PTR [r8+120] - mul QWORD PTR [r8+112] - xor r10, r10 - add r11, rax - adc r9, rdx - adc r10, 0 - add r11, rax - adc r9, rdx - adc r10, 0 - mov QWORD PTR [rcx+232], r11 - ; A[15] * A[15] - mov rax, QWORD PTR [r8+120] - mul rax - add r9, rax - adc r10, rdx - mov QWORD PTR [rcx+240], r9 - mov QWORD PTR [rcx+248], r10 - mov rax, QWORD PTR [rsp] - mov rdx, QWORD PTR [rsp+8] - mov r12, QWORD PTR [rsp+16] - mov r13, QWORD PTR [rsp+24] - mov QWORD PTR [rcx], rax - mov QWORD PTR [rcx+8], rdx - mov QWORD PTR [rcx+16], r12 - mov QWORD PTR [rcx+24], r13 - mov rax, QWORD PTR [rsp+32] - mov rdx, QWORD PTR [rsp+40] - mov r12, QWORD PTR [rsp+48] - mov r13, QWORD PTR [rsp+56] - mov QWORD PTR [rcx+32], rax - mov QWORD PTR [rcx+40], rdx - mov QWORD PTR [rcx+48], r12 - mov QWORD PTR [rcx+56], r13 - mov rax, QWORD PTR [rsp+64] - mov rdx, QWORD PTR [rsp+72] - mov r12, QWORD PTR [rsp+80] - mov r13, QWORD PTR [rsp+88] - mov QWORD PTR [rcx+64], rax - mov QWORD PTR [rcx+72], rdx - mov QWORD PTR [rcx+80], r12 - mov QWORD PTR [rcx+88], r13 - mov rax, QWORD PTR [rsp+96] - mov rdx, QWORD PTR [rsp+104] - mov r12, QWORD PTR [rsp+112] - mov r13, QWORD PTR [rsp+120] - mov QWORD PTR [rcx+96], rax - mov QWORD PTR [rcx+104], rdx - mov QWORD PTR [rcx+112], r12 - mov QWORD PTR [rcx+120], r13 - add rsp, 128 - pop r14 - pop r13 - pop r12 - ret -sp_2048_sqr_16 ENDP -_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Multiply a and b into r. (r = a * b) ; * @@ -4795,6 +3707,2742 @@ L_end_2048_mul_avx2_16: sp_2048_mul_avx2_16 ENDP _text ENDS ENDIF +; /* Add b to a into r. (r = a + b) +; * +; * r A single precision integer. +; * a A single precision integer. +; * b A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_2048_add_16 PROC + ; Add + mov r9, QWORD PTR [rdx] + xor rax, rax + add r9, QWORD PTR [r8] + mov r10, QWORD PTR [rdx+8] + mov QWORD PTR [rcx], r9 + adc r10, QWORD PTR [r8+8] + mov r9, QWORD PTR [rdx+16] + mov QWORD PTR [rcx+8], r10 + adc r9, QWORD PTR [r8+16] + mov r10, QWORD PTR [rdx+24] + mov QWORD PTR [rcx+16], r9 + adc r10, QWORD PTR [r8+24] + mov r9, QWORD PTR [rdx+32] + mov QWORD PTR [rcx+24], r10 + adc r9, QWORD PTR [r8+32] + mov r10, QWORD PTR [rdx+40] + mov QWORD PTR [rcx+32], r9 + adc r10, QWORD PTR [r8+40] + mov r9, QWORD PTR [rdx+48] + mov QWORD PTR [rcx+40], r10 + adc r9, QWORD PTR [r8+48] + mov r10, QWORD PTR [rdx+56] + mov QWORD PTR [rcx+48], r9 + adc r10, QWORD PTR [r8+56] + mov r9, QWORD PTR [rdx+64] + mov QWORD PTR [rcx+56], r10 + adc r9, QWORD PTR [r8+64] + mov r10, QWORD PTR [rdx+72] + mov QWORD PTR [rcx+64], r9 + adc r10, QWORD PTR [r8+72] + mov r9, QWORD PTR [rdx+80] + mov QWORD PTR [rcx+72], r10 + adc r9, QWORD PTR [r8+80] + mov r10, QWORD PTR [rdx+88] + mov QWORD PTR [rcx+80], r9 + adc r10, QWORD PTR [r8+88] + mov r9, QWORD PTR [rdx+96] + mov QWORD PTR [rcx+88], r10 + adc r9, QWORD PTR [r8+96] + mov r10, QWORD PTR [rdx+104] + mov QWORD PTR [rcx+96], r9 + adc r10, QWORD PTR [r8+104] + mov r9, QWORD PTR [rdx+112] + mov QWORD PTR [rcx+104], r10 + adc r9, QWORD PTR [r8+112] + mov r10, QWORD PTR [rdx+120] + mov QWORD PTR [rcx+112], r9 + adc r10, QWORD PTR [r8+120] + mov QWORD PTR [rcx+120], r10 + adc rax, 0 + ret +sp_2048_add_16 ENDP +_text ENDS +; /* Sub b from a into a. (a -= b) +; * +; * a A single precision integer and result. +; * b A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_2048_sub_in_place_32 PROC + mov r8, QWORD PTR [rcx] + xor rax, rax + sub r8, QWORD PTR [rdx] + mov r9, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 + sbb r9, QWORD PTR [rdx+8] + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], r9 + sbb r8, QWORD PTR [rdx+16] + mov r9, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 + sbb r9, QWORD PTR [rdx+24] + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], r9 + sbb r8, QWORD PTR [rdx+32] + mov r9, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 + sbb r9, QWORD PTR [rdx+40] + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], r9 + sbb r8, QWORD PTR [rdx+48] + mov r9, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 + sbb r9, QWORD PTR [rdx+56] + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], r9 + sbb r8, QWORD PTR [rdx+64] + mov r9, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 + sbb r9, QWORD PTR [rdx+72] + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], r9 + sbb r8, QWORD PTR [rdx+80] + mov r9, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 + sbb r9, QWORD PTR [rdx+88] + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], r9 + sbb r8, QWORD PTR [rdx+96] + mov r9, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 + sbb r9, QWORD PTR [rdx+104] + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], r9 + sbb r8, QWORD PTR [rdx+112] + mov r9, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 + sbb r9, QWORD PTR [rdx+120] + mov r8, QWORD PTR [rcx+128] + mov QWORD PTR [rcx+120], r9 + sbb r8, QWORD PTR [rdx+128] + mov r9, QWORD PTR [rcx+136] + mov QWORD PTR [rcx+128], r8 + sbb r9, QWORD PTR [rdx+136] + mov r8, QWORD PTR [rcx+144] + mov QWORD PTR [rcx+136], r9 + sbb r8, QWORD PTR [rdx+144] + mov r9, QWORD PTR [rcx+152] + mov QWORD PTR [rcx+144], r8 + sbb r9, QWORD PTR [rdx+152] + mov r8, QWORD PTR [rcx+160] + mov QWORD PTR [rcx+152], r9 + sbb r8, QWORD PTR [rdx+160] + mov r9, QWORD PTR [rcx+168] + mov QWORD PTR [rcx+160], r8 + sbb r9, QWORD PTR [rdx+168] + mov r8, QWORD PTR [rcx+176] + mov QWORD PTR [rcx+168], r9 + sbb r8, QWORD PTR [rdx+176] + mov r9, QWORD PTR [rcx+184] + mov QWORD PTR [rcx+176], r8 + sbb r9, QWORD PTR [rdx+184] + mov r8, QWORD PTR [rcx+192] + mov QWORD PTR [rcx+184], r9 + sbb r8, QWORD PTR [rdx+192] + mov r9, QWORD PTR [rcx+200] + mov QWORD PTR [rcx+192], r8 + sbb r9, QWORD PTR [rdx+200] + mov r8, QWORD PTR [rcx+208] + mov QWORD PTR [rcx+200], r9 + sbb r8, QWORD PTR [rdx+208] + mov r9, QWORD PTR [rcx+216] + mov QWORD PTR [rcx+208], r8 + sbb r9, QWORD PTR [rdx+216] + mov r8, QWORD PTR [rcx+224] + mov QWORD PTR [rcx+216], r9 + sbb r8, QWORD PTR [rdx+224] + mov r9, QWORD PTR [rcx+232] + mov QWORD PTR [rcx+224], r8 + sbb r9, QWORD PTR [rdx+232] + mov r8, QWORD PTR [rcx+240] + mov QWORD PTR [rcx+232], r9 + sbb r8, QWORD PTR [rdx+240] + mov r9, QWORD PTR [rcx+248] + mov QWORD PTR [rcx+240], r8 + sbb r9, QWORD PTR [rdx+248] + mov QWORD PTR [rcx+248], r9 + sbb rax, 0 + ret +sp_2048_sub_in_place_32 ENDP +_text ENDS +; /* Add b to a into r. (r = a + b) +; * +; * r A single precision integer. +; * a A single precision integer. +; * b A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_2048_add_32 PROC + ; Add + mov r9, QWORD PTR [rdx] + xor rax, rax + add r9, QWORD PTR [r8] + mov r10, QWORD PTR [rdx+8] + mov QWORD PTR [rcx], r9 + adc r10, QWORD PTR [r8+8] + mov r9, QWORD PTR [rdx+16] + mov QWORD PTR [rcx+8], r10 + adc r9, QWORD PTR [r8+16] + mov r10, QWORD PTR [rdx+24] + mov QWORD PTR [rcx+16], r9 + adc r10, QWORD PTR [r8+24] + mov r9, QWORD PTR [rdx+32] + mov QWORD PTR [rcx+24], r10 + adc r9, QWORD PTR [r8+32] + mov r10, QWORD PTR [rdx+40] + mov QWORD PTR [rcx+32], r9 + adc r10, QWORD PTR [r8+40] + mov r9, QWORD PTR [rdx+48] + mov QWORD PTR [rcx+40], r10 + adc r9, QWORD PTR [r8+48] + mov r10, QWORD PTR [rdx+56] + mov QWORD PTR [rcx+48], r9 + adc r10, QWORD PTR [r8+56] + mov r9, QWORD PTR [rdx+64] + mov QWORD PTR [rcx+56], r10 + adc r9, QWORD PTR [r8+64] + mov r10, QWORD PTR [rdx+72] + mov QWORD PTR [rcx+64], r9 + adc r10, QWORD PTR [r8+72] + mov r9, QWORD PTR [rdx+80] + mov QWORD PTR [rcx+72], r10 + adc r9, QWORD PTR [r8+80] + mov r10, QWORD PTR [rdx+88] + mov QWORD PTR [rcx+80], r9 + adc r10, QWORD PTR [r8+88] + mov r9, QWORD PTR [rdx+96] + mov QWORD PTR [rcx+88], r10 + adc r9, QWORD PTR [r8+96] + mov r10, QWORD PTR [rdx+104] + mov QWORD PTR [rcx+96], r9 + adc r10, QWORD PTR [r8+104] + mov r9, QWORD PTR [rdx+112] + mov QWORD PTR [rcx+104], r10 + adc r9, QWORD PTR [r8+112] + mov r10, QWORD PTR [rdx+120] + mov QWORD PTR [rcx+112], r9 + adc r10, QWORD PTR [r8+120] + mov r9, QWORD PTR [rdx+128] + mov QWORD PTR [rcx+120], r10 + adc r9, QWORD PTR [r8+128] + mov r10, QWORD PTR [rdx+136] + mov QWORD PTR [rcx+128], r9 + adc r10, QWORD PTR [r8+136] + mov r9, QWORD PTR [rdx+144] + mov QWORD PTR [rcx+136], r10 + adc r9, QWORD PTR [r8+144] + mov r10, QWORD PTR [rdx+152] + mov QWORD PTR [rcx+144], r9 + adc r10, QWORD PTR [r8+152] + mov r9, QWORD PTR [rdx+160] + mov QWORD PTR [rcx+152], r10 + adc r9, QWORD PTR [r8+160] + mov r10, QWORD PTR [rdx+168] + mov QWORD PTR [rcx+160], r9 + adc r10, QWORD PTR [r8+168] + mov r9, QWORD PTR [rdx+176] + mov QWORD PTR [rcx+168], r10 + adc r9, QWORD PTR [r8+176] + mov r10, QWORD PTR [rdx+184] + mov QWORD PTR [rcx+176], r9 + adc r10, QWORD PTR [r8+184] + mov r9, QWORD PTR [rdx+192] + mov QWORD PTR [rcx+184], r10 + adc r9, QWORD PTR [r8+192] + mov r10, QWORD PTR [rdx+200] + mov QWORD PTR [rcx+192], r9 + adc r10, QWORD PTR [r8+200] + mov r9, QWORD PTR [rdx+208] + mov QWORD PTR [rcx+200], r10 + adc r9, QWORD PTR [r8+208] + mov r10, QWORD PTR [rdx+216] + mov QWORD PTR [rcx+208], r9 + adc r10, QWORD PTR [r8+216] + mov r9, QWORD PTR [rdx+224] + mov QWORD PTR [rcx+216], r10 + adc r9, QWORD PTR [r8+224] + mov r10, QWORD PTR [rdx+232] + mov QWORD PTR [rcx+224], r9 + adc r10, QWORD PTR [r8+232] + mov r9, QWORD PTR [rdx+240] + mov QWORD PTR [rcx+232], r10 + adc r9, QWORD PTR [r8+240] + mov r10, QWORD PTR [rdx+248] + mov QWORD PTR [rcx+240], r9 + adc r10, QWORD PTR [r8+248] + mov QWORD PTR [rcx+248], r10 + adc rax, 0 + ret +sp_2048_add_32 ENDP +_text ENDS +; /* Multiply a and b into r. (r = a * b) +; * +; * r A single precision integer. +; * a A single precision integer. +; * b A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_2048_mul_32 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + sub rsp, 808 + mov QWORD PTR [rsp+768], rcx + mov QWORD PTR [rsp+776], rdx + mov QWORD PTR [rsp+784], r8 + lea r12, QWORD PTR [rsp+512] + lea r14, QWORD PTR [rdx+128] + ; Add + mov rax, QWORD PTR [rdx] + xor r15, r15 + add rax, QWORD PTR [r14] + mov r9, QWORD PTR [rdx+8] + mov QWORD PTR [r12], rax + adc r9, QWORD PTR [r14+8] + mov r10, QWORD PTR [rdx+16] + mov QWORD PTR [r12+8], r9 + adc r10, QWORD PTR [r14+16] + mov rax, QWORD PTR [rdx+24] + mov QWORD PTR [r12+16], r10 + adc rax, QWORD PTR [r14+24] + mov r9, QWORD PTR [rdx+32] + mov QWORD PTR [r12+24], rax + adc r9, QWORD PTR [r14+32] + mov r10, QWORD PTR [rdx+40] + mov QWORD PTR [r12+32], r9 + adc r10, QWORD PTR [r14+40] + mov rax, QWORD PTR [rdx+48] + mov QWORD PTR [r12+40], r10 + adc rax, QWORD PTR [r14+48] + mov r9, QWORD PTR [rdx+56] + mov QWORD PTR [r12+48], rax + adc r9, QWORD PTR [r14+56] + mov r10, QWORD PTR [rdx+64] + mov QWORD PTR [r12+56], r9 + adc r10, QWORD PTR [r14+64] + mov rax, QWORD PTR [rdx+72] + mov QWORD PTR [r12+64], r10 + adc rax, QWORD PTR [r14+72] + mov r9, QWORD PTR [rdx+80] + mov QWORD PTR [r12+72], rax + adc r9, QWORD PTR [r14+80] + mov r10, QWORD PTR [rdx+88] + mov QWORD PTR [r12+80], r9 + adc r10, QWORD PTR [r14+88] + mov rax, QWORD PTR [rdx+96] + mov QWORD PTR [r12+88], r10 + adc rax, QWORD PTR [r14+96] + mov r9, QWORD PTR [rdx+104] + mov QWORD PTR [r12+96], rax + adc r9, QWORD PTR [r14+104] + mov r10, QWORD PTR [rdx+112] + mov QWORD PTR [r12+104], r9 + adc r10, QWORD PTR [r14+112] + mov rax, QWORD PTR [rdx+120] + mov QWORD PTR [r12+112], r10 + adc rax, QWORD PTR [r14+120] + mov QWORD PTR [r12+120], rax + adc r15, 0 + mov QWORD PTR [rsp+792], r15 + lea r13, QWORD PTR [rsp+640] + lea r14, QWORD PTR [r8+128] + ; Add + mov rax, QWORD PTR [r8] + xor rdi, rdi + add rax, QWORD PTR [r14] + mov r9, QWORD PTR [r8+8] + mov QWORD PTR [r13], rax + adc r9, QWORD PTR [r14+8] + mov r10, QWORD PTR [r8+16] + mov QWORD PTR [r13+8], r9 + adc r10, QWORD PTR [r14+16] + mov rax, QWORD PTR [r8+24] + mov QWORD PTR [r13+16], r10 + adc rax, QWORD PTR [r14+24] + mov r9, QWORD PTR [r8+32] + mov QWORD PTR [r13+24], rax + adc r9, QWORD PTR [r14+32] + mov r10, QWORD PTR [r8+40] + mov QWORD PTR [r13+32], r9 + adc r10, QWORD PTR [r14+40] + mov rax, QWORD PTR [r8+48] + mov QWORD PTR [r13+40], r10 + adc rax, QWORD PTR [r14+48] + mov r9, QWORD PTR [r8+56] + mov QWORD PTR [r13+48], rax + adc r9, QWORD PTR [r14+56] + mov r10, QWORD PTR [r8+64] + mov QWORD PTR [r13+56], r9 + adc r10, QWORD PTR [r14+64] + mov rax, QWORD PTR [r8+72] + mov QWORD PTR [r13+64], r10 + adc rax, QWORD PTR [r14+72] + mov r9, QWORD PTR [r8+80] + mov QWORD PTR [r13+72], rax + adc r9, QWORD PTR [r14+80] + mov r10, QWORD PTR [r8+88] + mov QWORD PTR [r13+80], r9 + adc r10, QWORD PTR [r14+88] + mov rax, QWORD PTR [r8+96] + mov QWORD PTR [r13+88], r10 + adc rax, QWORD PTR [r14+96] + mov r9, QWORD PTR [r8+104] + mov QWORD PTR [r13+96], rax + adc r9, QWORD PTR [r14+104] + mov r10, QWORD PTR [r8+112] + mov QWORD PTR [r13+104], r9 + adc r10, QWORD PTR [r14+112] + mov rax, QWORD PTR [r8+120] + mov QWORD PTR [r13+112], r10 + adc rax, QWORD PTR [r14+120] + mov QWORD PTR [r13+120], rax + adc rdi, 0 + mov QWORD PTR [rsp+800], rdi + mov r8, r13 + mov rdx, r12 + mov rcx, rsp + call sp_2048_mul_16 + mov r8, QWORD PTR [rsp+784] + mov rdx, QWORD PTR [rsp+776] + lea rcx, QWORD PTR [rsp+256] + add r8, 128 + add rdx, 128 + call sp_2048_mul_16 + mov r8, QWORD PTR [rsp+784] + mov rdx, QWORD PTR [rsp+776] + mov rcx, QWORD PTR [rsp+768] + call sp_2048_mul_16 +IFDEF _WIN64 + mov r8, QWORD PTR [rsp+784] + mov rdx, QWORD PTR [rsp+776] + mov rcx, QWORD PTR [rsp+768] +ENDIF + mov r15, QWORD PTR [rsp+792] + mov rdi, QWORD PTR [rsp+800] + mov rsi, QWORD PTR [rsp+768] + mov r11, r15 + lea r12, QWORD PTR [rsp+512] + lea r13, QWORD PTR [rsp+640] + and r11, rdi + neg r15 + neg rdi + add rsi, 256 + mov rax, QWORD PTR [r12] + mov r9, QWORD PTR [r13] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12], rax + mov QWORD PTR [r13], r9 + mov rax, QWORD PTR [r12+8] + mov r9, QWORD PTR [r13+8] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+8], rax + mov QWORD PTR [r13+8], r9 + mov rax, QWORD PTR [r12+16] + mov r9, QWORD PTR [r13+16] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+16], rax + mov QWORD PTR [r13+16], r9 + mov rax, QWORD PTR [r12+24] + mov r9, QWORD PTR [r13+24] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+24], rax + mov QWORD PTR [r13+24], r9 + mov rax, QWORD PTR [r12+32] + mov r9, QWORD PTR [r13+32] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+32], rax + mov QWORD PTR [r13+32], r9 + mov rax, QWORD PTR [r12+40] + mov r9, QWORD PTR [r13+40] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+40], rax + mov QWORD PTR [r13+40], r9 + mov rax, QWORD PTR [r12+48] + mov r9, QWORD PTR [r13+48] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+48], rax + mov QWORD PTR [r13+48], r9 + mov rax, QWORD PTR [r12+56] + mov r9, QWORD PTR [r13+56] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+56], rax + mov QWORD PTR [r13+56], r9 + mov rax, QWORD PTR [r12+64] + mov r9, QWORD PTR [r13+64] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+64], rax + mov QWORD PTR [r13+64], r9 + mov rax, QWORD PTR [r12+72] + mov r9, QWORD PTR [r13+72] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+72], rax + mov QWORD PTR [r13+72], r9 + mov rax, QWORD PTR [r12+80] + mov r9, QWORD PTR [r13+80] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+80], rax + mov QWORD PTR [r13+80], r9 + mov rax, QWORD PTR [r12+88] + mov r9, QWORD PTR [r13+88] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+88], rax + mov QWORD PTR [r13+88], r9 + mov rax, QWORD PTR [r12+96] + mov r9, QWORD PTR [r13+96] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+96], rax + mov QWORD PTR [r13+96], r9 + mov rax, QWORD PTR [r12+104] + mov r9, QWORD PTR [r13+104] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+104], rax + mov QWORD PTR [r13+104], r9 + mov rax, QWORD PTR [r12+112] + mov r9, QWORD PTR [r13+112] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+112], rax + mov QWORD PTR [r13+112], r9 + mov rax, QWORD PTR [r12+120] + mov r9, QWORD PTR [r13+120] + and rax, rdi + and r9, r15 + mov QWORD PTR [r12+120], rax + mov QWORD PTR [r13+120], r9 + mov rax, QWORD PTR [r12] + add rax, QWORD PTR [r13] + mov r9, QWORD PTR [r12+8] + mov QWORD PTR [rsi], rax + adc r9, QWORD PTR [r13+8] + mov r10, QWORD PTR [r12+16] + mov QWORD PTR [rsi+8], r9 + adc r10, QWORD PTR [r13+16] + mov rax, QWORD PTR [r12+24] + mov QWORD PTR [rsi+16], r10 + adc rax, QWORD PTR [r13+24] + mov r9, QWORD PTR [r12+32] + mov QWORD PTR [rsi+24], rax + adc r9, QWORD PTR [r13+32] + mov r10, QWORD PTR [r12+40] + mov QWORD PTR [rsi+32], r9 + adc r10, QWORD PTR [r13+40] + mov rax, QWORD PTR [r12+48] + mov QWORD PTR [rsi+40], r10 + adc rax, QWORD PTR [r13+48] + mov r9, QWORD PTR [r12+56] + mov QWORD PTR [rsi+48], rax + adc r9, QWORD PTR [r13+56] + mov r10, QWORD PTR [r12+64] + mov QWORD PTR [rsi+56], r9 + adc r10, QWORD PTR [r13+64] + mov rax, QWORD PTR [r12+72] + mov QWORD PTR [rsi+64], r10 + adc rax, QWORD PTR [r13+72] + mov r9, QWORD PTR [r12+80] + mov QWORD PTR [rsi+72], rax + adc r9, QWORD PTR [r13+80] + mov r10, QWORD PTR [r12+88] + mov QWORD PTR [rsi+80], r9 + adc r10, QWORD PTR [r13+88] + mov rax, QWORD PTR [r12+96] + mov QWORD PTR [rsi+88], r10 + adc rax, QWORD PTR [r13+96] + mov r9, QWORD PTR [r12+104] + mov QWORD PTR [rsi+96], rax + adc r9, QWORD PTR [r13+104] + mov r10, QWORD PTR [r12+112] + mov QWORD PTR [rsi+104], r9 + adc r10, QWORD PTR [r13+112] + mov rax, QWORD PTR [r12+120] + mov QWORD PTR [rsi+112], r10 + adc rax, QWORD PTR [r13+120] + mov QWORD PTR [rsi+120], rax + adc r11, 0 + lea r13, QWORD PTR [rsp+256] + mov r12, rsp + mov rax, QWORD PTR [r12] + sub rax, QWORD PTR [r13] + mov r9, QWORD PTR [r12+8] + mov QWORD PTR [r12], rax + sbb r9, QWORD PTR [r13+8] + mov r10, QWORD PTR [r12+16] + mov QWORD PTR [r12+8], r9 + sbb r10, QWORD PTR [r13+16] + mov rax, QWORD PTR [r12+24] + mov QWORD PTR [r12+16], r10 + sbb rax, QWORD PTR [r13+24] + mov r9, QWORD PTR [r12+32] + mov QWORD PTR [r12+24], rax + sbb r9, QWORD PTR [r13+32] + mov r10, QWORD PTR [r12+40] + mov QWORD PTR [r12+32], r9 + sbb r10, QWORD PTR [r13+40] + mov rax, QWORD PTR [r12+48] + mov QWORD PTR [r12+40], r10 + sbb rax, QWORD PTR [r13+48] + mov r9, QWORD PTR [r12+56] + mov QWORD PTR [r12+48], rax + sbb r9, QWORD PTR [r13+56] + mov r10, QWORD PTR [r12+64] + mov QWORD PTR [r12+56], r9 + sbb r10, QWORD PTR [r13+64] + mov rax, QWORD PTR [r12+72] + mov QWORD PTR [r12+64], r10 + sbb rax, QWORD PTR [r13+72] + mov r9, QWORD PTR [r12+80] + mov QWORD PTR [r12+72], rax + sbb r9, QWORD PTR [r13+80] + mov r10, QWORD PTR [r12+88] + mov QWORD PTR [r12+80], r9 + sbb r10, QWORD PTR [r13+88] + mov rax, QWORD PTR [r12+96] + mov QWORD PTR [r12+88], r10 + sbb rax, QWORD PTR [r13+96] + mov r9, QWORD PTR [r12+104] + mov QWORD PTR [r12+96], rax + sbb r9, QWORD PTR [r13+104] + mov r10, QWORD PTR [r12+112] + mov QWORD PTR [r12+104], r9 + sbb r10, QWORD PTR [r13+112] + mov rax, QWORD PTR [r12+120] + mov QWORD PTR [r12+112], r10 + sbb rax, QWORD PTR [r13+120] + mov r9, QWORD PTR [r12+128] + mov QWORD PTR [r12+120], rax + sbb r9, QWORD PTR [r13+128] + mov r10, QWORD PTR [r12+136] + mov QWORD PTR [r12+128], r9 + sbb r10, QWORD PTR [r13+136] + mov rax, QWORD PTR [r12+144] + mov QWORD PTR [r12+136], r10 + sbb rax, QWORD PTR [r13+144] + mov r9, QWORD PTR [r12+152] + mov QWORD PTR [r12+144], rax + sbb r9, QWORD PTR [r13+152] + mov r10, QWORD PTR [r12+160] + mov QWORD PTR [r12+152], r9 + sbb r10, QWORD PTR [r13+160] + mov rax, QWORD PTR [r12+168] + mov QWORD PTR [r12+160], r10 + sbb rax, QWORD PTR [r13+168] + mov r9, QWORD PTR [r12+176] + mov QWORD PTR [r12+168], rax + sbb r9, QWORD PTR [r13+176] + mov r10, QWORD PTR [r12+184] + mov QWORD PTR [r12+176], r9 + sbb r10, QWORD PTR [r13+184] + mov rax, QWORD PTR [r12+192] + mov QWORD PTR [r12+184], r10 + sbb rax, QWORD PTR [r13+192] + mov r9, QWORD PTR [r12+200] + mov QWORD PTR [r12+192], rax + sbb r9, QWORD PTR [r13+200] + mov r10, QWORD PTR [r12+208] + mov QWORD PTR [r12+200], r9 + sbb r10, QWORD PTR [r13+208] + mov rax, QWORD PTR [r12+216] + mov QWORD PTR [r12+208], r10 + sbb rax, QWORD PTR [r13+216] + mov r9, QWORD PTR [r12+224] + mov QWORD PTR [r12+216], rax + sbb r9, QWORD PTR [r13+224] + mov r10, QWORD PTR [r12+232] + mov QWORD PTR [r12+224], r9 + sbb r10, QWORD PTR [r13+232] + mov rax, QWORD PTR [r12+240] + mov QWORD PTR [r12+232], r10 + sbb rax, QWORD PTR [r13+240] + mov r9, QWORD PTR [r12+248] + mov QWORD PTR [r12+240], rax + sbb r9, QWORD PTR [r13+248] + mov QWORD PTR [r12+248], r9 + sbb r11, 0 + mov rax, QWORD PTR [r12] + sub rax, QWORD PTR [rcx] + mov r9, QWORD PTR [r12+8] + mov QWORD PTR [r12], rax + sbb r9, QWORD PTR [rcx+8] + mov r10, QWORD PTR [r12+16] + mov QWORD PTR [r12+8], r9 + sbb r10, QWORD PTR [rcx+16] + mov rax, QWORD PTR [r12+24] + mov QWORD PTR [r12+16], r10 + sbb rax, QWORD PTR [rcx+24] + mov r9, QWORD PTR [r12+32] + mov QWORD PTR [r12+24], rax + sbb r9, QWORD PTR [rcx+32] + mov r10, QWORD PTR [r12+40] + mov QWORD PTR [r12+32], r9 + sbb r10, QWORD PTR [rcx+40] + mov rax, QWORD PTR [r12+48] + mov QWORD PTR [r12+40], r10 + sbb rax, QWORD PTR [rcx+48] + mov r9, QWORD PTR [r12+56] + mov QWORD PTR [r12+48], rax + sbb r9, QWORD PTR [rcx+56] + mov r10, QWORD PTR [r12+64] + mov QWORD PTR [r12+56], r9 + sbb r10, QWORD PTR [rcx+64] + mov rax, QWORD PTR [r12+72] + mov QWORD PTR [r12+64], r10 + sbb rax, QWORD PTR [rcx+72] + mov r9, QWORD PTR [r12+80] + mov QWORD PTR [r12+72], rax + sbb r9, QWORD PTR [rcx+80] + mov r10, QWORD PTR [r12+88] + mov QWORD PTR [r12+80], r9 + sbb r10, QWORD PTR [rcx+88] + mov rax, QWORD PTR [r12+96] + mov QWORD PTR [r12+88], r10 + sbb rax, QWORD PTR [rcx+96] + mov r9, QWORD PTR [r12+104] + mov QWORD PTR [r12+96], rax + sbb r9, QWORD PTR [rcx+104] + mov r10, QWORD PTR [r12+112] + mov QWORD PTR [r12+104], r9 + sbb r10, QWORD PTR [rcx+112] + mov rax, QWORD PTR [r12+120] + mov QWORD PTR [r12+112], r10 + sbb rax, QWORD PTR [rcx+120] + mov r9, QWORD PTR [r12+128] + mov QWORD PTR [r12+120], rax + sbb r9, QWORD PTR [rcx+128] + mov r10, QWORD PTR [r12+136] + mov QWORD PTR [r12+128], r9 + sbb r10, QWORD PTR [rcx+136] + mov rax, QWORD PTR [r12+144] + mov QWORD PTR [r12+136], r10 + sbb rax, QWORD PTR [rcx+144] + mov r9, QWORD PTR [r12+152] + mov QWORD PTR [r12+144], rax + sbb r9, QWORD PTR [rcx+152] + mov r10, QWORD PTR [r12+160] + mov QWORD PTR [r12+152], r9 + sbb r10, QWORD PTR [rcx+160] + mov rax, QWORD PTR [r12+168] + mov QWORD PTR [r12+160], r10 + sbb rax, QWORD PTR [rcx+168] + mov r9, QWORD PTR [r12+176] + mov QWORD PTR [r12+168], rax + sbb r9, QWORD PTR [rcx+176] + mov r10, QWORD PTR [r12+184] + mov QWORD PTR [r12+176], r9 + sbb r10, QWORD PTR [rcx+184] + mov rax, QWORD PTR [r12+192] + mov QWORD PTR [r12+184], r10 + sbb rax, QWORD PTR [rcx+192] + mov r9, QWORD PTR [r12+200] + mov QWORD PTR [r12+192], rax + sbb r9, QWORD PTR [rcx+200] + mov r10, QWORD PTR [r12+208] + mov QWORD PTR [r12+200], r9 + sbb r10, QWORD PTR [rcx+208] + mov rax, QWORD PTR [r12+216] + mov QWORD PTR [r12+208], r10 + sbb rax, QWORD PTR [rcx+216] + mov r9, QWORD PTR [r12+224] + mov QWORD PTR [r12+216], rax + sbb r9, QWORD PTR [rcx+224] + mov r10, QWORD PTR [r12+232] + mov QWORD PTR [r12+224], r9 + sbb r10, QWORD PTR [rcx+232] + mov rax, QWORD PTR [r12+240] + mov QWORD PTR [r12+232], r10 + sbb rax, QWORD PTR [rcx+240] + mov r9, QWORD PTR [r12+248] + mov QWORD PTR [r12+240], rax + sbb r9, QWORD PTR [rcx+248] + mov QWORD PTR [r12+248], r9 + sbb r11, 0 + sub rsi, 128 + ; Add + mov rax, QWORD PTR [rsi] + add rax, QWORD PTR [r12] + mov r9, QWORD PTR [rsi+8] + mov QWORD PTR [rsi], rax + adc r9, QWORD PTR [r12+8] + mov r10, QWORD PTR [rsi+16] + mov QWORD PTR [rsi+8], r9 + adc r10, QWORD PTR [r12+16] + mov rax, QWORD PTR [rsi+24] + mov QWORD PTR [rsi+16], r10 + adc rax, QWORD PTR [r12+24] + mov r9, QWORD PTR [rsi+32] + mov QWORD PTR [rsi+24], rax + adc r9, QWORD PTR [r12+32] + mov r10, QWORD PTR [rsi+40] + mov QWORD PTR [rsi+32], r9 + adc r10, QWORD PTR [r12+40] + mov rax, QWORD PTR [rsi+48] + mov QWORD PTR [rsi+40], r10 + adc rax, QWORD PTR [r12+48] + mov r9, QWORD PTR [rsi+56] + mov QWORD PTR [rsi+48], rax + adc r9, QWORD PTR [r12+56] + mov r10, QWORD PTR [rsi+64] + mov QWORD PTR [rsi+56], r9 + adc r10, QWORD PTR [r12+64] + mov rax, QWORD PTR [rsi+72] + mov QWORD PTR [rsi+64], r10 + adc rax, QWORD PTR [r12+72] + mov r9, QWORD PTR [rsi+80] + mov QWORD PTR [rsi+72], rax + adc r9, QWORD PTR [r12+80] + mov r10, QWORD PTR [rsi+88] + mov QWORD PTR [rsi+80], r9 + adc r10, QWORD PTR [r12+88] + mov rax, QWORD PTR [rsi+96] + mov QWORD PTR [rsi+88], r10 + adc rax, QWORD PTR [r12+96] + mov r9, QWORD PTR [rsi+104] + mov QWORD PTR [rsi+96], rax + adc r9, QWORD PTR [r12+104] + mov r10, QWORD PTR [rsi+112] + mov QWORD PTR [rsi+104], r9 + adc r10, QWORD PTR [r12+112] + mov rax, QWORD PTR [rsi+120] + mov QWORD PTR [rsi+112], r10 + adc rax, QWORD PTR [r12+120] + mov r9, QWORD PTR [rsi+128] + mov QWORD PTR [rsi+120], rax + adc r9, QWORD PTR [r12+128] + mov r10, QWORD PTR [rsi+136] + mov QWORD PTR [rsi+128], r9 + adc r10, QWORD PTR [r12+136] + mov rax, QWORD PTR [rsi+144] + mov QWORD PTR [rsi+136], r10 + adc rax, QWORD PTR [r12+144] + mov r9, QWORD PTR [rsi+152] + mov QWORD PTR [rsi+144], rax + adc r9, QWORD PTR [r12+152] + mov r10, QWORD PTR [rsi+160] + mov QWORD PTR [rsi+152], r9 + adc r10, QWORD PTR [r12+160] + mov rax, QWORD PTR [rsi+168] + mov QWORD PTR [rsi+160], r10 + adc rax, QWORD PTR [r12+168] + mov r9, QWORD PTR [rsi+176] + mov QWORD PTR [rsi+168], rax + adc r9, QWORD PTR [r12+176] + mov r10, QWORD PTR [rsi+184] + mov QWORD PTR [rsi+176], r9 + adc r10, QWORD PTR [r12+184] + mov rax, QWORD PTR [rsi+192] + mov QWORD PTR [rsi+184], r10 + adc rax, QWORD PTR [r12+192] + mov r9, QWORD PTR [rsi+200] + mov QWORD PTR [rsi+192], rax + adc r9, QWORD PTR [r12+200] + mov r10, QWORD PTR [rsi+208] + mov QWORD PTR [rsi+200], r9 + adc r10, QWORD PTR [r12+208] + mov rax, QWORD PTR [rsi+216] + mov QWORD PTR [rsi+208], r10 + adc rax, QWORD PTR [r12+216] + mov r9, QWORD PTR [rsi+224] + mov QWORD PTR [rsi+216], rax + adc r9, QWORD PTR [r12+224] + mov r10, QWORD PTR [rsi+232] + mov QWORD PTR [rsi+224], r9 + adc r10, QWORD PTR [r12+232] + mov rax, QWORD PTR [rsi+240] + mov QWORD PTR [rsi+232], r10 + adc rax, QWORD PTR [r12+240] + mov r9, QWORD PTR [rsi+248] + mov QWORD PTR [rsi+240], rax + adc r9, QWORD PTR [r12+248] + mov QWORD PTR [rsi+248], r9 + adc r11, 0 + mov QWORD PTR [rcx+384], r11 + add rsi, 128 + ; Add + mov rax, QWORD PTR [rsi] + xor r11, r11 + add rax, QWORD PTR [r13] + mov r9, QWORD PTR [rsi+8] + mov QWORD PTR [rsi], rax + adc r9, QWORD PTR [r13+8] + mov r10, QWORD PTR [rsi+16] + mov QWORD PTR [rsi+8], r9 + adc r10, QWORD PTR [r13+16] + mov rax, QWORD PTR [rsi+24] + mov QWORD PTR [rsi+16], r10 + adc rax, QWORD PTR [r13+24] + mov r9, QWORD PTR [rsi+32] + mov QWORD PTR [rsi+24], rax + adc r9, QWORD PTR [r13+32] + mov r10, QWORD PTR [rsi+40] + mov QWORD PTR [rsi+32], r9 + adc r10, QWORD PTR [r13+40] + mov rax, QWORD PTR [rsi+48] + mov QWORD PTR [rsi+40], r10 + adc rax, QWORD PTR [r13+48] + mov r9, QWORD PTR [rsi+56] + mov QWORD PTR [rsi+48], rax + adc r9, QWORD PTR [r13+56] + mov r10, QWORD PTR [rsi+64] + mov QWORD PTR [rsi+56], r9 + adc r10, QWORD PTR [r13+64] + mov rax, QWORD PTR [rsi+72] + mov QWORD PTR [rsi+64], r10 + adc rax, QWORD PTR [r13+72] + mov r9, QWORD PTR [rsi+80] + mov QWORD PTR [rsi+72], rax + adc r9, QWORD PTR [r13+80] + mov r10, QWORD PTR [rsi+88] + mov QWORD PTR [rsi+80], r9 + adc r10, QWORD PTR [r13+88] + mov rax, QWORD PTR [rsi+96] + mov QWORD PTR [rsi+88], r10 + adc rax, QWORD PTR [r13+96] + mov r9, QWORD PTR [rsi+104] + mov QWORD PTR [rsi+96], rax + adc r9, QWORD PTR [r13+104] + mov r10, QWORD PTR [rsi+112] + mov QWORD PTR [rsi+104], r9 + adc r10, QWORD PTR [r13+112] + mov rax, QWORD PTR [rsi+120] + mov QWORD PTR [rsi+112], r10 + adc rax, QWORD PTR [r13+120] + mov r9, QWORD PTR [rsi+128] + mov QWORD PTR [rsi+120], rax + adc r9, QWORD PTR [r13+128] + mov QWORD PTR [rsi+128], r9 + adc r11, 0 + ; Add to zero + mov rax, QWORD PTR [r13+136] + adc rax, 0 + mov r9, QWORD PTR [r13+144] + mov QWORD PTR [rsi+136], rax + adc r9, 0 + mov r10, QWORD PTR [r13+152] + mov QWORD PTR [rsi+144], r9 + adc r10, 0 + mov rax, QWORD PTR [r13+160] + mov QWORD PTR [rsi+152], r10 + adc rax, 0 + mov r9, QWORD PTR [r13+168] + mov QWORD PTR [rsi+160], rax + adc r9, 0 + mov r10, QWORD PTR [r13+176] + mov QWORD PTR [rsi+168], r9 + adc r10, 0 + mov rax, QWORD PTR [r13+184] + mov QWORD PTR [rsi+176], r10 + adc rax, 0 + mov r9, QWORD PTR [r13+192] + mov QWORD PTR [rsi+184], rax + adc r9, 0 + mov r10, QWORD PTR [r13+200] + mov QWORD PTR [rsi+192], r9 + adc r10, 0 + mov rax, QWORD PTR [r13+208] + mov QWORD PTR [rsi+200], r10 + adc rax, 0 + mov r9, QWORD PTR [r13+216] + mov QWORD PTR [rsi+208], rax + adc r9, 0 + mov r10, QWORD PTR [r13+224] + mov QWORD PTR [rsi+216], r9 + adc r10, 0 + mov rax, QWORD PTR [r13+232] + mov QWORD PTR [rsi+224], r10 + adc rax, 0 + mov r9, QWORD PTR [r13+240] + mov QWORD PTR [rsi+232], rax + adc r9, 0 + mov r10, QWORD PTR [r13+248] + mov QWORD PTR [rsi+240], r9 + adc r10, 0 + mov QWORD PTR [rsi+248], r10 + add rsp, 808 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_2048_mul_32 ENDP +_text ENDS +IFDEF HAVE_INTEL_AVX2 +; /* Multiply a and b into r. (r = a * b) +; * +; * r A single precision integer. +; * a A single precision integer. +; * b A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_2048_mul_avx2_32 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + sub rsp, 808 + mov QWORD PTR [rsp+768], rcx + mov QWORD PTR [rsp+776], rdx + mov QWORD PTR [rsp+784], r8 + lea r12, QWORD PTR [rsp+512] + lea r14, QWORD PTR [rdx+128] + ; Add + mov rax, QWORD PTR [rdx] + xor r15, r15 + add rax, QWORD PTR [r14] + mov r9, QWORD PTR [rdx+8] + mov QWORD PTR [r12], rax + adc r9, QWORD PTR [r14+8] + mov r10, QWORD PTR [rdx+16] + mov QWORD PTR [r12+8], r9 + adc r10, QWORD PTR [r14+16] + mov rax, QWORD PTR [rdx+24] + mov QWORD PTR [r12+16], r10 + adc rax, QWORD PTR [r14+24] + mov r9, QWORD PTR [rdx+32] + mov QWORD PTR [r12+24], rax + adc r9, QWORD PTR [r14+32] + mov r10, QWORD PTR [rdx+40] + mov QWORD PTR [r12+32], r9 + adc r10, QWORD PTR [r14+40] + mov rax, QWORD PTR [rdx+48] + mov QWORD PTR [r12+40], r10 + adc rax, QWORD PTR [r14+48] + mov r9, QWORD PTR [rdx+56] + mov QWORD PTR [r12+48], rax + adc r9, QWORD PTR [r14+56] + mov r10, QWORD PTR [rdx+64] + mov QWORD PTR [r12+56], r9 + adc r10, QWORD PTR [r14+64] + mov rax, QWORD PTR [rdx+72] + mov QWORD PTR [r12+64], r10 + adc rax, QWORD PTR [r14+72] + mov r9, QWORD PTR [rdx+80] + mov QWORD PTR [r12+72], rax + adc r9, QWORD PTR [r14+80] + mov r10, QWORD PTR [rdx+88] + mov QWORD PTR [r12+80], r9 + adc r10, QWORD PTR [r14+88] + mov rax, QWORD PTR [rdx+96] + mov QWORD PTR [r12+88], r10 + adc rax, QWORD PTR [r14+96] + mov r9, QWORD PTR [rdx+104] + mov QWORD PTR [r12+96], rax + adc r9, QWORD PTR [r14+104] + mov r10, QWORD PTR [rdx+112] + mov QWORD PTR [r12+104], r9 + adc r10, QWORD PTR [r14+112] + mov rax, QWORD PTR [rdx+120] + mov QWORD PTR [r12+112], r10 + adc rax, QWORD PTR [r14+120] + mov QWORD PTR [r12+120], rax + adc r15, 0 + mov QWORD PTR [rsp+792], r15 + lea r13, QWORD PTR [rsp+640] + lea r14, QWORD PTR [r8+128] + ; Add + mov rax, QWORD PTR [r8] + xor rdi, rdi + add rax, QWORD PTR [r14] + mov r9, QWORD PTR [r8+8] + mov QWORD PTR [r13], rax + adc r9, QWORD PTR [r14+8] + mov r10, QWORD PTR [r8+16] + mov QWORD PTR [r13+8], r9 + adc r10, QWORD PTR [r14+16] + mov rax, QWORD PTR [r8+24] + mov QWORD PTR [r13+16], r10 + adc rax, QWORD PTR [r14+24] + mov r9, QWORD PTR [r8+32] + mov QWORD PTR [r13+24], rax + adc r9, QWORD PTR [r14+32] + mov r10, QWORD PTR [r8+40] + mov QWORD PTR [r13+32], r9 + adc r10, QWORD PTR [r14+40] + mov rax, QWORD PTR [r8+48] + mov QWORD PTR [r13+40], r10 + adc rax, QWORD PTR [r14+48] + mov r9, QWORD PTR [r8+56] + mov QWORD PTR [r13+48], rax + adc r9, QWORD PTR [r14+56] + mov r10, QWORD PTR [r8+64] + mov QWORD PTR [r13+56], r9 + adc r10, QWORD PTR [r14+64] + mov rax, QWORD PTR [r8+72] + mov QWORD PTR [r13+64], r10 + adc rax, QWORD PTR [r14+72] + mov r9, QWORD PTR [r8+80] + mov QWORD PTR [r13+72], rax + adc r9, QWORD PTR [r14+80] + mov r10, QWORD PTR [r8+88] + mov QWORD PTR [r13+80], r9 + adc r10, QWORD PTR [r14+88] + mov rax, QWORD PTR [r8+96] + mov QWORD PTR [r13+88], r10 + adc rax, QWORD PTR [r14+96] + mov r9, QWORD PTR [r8+104] + mov QWORD PTR [r13+96], rax + adc r9, QWORD PTR [r14+104] + mov r10, QWORD PTR [r8+112] + mov QWORD PTR [r13+104], r9 + adc r10, QWORD PTR [r14+112] + mov rax, QWORD PTR [r8+120] + mov QWORD PTR [r13+112], r10 + adc rax, QWORD PTR [r14+120] + mov QWORD PTR [r13+120], rax + adc rdi, 0 + mov QWORD PTR [rsp+800], rdi + mov r8, r13 + mov rdx, r12 + mov rcx, rsp + call sp_2048_mul_avx2_16 + mov r8, QWORD PTR [rsp+784] + mov rdx, QWORD PTR [rsp+776] + lea rcx, QWORD PTR [rsp+256] + add r8, 128 + add rdx, 128 + call sp_2048_mul_avx2_16 + mov r8, QWORD PTR [rsp+784] + mov rdx, QWORD PTR [rsp+776] + mov rcx, QWORD PTR [rsp+768] + call sp_2048_mul_avx2_16 +IFDEF _WIN64 + mov r8, QWORD PTR [rsp+784] + mov rdx, QWORD PTR [rsp+776] + mov rcx, QWORD PTR [rsp+768] +ENDIF + mov r15, QWORD PTR [rsp+792] + mov rdi, QWORD PTR [rsp+800] + mov rsi, QWORD PTR [rsp+768] + mov r11, r15 + lea r12, QWORD PTR [rsp+512] + lea r13, QWORD PTR [rsp+640] + and r11, rdi + neg r15 + neg rdi + add rsi, 256 + mov rax, QWORD PTR [r12] + mov r9, QWORD PTR [r13] + pext rax, rax, rdi + pext r9, r9, r15 + add rax, r9 + mov r9, QWORD PTR [r12+8] + mov r10, QWORD PTR [r13+8] + pext r9, r9, rdi + pext r10, r10, r15 + mov QWORD PTR [rsi], rax + adc r9, r10 + mov r10, QWORD PTR [r12+16] + mov rax, QWORD PTR [r13+16] + pext r10, r10, rdi + pext rax, rax, r15 + mov QWORD PTR [rsi+8], r9 + adc r10, rax + mov rax, QWORD PTR [r12+24] + mov r9, QWORD PTR [r13+24] + pext rax, rax, rdi + pext r9, r9, r15 + mov QWORD PTR [rsi+16], r10 + adc rax, r9 + mov r9, QWORD PTR [r12+32] + mov r10, QWORD PTR [r13+32] + pext r9, r9, rdi + pext r10, r10, r15 + mov QWORD PTR [rsi+24], rax + adc r9, r10 + mov r10, QWORD PTR [r12+40] + mov rax, QWORD PTR [r13+40] + pext r10, r10, rdi + pext rax, rax, r15 + mov QWORD PTR [rsi+32], r9 + adc r10, rax + mov rax, QWORD PTR [r12+48] + mov r9, QWORD PTR [r13+48] + pext rax, rax, rdi + pext r9, r9, r15 + mov QWORD PTR [rsi+40], r10 + adc rax, r9 + mov r9, QWORD PTR [r12+56] + mov r10, QWORD PTR [r13+56] + pext r9, r9, rdi + pext r10, r10, r15 + mov QWORD PTR [rsi+48], rax + adc r9, r10 + mov r10, QWORD PTR [r12+64] + mov rax, QWORD PTR [r13+64] + pext r10, r10, rdi + pext rax, rax, r15 + mov QWORD PTR [rsi+56], r9 + adc r10, rax + mov rax, QWORD PTR [r12+72] + mov r9, QWORD PTR [r13+72] + pext rax, rax, rdi + pext r9, r9, r15 + mov QWORD PTR [rsi+64], r10 + adc rax, r9 + mov r9, QWORD PTR [r12+80] + mov r10, QWORD PTR [r13+80] + pext r9, r9, rdi + pext r10, r10, r15 + mov QWORD PTR [rsi+72], rax + adc r9, r10 + mov r10, QWORD PTR [r12+88] + mov rax, QWORD PTR [r13+88] + pext r10, r10, rdi + pext rax, rax, r15 + mov QWORD PTR [rsi+80], r9 + adc r10, rax + mov rax, QWORD PTR [r12+96] + mov r9, QWORD PTR [r13+96] + pext rax, rax, rdi + pext r9, r9, r15 + mov QWORD PTR [rsi+88], r10 + adc rax, r9 + mov r9, QWORD PTR [r12+104] + mov r10, QWORD PTR [r13+104] + pext r9, r9, rdi + pext r10, r10, r15 + mov QWORD PTR [rsi+96], rax + adc r9, r10 + mov r10, QWORD PTR [r12+112] + mov rax, QWORD PTR [r13+112] + pext r10, r10, rdi + pext rax, rax, r15 + mov QWORD PTR [rsi+104], r9 + adc r10, rax + mov rax, QWORD PTR [r12+120] + mov r9, QWORD PTR [r13+120] + pext rax, rax, rdi + pext r9, r9, r15 + mov QWORD PTR [rsi+112], r10 + adc rax, r9 + mov QWORD PTR [rsi+120], rax + adc r11, 0 + lea r13, QWORD PTR [rsp+256] + mov r12, rsp + mov rax, QWORD PTR [r12] + sub rax, QWORD PTR [r13] + mov r9, QWORD PTR [r12+8] + mov QWORD PTR [r12], rax + sbb r9, QWORD PTR [r13+8] + mov r10, QWORD PTR [r12+16] + mov QWORD PTR [r12+8], r9 + sbb r10, QWORD PTR [r13+16] + mov rax, QWORD PTR [r12+24] + mov QWORD PTR [r12+16], r10 + sbb rax, QWORD PTR [r13+24] + mov r9, QWORD PTR [r12+32] + mov QWORD PTR [r12+24], rax + sbb r9, QWORD PTR [r13+32] + mov r10, QWORD PTR [r12+40] + mov QWORD PTR [r12+32], r9 + sbb r10, QWORD PTR [r13+40] + mov rax, QWORD PTR [r12+48] + mov QWORD PTR [r12+40], r10 + sbb rax, QWORD PTR [r13+48] + mov r9, QWORD PTR [r12+56] + mov QWORD PTR [r12+48], rax + sbb r9, QWORD PTR [r13+56] + mov r10, QWORD PTR [r12+64] + mov QWORD PTR [r12+56], r9 + sbb r10, QWORD PTR [r13+64] + mov rax, QWORD PTR [r12+72] + mov QWORD PTR [r12+64], r10 + sbb rax, QWORD PTR [r13+72] + mov r9, QWORD PTR [r12+80] + mov QWORD PTR [r12+72], rax + sbb r9, QWORD PTR [r13+80] + mov r10, QWORD PTR [r12+88] + mov QWORD PTR [r12+80], r9 + sbb r10, QWORD PTR [r13+88] + mov rax, QWORD PTR [r12+96] + mov QWORD PTR [r12+88], r10 + sbb rax, QWORD PTR [r13+96] + mov r9, QWORD PTR [r12+104] + mov QWORD PTR [r12+96], rax + sbb r9, QWORD PTR [r13+104] + mov r10, QWORD PTR [r12+112] + mov QWORD PTR [r12+104], r9 + sbb r10, QWORD PTR [r13+112] + mov rax, QWORD PTR [r12+120] + mov QWORD PTR [r12+112], r10 + sbb rax, QWORD PTR [r13+120] + mov r9, QWORD PTR [r12+128] + mov QWORD PTR [r12+120], rax + sbb r9, QWORD PTR [r13+128] + mov r10, QWORD PTR [r12+136] + mov QWORD PTR [r12+128], r9 + sbb r10, QWORD PTR [r13+136] + mov rax, QWORD PTR [r12+144] + mov QWORD PTR [r12+136], r10 + sbb rax, QWORD PTR [r13+144] + mov r9, QWORD PTR [r12+152] + mov QWORD PTR [r12+144], rax + sbb r9, QWORD PTR [r13+152] + mov r10, QWORD PTR [r12+160] + mov QWORD PTR [r12+152], r9 + sbb r10, QWORD PTR [r13+160] + mov rax, QWORD PTR [r12+168] + mov QWORD PTR [r12+160], r10 + sbb rax, QWORD PTR [r13+168] + mov r9, QWORD PTR [r12+176] + mov QWORD PTR [r12+168], rax + sbb r9, QWORD PTR [r13+176] + mov r10, QWORD PTR [r12+184] + mov QWORD PTR [r12+176], r9 + sbb r10, QWORD PTR [r13+184] + mov rax, QWORD PTR [r12+192] + mov QWORD PTR [r12+184], r10 + sbb rax, QWORD PTR [r13+192] + mov r9, QWORD PTR [r12+200] + mov QWORD PTR [r12+192], rax + sbb r9, QWORD PTR [r13+200] + mov r10, QWORD PTR [r12+208] + mov QWORD PTR [r12+200], r9 + sbb r10, QWORD PTR [r13+208] + mov rax, QWORD PTR [r12+216] + mov QWORD PTR [r12+208], r10 + sbb rax, QWORD PTR [r13+216] + mov r9, QWORD PTR [r12+224] + mov QWORD PTR [r12+216], rax + sbb r9, QWORD PTR [r13+224] + mov r10, QWORD PTR [r12+232] + mov QWORD PTR [r12+224], r9 + sbb r10, QWORD PTR [r13+232] + mov rax, QWORD PTR [r12+240] + mov QWORD PTR [r12+232], r10 + sbb rax, QWORD PTR [r13+240] + mov r9, QWORD PTR [r12+248] + mov QWORD PTR [r12+240], rax + sbb r9, QWORD PTR [r13+248] + mov QWORD PTR [r12+248], r9 + sbb r11, 0 + mov rax, QWORD PTR [r12] + sub rax, QWORD PTR [rcx] + mov r9, QWORD PTR [r12+8] + mov QWORD PTR [r12], rax + sbb r9, QWORD PTR [rcx+8] + mov r10, QWORD PTR [r12+16] + mov QWORD PTR [r12+8], r9 + sbb r10, QWORD PTR [rcx+16] + mov rax, QWORD PTR [r12+24] + mov QWORD PTR [r12+16], r10 + sbb rax, QWORD PTR [rcx+24] + mov r9, QWORD PTR [r12+32] + mov QWORD PTR [r12+24], rax + sbb r9, QWORD PTR [rcx+32] + mov r10, QWORD PTR [r12+40] + mov QWORD PTR [r12+32], r9 + sbb r10, QWORD PTR [rcx+40] + mov rax, QWORD PTR [r12+48] + mov QWORD PTR [r12+40], r10 + sbb rax, QWORD PTR [rcx+48] + mov r9, QWORD PTR [r12+56] + mov QWORD PTR [r12+48], rax + sbb r9, QWORD PTR [rcx+56] + mov r10, QWORD PTR [r12+64] + mov QWORD PTR [r12+56], r9 + sbb r10, QWORD PTR [rcx+64] + mov rax, QWORD PTR [r12+72] + mov QWORD PTR [r12+64], r10 + sbb rax, QWORD PTR [rcx+72] + mov r9, QWORD PTR [r12+80] + mov QWORD PTR [r12+72], rax + sbb r9, QWORD PTR [rcx+80] + mov r10, QWORD PTR [r12+88] + mov QWORD PTR [r12+80], r9 + sbb r10, QWORD PTR [rcx+88] + mov rax, QWORD PTR [r12+96] + mov QWORD PTR [r12+88], r10 + sbb rax, QWORD PTR [rcx+96] + mov r9, QWORD PTR [r12+104] + mov QWORD PTR [r12+96], rax + sbb r9, QWORD PTR [rcx+104] + mov r10, QWORD PTR [r12+112] + mov QWORD PTR [r12+104], r9 + sbb r10, QWORD PTR [rcx+112] + mov rax, QWORD PTR [r12+120] + mov QWORD PTR [r12+112], r10 + sbb rax, QWORD PTR [rcx+120] + mov r9, QWORD PTR [r12+128] + mov QWORD PTR [r12+120], rax + sbb r9, QWORD PTR [rcx+128] + mov r10, QWORD PTR [r12+136] + mov QWORD PTR [r12+128], r9 + sbb r10, QWORD PTR [rcx+136] + mov rax, QWORD PTR [r12+144] + mov QWORD PTR [r12+136], r10 + sbb rax, QWORD PTR [rcx+144] + mov r9, QWORD PTR [r12+152] + mov QWORD PTR [r12+144], rax + sbb r9, QWORD PTR [rcx+152] + mov r10, QWORD PTR [r12+160] + mov QWORD PTR [r12+152], r9 + sbb r10, QWORD PTR [rcx+160] + mov rax, QWORD PTR [r12+168] + mov QWORD PTR [r12+160], r10 + sbb rax, QWORD PTR [rcx+168] + mov r9, QWORD PTR [r12+176] + mov QWORD PTR [r12+168], rax + sbb r9, QWORD PTR [rcx+176] + mov r10, QWORD PTR [r12+184] + mov QWORD PTR [r12+176], r9 + sbb r10, QWORD PTR [rcx+184] + mov rax, QWORD PTR [r12+192] + mov QWORD PTR [r12+184], r10 + sbb rax, QWORD PTR [rcx+192] + mov r9, QWORD PTR [r12+200] + mov QWORD PTR [r12+192], rax + sbb r9, QWORD PTR [rcx+200] + mov r10, QWORD PTR [r12+208] + mov QWORD PTR [r12+200], r9 + sbb r10, QWORD PTR [rcx+208] + mov rax, QWORD PTR [r12+216] + mov QWORD PTR [r12+208], r10 + sbb rax, QWORD PTR [rcx+216] + mov r9, QWORD PTR [r12+224] + mov QWORD PTR [r12+216], rax + sbb r9, QWORD PTR [rcx+224] + mov r10, QWORD PTR [r12+232] + mov QWORD PTR [r12+224], r9 + sbb r10, QWORD PTR [rcx+232] + mov rax, QWORD PTR [r12+240] + mov QWORD PTR [r12+232], r10 + sbb rax, QWORD PTR [rcx+240] + mov r9, QWORD PTR [r12+248] + mov QWORD PTR [r12+240], rax + sbb r9, QWORD PTR [rcx+248] + mov QWORD PTR [r12+248], r9 + sbb r11, 0 + sub rsi, 128 + ; Add + mov rax, QWORD PTR [rsi] + add rax, QWORD PTR [r12] + mov r9, QWORD PTR [rsi+8] + mov QWORD PTR [rsi], rax + adc r9, QWORD PTR [r12+8] + mov r10, QWORD PTR [rsi+16] + mov QWORD PTR [rsi+8], r9 + adc r10, QWORD PTR [r12+16] + mov rax, QWORD PTR [rsi+24] + mov QWORD PTR [rsi+16], r10 + adc rax, QWORD PTR [r12+24] + mov r9, QWORD PTR [rsi+32] + mov QWORD PTR [rsi+24], rax + adc r9, QWORD PTR [r12+32] + mov r10, QWORD PTR [rsi+40] + mov QWORD PTR [rsi+32], r9 + adc r10, QWORD PTR [r12+40] + mov rax, QWORD PTR [rsi+48] + mov QWORD PTR [rsi+40], r10 + adc rax, QWORD PTR [r12+48] + mov r9, QWORD PTR [rsi+56] + mov QWORD PTR [rsi+48], rax + adc r9, QWORD PTR [r12+56] + mov r10, QWORD PTR [rsi+64] + mov QWORD PTR [rsi+56], r9 + adc r10, QWORD PTR [r12+64] + mov rax, QWORD PTR [rsi+72] + mov QWORD PTR [rsi+64], r10 + adc rax, QWORD PTR [r12+72] + mov r9, QWORD PTR [rsi+80] + mov QWORD PTR [rsi+72], rax + adc r9, QWORD PTR [r12+80] + mov r10, QWORD PTR [rsi+88] + mov QWORD PTR [rsi+80], r9 + adc r10, QWORD PTR [r12+88] + mov rax, QWORD PTR [rsi+96] + mov QWORD PTR [rsi+88], r10 + adc rax, QWORD PTR [r12+96] + mov r9, QWORD PTR [rsi+104] + mov QWORD PTR [rsi+96], rax + adc r9, QWORD PTR [r12+104] + mov r10, QWORD PTR [rsi+112] + mov QWORD PTR [rsi+104], r9 + adc r10, QWORD PTR [r12+112] + mov rax, QWORD PTR [rsi+120] + mov QWORD PTR [rsi+112], r10 + adc rax, QWORD PTR [r12+120] + mov r9, QWORD PTR [rsi+128] + mov QWORD PTR [rsi+120], rax + adc r9, QWORD PTR [r12+128] + mov r10, QWORD PTR [rsi+136] + mov QWORD PTR [rsi+128], r9 + adc r10, QWORD PTR [r12+136] + mov rax, QWORD PTR [rsi+144] + mov QWORD PTR [rsi+136], r10 + adc rax, QWORD PTR [r12+144] + mov r9, QWORD PTR [rsi+152] + mov QWORD PTR [rsi+144], rax + adc r9, QWORD PTR [r12+152] + mov r10, QWORD PTR [rsi+160] + mov QWORD PTR [rsi+152], r9 + adc r10, QWORD PTR [r12+160] + mov rax, QWORD PTR [rsi+168] + mov QWORD PTR [rsi+160], r10 + adc rax, QWORD PTR [r12+168] + mov r9, QWORD PTR [rsi+176] + mov QWORD PTR [rsi+168], rax + adc r9, QWORD PTR [r12+176] + mov r10, QWORD PTR [rsi+184] + mov QWORD PTR [rsi+176], r9 + adc r10, QWORD PTR [r12+184] + mov rax, QWORD PTR [rsi+192] + mov QWORD PTR [rsi+184], r10 + adc rax, QWORD PTR [r12+192] + mov r9, QWORD PTR [rsi+200] + mov QWORD PTR [rsi+192], rax + adc r9, QWORD PTR [r12+200] + mov r10, QWORD PTR [rsi+208] + mov QWORD PTR [rsi+200], r9 + adc r10, QWORD PTR [r12+208] + mov rax, QWORD PTR [rsi+216] + mov QWORD PTR [rsi+208], r10 + adc rax, QWORD PTR [r12+216] + mov r9, QWORD PTR [rsi+224] + mov QWORD PTR [rsi+216], rax + adc r9, QWORD PTR [r12+224] + mov r10, QWORD PTR [rsi+232] + mov QWORD PTR [rsi+224], r9 + adc r10, QWORD PTR [r12+232] + mov rax, QWORD PTR [rsi+240] + mov QWORD PTR [rsi+232], r10 + adc rax, QWORD PTR [r12+240] + mov r9, QWORD PTR [rsi+248] + mov QWORD PTR [rsi+240], rax + adc r9, QWORD PTR [r12+248] + mov QWORD PTR [rsi+248], r9 + adc r11, 0 + mov QWORD PTR [rcx+384], r11 + add rsi, 128 + ; Add + mov rax, QWORD PTR [rsi] + xor r11, r11 + add rax, QWORD PTR [r13] + mov r9, QWORD PTR [rsi+8] + mov QWORD PTR [rsi], rax + adc r9, QWORD PTR [r13+8] + mov r10, QWORD PTR [rsi+16] + mov QWORD PTR [rsi+8], r9 + adc r10, QWORD PTR [r13+16] + mov rax, QWORD PTR [rsi+24] + mov QWORD PTR [rsi+16], r10 + adc rax, QWORD PTR [r13+24] + mov r9, QWORD PTR [rsi+32] + mov QWORD PTR [rsi+24], rax + adc r9, QWORD PTR [r13+32] + mov r10, QWORD PTR [rsi+40] + mov QWORD PTR [rsi+32], r9 + adc r10, QWORD PTR [r13+40] + mov rax, QWORD PTR [rsi+48] + mov QWORD PTR [rsi+40], r10 + adc rax, QWORD PTR [r13+48] + mov r9, QWORD PTR [rsi+56] + mov QWORD PTR [rsi+48], rax + adc r9, QWORD PTR [r13+56] + mov r10, QWORD PTR [rsi+64] + mov QWORD PTR [rsi+56], r9 + adc r10, QWORD PTR [r13+64] + mov rax, QWORD PTR [rsi+72] + mov QWORD PTR [rsi+64], r10 + adc rax, QWORD PTR [r13+72] + mov r9, QWORD PTR [rsi+80] + mov QWORD PTR [rsi+72], rax + adc r9, QWORD PTR [r13+80] + mov r10, QWORD PTR [rsi+88] + mov QWORD PTR [rsi+80], r9 + adc r10, QWORD PTR [r13+88] + mov rax, QWORD PTR [rsi+96] + mov QWORD PTR [rsi+88], r10 + adc rax, QWORD PTR [r13+96] + mov r9, QWORD PTR [rsi+104] + mov QWORD PTR [rsi+96], rax + adc r9, QWORD PTR [r13+104] + mov r10, QWORD PTR [rsi+112] + mov QWORD PTR [rsi+104], r9 + adc r10, QWORD PTR [r13+112] + mov rax, QWORD PTR [rsi+120] + mov QWORD PTR [rsi+112], r10 + adc rax, QWORD PTR [r13+120] + mov r9, QWORD PTR [rsi+128] + mov QWORD PTR [rsi+120], rax + adc r9, QWORD PTR [r13+128] + mov QWORD PTR [rsi+128], r9 + adc r11, 0 + ; Add to zero + mov rax, QWORD PTR [r13+136] + adc rax, 0 + mov r9, QWORD PTR [r13+144] + mov QWORD PTR [rsi+136], rax + adc r9, 0 + mov r10, QWORD PTR [r13+152] + mov QWORD PTR [rsi+144], r9 + adc r10, 0 + mov rax, QWORD PTR [r13+160] + mov QWORD PTR [rsi+152], r10 + adc rax, 0 + mov r9, QWORD PTR [r13+168] + mov QWORD PTR [rsi+160], rax + adc r9, 0 + mov r10, QWORD PTR [r13+176] + mov QWORD PTR [rsi+168], r9 + adc r10, 0 + mov rax, QWORD PTR [r13+184] + mov QWORD PTR [rsi+176], r10 + adc rax, 0 + mov r9, QWORD PTR [r13+192] + mov QWORD PTR [rsi+184], rax + adc r9, 0 + mov r10, QWORD PTR [r13+200] + mov QWORD PTR [rsi+192], r9 + adc r10, 0 + mov rax, QWORD PTR [r13+208] + mov QWORD PTR [rsi+200], r10 + adc rax, 0 + mov r9, QWORD PTR [r13+216] + mov QWORD PTR [rsi+208], rax + adc r9, 0 + mov r10, QWORD PTR [r13+224] + mov QWORD PTR [rsi+216], r9 + adc r10, 0 + mov rax, QWORD PTR [r13+232] + mov QWORD PTR [rsi+224], r10 + adc rax, 0 + mov r9, QWORD PTR [r13+240] + mov QWORD PTR [rsi+232], rax + adc r9, 0 + mov r10, QWORD PTR [r13+248] + mov QWORD PTR [rsi+240], r9 + adc r10, 0 + mov QWORD PTR [rsi+248], r10 + add rsp, 808 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_2048_mul_avx2_32 ENDP +_text ENDS +ENDIF +; /* Square a and put result in r. (r = a * a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_2048_sqr_16 PROC + push r12 + push r13 + push r14 + mov r8, rdx + sub rsp, 128 + ; A[0] * A[0] + mov rax, QWORD PTR [r8] + mul rax + xor r11, r11 + mov QWORD PTR [rsp], rax + mov r10, rdx + ; A[0] * A[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8] + xor r9, r9 + add r10, rax + adc r11, rdx + adc r9, 0 + add r10, rax + adc r11, rdx + adc r9, 0 + mov QWORD PTR [rsp+8], r10 + ; A[0] * A[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8] + xor r10, r10 + add r11, rax + adc r9, rdx + adc r10, 0 + add r11, rax + adc r9, rdx + adc r10, 0 + ; A[1] * A[1] + mov rax, QWORD PTR [r8+8] + mul rax + add r11, rax + adc r9, rdx + adc r10, 0 + mov QWORD PTR [rsp+16], r11 + ; A[0] * A[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r8] + xor r11, r11 + add r9, rax + adc r10, rdx + adc r11, 0 + add r9, rax + adc r10, rdx + adc r11, 0 + ; A[1] * A[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8+8] + add r9, rax + adc r10, rdx + adc r11, 0 + add r9, rax + adc r10, rdx + adc r11, 0 + mov QWORD PTR [rsp+24], r9 + ; A[0] * A[4] + mov rax, QWORD PTR [r8+32] + mul QWORD PTR [r8] + xor r9, r9 + add r10, rax + adc r11, rdx + adc r9, 0 + add r10, rax + adc r11, rdx + adc r9, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r8+8] + add r10, rax + adc r11, rdx + adc r9, 0 + add r10, rax + adc r11, rdx + adc r9, 0 + ; A[2] * A[2] + mov rax, QWORD PTR [r8+16] + mul rax + add r10, rax + adc r11, rdx + adc r9, 0 + mov QWORD PTR [rsp+32], r10 + ; A[0] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[4] + mov rax, QWORD PTR [r8+32] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rsp+40], r11 + ; A[0] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[4] + mov rax, QWORD PTR [r8+32] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[3] + mov rax, QWORD PTR [r8+24] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rsp+48], r9 + ; A[0] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[4] + mov rax, QWORD PTR [r8+32] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rsp+56], r10 + ; A[0] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[4] + mov rax, QWORD PTR [r8+32] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rsp+64], r11 + ; A[0] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rsp+72], r9 + ; A[0] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[5] + mov rax, QWORD PTR [r8+40] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rsp+80], r10 + ; A[0] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rsp+88], r11 + ; A[0] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[6] + mov rax, QWORD PTR [r8+48] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rsp+96], r9 + ; A[0] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rsp+104], r10 + ; A[0] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[7] + mov rax, QWORD PTR [r8+56] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rsp+112], r11 + ; A[0] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rsp+120], r9 + ; A[1] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+8] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[2] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[8] * A[8] + mov rax, QWORD PTR [r8+64] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rcx+128], r10 + ; A[2] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+16] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[3] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[8] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+64] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rcx+136], r11 + ; A[3] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+24] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[4] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[8] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+64] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[9] * A[9] + mov rax, QWORD PTR [r8+72] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rcx+144], r9 + ; A[4] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+32] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[5] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[8] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+64] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[9] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+72] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rcx+152], r10 + ; A[5] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+40] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[6] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[8] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+64] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[9] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+72] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[10] * A[10] + mov rax, QWORD PTR [r8+80] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rcx+160], r11 + ; A[6] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+48] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[7] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[8] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+64] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[9] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+72] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[10] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+80] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rcx+168], r9 + ; A[7] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+56] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[8] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+64] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[9] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+72] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[10] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+80] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[11] * A[11] + mov rax, QWORD PTR [r8+88] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rcx+176], r10 + ; A[8] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+64] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[9] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+72] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[10] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+80] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[11] * A[12] + mov rax, QWORD PTR [r8+96] + mul QWORD PTR [r8+88] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rcx+184], r11 + ; A[9] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+72] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[10] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+80] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[11] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+88] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[12] * A[12] + mov rax, QWORD PTR [r8+96] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rcx+192], r9 + ; A[10] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+80] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[11] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+88] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[12] * A[13] + mov rax, QWORD PTR [r8+104] + mul QWORD PTR [r8+96] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rcx+200], r10 + ; A[11] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+88] + xor r10, r10 + add r11, rax + adc r9, rdx + adc r10, 0 + add r11, rax + adc r9, rdx + adc r10, 0 + ; A[12] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+96] + add r11, rax + adc r9, rdx + adc r10, 0 + add r11, rax + adc r9, rdx + adc r10, 0 + ; A[13] * A[13] + mov rax, QWORD PTR [r8+104] + mul rax + add r11, rax + adc r9, rdx + adc r10, 0 + mov QWORD PTR [rcx+208], r11 + ; A[12] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+96] + xor r11, r11 + add r9, rax + adc r10, rdx + adc r11, 0 + add r9, rax + adc r10, rdx + adc r11, 0 + ; A[13] * A[14] + mov rax, QWORD PTR [r8+112] + mul QWORD PTR [r8+104] + add r9, rax + adc r10, rdx + adc r11, 0 + add r9, rax + adc r10, rdx + adc r11, 0 + mov QWORD PTR [rcx+216], r9 + ; A[13] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+104] + xor r9, r9 + add r10, rax + adc r11, rdx + adc r9, 0 + add r10, rax + adc r11, rdx + adc r9, 0 + ; A[14] * A[14] + mov rax, QWORD PTR [r8+112] + mul rax + add r10, rax + adc r11, rdx + adc r9, 0 + mov QWORD PTR [rcx+224], r10 + ; A[14] * A[15] + mov rax, QWORD PTR [r8+120] + mul QWORD PTR [r8+112] + xor r10, r10 + add r11, rax + adc r9, rdx + adc r10, 0 + add r11, rax + adc r9, rdx + adc r10, 0 + mov QWORD PTR [rcx+232], r11 + ; A[15] * A[15] + mov rax, QWORD PTR [r8+120] + mul rax + add r9, rax + adc r10, rdx + mov QWORD PTR [rcx+240], r9 + mov QWORD PTR [rcx+248], r10 + mov rax, QWORD PTR [rsp] + mov rdx, QWORD PTR [rsp+8] + mov r12, QWORD PTR [rsp+16] + mov r13, QWORD PTR [rsp+24] + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], rdx + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+24], r13 + mov rax, QWORD PTR [rsp+32] + mov rdx, QWORD PTR [rsp+40] + mov r12, QWORD PTR [rsp+48] + mov r13, QWORD PTR [rsp+56] + mov QWORD PTR [rcx+32], rax + mov QWORD PTR [rcx+40], rdx + mov QWORD PTR [rcx+48], r12 + mov QWORD PTR [rcx+56], r13 + mov rax, QWORD PTR [rsp+64] + mov rdx, QWORD PTR [rsp+72] + mov r12, QWORD PTR [rsp+80] + mov r13, QWORD PTR [rsp+88] + mov QWORD PTR [rcx+64], rax + mov QWORD PTR [rcx+72], rdx + mov QWORD PTR [rcx+80], r12 + mov QWORD PTR [rcx+88], r13 + mov rax, QWORD PTR [rsp+96] + mov rdx, QWORD PTR [rsp+104] + mov r12, QWORD PTR [rsp+112] + mov r13, QWORD PTR [rsp+120] + mov QWORD PTR [rcx+96], rax + mov QWORD PTR [rcx+104], rdx + mov QWORD PTR [rcx+112], r12 + mov QWORD PTR [rcx+120], r13 + add rsp, 128 + pop r14 + pop r13 + pop r12 + ret +sp_2048_sqr_16 ENDP +_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Square a and put result in r. (r = a * a) ; * @@ -5849,993 +7497,6 @@ L_end_2048_sqr_avx2_16: sp_2048_sqr_avx2_16 ENDP _text ENDS ENDIF -; /* Add b to a into r. (r = a + b) -; * -; * r A single precision integer. -; * a A single precision integer. -; * b A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_add_16 PROC - ; Add - mov r9, QWORD PTR [rdx] - xor rax, rax - add r9, QWORD PTR [r8] - mov r10, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r9 - adc r10, QWORD PTR [r8+8] - mov r9, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r10 - adc r9, QWORD PTR [r8+16] - mov r10, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r9 - adc r10, QWORD PTR [r8+24] - mov r9, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r10 - adc r9, QWORD PTR [r8+32] - mov r10, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r9 - adc r10, QWORD PTR [r8+40] - mov r9, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r10 - adc r9, QWORD PTR [r8+48] - mov r10, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r9 - adc r10, QWORD PTR [r8+56] - mov r9, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r10 - adc r9, QWORD PTR [r8+64] - mov r10, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r9 - adc r10, QWORD PTR [r8+72] - mov r9, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r10 - adc r9, QWORD PTR [r8+80] - mov r10, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r9 - adc r10, QWORD PTR [r8+88] - mov r9, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r10 - adc r9, QWORD PTR [r8+96] - mov r10, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r9 - adc r10, QWORD PTR [r8+104] - mov r9, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r10 - adc r9, QWORD PTR [r8+112] - mov r10, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r9 - adc r10, QWORD PTR [r8+120] - mov QWORD PTR [rcx+120], r10 - adc rax, 0 - ret -sp_2048_add_16 ENDP -_text ENDS -; /* Sub b from a into a. (a -= b) -; * -; * a A single precision integer and result. -; * b A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_sub_in_place_32 PROC - mov r8, QWORD PTR [rcx] - xor rax, rax - sub r8, QWORD PTR [rdx] - mov r9, QWORD PTR [rcx+8] - mov QWORD PTR [rcx], r8 - sbb r9, QWORD PTR [rdx+8] - mov r8, QWORD PTR [rcx+16] - mov QWORD PTR [rcx+8], r9 - sbb r8, QWORD PTR [rdx+16] - mov r9, QWORD PTR [rcx+24] - mov QWORD PTR [rcx+16], r8 - sbb r9, QWORD PTR [rdx+24] - mov r8, QWORD PTR [rcx+32] - mov QWORD PTR [rcx+24], r9 - sbb r8, QWORD PTR [rdx+32] - mov r9, QWORD PTR [rcx+40] - mov QWORD PTR [rcx+32], r8 - sbb r9, QWORD PTR [rdx+40] - mov r8, QWORD PTR [rcx+48] - mov QWORD PTR [rcx+40], r9 - sbb r8, QWORD PTR [rdx+48] - mov r9, QWORD PTR [rcx+56] - mov QWORD PTR [rcx+48], r8 - sbb r9, QWORD PTR [rdx+56] - mov r8, QWORD PTR [rcx+64] - mov QWORD PTR [rcx+56], r9 - sbb r8, QWORD PTR [rdx+64] - mov r9, QWORD PTR [rcx+72] - mov QWORD PTR [rcx+64], r8 - sbb r9, QWORD PTR [rdx+72] - mov r8, QWORD PTR [rcx+80] - mov QWORD PTR [rcx+72], r9 - sbb r8, QWORD PTR [rdx+80] - mov r9, QWORD PTR [rcx+88] - mov QWORD PTR [rcx+80], r8 - sbb r9, QWORD PTR [rdx+88] - mov r8, QWORD PTR [rcx+96] - mov QWORD PTR [rcx+88], r9 - sbb r8, QWORD PTR [rdx+96] - mov r9, QWORD PTR [rcx+104] - mov QWORD PTR [rcx+96], r8 - sbb r9, QWORD PTR [rdx+104] - mov r8, QWORD PTR [rcx+112] - mov QWORD PTR [rcx+104], r9 - sbb r8, QWORD PTR [rdx+112] - mov r9, QWORD PTR [rcx+120] - mov QWORD PTR [rcx+112], r8 - sbb r9, QWORD PTR [rdx+120] - mov r8, QWORD PTR [rcx+128] - mov QWORD PTR [rcx+120], r9 - sbb r8, QWORD PTR [rdx+128] - mov r9, QWORD PTR [rcx+136] - mov QWORD PTR [rcx+128], r8 - sbb r9, QWORD PTR [rdx+136] - mov r8, QWORD PTR [rcx+144] - mov QWORD PTR [rcx+136], r9 - sbb r8, QWORD PTR [rdx+144] - mov r9, QWORD PTR [rcx+152] - mov QWORD PTR [rcx+144], r8 - sbb r9, QWORD PTR [rdx+152] - mov r8, QWORD PTR [rcx+160] - mov QWORD PTR [rcx+152], r9 - sbb r8, QWORD PTR [rdx+160] - mov r9, QWORD PTR [rcx+168] - mov QWORD PTR [rcx+160], r8 - sbb r9, QWORD PTR [rdx+168] - mov r8, QWORD PTR [rcx+176] - mov QWORD PTR [rcx+168], r9 - sbb r8, QWORD PTR [rdx+176] - mov r9, QWORD PTR [rcx+184] - mov QWORD PTR [rcx+176], r8 - sbb r9, QWORD PTR [rdx+184] - mov r8, QWORD PTR [rcx+192] - mov QWORD PTR [rcx+184], r9 - sbb r8, QWORD PTR [rdx+192] - mov r9, QWORD PTR [rcx+200] - mov QWORD PTR [rcx+192], r8 - sbb r9, QWORD PTR [rdx+200] - mov r8, QWORD PTR [rcx+208] - mov QWORD PTR [rcx+200], r9 - sbb r8, QWORD PTR [rdx+208] - mov r9, QWORD PTR [rcx+216] - mov QWORD PTR [rcx+208], r8 - sbb r9, QWORD PTR [rdx+216] - mov r8, QWORD PTR [rcx+224] - mov QWORD PTR [rcx+216], r9 - sbb r8, QWORD PTR [rdx+224] - mov r9, QWORD PTR [rcx+232] - mov QWORD PTR [rcx+224], r8 - sbb r9, QWORD PTR [rdx+232] - mov r8, QWORD PTR [rcx+240] - mov QWORD PTR [rcx+232], r9 - sbb r8, QWORD PTR [rdx+240] - mov r9, QWORD PTR [rcx+248] - mov QWORD PTR [rcx+240], r8 - sbb r9, QWORD PTR [rdx+248] - mov QWORD PTR [rcx+248], r9 - sbb rax, 0 - ret -sp_2048_sub_in_place_32 ENDP -_text ENDS -; /* Add b to a into r. (r = a + b) -; * -; * r A single precision integer. -; * a A single precision integer. -; * b A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_add_32 PROC - ; Add - mov r9, QWORD PTR [rdx] - xor rax, rax - add r9, QWORD PTR [r8] - mov r10, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r9 - adc r10, QWORD PTR [r8+8] - mov r9, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r10 - adc r9, QWORD PTR [r8+16] - mov r10, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r9 - adc r10, QWORD PTR [r8+24] - mov r9, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r10 - adc r9, QWORD PTR [r8+32] - mov r10, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r9 - adc r10, QWORD PTR [r8+40] - mov r9, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r10 - adc r9, QWORD PTR [r8+48] - mov r10, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r9 - adc r10, QWORD PTR [r8+56] - mov r9, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r10 - adc r9, QWORD PTR [r8+64] - mov r10, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r9 - adc r10, QWORD PTR [r8+72] - mov r9, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r10 - adc r9, QWORD PTR [r8+80] - mov r10, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r9 - adc r10, QWORD PTR [r8+88] - mov r9, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r10 - adc r9, QWORD PTR [r8+96] - mov r10, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r9 - adc r10, QWORD PTR [r8+104] - mov r9, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r10 - adc r9, QWORD PTR [r8+112] - mov r10, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r9 - adc r10, QWORD PTR [r8+120] - mov r9, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+120], r10 - adc r9, QWORD PTR [r8+128] - mov r10, QWORD PTR [rdx+136] - mov QWORD PTR [rcx+128], r9 - adc r10, QWORD PTR [r8+136] - mov r9, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+136], r10 - adc r9, QWORD PTR [r8+144] - mov r10, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+144], r9 - adc r10, QWORD PTR [r8+152] - mov r9, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+152], r10 - adc r9, QWORD PTR [r8+160] - mov r10, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+160], r9 - adc r10, QWORD PTR [r8+168] - mov r9, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+168], r10 - adc r9, QWORD PTR [r8+176] - mov r10, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+176], r9 - adc r10, QWORD PTR [r8+184] - mov r9, QWORD PTR [rdx+192] - mov QWORD PTR [rcx+184], r10 - adc r9, QWORD PTR [r8+192] - mov r10, QWORD PTR [rdx+200] - mov QWORD PTR [rcx+192], r9 - adc r10, QWORD PTR [r8+200] - mov r9, QWORD PTR [rdx+208] - mov QWORD PTR [rcx+200], r10 - adc r9, QWORD PTR [r8+208] - mov r10, QWORD PTR [rdx+216] - mov QWORD PTR [rcx+208], r9 - adc r10, QWORD PTR [r8+216] - mov r9, QWORD PTR [rdx+224] - mov QWORD PTR [rcx+216], r10 - adc r9, QWORD PTR [r8+224] - mov r10, QWORD PTR [rdx+232] - mov QWORD PTR [rcx+224], r9 - adc r10, QWORD PTR [r8+232] - mov r9, QWORD PTR [rdx+240] - mov QWORD PTR [rcx+232], r10 - adc r9, QWORD PTR [r8+240] - mov r10, QWORD PTR [rdx+248] - mov QWORD PTR [rcx+240], r9 - adc r10, QWORD PTR [r8+248] - mov QWORD PTR [rcx+248], r10 - adc rax, 0 - ret -sp_2048_add_32 ENDP -_text ENDS -; /* Multiply a and b into r. (r = a * b) -; * -; * r A single precision integer. -; * a A single precision integer. -; * b A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_mul_32 PROC - push r12 - push r13 - push r14 - push r15 - push rdi - push rsi - sub rsp, 808 - mov QWORD PTR [rsp+768], rcx - mov QWORD PTR [rsp+776], rdx - mov QWORD PTR [rsp+784], r8 - lea r12, QWORD PTR [rsp+512] - lea r14, QWORD PTR [rdx+128] - ; Add - mov rax, QWORD PTR [rdx] - xor r15, r15 - add rax, QWORD PTR [r14] - mov r9, QWORD PTR [rdx+8] - mov QWORD PTR [r12], rax - adc r9, QWORD PTR [r14+8] - mov r10, QWORD PTR [rdx+16] - mov QWORD PTR [r12+8], r9 - adc r10, QWORD PTR [r14+16] - mov rax, QWORD PTR [rdx+24] - mov QWORD PTR [r12+16], r10 - adc rax, QWORD PTR [r14+24] - mov r9, QWORD PTR [rdx+32] - mov QWORD PTR [r12+24], rax - adc r9, QWORD PTR [r14+32] - mov r10, QWORD PTR [rdx+40] - mov QWORD PTR [r12+32], r9 - adc r10, QWORD PTR [r14+40] - mov rax, QWORD PTR [rdx+48] - mov QWORD PTR [r12+40], r10 - adc rax, QWORD PTR [r14+48] - mov r9, QWORD PTR [rdx+56] - mov QWORD PTR [r12+48], rax - adc r9, QWORD PTR [r14+56] - mov r10, QWORD PTR [rdx+64] - mov QWORD PTR [r12+56], r9 - adc r10, QWORD PTR [r14+64] - mov rax, QWORD PTR [rdx+72] - mov QWORD PTR [r12+64], r10 - adc rax, QWORD PTR [r14+72] - mov r9, QWORD PTR [rdx+80] - mov QWORD PTR [r12+72], rax - adc r9, QWORD PTR [r14+80] - mov r10, QWORD PTR [rdx+88] - mov QWORD PTR [r12+80], r9 - adc r10, QWORD PTR [r14+88] - mov rax, QWORD PTR [rdx+96] - mov QWORD PTR [r12+88], r10 - adc rax, QWORD PTR [r14+96] - mov r9, QWORD PTR [rdx+104] - mov QWORD PTR [r12+96], rax - adc r9, QWORD PTR [r14+104] - mov r10, QWORD PTR [rdx+112] - mov QWORD PTR [r12+104], r9 - adc r10, QWORD PTR [r14+112] - mov rax, QWORD PTR [rdx+120] - mov QWORD PTR [r12+112], r10 - adc rax, QWORD PTR [r14+120] - mov QWORD PTR [r12+120], rax - adc r15, 0 - mov QWORD PTR [rsp+792], r15 - lea r13, QWORD PTR [rsp+640] - lea r14, QWORD PTR [r8+128] - ; Add - mov rax, QWORD PTR [r8] - xor rdi, rdi - add rax, QWORD PTR [r14] - mov r9, QWORD PTR [r8+8] - mov QWORD PTR [r13], rax - adc r9, QWORD PTR [r14+8] - mov r10, QWORD PTR [r8+16] - mov QWORD PTR [r13+8], r9 - adc r10, QWORD PTR [r14+16] - mov rax, QWORD PTR [r8+24] - mov QWORD PTR [r13+16], r10 - adc rax, QWORD PTR [r14+24] - mov r9, QWORD PTR [r8+32] - mov QWORD PTR [r13+24], rax - adc r9, QWORD PTR [r14+32] - mov r10, QWORD PTR [r8+40] - mov QWORD PTR [r13+32], r9 - adc r10, QWORD PTR [r14+40] - mov rax, QWORD PTR [r8+48] - mov QWORD PTR [r13+40], r10 - adc rax, QWORD PTR [r14+48] - mov r9, QWORD PTR [r8+56] - mov QWORD PTR [r13+48], rax - adc r9, QWORD PTR [r14+56] - mov r10, QWORD PTR [r8+64] - mov QWORD PTR [r13+56], r9 - adc r10, QWORD PTR [r14+64] - mov rax, QWORD PTR [r8+72] - mov QWORD PTR [r13+64], r10 - adc rax, QWORD PTR [r14+72] - mov r9, QWORD PTR [r8+80] - mov QWORD PTR [r13+72], rax - adc r9, QWORD PTR [r14+80] - mov r10, QWORD PTR [r8+88] - mov QWORD PTR [r13+80], r9 - adc r10, QWORD PTR [r14+88] - mov rax, QWORD PTR [r8+96] - mov QWORD PTR [r13+88], r10 - adc rax, QWORD PTR [r14+96] - mov r9, QWORD PTR [r8+104] - mov QWORD PTR [r13+96], rax - adc r9, QWORD PTR [r14+104] - mov r10, QWORD PTR [r8+112] - mov QWORD PTR [r13+104], r9 - adc r10, QWORD PTR [r14+112] - mov rax, QWORD PTR [r8+120] - mov QWORD PTR [r13+112], r10 - adc rax, QWORD PTR [r14+120] - mov QWORD PTR [r13+120], rax - adc rdi, 0 - mov QWORD PTR [rsp+800], rdi - mov r8, r13 - mov rdx, r12 - mov rcx, rsp - call sp_2048_mul_16 - mov r8, QWORD PTR [rsp+784] - mov rdx, QWORD PTR [rsp+776] - lea rcx, QWORD PTR [rsp+256] - add r8, 128 - add rdx, 128 - call sp_2048_mul_16 - mov r8, QWORD PTR [rsp+784] - mov rdx, QWORD PTR [rsp+776] - mov rcx, QWORD PTR [rsp+768] - call sp_2048_mul_16 -IFDEF _WIN64 - mov r8, QWORD PTR [rsp+784] - mov rdx, QWORD PTR [rsp+776] - mov rcx, QWORD PTR [rsp+768] -ENDIF - mov r15, QWORD PTR [rsp+792] - mov rdi, QWORD PTR [rsp+800] - mov rsi, QWORD PTR [rsp+768] - mov r11, r15 - lea r12, QWORD PTR [rsp+512] - lea r13, QWORD PTR [rsp+640] - and r11, rdi - neg r15 - neg rdi - add rsi, 256 - mov rax, QWORD PTR [r12] - mov r9, QWORD PTR [r13] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12], rax - mov QWORD PTR [r13], r9 - mov rax, QWORD PTR [r12+8] - mov r9, QWORD PTR [r13+8] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+8], rax - mov QWORD PTR [r13+8], r9 - mov rax, QWORD PTR [r12+16] - mov r9, QWORD PTR [r13+16] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+16], rax - mov QWORD PTR [r13+16], r9 - mov rax, QWORD PTR [r12+24] - mov r9, QWORD PTR [r13+24] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+24], rax - mov QWORD PTR [r13+24], r9 - mov rax, QWORD PTR [r12+32] - mov r9, QWORD PTR [r13+32] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+32], rax - mov QWORD PTR [r13+32], r9 - mov rax, QWORD PTR [r12+40] - mov r9, QWORD PTR [r13+40] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+40], rax - mov QWORD PTR [r13+40], r9 - mov rax, QWORD PTR [r12+48] - mov r9, QWORD PTR [r13+48] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+48], rax - mov QWORD PTR [r13+48], r9 - mov rax, QWORD PTR [r12+56] - mov r9, QWORD PTR [r13+56] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+56], rax - mov QWORD PTR [r13+56], r9 - mov rax, QWORD PTR [r12+64] - mov r9, QWORD PTR [r13+64] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+64], rax - mov QWORD PTR [r13+64], r9 - mov rax, QWORD PTR [r12+72] - mov r9, QWORD PTR [r13+72] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+72], rax - mov QWORD PTR [r13+72], r9 - mov rax, QWORD PTR [r12+80] - mov r9, QWORD PTR [r13+80] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+80], rax - mov QWORD PTR [r13+80], r9 - mov rax, QWORD PTR [r12+88] - mov r9, QWORD PTR [r13+88] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+88], rax - mov QWORD PTR [r13+88], r9 - mov rax, QWORD PTR [r12+96] - mov r9, QWORD PTR [r13+96] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+96], rax - mov QWORD PTR [r13+96], r9 - mov rax, QWORD PTR [r12+104] - mov r9, QWORD PTR [r13+104] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+104], rax - mov QWORD PTR [r13+104], r9 - mov rax, QWORD PTR [r12+112] - mov r9, QWORD PTR [r13+112] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+112], rax - mov QWORD PTR [r13+112], r9 - mov rax, QWORD PTR [r12+120] - mov r9, QWORD PTR [r13+120] - and rax, rdi - and r9, r15 - mov QWORD PTR [r12+120], rax - mov QWORD PTR [r13+120], r9 - mov rax, QWORD PTR [r12] - add rax, QWORD PTR [r13] - mov r9, QWORD PTR [r12+8] - mov QWORD PTR [rsi], rax - adc r9, QWORD PTR [r13+8] - mov r10, QWORD PTR [r12+16] - mov QWORD PTR [rsi+8], r9 - adc r10, QWORD PTR [r13+16] - mov rax, QWORD PTR [r12+24] - mov QWORD PTR [rsi+16], r10 - adc rax, QWORD PTR [r13+24] - mov r9, QWORD PTR [r12+32] - mov QWORD PTR [rsi+24], rax - adc r9, QWORD PTR [r13+32] - mov r10, QWORD PTR [r12+40] - mov QWORD PTR [rsi+32], r9 - adc r10, QWORD PTR [r13+40] - mov rax, QWORD PTR [r12+48] - mov QWORD PTR [rsi+40], r10 - adc rax, QWORD PTR [r13+48] - mov r9, QWORD PTR [r12+56] - mov QWORD PTR [rsi+48], rax - adc r9, QWORD PTR [r13+56] - mov r10, QWORD PTR [r12+64] - mov QWORD PTR [rsi+56], r9 - adc r10, QWORD PTR [r13+64] - mov rax, QWORD PTR [r12+72] - mov QWORD PTR [rsi+64], r10 - adc rax, QWORD PTR [r13+72] - mov r9, QWORD PTR [r12+80] - mov QWORD PTR [rsi+72], rax - adc r9, QWORD PTR [r13+80] - mov r10, QWORD PTR [r12+88] - mov QWORD PTR [rsi+80], r9 - adc r10, QWORD PTR [r13+88] - mov rax, QWORD PTR [r12+96] - mov QWORD PTR [rsi+88], r10 - adc rax, QWORD PTR [r13+96] - mov r9, QWORD PTR [r12+104] - mov QWORD PTR [rsi+96], rax - adc r9, QWORD PTR [r13+104] - mov r10, QWORD PTR [r12+112] - mov QWORD PTR [rsi+104], r9 - adc r10, QWORD PTR [r13+112] - mov rax, QWORD PTR [r12+120] - mov QWORD PTR [rsi+112], r10 - adc rax, QWORD PTR [r13+120] - mov QWORD PTR [rsi+120], rax - adc r11, 0 - lea r13, QWORD PTR [rsp+256] - mov r12, rsp - mov rax, QWORD PTR [r12] - sub rax, QWORD PTR [r13] - mov r9, QWORD PTR [r12+8] - mov QWORD PTR [r12], rax - sbb r9, QWORD PTR [r13+8] - mov r10, QWORD PTR [r12+16] - mov QWORD PTR [r12+8], r9 - sbb r10, QWORD PTR [r13+16] - mov rax, QWORD PTR [r12+24] - mov QWORD PTR [r12+16], r10 - sbb rax, QWORD PTR [r13+24] - mov r9, QWORD PTR [r12+32] - mov QWORD PTR [r12+24], rax - sbb r9, QWORD PTR [r13+32] - mov r10, QWORD PTR [r12+40] - mov QWORD PTR [r12+32], r9 - sbb r10, QWORD PTR [r13+40] - mov rax, QWORD PTR [r12+48] - mov QWORD PTR [r12+40], r10 - sbb rax, QWORD PTR [r13+48] - mov r9, QWORD PTR [r12+56] - mov QWORD PTR [r12+48], rax - sbb r9, QWORD PTR [r13+56] - mov r10, QWORD PTR [r12+64] - mov QWORD PTR [r12+56], r9 - sbb r10, QWORD PTR [r13+64] - mov rax, QWORD PTR [r12+72] - mov QWORD PTR [r12+64], r10 - sbb rax, QWORD PTR [r13+72] - mov r9, QWORD PTR [r12+80] - mov QWORD PTR [r12+72], rax - sbb r9, QWORD PTR [r13+80] - mov r10, QWORD PTR [r12+88] - mov QWORD PTR [r12+80], r9 - sbb r10, QWORD PTR [r13+88] - mov rax, QWORD PTR [r12+96] - mov QWORD PTR [r12+88], r10 - sbb rax, QWORD PTR [r13+96] - mov r9, QWORD PTR [r12+104] - mov QWORD PTR [r12+96], rax - sbb r9, QWORD PTR [r13+104] - mov r10, QWORD PTR [r12+112] - mov QWORD PTR [r12+104], r9 - sbb r10, QWORD PTR [r13+112] - mov rax, QWORD PTR [r12+120] - mov QWORD PTR [r12+112], r10 - sbb rax, QWORD PTR [r13+120] - mov r9, QWORD PTR [r12+128] - mov QWORD PTR [r12+120], rax - sbb r9, QWORD PTR [r13+128] - mov r10, QWORD PTR [r12+136] - mov QWORD PTR [r12+128], r9 - sbb r10, QWORD PTR [r13+136] - mov rax, QWORD PTR [r12+144] - mov QWORD PTR [r12+136], r10 - sbb rax, QWORD PTR [r13+144] - mov r9, QWORD PTR [r12+152] - mov QWORD PTR [r12+144], rax - sbb r9, QWORD PTR [r13+152] - mov r10, QWORD PTR [r12+160] - mov QWORD PTR [r12+152], r9 - sbb r10, QWORD PTR [r13+160] - mov rax, QWORD PTR [r12+168] - mov QWORD PTR [r12+160], r10 - sbb rax, QWORD PTR [r13+168] - mov r9, QWORD PTR [r12+176] - mov QWORD PTR [r12+168], rax - sbb r9, QWORD PTR [r13+176] - mov r10, QWORD PTR [r12+184] - mov QWORD PTR [r12+176], r9 - sbb r10, QWORD PTR [r13+184] - mov rax, QWORD PTR [r12+192] - mov QWORD PTR [r12+184], r10 - sbb rax, QWORD PTR [r13+192] - mov r9, QWORD PTR [r12+200] - mov QWORD PTR [r12+192], rax - sbb r9, QWORD PTR [r13+200] - mov r10, QWORD PTR [r12+208] - mov QWORD PTR [r12+200], r9 - sbb r10, QWORD PTR [r13+208] - mov rax, QWORD PTR [r12+216] - mov QWORD PTR [r12+208], r10 - sbb rax, QWORD PTR [r13+216] - mov r9, QWORD PTR [r12+224] - mov QWORD PTR [r12+216], rax - sbb r9, QWORD PTR [r13+224] - mov r10, QWORD PTR [r12+232] - mov QWORD PTR [r12+224], r9 - sbb r10, QWORD PTR [r13+232] - mov rax, QWORD PTR [r12+240] - mov QWORD PTR [r12+232], r10 - sbb rax, QWORD PTR [r13+240] - mov r9, QWORD PTR [r12+248] - mov QWORD PTR [r12+240], rax - sbb r9, QWORD PTR [r13+248] - mov QWORD PTR [r12+248], r9 - sbb r11, 0 - mov rax, QWORD PTR [r12] - sub rax, QWORD PTR [rcx] - mov r9, QWORD PTR [r12+8] - mov QWORD PTR [r12], rax - sbb r9, QWORD PTR [rcx+8] - mov r10, QWORD PTR [r12+16] - mov QWORD PTR [r12+8], r9 - sbb r10, QWORD PTR [rcx+16] - mov rax, QWORD PTR [r12+24] - mov QWORD PTR [r12+16], r10 - sbb rax, QWORD PTR [rcx+24] - mov r9, QWORD PTR [r12+32] - mov QWORD PTR [r12+24], rax - sbb r9, QWORD PTR [rcx+32] - mov r10, QWORD PTR [r12+40] - mov QWORD PTR [r12+32], r9 - sbb r10, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r12+48] - mov QWORD PTR [r12+40], r10 - sbb rax, QWORD PTR [rcx+48] - mov r9, QWORD PTR [r12+56] - mov QWORD PTR [r12+48], rax - sbb r9, QWORD PTR [rcx+56] - mov r10, QWORD PTR [r12+64] - mov QWORD PTR [r12+56], r9 - sbb r10, QWORD PTR [rcx+64] - mov rax, QWORD PTR [r12+72] - mov QWORD PTR [r12+64], r10 - sbb rax, QWORD PTR [rcx+72] - mov r9, QWORD PTR [r12+80] - mov QWORD PTR [r12+72], rax - sbb r9, QWORD PTR [rcx+80] - mov r10, QWORD PTR [r12+88] - mov QWORD PTR [r12+80], r9 - sbb r10, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r12+96] - mov QWORD PTR [r12+88], r10 - sbb rax, QWORD PTR [rcx+96] - mov r9, QWORD PTR [r12+104] - mov QWORD PTR [r12+96], rax - sbb r9, QWORD PTR [rcx+104] - mov r10, QWORD PTR [r12+112] - mov QWORD PTR [r12+104], r9 - sbb r10, QWORD PTR [rcx+112] - mov rax, QWORD PTR [r12+120] - mov QWORD PTR [r12+112], r10 - sbb rax, QWORD PTR [rcx+120] - mov r9, QWORD PTR [r12+128] - mov QWORD PTR [r12+120], rax - sbb r9, QWORD PTR [rcx+128] - mov r10, QWORD PTR [r12+136] - mov QWORD PTR [r12+128], r9 - sbb r10, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r12+144] - mov QWORD PTR [r12+136], r10 - sbb rax, QWORD PTR [rcx+144] - mov r9, QWORD PTR [r12+152] - mov QWORD PTR [r12+144], rax - sbb r9, QWORD PTR [rcx+152] - mov r10, QWORD PTR [r12+160] - mov QWORD PTR [r12+152], r9 - sbb r10, QWORD PTR [rcx+160] - mov rax, QWORD PTR [r12+168] - mov QWORD PTR [r12+160], r10 - sbb rax, QWORD PTR [rcx+168] - mov r9, QWORD PTR [r12+176] - mov QWORD PTR [r12+168], rax - sbb r9, QWORD PTR [rcx+176] - mov r10, QWORD PTR [r12+184] - mov QWORD PTR [r12+176], r9 - sbb r10, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r12+192] - mov QWORD PTR [r12+184], r10 - sbb rax, QWORD PTR [rcx+192] - mov r9, QWORD PTR [r12+200] - mov QWORD PTR [r12+192], rax - sbb r9, QWORD PTR [rcx+200] - mov r10, QWORD PTR [r12+208] - mov QWORD PTR [r12+200], r9 - sbb r10, QWORD PTR [rcx+208] - mov rax, QWORD PTR [r12+216] - mov QWORD PTR [r12+208], r10 - sbb rax, QWORD PTR [rcx+216] - mov r9, QWORD PTR [r12+224] - mov QWORD PTR [r12+216], rax - sbb r9, QWORD PTR [rcx+224] - mov r10, QWORD PTR [r12+232] - mov QWORD PTR [r12+224], r9 - sbb r10, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r12+240] - mov QWORD PTR [r12+232], r10 - sbb rax, QWORD PTR [rcx+240] - mov r9, QWORD PTR [r12+248] - mov QWORD PTR [r12+240], rax - sbb r9, QWORD PTR [rcx+248] - mov QWORD PTR [r12+248], r9 - sbb r11, 0 - sub rsi, 128 - ; Add - mov rax, QWORD PTR [rsi] - add rax, QWORD PTR [r12] - mov r9, QWORD PTR [rsi+8] - mov QWORD PTR [rsi], rax - adc r9, QWORD PTR [r12+8] - mov r10, QWORD PTR [rsi+16] - mov QWORD PTR [rsi+8], r9 - adc r10, QWORD PTR [r12+16] - mov rax, QWORD PTR [rsi+24] - mov QWORD PTR [rsi+16], r10 - adc rax, QWORD PTR [r12+24] - mov r9, QWORD PTR [rsi+32] - mov QWORD PTR [rsi+24], rax - adc r9, QWORD PTR [r12+32] - mov r10, QWORD PTR [rsi+40] - mov QWORD PTR [rsi+32], r9 - adc r10, QWORD PTR [r12+40] - mov rax, QWORD PTR [rsi+48] - mov QWORD PTR [rsi+40], r10 - adc rax, QWORD PTR [r12+48] - mov r9, QWORD PTR [rsi+56] - mov QWORD PTR [rsi+48], rax - adc r9, QWORD PTR [r12+56] - mov r10, QWORD PTR [rsi+64] - mov QWORD PTR [rsi+56], r9 - adc r10, QWORD PTR [r12+64] - mov rax, QWORD PTR [rsi+72] - mov QWORD PTR [rsi+64], r10 - adc rax, QWORD PTR [r12+72] - mov r9, QWORD PTR [rsi+80] - mov QWORD PTR [rsi+72], rax - adc r9, QWORD PTR [r12+80] - mov r10, QWORD PTR [rsi+88] - mov QWORD PTR [rsi+80], r9 - adc r10, QWORD PTR [r12+88] - mov rax, QWORD PTR [rsi+96] - mov QWORD PTR [rsi+88], r10 - adc rax, QWORD PTR [r12+96] - mov r9, QWORD PTR [rsi+104] - mov QWORD PTR [rsi+96], rax - adc r9, QWORD PTR [r12+104] - mov r10, QWORD PTR [rsi+112] - mov QWORD PTR [rsi+104], r9 - adc r10, QWORD PTR [r12+112] - mov rax, QWORD PTR [rsi+120] - mov QWORD PTR [rsi+112], r10 - adc rax, QWORD PTR [r12+120] - mov r9, QWORD PTR [rsi+128] - mov QWORD PTR [rsi+120], rax - adc r9, QWORD PTR [r12+128] - mov r10, QWORD PTR [rsi+136] - mov QWORD PTR [rsi+128], r9 - adc r10, QWORD PTR [r12+136] - mov rax, QWORD PTR [rsi+144] - mov QWORD PTR [rsi+136], r10 - adc rax, QWORD PTR [r12+144] - mov r9, QWORD PTR [rsi+152] - mov QWORD PTR [rsi+144], rax - adc r9, QWORD PTR [r12+152] - mov r10, QWORD PTR [rsi+160] - mov QWORD PTR [rsi+152], r9 - adc r10, QWORD PTR [r12+160] - mov rax, QWORD PTR [rsi+168] - mov QWORD PTR [rsi+160], r10 - adc rax, QWORD PTR [r12+168] - mov r9, QWORD PTR [rsi+176] - mov QWORD PTR [rsi+168], rax - adc r9, QWORD PTR [r12+176] - mov r10, QWORD PTR [rsi+184] - mov QWORD PTR [rsi+176], r9 - adc r10, QWORD PTR [r12+184] - mov rax, QWORD PTR [rsi+192] - mov QWORD PTR [rsi+184], r10 - adc rax, QWORD PTR [r12+192] - mov r9, QWORD PTR [rsi+200] - mov QWORD PTR [rsi+192], rax - adc r9, QWORD PTR [r12+200] - mov r10, QWORD PTR [rsi+208] - mov QWORD PTR [rsi+200], r9 - adc r10, QWORD PTR [r12+208] - mov rax, QWORD PTR [rsi+216] - mov QWORD PTR [rsi+208], r10 - adc rax, QWORD PTR [r12+216] - mov r9, QWORD PTR [rsi+224] - mov QWORD PTR [rsi+216], rax - adc r9, QWORD PTR [r12+224] - mov r10, QWORD PTR [rsi+232] - mov QWORD PTR [rsi+224], r9 - adc r10, QWORD PTR [r12+232] - mov rax, QWORD PTR [rsi+240] - mov QWORD PTR [rsi+232], r10 - adc rax, QWORD PTR [r12+240] - mov r9, QWORD PTR [rsi+248] - mov QWORD PTR [rsi+240], rax - adc r9, QWORD PTR [r12+248] - mov QWORD PTR [rsi+248], r9 - adc r11, 0 - mov QWORD PTR [rcx+384], r11 - add rsi, 128 - ; Add - mov rax, QWORD PTR [rsi] - xor r11, r11 - add rax, QWORD PTR [r13] - mov r9, QWORD PTR [rsi+8] - mov QWORD PTR [rsi], rax - adc r9, QWORD PTR [r13+8] - mov r10, QWORD PTR [rsi+16] - mov QWORD PTR [rsi+8], r9 - adc r10, QWORD PTR [r13+16] - mov rax, QWORD PTR [rsi+24] - mov QWORD PTR [rsi+16], r10 - adc rax, QWORD PTR [r13+24] - mov r9, QWORD PTR [rsi+32] - mov QWORD PTR [rsi+24], rax - adc r9, QWORD PTR [r13+32] - mov r10, QWORD PTR [rsi+40] - mov QWORD PTR [rsi+32], r9 - adc r10, QWORD PTR [r13+40] - mov rax, QWORD PTR [rsi+48] - mov QWORD PTR [rsi+40], r10 - adc rax, QWORD PTR [r13+48] - mov r9, QWORD PTR [rsi+56] - mov QWORD PTR [rsi+48], rax - adc r9, QWORD PTR [r13+56] - mov r10, QWORD PTR [rsi+64] - mov QWORD PTR [rsi+56], r9 - adc r10, QWORD PTR [r13+64] - mov rax, QWORD PTR [rsi+72] - mov QWORD PTR [rsi+64], r10 - adc rax, QWORD PTR [r13+72] - mov r9, QWORD PTR [rsi+80] - mov QWORD PTR [rsi+72], rax - adc r9, QWORD PTR [r13+80] - mov r10, QWORD PTR [rsi+88] - mov QWORD PTR [rsi+80], r9 - adc r10, QWORD PTR [r13+88] - mov rax, QWORD PTR [rsi+96] - mov QWORD PTR [rsi+88], r10 - adc rax, QWORD PTR [r13+96] - mov r9, QWORD PTR [rsi+104] - mov QWORD PTR [rsi+96], rax - adc r9, QWORD PTR [r13+104] - mov r10, QWORD PTR [rsi+112] - mov QWORD PTR [rsi+104], r9 - adc r10, QWORD PTR [r13+112] - mov rax, QWORD PTR [rsi+120] - mov QWORD PTR [rsi+112], r10 - adc rax, QWORD PTR [r13+120] - mov r9, QWORD PTR [rsi+128] - mov QWORD PTR [rsi+120], rax - adc r9, QWORD PTR [r13+128] - mov QWORD PTR [rsi+128], r9 - adc r11, 0 - ; Add to zero - mov rax, QWORD PTR [r13+136] - adc rax, 0 - mov r9, QWORD PTR [r13+144] - mov QWORD PTR [rsi+136], rax - adc r9, 0 - mov r10, QWORD PTR [r13+152] - mov QWORD PTR [rsi+144], r9 - adc r10, 0 - mov rax, QWORD PTR [r13+160] - mov QWORD PTR [rsi+152], r10 - adc rax, 0 - mov r9, QWORD PTR [r13+168] - mov QWORD PTR [rsi+160], rax - adc r9, 0 - mov r10, QWORD PTR [r13+176] - mov QWORD PTR [rsi+168], r9 - adc r10, 0 - mov rax, QWORD PTR [r13+184] - mov QWORD PTR [rsi+176], r10 - adc rax, 0 - mov r9, QWORD PTR [r13+192] - mov QWORD PTR [rsi+184], rax - adc r9, 0 - mov r10, QWORD PTR [r13+200] - mov QWORD PTR [rsi+192], r9 - adc r10, 0 - mov rax, QWORD PTR [r13+208] - mov QWORD PTR [rsi+200], r10 - adc rax, 0 - mov r9, QWORD PTR [r13+216] - mov QWORD PTR [rsi+208], rax - adc r9, 0 - mov r10, QWORD PTR [r13+224] - mov QWORD PTR [rsi+216], r9 - adc r10, 0 - mov rax, QWORD PTR [r13+232] - mov QWORD PTR [rsi+224], r10 - adc rax, 0 - mov r9, QWORD PTR [r13+240] - mov QWORD PTR [rsi+232], rax - adc r9, 0 - mov r10, QWORD PTR [r13+248] - mov QWORD PTR [rsi+240], r9 - adc r10, 0 - mov QWORD PTR [rsi+248], r10 - add rsp, 808 - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret -sp_2048_mul_32 ENDP -_text ENDS ; /* Add a to a into r. (r = a + a) ; * ; * r A single precision integer. @@ -7477,667 +8138,6 @@ ENDIF sp_2048_sqr_32 ENDP _text ENDS IFDEF HAVE_INTEL_AVX2 -; /* Multiply a and b into r. (r = a * b) -; * -; * r A single precision integer. -; * a A single precision integer. -; * b A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_mul_avx2_32 PROC - push r12 - push r13 - push r14 - push r15 - push rdi - push rsi - sub rsp, 808 - mov QWORD PTR [rsp+768], rcx - mov QWORD PTR [rsp+776], rdx - mov QWORD PTR [rsp+784], r8 - lea r12, QWORD PTR [rsp+512] - lea r14, QWORD PTR [rdx+128] - ; Add - mov rax, QWORD PTR [rdx] - xor r15, r15 - add rax, QWORD PTR [r14] - mov r9, QWORD PTR [rdx+8] - mov QWORD PTR [r12], rax - adc r9, QWORD PTR [r14+8] - mov r10, QWORD PTR [rdx+16] - mov QWORD PTR [r12+8], r9 - adc r10, QWORD PTR [r14+16] - mov rax, QWORD PTR [rdx+24] - mov QWORD PTR [r12+16], r10 - adc rax, QWORD PTR [r14+24] - mov r9, QWORD PTR [rdx+32] - mov QWORD PTR [r12+24], rax - adc r9, QWORD PTR [r14+32] - mov r10, QWORD PTR [rdx+40] - mov QWORD PTR [r12+32], r9 - adc r10, QWORD PTR [r14+40] - mov rax, QWORD PTR [rdx+48] - mov QWORD PTR [r12+40], r10 - adc rax, QWORD PTR [r14+48] - mov r9, QWORD PTR [rdx+56] - mov QWORD PTR [r12+48], rax - adc r9, QWORD PTR [r14+56] - mov r10, QWORD PTR [rdx+64] - mov QWORD PTR [r12+56], r9 - adc r10, QWORD PTR [r14+64] - mov rax, QWORD PTR [rdx+72] - mov QWORD PTR [r12+64], r10 - adc rax, QWORD PTR [r14+72] - mov r9, QWORD PTR [rdx+80] - mov QWORD PTR [r12+72], rax - adc r9, QWORD PTR [r14+80] - mov r10, QWORD PTR [rdx+88] - mov QWORD PTR [r12+80], r9 - adc r10, QWORD PTR [r14+88] - mov rax, QWORD PTR [rdx+96] - mov QWORD PTR [r12+88], r10 - adc rax, QWORD PTR [r14+96] - mov r9, QWORD PTR [rdx+104] - mov QWORD PTR [r12+96], rax - adc r9, QWORD PTR [r14+104] - mov r10, QWORD PTR [rdx+112] - mov QWORD PTR [r12+104], r9 - adc r10, QWORD PTR [r14+112] - mov rax, QWORD PTR [rdx+120] - mov QWORD PTR [r12+112], r10 - adc rax, QWORD PTR [r14+120] - mov QWORD PTR [r12+120], rax - adc r15, 0 - mov QWORD PTR [rsp+792], r15 - lea r13, QWORD PTR [rsp+640] - lea r14, QWORD PTR [r8+128] - ; Add - mov rax, QWORD PTR [r8] - xor rdi, rdi - add rax, QWORD PTR [r14] - mov r9, QWORD PTR [r8+8] - mov QWORD PTR [r13], rax - adc r9, QWORD PTR [r14+8] - mov r10, QWORD PTR [r8+16] - mov QWORD PTR [r13+8], r9 - adc r10, QWORD PTR [r14+16] - mov rax, QWORD PTR [r8+24] - mov QWORD PTR [r13+16], r10 - adc rax, QWORD PTR [r14+24] - mov r9, QWORD PTR [r8+32] - mov QWORD PTR [r13+24], rax - adc r9, QWORD PTR [r14+32] - mov r10, QWORD PTR [r8+40] - mov QWORD PTR [r13+32], r9 - adc r10, QWORD PTR [r14+40] - mov rax, QWORD PTR [r8+48] - mov QWORD PTR [r13+40], r10 - adc rax, QWORD PTR [r14+48] - mov r9, QWORD PTR [r8+56] - mov QWORD PTR [r13+48], rax - adc r9, QWORD PTR [r14+56] - mov r10, QWORD PTR [r8+64] - mov QWORD PTR [r13+56], r9 - adc r10, QWORD PTR [r14+64] - mov rax, QWORD PTR [r8+72] - mov QWORD PTR [r13+64], r10 - adc rax, QWORD PTR [r14+72] - mov r9, QWORD PTR [r8+80] - mov QWORD PTR [r13+72], rax - adc r9, QWORD PTR [r14+80] - mov r10, QWORD PTR [r8+88] - mov QWORD PTR [r13+80], r9 - adc r10, QWORD PTR [r14+88] - mov rax, QWORD PTR [r8+96] - mov QWORD PTR [r13+88], r10 - adc rax, QWORD PTR [r14+96] - mov r9, QWORD PTR [r8+104] - mov QWORD PTR [r13+96], rax - adc r9, QWORD PTR [r14+104] - mov r10, QWORD PTR [r8+112] - mov QWORD PTR [r13+104], r9 - adc r10, QWORD PTR [r14+112] - mov rax, QWORD PTR [r8+120] - mov QWORD PTR [r13+112], r10 - adc rax, QWORD PTR [r14+120] - mov QWORD PTR [r13+120], rax - adc rdi, 0 - mov QWORD PTR [rsp+800], rdi - mov r8, r13 - mov rdx, r12 - mov rcx, rsp - call sp_2048_mul_avx2_16 - mov r8, QWORD PTR [rsp+784] - mov rdx, QWORD PTR [rsp+776] - lea rcx, QWORD PTR [rsp+256] - add r8, 128 - add rdx, 128 - call sp_2048_mul_avx2_16 - mov r8, QWORD PTR [rsp+784] - mov rdx, QWORD PTR [rsp+776] - mov rcx, QWORD PTR [rsp+768] - call sp_2048_mul_avx2_16 -IFDEF _WIN64 - mov r8, QWORD PTR [rsp+784] - mov rdx, QWORD PTR [rsp+776] - mov rcx, QWORD PTR [rsp+768] -ENDIF - mov r15, QWORD PTR [rsp+792] - mov rdi, QWORD PTR [rsp+800] - mov rsi, QWORD PTR [rsp+768] - mov r11, r15 - lea r12, QWORD PTR [rsp+512] - lea r13, QWORD PTR [rsp+640] - and r11, rdi - neg r15 - neg rdi - add rsi, 256 - mov rax, QWORD PTR [r12] - mov r9, QWORD PTR [r13] - pext rax, rax, rdi - pext r9, r9, r15 - add rax, r9 - mov r9, QWORD PTR [r12+8] - mov r10, QWORD PTR [r13+8] - pext r9, r9, rdi - pext r10, r10, r15 - mov QWORD PTR [rsi], rax - adc r9, r10 - mov r10, QWORD PTR [r12+16] - mov rax, QWORD PTR [r13+16] - pext r10, r10, rdi - pext rax, rax, r15 - mov QWORD PTR [rsi+8], r9 - adc r10, rax - mov rax, QWORD PTR [r12+24] - mov r9, QWORD PTR [r13+24] - pext rax, rax, rdi - pext r9, r9, r15 - mov QWORD PTR [rsi+16], r10 - adc rax, r9 - mov r9, QWORD PTR [r12+32] - mov r10, QWORD PTR [r13+32] - pext r9, r9, rdi - pext r10, r10, r15 - mov QWORD PTR [rsi+24], rax - adc r9, r10 - mov r10, QWORD PTR [r12+40] - mov rax, QWORD PTR [r13+40] - pext r10, r10, rdi - pext rax, rax, r15 - mov QWORD PTR [rsi+32], r9 - adc r10, rax - mov rax, QWORD PTR [r12+48] - mov r9, QWORD PTR [r13+48] - pext rax, rax, rdi - pext r9, r9, r15 - mov QWORD PTR [rsi+40], r10 - adc rax, r9 - mov r9, QWORD PTR [r12+56] - mov r10, QWORD PTR [r13+56] - pext r9, r9, rdi - pext r10, r10, r15 - mov QWORD PTR [rsi+48], rax - adc r9, r10 - mov r10, QWORD PTR [r12+64] - mov rax, QWORD PTR [r13+64] - pext r10, r10, rdi - pext rax, rax, r15 - mov QWORD PTR [rsi+56], r9 - adc r10, rax - mov rax, QWORD PTR [r12+72] - mov r9, QWORD PTR [r13+72] - pext rax, rax, rdi - pext r9, r9, r15 - mov QWORD PTR [rsi+64], r10 - adc rax, r9 - mov r9, QWORD PTR [r12+80] - mov r10, QWORD PTR [r13+80] - pext r9, r9, rdi - pext r10, r10, r15 - mov QWORD PTR [rsi+72], rax - adc r9, r10 - mov r10, QWORD PTR [r12+88] - mov rax, QWORD PTR [r13+88] - pext r10, r10, rdi - pext rax, rax, r15 - mov QWORD PTR [rsi+80], r9 - adc r10, rax - mov rax, QWORD PTR [r12+96] - mov r9, QWORD PTR [r13+96] - pext rax, rax, rdi - pext r9, r9, r15 - mov QWORD PTR [rsi+88], r10 - adc rax, r9 - mov r9, QWORD PTR [r12+104] - mov r10, QWORD PTR [r13+104] - pext r9, r9, rdi - pext r10, r10, r15 - mov QWORD PTR [rsi+96], rax - adc r9, r10 - mov r10, QWORD PTR [r12+112] - mov rax, QWORD PTR [r13+112] - pext r10, r10, rdi - pext rax, rax, r15 - mov QWORD PTR [rsi+104], r9 - adc r10, rax - mov rax, QWORD PTR [r12+120] - mov r9, QWORD PTR [r13+120] - pext rax, rax, rdi - pext r9, r9, r15 - mov QWORD PTR [rsi+112], r10 - adc rax, r9 - mov QWORD PTR [rsi+120], rax - adc r11, 0 - lea r13, QWORD PTR [rsp+256] - mov r12, rsp - mov rax, QWORD PTR [r12] - sub rax, QWORD PTR [r13] - mov r9, QWORD PTR [r12+8] - mov QWORD PTR [r12], rax - sbb r9, QWORD PTR [r13+8] - mov r10, QWORD PTR [r12+16] - mov QWORD PTR [r12+8], r9 - sbb r10, QWORD PTR [r13+16] - mov rax, QWORD PTR [r12+24] - mov QWORD PTR [r12+16], r10 - sbb rax, QWORD PTR [r13+24] - mov r9, QWORD PTR [r12+32] - mov QWORD PTR [r12+24], rax - sbb r9, QWORD PTR [r13+32] - mov r10, QWORD PTR [r12+40] - mov QWORD PTR [r12+32], r9 - sbb r10, QWORD PTR [r13+40] - mov rax, QWORD PTR [r12+48] - mov QWORD PTR [r12+40], r10 - sbb rax, QWORD PTR [r13+48] - mov r9, QWORD PTR [r12+56] - mov QWORD PTR [r12+48], rax - sbb r9, QWORD PTR [r13+56] - mov r10, QWORD PTR [r12+64] - mov QWORD PTR [r12+56], r9 - sbb r10, QWORD PTR [r13+64] - mov rax, QWORD PTR [r12+72] - mov QWORD PTR [r12+64], r10 - sbb rax, QWORD PTR [r13+72] - mov r9, QWORD PTR [r12+80] - mov QWORD PTR [r12+72], rax - sbb r9, QWORD PTR [r13+80] - mov r10, QWORD PTR [r12+88] - mov QWORD PTR [r12+80], r9 - sbb r10, QWORD PTR [r13+88] - mov rax, QWORD PTR [r12+96] - mov QWORD PTR [r12+88], r10 - sbb rax, QWORD PTR [r13+96] - mov r9, QWORD PTR [r12+104] - mov QWORD PTR [r12+96], rax - sbb r9, QWORD PTR [r13+104] - mov r10, QWORD PTR [r12+112] - mov QWORD PTR [r12+104], r9 - sbb r10, QWORD PTR [r13+112] - mov rax, QWORD PTR [r12+120] - mov QWORD PTR [r12+112], r10 - sbb rax, QWORD PTR [r13+120] - mov r9, QWORD PTR [r12+128] - mov QWORD PTR [r12+120], rax - sbb r9, QWORD PTR [r13+128] - mov r10, QWORD PTR [r12+136] - mov QWORD PTR [r12+128], r9 - sbb r10, QWORD PTR [r13+136] - mov rax, QWORD PTR [r12+144] - mov QWORD PTR [r12+136], r10 - sbb rax, QWORD PTR [r13+144] - mov r9, QWORD PTR [r12+152] - mov QWORD PTR [r12+144], rax - sbb r9, QWORD PTR [r13+152] - mov r10, QWORD PTR [r12+160] - mov QWORD PTR [r12+152], r9 - sbb r10, QWORD PTR [r13+160] - mov rax, QWORD PTR [r12+168] - mov QWORD PTR [r12+160], r10 - sbb rax, QWORD PTR [r13+168] - mov r9, QWORD PTR [r12+176] - mov QWORD PTR [r12+168], rax - sbb r9, QWORD PTR [r13+176] - mov r10, QWORD PTR [r12+184] - mov QWORD PTR [r12+176], r9 - sbb r10, QWORD PTR [r13+184] - mov rax, QWORD PTR [r12+192] - mov QWORD PTR [r12+184], r10 - sbb rax, QWORD PTR [r13+192] - mov r9, QWORD PTR [r12+200] - mov QWORD PTR [r12+192], rax - sbb r9, QWORD PTR [r13+200] - mov r10, QWORD PTR [r12+208] - mov QWORD PTR [r12+200], r9 - sbb r10, QWORD PTR [r13+208] - mov rax, QWORD PTR [r12+216] - mov QWORD PTR [r12+208], r10 - sbb rax, QWORD PTR [r13+216] - mov r9, QWORD PTR [r12+224] - mov QWORD PTR [r12+216], rax - sbb r9, QWORD PTR [r13+224] - mov r10, QWORD PTR [r12+232] - mov QWORD PTR [r12+224], r9 - sbb r10, QWORD PTR [r13+232] - mov rax, QWORD PTR [r12+240] - mov QWORD PTR [r12+232], r10 - sbb rax, QWORD PTR [r13+240] - mov r9, QWORD PTR [r12+248] - mov QWORD PTR [r12+240], rax - sbb r9, QWORD PTR [r13+248] - mov QWORD PTR [r12+248], r9 - sbb r11, 0 - mov rax, QWORD PTR [r12] - sub rax, QWORD PTR [rcx] - mov r9, QWORD PTR [r12+8] - mov QWORD PTR [r12], rax - sbb r9, QWORD PTR [rcx+8] - mov r10, QWORD PTR [r12+16] - mov QWORD PTR [r12+8], r9 - sbb r10, QWORD PTR [rcx+16] - mov rax, QWORD PTR [r12+24] - mov QWORD PTR [r12+16], r10 - sbb rax, QWORD PTR [rcx+24] - mov r9, QWORD PTR [r12+32] - mov QWORD PTR [r12+24], rax - sbb r9, QWORD PTR [rcx+32] - mov r10, QWORD PTR [r12+40] - mov QWORD PTR [r12+32], r9 - sbb r10, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r12+48] - mov QWORD PTR [r12+40], r10 - sbb rax, QWORD PTR [rcx+48] - mov r9, QWORD PTR [r12+56] - mov QWORD PTR [r12+48], rax - sbb r9, QWORD PTR [rcx+56] - mov r10, QWORD PTR [r12+64] - mov QWORD PTR [r12+56], r9 - sbb r10, QWORD PTR [rcx+64] - mov rax, QWORD PTR [r12+72] - mov QWORD PTR [r12+64], r10 - sbb rax, QWORD PTR [rcx+72] - mov r9, QWORD PTR [r12+80] - mov QWORD PTR [r12+72], rax - sbb r9, QWORD PTR [rcx+80] - mov r10, QWORD PTR [r12+88] - mov QWORD PTR [r12+80], r9 - sbb r10, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r12+96] - mov QWORD PTR [r12+88], r10 - sbb rax, QWORD PTR [rcx+96] - mov r9, QWORD PTR [r12+104] - mov QWORD PTR [r12+96], rax - sbb r9, QWORD PTR [rcx+104] - mov r10, QWORD PTR [r12+112] - mov QWORD PTR [r12+104], r9 - sbb r10, QWORD PTR [rcx+112] - mov rax, QWORD PTR [r12+120] - mov QWORD PTR [r12+112], r10 - sbb rax, QWORD PTR [rcx+120] - mov r9, QWORD PTR [r12+128] - mov QWORD PTR [r12+120], rax - sbb r9, QWORD PTR [rcx+128] - mov r10, QWORD PTR [r12+136] - mov QWORD PTR [r12+128], r9 - sbb r10, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r12+144] - mov QWORD PTR [r12+136], r10 - sbb rax, QWORD PTR [rcx+144] - mov r9, QWORD PTR [r12+152] - mov QWORD PTR [r12+144], rax - sbb r9, QWORD PTR [rcx+152] - mov r10, QWORD PTR [r12+160] - mov QWORD PTR [r12+152], r9 - sbb r10, QWORD PTR [rcx+160] - mov rax, QWORD PTR [r12+168] - mov QWORD PTR [r12+160], r10 - sbb rax, QWORD PTR [rcx+168] - mov r9, QWORD PTR [r12+176] - mov QWORD PTR [r12+168], rax - sbb r9, QWORD PTR [rcx+176] - mov r10, QWORD PTR [r12+184] - mov QWORD PTR [r12+176], r9 - sbb r10, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r12+192] - mov QWORD PTR [r12+184], r10 - sbb rax, QWORD PTR [rcx+192] - mov r9, QWORD PTR [r12+200] - mov QWORD PTR [r12+192], rax - sbb r9, QWORD PTR [rcx+200] - mov r10, QWORD PTR [r12+208] - mov QWORD PTR [r12+200], r9 - sbb r10, QWORD PTR [rcx+208] - mov rax, QWORD PTR [r12+216] - mov QWORD PTR [r12+208], r10 - sbb rax, QWORD PTR [rcx+216] - mov r9, QWORD PTR [r12+224] - mov QWORD PTR [r12+216], rax - sbb r9, QWORD PTR [rcx+224] - mov r10, QWORD PTR [r12+232] - mov QWORD PTR [r12+224], r9 - sbb r10, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r12+240] - mov QWORD PTR [r12+232], r10 - sbb rax, QWORD PTR [rcx+240] - mov r9, QWORD PTR [r12+248] - mov QWORD PTR [r12+240], rax - sbb r9, QWORD PTR [rcx+248] - mov QWORD PTR [r12+248], r9 - sbb r11, 0 - sub rsi, 128 - ; Add - mov rax, QWORD PTR [rsi] - add rax, QWORD PTR [r12] - mov r9, QWORD PTR [rsi+8] - mov QWORD PTR [rsi], rax - adc r9, QWORD PTR [r12+8] - mov r10, QWORD PTR [rsi+16] - mov QWORD PTR [rsi+8], r9 - adc r10, QWORD PTR [r12+16] - mov rax, QWORD PTR [rsi+24] - mov QWORD PTR [rsi+16], r10 - adc rax, QWORD PTR [r12+24] - mov r9, QWORD PTR [rsi+32] - mov QWORD PTR [rsi+24], rax - adc r9, QWORD PTR [r12+32] - mov r10, QWORD PTR [rsi+40] - mov QWORD PTR [rsi+32], r9 - adc r10, QWORD PTR [r12+40] - mov rax, QWORD PTR [rsi+48] - mov QWORD PTR [rsi+40], r10 - adc rax, QWORD PTR [r12+48] - mov r9, QWORD PTR [rsi+56] - mov QWORD PTR [rsi+48], rax - adc r9, QWORD PTR [r12+56] - mov r10, QWORD PTR [rsi+64] - mov QWORD PTR [rsi+56], r9 - adc r10, QWORD PTR [r12+64] - mov rax, QWORD PTR [rsi+72] - mov QWORD PTR [rsi+64], r10 - adc rax, QWORD PTR [r12+72] - mov r9, QWORD PTR [rsi+80] - mov QWORD PTR [rsi+72], rax - adc r9, QWORD PTR [r12+80] - mov r10, QWORD PTR [rsi+88] - mov QWORD PTR [rsi+80], r9 - adc r10, QWORD PTR [r12+88] - mov rax, QWORD PTR [rsi+96] - mov QWORD PTR [rsi+88], r10 - adc rax, QWORD PTR [r12+96] - mov r9, QWORD PTR [rsi+104] - mov QWORD PTR [rsi+96], rax - adc r9, QWORD PTR [r12+104] - mov r10, QWORD PTR [rsi+112] - mov QWORD PTR [rsi+104], r9 - adc r10, QWORD PTR [r12+112] - mov rax, QWORD PTR [rsi+120] - mov QWORD PTR [rsi+112], r10 - adc rax, QWORD PTR [r12+120] - mov r9, QWORD PTR [rsi+128] - mov QWORD PTR [rsi+120], rax - adc r9, QWORD PTR [r12+128] - mov r10, QWORD PTR [rsi+136] - mov QWORD PTR [rsi+128], r9 - adc r10, QWORD PTR [r12+136] - mov rax, QWORD PTR [rsi+144] - mov QWORD PTR [rsi+136], r10 - adc rax, QWORD PTR [r12+144] - mov r9, QWORD PTR [rsi+152] - mov QWORD PTR [rsi+144], rax - adc r9, QWORD PTR [r12+152] - mov r10, QWORD PTR [rsi+160] - mov QWORD PTR [rsi+152], r9 - adc r10, QWORD PTR [r12+160] - mov rax, QWORD PTR [rsi+168] - mov QWORD PTR [rsi+160], r10 - adc rax, QWORD PTR [r12+168] - mov r9, QWORD PTR [rsi+176] - mov QWORD PTR [rsi+168], rax - adc r9, QWORD PTR [r12+176] - mov r10, QWORD PTR [rsi+184] - mov QWORD PTR [rsi+176], r9 - adc r10, QWORD PTR [r12+184] - mov rax, QWORD PTR [rsi+192] - mov QWORD PTR [rsi+184], r10 - adc rax, QWORD PTR [r12+192] - mov r9, QWORD PTR [rsi+200] - mov QWORD PTR [rsi+192], rax - adc r9, QWORD PTR [r12+200] - mov r10, QWORD PTR [rsi+208] - mov QWORD PTR [rsi+200], r9 - adc r10, QWORD PTR [r12+208] - mov rax, QWORD PTR [rsi+216] - mov QWORD PTR [rsi+208], r10 - adc rax, QWORD PTR [r12+216] - mov r9, QWORD PTR [rsi+224] - mov QWORD PTR [rsi+216], rax - adc r9, QWORD PTR [r12+224] - mov r10, QWORD PTR [rsi+232] - mov QWORD PTR [rsi+224], r9 - adc r10, QWORD PTR [r12+232] - mov rax, QWORD PTR [rsi+240] - mov QWORD PTR [rsi+232], r10 - adc rax, QWORD PTR [r12+240] - mov r9, QWORD PTR [rsi+248] - mov QWORD PTR [rsi+240], rax - adc r9, QWORD PTR [r12+248] - mov QWORD PTR [rsi+248], r9 - adc r11, 0 - mov QWORD PTR [rcx+384], r11 - add rsi, 128 - ; Add - mov rax, QWORD PTR [rsi] - xor r11, r11 - add rax, QWORD PTR [r13] - mov r9, QWORD PTR [rsi+8] - mov QWORD PTR [rsi], rax - adc r9, QWORD PTR [r13+8] - mov r10, QWORD PTR [rsi+16] - mov QWORD PTR [rsi+8], r9 - adc r10, QWORD PTR [r13+16] - mov rax, QWORD PTR [rsi+24] - mov QWORD PTR [rsi+16], r10 - adc rax, QWORD PTR [r13+24] - mov r9, QWORD PTR [rsi+32] - mov QWORD PTR [rsi+24], rax - adc r9, QWORD PTR [r13+32] - mov r10, QWORD PTR [rsi+40] - mov QWORD PTR [rsi+32], r9 - adc r10, QWORD PTR [r13+40] - mov rax, QWORD PTR [rsi+48] - mov QWORD PTR [rsi+40], r10 - adc rax, QWORD PTR [r13+48] - mov r9, QWORD PTR [rsi+56] - mov QWORD PTR [rsi+48], rax - adc r9, QWORD PTR [r13+56] - mov r10, QWORD PTR [rsi+64] - mov QWORD PTR [rsi+56], r9 - adc r10, QWORD PTR [r13+64] - mov rax, QWORD PTR [rsi+72] - mov QWORD PTR [rsi+64], r10 - adc rax, QWORD PTR [r13+72] - mov r9, QWORD PTR [rsi+80] - mov QWORD PTR [rsi+72], rax - adc r9, QWORD PTR [r13+80] - mov r10, QWORD PTR [rsi+88] - mov QWORD PTR [rsi+80], r9 - adc r10, QWORD PTR [r13+88] - mov rax, QWORD PTR [rsi+96] - mov QWORD PTR [rsi+88], r10 - adc rax, QWORD PTR [r13+96] - mov r9, QWORD PTR [rsi+104] - mov QWORD PTR [rsi+96], rax - adc r9, QWORD PTR [r13+104] - mov r10, QWORD PTR [rsi+112] - mov QWORD PTR [rsi+104], r9 - adc r10, QWORD PTR [r13+112] - mov rax, QWORD PTR [rsi+120] - mov QWORD PTR [rsi+112], r10 - adc rax, QWORD PTR [r13+120] - mov r9, QWORD PTR [rsi+128] - mov QWORD PTR [rsi+120], rax - adc r9, QWORD PTR [r13+128] - mov QWORD PTR [rsi+128], r9 - adc r11, 0 - ; Add to zero - mov rax, QWORD PTR [r13+136] - adc rax, 0 - mov r9, QWORD PTR [r13+144] - mov QWORD PTR [rsi+136], rax - adc r9, 0 - mov r10, QWORD PTR [r13+152] - mov QWORD PTR [rsi+144], r9 - adc r10, 0 - mov rax, QWORD PTR [r13+160] - mov QWORD PTR [rsi+152], r10 - adc rax, 0 - mov r9, QWORD PTR [r13+168] - mov QWORD PTR [rsi+160], rax - adc r9, 0 - mov r10, QWORD PTR [r13+176] - mov QWORD PTR [rsi+168], r9 - adc r10, 0 - mov rax, QWORD PTR [r13+184] - mov QWORD PTR [rsi+176], r10 - adc rax, 0 - mov r9, QWORD PTR [r13+192] - mov QWORD PTR [rsi+184], rax - adc r9, 0 - mov r10, QWORD PTR [r13+200] - mov QWORD PTR [rsi+192], r9 - adc r10, 0 - mov rax, QWORD PTR [r13+208] - mov QWORD PTR [rsi+200], r10 - adc rax, 0 - mov r9, QWORD PTR [r13+216] - mov QWORD PTR [rsi+208], rax - adc r9, 0 - mov r10, QWORD PTR [r13+224] - mov QWORD PTR [rsi+216], r9 - adc r10, 0 - mov rax, QWORD PTR [r13+232] - mov QWORD PTR [rsi+224], r10 - adc rax, 0 - mov r9, QWORD PTR [r13+240] - mov QWORD PTR [rsi+232], rax - adc r9, 0 - mov r10, QWORD PTR [r13+248] - mov QWORD PTR [rsi+240], r9 - adc r10, 0 - mov QWORD PTR [rsi+248], r10 - add rsp, 808 - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret -sp_2048_mul_avx2_32 ENDP -_text ENDS -ENDIF -IFDEF HAVE_INTEL_AVX2 ; /* Square a and put result in r. (r = a * a) ; * ; * r A single precision integer. @@ -13734,674 +13734,6 @@ sp_3072_mul_12 PROC ret sp_3072_mul_12 ENDP _text ENDS -; /* Square a and put result in r. (r = a * a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_sqr_12 PROC - push r12 - push r13 - push r14 - mov r8, rdx - sub rsp, 96 - ; A[0] * A[0] - mov rax, QWORD PTR [r8] - mul rax - xor r11, r11 - mov QWORD PTR [rsp], rax - mov r10, rdx - ; A[0] * A[1] - mov rax, QWORD PTR [r8+8] - mul QWORD PTR [r8] - xor r9, r9 - add r10, rax - adc r11, rdx - adc r9, 0 - add r10, rax - adc r11, rdx - adc r9, 0 - mov QWORD PTR [rsp+8], r10 - ; A[0] * A[2] - mov rax, QWORD PTR [r8+16] - mul QWORD PTR [r8] - xor r10, r10 - add r11, rax - adc r9, rdx - adc r10, 0 - add r11, rax - adc r9, rdx - adc r10, 0 - ; A[1] * A[1] - mov rax, QWORD PTR [r8+8] - mul rax - add r11, rax - adc r9, rdx - adc r10, 0 - mov QWORD PTR [rsp+16], r11 - ; A[0] * A[3] - mov rax, QWORD PTR [r8+24] - mul QWORD PTR [r8] - xor r11, r11 - add r9, rax - adc r10, rdx - adc r11, 0 - add r9, rax - adc r10, rdx - adc r11, 0 - ; A[1] * A[2] - mov rax, QWORD PTR [r8+16] - mul QWORD PTR [r8+8] - add r9, rax - adc r10, rdx - adc r11, 0 - add r9, rax - adc r10, rdx - adc r11, 0 - mov QWORD PTR [rsp+24], r9 - ; A[0] * A[4] - mov rax, QWORD PTR [r8+32] - mul QWORD PTR [r8] - xor r9, r9 - add r10, rax - adc r11, rdx - adc r9, 0 - add r10, rax - adc r11, rdx - adc r9, 0 - ; A[1] * A[3] - mov rax, QWORD PTR [r8+24] - mul QWORD PTR [r8+8] - add r10, rax - adc r11, rdx - adc r9, 0 - add r10, rax - adc r11, rdx - adc r9, 0 - ; A[2] * A[2] - mov rax, QWORD PTR [r8+16] - mul rax - add r10, rax - adc r11, rdx - adc r9, 0 - mov QWORD PTR [rsp+32], r10 - ; A[0] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[4] - mov rax, QWORD PTR [r8+32] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[3] - mov rax, QWORD PTR [r8+24] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rsp+40], r11 - ; A[0] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[4] - mov rax, QWORD PTR [r8+32] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[3] - mov rax, QWORD PTR [r8+24] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rsp+48], r9 - ; A[0] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[4] - mov rax, QWORD PTR [r8+32] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rsp+56], r10 - ; A[0] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[4] - mov rax, QWORD PTR [r8+32] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rsp+64], r11 - ; A[0] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[5] - mov rax, QWORD PTR [r8+40] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rsp+72], r9 - ; A[0] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[5] - mov rax, QWORD PTR [r8+40] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rsp+80], r10 - ; A[0] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[1] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+8] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[2] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[6] - mov rax, QWORD PTR [r8+48] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rsp+88], r11 - ; A[1] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+8] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[2] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+16] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[3] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[6] - mov rax, QWORD PTR [r8+48] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rcx+96], r9 - ; A[2] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+16] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[3] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+24] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[4] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[7] - mov rax, QWORD PTR [r8+56] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rcx+104], r10 - ; A[3] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+24] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[4] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+32] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[5] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[7] - mov rax, QWORD PTR [r8+56] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rcx+112], r11 - ; A[4] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+32] - xor r11, r11 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[5] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+40] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[6] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[8] - mov rax, QWORD PTR [r8+64] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r9, r12 - adc r10, r13 - adc r11, r14 - mov QWORD PTR [rcx+120], r9 - ; A[5] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+40] - xor r9, r9 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[6] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+48] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[7] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[8] * A[8] - mov rax, QWORD PTR [r8+64] - mul rax - add r12, r12 - adc r13, r13 - adc r14, r14 - add r12, rax - adc r13, rdx - adc r14, 0 - add r10, r12 - adc r11, r13 - adc r9, r14 - mov QWORD PTR [rcx+128], r10 - ; A[6] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+48] - xor r10, r10 - xor r14, r14 - mov r12, rax - mov r13, rdx - ; A[7] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+56] - add r12, rax - adc r13, rdx - adc r14, 0 - ; A[8] * A[9] - mov rax, QWORD PTR [r8+72] - mul QWORD PTR [r8+64] - add r12, rax - adc r13, rdx - adc r14, 0 - add r12, r12 - adc r13, r13 - adc r14, r14 - add r11, r12 - adc r9, r13 - adc r10, r14 - mov QWORD PTR [rcx+136], r11 - ; A[7] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+56] - xor r11, r11 - add r9, rax - adc r10, rdx - adc r11, 0 - add r9, rax - adc r10, rdx - adc r11, 0 - ; A[8] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+64] - add r9, rax - adc r10, rdx - adc r11, 0 - add r9, rax - adc r10, rdx - adc r11, 0 - ; A[9] * A[9] - mov rax, QWORD PTR [r8+72] - mul rax - add r9, rax - adc r10, rdx - adc r11, 0 - mov QWORD PTR [rcx+144], r9 - ; A[8] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+64] - xor r9, r9 - add r10, rax - adc r11, rdx - adc r9, 0 - add r10, rax - adc r11, rdx - adc r9, 0 - ; A[9] * A[10] - mov rax, QWORD PTR [r8+80] - mul QWORD PTR [r8+72] - add r10, rax - adc r11, rdx - adc r9, 0 - add r10, rax - adc r11, rdx - adc r9, 0 - mov QWORD PTR [rcx+152], r10 - ; A[9] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+72] - xor r10, r10 - add r11, rax - adc r9, rdx - adc r10, 0 - add r11, rax - adc r9, rdx - adc r10, 0 - ; A[10] * A[10] - mov rax, QWORD PTR [r8+80] - mul rax - add r11, rax - adc r9, rdx - adc r10, 0 - mov QWORD PTR [rcx+160], r11 - ; A[10] * A[11] - mov rax, QWORD PTR [r8+88] - mul QWORD PTR [r8+80] - xor r11, r11 - add r9, rax - adc r10, rdx - adc r11, 0 - add r9, rax - adc r10, rdx - adc r11, 0 - mov QWORD PTR [rcx+168], r9 - ; A[11] * A[11] - mov rax, QWORD PTR [r8+88] - mul rax - add r10, rax - adc r11, rdx - mov QWORD PTR [rcx+176], r10 - mov QWORD PTR [rcx+184], r11 - mov rax, QWORD PTR [rsp] - mov rdx, QWORD PTR [rsp+8] - mov r12, QWORD PTR [rsp+16] - mov r13, QWORD PTR [rsp+24] - mov QWORD PTR [rcx], rax - mov QWORD PTR [rcx+8], rdx - mov QWORD PTR [rcx+16], r12 - mov QWORD PTR [rcx+24], r13 - mov rax, QWORD PTR [rsp+32] - mov rdx, QWORD PTR [rsp+40] - mov r12, QWORD PTR [rsp+48] - mov r13, QWORD PTR [rsp+56] - mov QWORD PTR [rcx+32], rax - mov QWORD PTR [rcx+40], rdx - mov QWORD PTR [rcx+48], r12 - mov QWORD PTR [rcx+56], r13 - mov rax, QWORD PTR [rsp+64] - mov rdx, QWORD PTR [rsp+72] - mov r12, QWORD PTR [rsp+80] - mov r13, QWORD PTR [rsp+88] - mov QWORD PTR [rcx+64], rax - mov QWORD PTR [rcx+72], rdx - mov QWORD PTR [rcx+80], r12 - mov QWORD PTR [rcx+88], r13 - add rsp, 96 - pop r14 - pop r13 - pop r12 - ret -sp_3072_sqr_12 ENDP -_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Multiply a and b into r. (r = a * b) ; * @@ -15374,649 +14706,6 @@ L_end_3072_mul_avx2_12: sp_3072_mul_avx2_12 ENDP _text ENDS ENDIF -IFDEF HAVE_INTEL_AVX2 -; /* Square a and put result in r. (r = a * a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_sqr_avx2_12 PROC - push rbp - push r12 - push r13 - push r14 - push r15 - push rdi - push rsi - push rbx - mov r8, rcx - mov r9, rdx - sub rsp, 96 - cmp r9, r8 - mov rbp, rsp - cmovne rbp, r8 - add r8, 96 - xor r12, r12 - ; Diagonal 1 - ; Zero into %r9 - ; A[1] x A[0] - mov rdx, QWORD PTR [r9] - mulx r11, r10, QWORD PTR [r9+8] - mov QWORD PTR [rbp+8], r10 - ; Zero into %r8 - ; A[2] x A[0] - mulx r10, rax, QWORD PTR [r9+16] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [rbp+16], r11 - ; Zero into %r9 - ; A[3] x A[0] - mulx r11, rax, QWORD PTR [r9+24] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [rbp+24], r10 - ; Zero into %r8 - ; A[4] x A[0] - mulx r10, rax, QWORD PTR [r9+32] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [rbp+32], r11 - ; Zero into %r9 - ; A[5] x A[0] - mulx r11, rax, QWORD PTR [r9+40] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [rbp+40], r10 - ; No load %r12 - %r8 - ; A[6] x A[0] - mulx r14, rax, QWORD PTR [r9+48] - adcx r11, rax - adox r14, r12 - mov QWORD PTR [rbp+48], r11 - ; No load %r13 - %r9 - ; A[7] x A[0] - mulx r15, rax, QWORD PTR [r9+56] - adcx r14, rax - adox r15, r12 - ; No store %r12 - %r8 - ; No load %r14 - %r8 - ; A[8] x A[0] - mulx rdi, rax, QWORD PTR [r9+64] - adcx r15, rax - adox rdi, r12 - ; No store %r13 - %r9 - ; No load %r15 - %r9 - ; A[9] x A[0] - mulx rsi, rax, QWORD PTR [r9+72] - adcx rdi, rax - adox rsi, r12 - ; No store %r14 - %r8 - ; No load %rbx - %r8 - ; A[10] x A[0] - mulx rbx, rax, QWORD PTR [r9+80] - adcx rsi, rax - adox rbx, r12 - ; No store %r15 - %r9 - ; Zero into %r9 - ; A[11] x A[0] - mulx r11, rax, QWORD PTR [r9+88] - adcx rbx, rax - adox r11, r12 - ; No store %rbx - %r8 - ; Carry - adcx r11, r12 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8], r11 - ; Diagonal 2 - mov r11, QWORD PTR [rbp+24] - mov r10, QWORD PTR [rbp+32] - ; A[2] x A[1] - mov rdx, QWORD PTR [r9+8] - mulx rcx, rax, QWORD PTR [r9+16] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [rbp+24], r11 - mov r11, QWORD PTR [rbp+40] - ; A[3] x A[1] - mulx rcx, rax, QWORD PTR [r9+24] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [rbp+32], r10 - mov r10, QWORD PTR [rbp+48] - ; A[4] x A[1] - mulx rcx, rax, QWORD PTR [r9+32] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [rbp+40], r11 - ; No load %r12 - %r9 - ; A[5] x A[1] - mulx rcx, rax, QWORD PTR [r9+40] - adcx r10, rax - adox r14, rcx - mov QWORD PTR [rbp+48], r10 - ; No load %r13 - %r8 - ; A[6] x A[1] - mulx rcx, rax, QWORD PTR [r9+48] - adcx r14, rax - adox r15, rcx - ; No store %r12 - %r9 - ; No load %r14 - %r9 - ; A[7] x A[1] - mulx rcx, rax, QWORD PTR [r9+56] - adcx r15, rax - adox rdi, rcx - ; No store %r13 - %r8 - ; No load %r15 - %r8 - ; A[8] x A[1] - mulx rcx, rax, QWORD PTR [r9+64] - adcx rdi, rax - adox rsi, rcx - ; No store %r14 - %r9 - ; No load %rbx - %r9 - ; A[9] x A[1] - mulx rcx, rax, QWORD PTR [r9+72] - adcx rsi, rax - adox rbx, rcx - ; No store %r15 - %r8 - mov r10, QWORD PTR [r8] - ; A[10] x A[1] - mulx rcx, rax, QWORD PTR [r9+80] - adcx rbx, rax - adox r10, rcx - ; No store %rbx - %r9 - ; Zero into %r9 - ; A[11] x A[1] - mulx r11, rax, QWORD PTR [r9+88] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [r8], r10 - ; Zero into %r8 - ; A[11] x A[2] - mov rdx, QWORD PTR [r9+16] - mulx r10, rax, QWORD PTR [r9+88] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [r8+8], r11 - ; Carry - adcx r10, r13 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8+16], r10 - ; Diagonal 3 - mov r10, QWORD PTR [rbp+40] - mov r11, QWORD PTR [rbp+48] - ; A[3] x A[2] - mulx rcx, rax, QWORD PTR [r9+24] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [rbp+40], r10 - ; No load %r12 - %r8 - ; A[4] x A[2] - mulx rcx, rax, QWORD PTR [r9+32] - adcx r11, rax - adox r14, rcx - mov QWORD PTR [rbp+48], r11 - ; No load %r13 - %r9 - ; A[5] x A[2] - mulx rcx, rax, QWORD PTR [r9+40] - adcx r14, rax - adox r15, rcx - ; No store %r12 - %r8 - ; No load %r14 - %r8 - ; A[6] x A[2] - mulx rcx, rax, QWORD PTR [r9+48] - adcx r15, rax - adox rdi, rcx - ; No store %r13 - %r9 - ; No load %r15 - %r9 - ; A[7] x A[2] - mulx rcx, rax, QWORD PTR [r9+56] - adcx rdi, rax - adox rsi, rcx - ; No store %r14 - %r8 - ; No load %rbx - %r8 - ; A[8] x A[2] - mulx rcx, rax, QWORD PTR [r9+64] - adcx rsi, rax - adox rbx, rcx - ; No store %r15 - %r9 - mov r11, QWORD PTR [r8] - ; A[9] x A[2] - mulx rcx, rax, QWORD PTR [r9+72] - adcx rbx, rax - adox r11, rcx - ; No store %rbx - %r8 - mov r10, QWORD PTR [r8+8] - ; A[10] x A[2] - mulx rcx, rax, QWORD PTR [r9+80] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8], r11 - mov r11, QWORD PTR [r8+16] - ; A[10] x A[3] - mov rdx, QWORD PTR [r9+24] - mulx rcx, rax, QWORD PTR [r9+80] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+8], r10 - ; Zero into %r8 - ; A[10] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx r10, rax, QWORD PTR [r9+80] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [r8+16], r11 - ; Zero into %r9 - ; A[10] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx r11, rax, QWORD PTR [r9+80] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [r8+24], r10 - ; Carry - adcx r11, r13 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8+32], r11 - ; Diagonal 4 - ; No load %r13 - %r8 - ; A[4] x A[3] - mov rdx, QWORD PTR [r9+24] - mulx rcx, rax, QWORD PTR [r9+32] - adcx r14, rax - adox r15, rcx - ; No store %r12 - %r9 - ; No load %r14 - %r9 - ; A[5] x A[3] - mulx rcx, rax, QWORD PTR [r9+40] - adcx r15, rax - adox rdi, rcx - ; No store %r13 - %r8 - ; No load %r15 - %r8 - ; A[6] x A[3] - mulx rcx, rax, QWORD PTR [r9+48] - adcx rdi, rax - adox rsi, rcx - ; No store %r14 - %r9 - ; No load %rbx - %r9 - ; A[7] x A[3] - mulx rcx, rax, QWORD PTR [r9+56] - adcx rsi, rax - adox rbx, rcx - ; No store %r15 - %r8 - mov r10, QWORD PTR [r8] - ; A[8] x A[3] - mulx rcx, rax, QWORD PTR [r9+64] - adcx rbx, rax - adox r10, rcx - ; No store %rbx - %r9 - mov r11, QWORD PTR [r8+8] - ; A[9] x A[3] - mulx rcx, rax, QWORD PTR [r9+72] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8], r10 - mov r10, QWORD PTR [r8+16] - ; A[9] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx rcx, rax, QWORD PTR [r9+72] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+8], r11 - mov r11, QWORD PTR [r8+24] - ; A[9] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx rcx, rax, QWORD PTR [r9+72] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+16], r10 - mov r10, QWORD PTR [r8+32] - ; A[9] x A[6] - mov rdx, QWORD PTR [r9+48] - mulx rcx, rax, QWORD PTR [r9+72] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+24], r11 - ; Zero into %r9 - ; A[9] x A[7] - mov rdx, QWORD PTR [r9+56] - mulx r11, rax, QWORD PTR [r9+72] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [r8+32], r10 - ; Zero into %r8 - ; A[9] x A[8] - mov rdx, QWORD PTR [r9+64] - mulx r10, rax, QWORD PTR [r9+72] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [r8+40], r11 - ; Carry - adcx r10, r13 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8+48], r10 - ; Diagonal 5 - ; No load %r15 - %r9 - ; A[5] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx rcx, rax, QWORD PTR [r9+40] - adcx rdi, rax - adox rsi, rcx - ; No store %r14 - %r8 - ; No load %rbx - %r8 - ; A[6] x A[4] - mulx rcx, rax, QWORD PTR [r9+48] - adcx rsi, rax - adox rbx, rcx - ; No store %r15 - %r9 - mov r11, QWORD PTR [r8] - ; A[7] x A[4] - mulx rcx, rax, QWORD PTR [r9+56] - adcx rbx, rax - adox r11, rcx - ; No store %rbx - %r8 - mov r10, QWORD PTR [r8+8] - ; A[8] x A[4] - mulx rcx, rax, QWORD PTR [r9+64] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8], r11 - mov r11, QWORD PTR [r8+16] - ; A[8] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx rcx, rax, QWORD PTR [r9+64] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+8], r10 - mov r10, QWORD PTR [r8+24] - ; A[8] x A[6] - mov rdx, QWORD PTR [r9+48] - mulx rcx, rax, QWORD PTR [r9+64] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+16], r11 - mov r11, QWORD PTR [r8+32] - ; A[8] x A[7] - mov rdx, QWORD PTR [r9+56] - mulx rcx, rax, QWORD PTR [r9+64] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+24], r10 - mov r10, QWORD PTR [r8+40] - ; A[10] x A[6] - mov rdx, QWORD PTR [r9+48] - mulx rcx, rax, QWORD PTR [r9+80] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+32], r11 - mov r11, QWORD PTR [r8+48] - ; A[10] x A[7] - mov rdx, QWORD PTR [r9+56] - mulx rcx, rax, QWORD PTR [r9+80] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+40], r10 - ; Zero into %r8 - ; A[10] x A[8] - mov rdx, QWORD PTR [r9+64] - mulx r10, rax, QWORD PTR [r9+80] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [r8+48], r11 - ; Zero into %r9 - ; A[10] x A[9] - mov rdx, QWORD PTR [r9+72] - mulx r11, rax, QWORD PTR [r9+80] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [r8+56], r10 - ; Carry - adcx r11, r13 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8+64], r11 - ; Diagonal 6 - mov r10, QWORD PTR [r8] - ; A[6] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx rcx, rax, QWORD PTR [r9+48] - adcx rbx, rax - adox r10, rcx - ; No store %rbx - %r9 - mov r11, QWORD PTR [r8+8] - ; A[7] x A[5] - mulx rcx, rax, QWORD PTR [r9+56] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8], r10 - mov r10, QWORD PTR [r8+16] - ; A[7] x A[6] - mov rdx, QWORD PTR [r9+48] - mulx rcx, rax, QWORD PTR [r9+56] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+8], r11 - mov r11, QWORD PTR [r8+24] - ; A[11] x A[3] - mov rdx, QWORD PTR [r9+24] - mulx rcx, rax, QWORD PTR [r9+88] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+16], r10 - mov r10, QWORD PTR [r8+32] - ; A[11] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx rcx, rax, QWORD PTR [r9+88] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+24], r11 - mov r11, QWORD PTR [r8+40] - ; A[11] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx rcx, rax, QWORD PTR [r9+88] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+32], r10 - mov r10, QWORD PTR [r8+48] - ; A[11] x A[6] - mov rdx, QWORD PTR [r9+48] - mulx rcx, rax, QWORD PTR [r9+88] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+40], r11 - mov r11, QWORD PTR [r8+56] - ; A[11] x A[7] - mov rdx, QWORD PTR [r9+56] - mulx rcx, rax, QWORD PTR [r9+88] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+48], r10 - mov r10, QWORD PTR [r8+64] - ; A[11] x A[8] - mov rdx, QWORD PTR [r9+64] - mulx rcx, rax, QWORD PTR [r9+88] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+56], r11 - ; Zero into %r9 - ; A[11] x A[9] - mov rdx, QWORD PTR [r9+72] - mulx r11, rax, QWORD PTR [r9+88] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [r8+64], r10 - ; Zero into %r8 - ; A[11] x A[10] - mov rdx, QWORD PTR [r9+80] - mulx r10, rax, QWORD PTR [r9+88] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [r8+72], r11 - ; Carry - adcx r10, r13 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8+80], r10 - mov QWORD PTR [r8+88], r13 - ; Double and Add in A[i] x A[i] - mov r11, QWORD PTR [rbp+8] - ; A[0] x A[0] - mov rdx, QWORD PTR [r9] - mulx rcx, rax, rdx - mov QWORD PTR [rbp], rax - adox r11, r11 - adcx r11, rcx - mov QWORD PTR [rbp+8], r11 - mov r10, QWORD PTR [rbp+16] - mov r11, QWORD PTR [rbp+24] - ; A[1] x A[1] - mov rdx, QWORD PTR [r9+8] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [rbp+16], r10 - mov QWORD PTR [rbp+24], r11 - mov r10, QWORD PTR [rbp+32] - mov r11, QWORD PTR [rbp+40] - ; A[2] x A[2] - mov rdx, QWORD PTR [r9+16] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [rbp+32], r10 - mov QWORD PTR [rbp+40], r11 - mov r10, QWORD PTR [rbp+48] - ; A[3] x A[3] - mov rdx, QWORD PTR [r9+24] - mulx rcx, rax, rdx - adox r10, r10 - adox r14, r14 - adcx r10, rax - adcx r14, rcx - mov QWORD PTR [rbp+48], r10 - ; A[4] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx rcx, rax, rdx - adox r15, r15 - adox rdi, rdi - adcx r15, rax - adcx rdi, rcx - ; A[5] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx rcx, rax, rdx - adox rsi, rsi - adox rbx, rbx - adcx rsi, rax - adcx rbx, rcx - mov r10, QWORD PTR [r8] - mov r11, QWORD PTR [r8+8] - ; A[6] x A[6] - mov rdx, QWORD PTR [r9+48] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8], r10 - mov QWORD PTR [r8+8], r11 - mov r10, QWORD PTR [r8+16] - mov r11, QWORD PTR [r8+24] - ; A[7] x A[7] - mov rdx, QWORD PTR [r9+56] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8+16], r10 - mov QWORD PTR [r8+24], r11 - mov r10, QWORD PTR [r8+32] - mov r11, QWORD PTR [r8+40] - ; A[8] x A[8] - mov rdx, QWORD PTR [r9+64] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8+32], r10 - mov QWORD PTR [r8+40], r11 - mov r10, QWORD PTR [r8+48] - mov r11, QWORD PTR [r8+56] - ; A[9] x A[9] - mov rdx, QWORD PTR [r9+72] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8+48], r10 - mov QWORD PTR [r8+56], r11 - mov r10, QWORD PTR [r8+64] - mov r11, QWORD PTR [r8+72] - ; A[10] x A[10] - mov rdx, QWORD PTR [r9+80] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8+64], r10 - mov QWORD PTR [r8+72], r11 - mov r10, QWORD PTR [r8+80] - mov r11, QWORD PTR [r8+88] - ; A[11] x A[11] - mov rdx, QWORD PTR [r9+88] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8+80], r10 - mov QWORD PTR [r8+88], r11 - mov QWORD PTR [r8+-40], r14 - mov QWORD PTR [r8+-32], r15 - mov QWORD PTR [r8+-24], rdi - mov QWORD PTR [r8+-16], rsi - mov QWORD PTR [r8+-8], rbx - sub r8, 96 - cmp r9, r8 - jne L_end_3072_sqr_avx2_12 - vmovdqu xmm0, OWORD PTR [rbp] - vmovups OWORD PTR [r8], xmm0 - vmovdqu xmm0, OWORD PTR [rbp+16] - vmovups OWORD PTR [r8+16], xmm0 - vmovdqu xmm0, OWORD PTR [rbp+32] - vmovups OWORD PTR [r8+32], xmm0 - mov rax, QWORD PTR [rbp+48] - mov QWORD PTR [r8+48], rax -L_end_3072_sqr_avx2_12: - add rsp, 96 - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - ret -sp_3072_sqr_avx2_12 ENDP -_text ENDS -ENDIF ; /* Add b to a into r. (r = a + b) ; * ; * r A single precision integer. @@ -16786,503 +15475,6 @@ ENDIF ret sp_3072_mul_24 ENDP _text ENDS -; /* Add a to a into r. (r = a + a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_dbl_12 PROC - mov r8, QWORD PTR [rdx] - xor rax, rax - add r8, r8 - mov r9, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r8 - adc r9, r9 - mov QWORD PTR [rcx+88], r9 - adc rax, 0 - ret -sp_3072_dbl_12 ENDP -_text ENDS -; /* Square a and put result in r. (r = a * a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_sqr_24 PROC - push r12 - sub rsp, 504 - mov QWORD PTR [rsp+480], rcx - mov QWORD PTR [rsp+488], rdx - lea r10, QWORD PTR [rsp+384] - lea r11, QWORD PTR [rdx+96] - ; Add - mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] - mov r8, QWORD PTR [rdx+8] - mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] - mov rax, QWORD PTR [rdx+16] - mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] - mov r8, QWORD PTR [rdx+24] - mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] - mov rax, QWORD PTR [rdx+32] - mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] - mov r8, QWORD PTR [rdx+40] - mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] - mov rax, QWORD PTR [rdx+48] - mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] - mov r8, QWORD PTR [rdx+56] - mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] - mov rax, QWORD PTR [rdx+64] - mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] - mov r8, QWORD PTR [rdx+72] - mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] - mov rax, QWORD PTR [rdx+80] - mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] - mov r8, QWORD PTR [rdx+88] - mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] - mov QWORD PTR [r10+88], r8 - adc r9, 0 - mov QWORD PTR [rsp+496], r9 - mov rdx, r10 - mov rcx, rsp - call sp_3072_sqr_12 - mov rdx, QWORD PTR [rsp+488] - lea rcx, QWORD PTR [rsp+192] - add rdx, 96 - call sp_3072_sqr_12 - mov rdx, QWORD PTR [rsp+488] - mov rcx, QWORD PTR [rsp+480] - call sp_3072_sqr_12 -IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+488] - mov rcx, QWORD PTR [rsp+480] -ENDIF - mov r12, QWORD PTR [rsp+496] - mov r11, rcx - lea r10, QWORD PTR [rsp+384] - mov r9, r12 - neg r12 - add r11, 192 - mov rax, QWORD PTR [r10] - mov r8, QWORD PTR [r10+8] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11], rax - mov QWORD PTR [r11+8], r8 - mov rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r10+24] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+16], rax - mov QWORD PTR [r11+24], r8 - mov rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r10+40] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+32], rax - mov QWORD PTR [r11+40], r8 - mov rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r10+56] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+48], rax - mov QWORD PTR [r11+56], r8 - mov rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r10+72] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+64], rax - mov QWORD PTR [r11+72], r8 - mov rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r10+88] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+80], rax - mov QWORD PTR [r11+88], r8 - mov rax, QWORD PTR [r11] - add rax, rax - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, r8 - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, rax - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, r8 - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, rax - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, r8 - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, rax - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, r8 - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, rax - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, r8 - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, rax - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, r8 - mov QWORD PTR [r11+88], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+192] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov QWORD PTR [r10+184], r8 - sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov QWORD PTR [r10+184], r8 - sbb r9, 0 - sub r11, 96 - ; Add in place - mov rax, QWORD PTR [r11] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [r11+96] - mov QWORD PTR [r11+88], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [r10+184] - mov QWORD PTR [r11+184], r8 - adc r9, 0 - mov QWORD PTR [rcx+288], r9 - ; Add in place - mov rax, QWORD PTR [r11+96] - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r11+192] - mov QWORD PTR [r11+184], r8 - adc rax, QWORD PTR [rdx+96] - mov QWORD PTR [r11+192], rax - ; Add to zero - mov rax, QWORD PTR [rdx+104] - adc rax, 0 - mov r8, QWORD PTR [rdx+112] - mov QWORD PTR [r11+200], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+120] - mov QWORD PTR [r11+208], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+128] - mov QWORD PTR [r11+216], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+136] - mov QWORD PTR [r11+224], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [r11+232], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+152] - mov QWORD PTR [r11+240], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [r11+248], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+168] - mov QWORD PTR [r11+256], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [r11+264], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+184] - mov QWORD PTR [r11+272], r8 - adc rax, 0 - mov QWORD PTR [r11+280], rax - add rsp, 504 - pop r12 - ret -sp_3072_sqr_24 ENDP -_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Multiply a and b into r. (r = a * b) ; * @@ -17798,433 +15990,6 @@ ENDIF sp_3072_mul_avx2_24 ENDP _text ENDS ENDIF -IFDEF HAVE_INTEL_AVX2 -; /* Square a and put result in r. (r = a * a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_sqr_avx2_24 PROC - push r12 - sub rsp, 504 - mov QWORD PTR [rsp+480], rcx - mov QWORD PTR [rsp+488], rdx - lea r10, QWORD PTR [rsp+384] - lea r11, QWORD PTR [rdx+96] - ; Add - mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] - mov r8, QWORD PTR [rdx+8] - mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] - mov rax, QWORD PTR [rdx+16] - mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] - mov r8, QWORD PTR [rdx+24] - mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] - mov rax, QWORD PTR [rdx+32] - mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] - mov r8, QWORD PTR [rdx+40] - mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] - mov rax, QWORD PTR [rdx+48] - mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] - mov r8, QWORD PTR [rdx+56] - mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] - mov rax, QWORD PTR [rdx+64] - mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] - mov r8, QWORD PTR [rdx+72] - mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] - mov rax, QWORD PTR [rdx+80] - mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] - mov r8, QWORD PTR [rdx+88] - mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] - mov QWORD PTR [r10+88], r8 - adc r9, 0 - mov QWORD PTR [rsp+496], r9 - mov rdx, r10 - mov rcx, rsp - call sp_3072_sqr_avx2_12 - mov rdx, QWORD PTR [rsp+488] - lea rcx, QWORD PTR [rsp+192] - add rdx, 96 - call sp_3072_sqr_avx2_12 - mov rdx, QWORD PTR [rsp+488] - mov rcx, QWORD PTR [rsp+480] - call sp_3072_sqr_avx2_12 -IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+488] - mov rcx, QWORD PTR [rsp+480] -ENDIF - mov r12, QWORD PTR [rsp+496] - mov r11, rcx - lea r10, QWORD PTR [rsp+384] - mov r9, r12 - neg r12 - add r11, 192 - mov rax, QWORD PTR [r10] - pext rax, rax, r12 - add rax, rax - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r11], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r11+8], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r11+16], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r11+24], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r11+32], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r11+40], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r11+48], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r11+56], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r11+64], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r11+72], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r11+80], rax - pext r8, r8, r12 - adc r8, r8 - mov QWORD PTR [r11+88], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+192] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov QWORD PTR [r10+184], r8 - sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov QWORD PTR [r10+184], r8 - sbb r9, 0 - sub r11, 96 - ; Add in place - mov rax, QWORD PTR [r11] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [r11+96] - mov QWORD PTR [r11+88], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [r10+184] - mov QWORD PTR [r11+184], r8 - adc r9, 0 - mov QWORD PTR [rcx+288], r9 - ; Add in place - mov rax, QWORD PTR [r11+96] - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r11+192] - mov QWORD PTR [r11+184], r8 - adc rax, QWORD PTR [rdx+96] - mov QWORD PTR [r11+192], rax - ; Add to zero - mov rax, QWORD PTR [rdx+104] - adc rax, 0 - mov r8, QWORD PTR [rdx+112] - mov QWORD PTR [r11+200], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+120] - mov QWORD PTR [r11+208], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+128] - mov QWORD PTR [r11+216], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+136] - mov QWORD PTR [r11+224], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [r11+232], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+152] - mov QWORD PTR [r11+240], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [r11+248], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+168] - mov QWORD PTR [r11+256], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [r11+264], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+184] - mov QWORD PTR [r11+272], r8 - adc rax, 0 - mov QWORD PTR [r11+280], rax - add rsp, 504 - pop r12 - ret -sp_3072_sqr_avx2_24 ENDP -_text ENDS -ENDIF ; /* Sub b from a into a. (a -= b) ; * ; * a A single precision integer and result. @@ -19556,935 +17321,6 @@ ENDIF ret sp_3072_mul_48 ENDP _text ENDS -; /* Add a to a into r. (r = a + a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_dbl_24 PROC - mov r8, QWORD PTR [rdx] - xor rax, rax - add r8, r8 - mov r9, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+120], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+136] - mov QWORD PTR [rcx+128], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+136], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+144], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+152], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+160], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+168], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+176], r8 - adc r9, r9 - mov QWORD PTR [rcx+184], r9 - adc rax, 0 - ret -sp_3072_dbl_24 ENDP -_text ENDS -; /* Square a and put result in r. (r = a * a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_sqr_48 PROC - push r12 - sub rsp, 984 - mov QWORD PTR [rsp+960], rcx - mov QWORD PTR [rsp+968], rdx - lea r10, QWORD PTR [rsp+768] - lea r11, QWORD PTR [rdx+192] - ; Add - mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] - mov r8, QWORD PTR [rdx+8] - mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] - mov rax, QWORD PTR [rdx+16] - mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] - mov r8, QWORD PTR [rdx+24] - mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] - mov rax, QWORD PTR [rdx+32] - mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] - mov r8, QWORD PTR [rdx+40] - mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] - mov rax, QWORD PTR [rdx+48] - mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] - mov r8, QWORD PTR [rdx+56] - mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] - mov rax, QWORD PTR [rdx+64] - mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] - mov r8, QWORD PTR [rdx+72] - mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] - mov rax, QWORD PTR [rdx+80] - mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] - mov r8, QWORD PTR [rdx+88] - mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] - mov rax, QWORD PTR [rdx+96] - mov QWORD PTR [r10+88], r8 - adc rax, QWORD PTR [r11+96] - mov r8, QWORD PTR [rdx+104] - mov QWORD PTR [r10+96], rax - adc r8, QWORD PTR [r11+104] - mov rax, QWORD PTR [rdx+112] - mov QWORD PTR [r10+104], r8 - adc rax, QWORD PTR [r11+112] - mov r8, QWORD PTR [rdx+120] - mov QWORD PTR [r10+112], rax - adc r8, QWORD PTR [r11+120] - mov rax, QWORD PTR [rdx+128] - mov QWORD PTR [r10+120], r8 - adc rax, QWORD PTR [r11+128] - mov r8, QWORD PTR [rdx+136] - mov QWORD PTR [r10+128], rax - adc r8, QWORD PTR [r11+136] - mov rax, QWORD PTR [rdx+144] - mov QWORD PTR [r10+136], r8 - adc rax, QWORD PTR [r11+144] - mov r8, QWORD PTR [rdx+152] - mov QWORD PTR [r10+144], rax - adc r8, QWORD PTR [r11+152] - mov rax, QWORD PTR [rdx+160] - mov QWORD PTR [r10+152], r8 - adc rax, QWORD PTR [r11+160] - mov r8, QWORD PTR [rdx+168] - mov QWORD PTR [r10+160], rax - adc r8, QWORD PTR [r11+168] - mov rax, QWORD PTR [rdx+176] - mov QWORD PTR [r10+168], r8 - adc rax, QWORD PTR [r11+176] - mov r8, QWORD PTR [rdx+184] - mov QWORD PTR [r10+176], rax - adc r8, QWORD PTR [r11+184] - mov QWORD PTR [r10+184], r8 - adc r9, 0 - mov QWORD PTR [rsp+976], r9 - mov rdx, r10 - mov rcx, rsp - call sp_3072_sqr_24 - mov rdx, QWORD PTR [rsp+968] - lea rcx, QWORD PTR [rsp+384] - add rdx, 192 - call sp_3072_sqr_24 - mov rdx, QWORD PTR [rsp+968] - mov rcx, QWORD PTR [rsp+960] - call sp_3072_sqr_24 -IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+968] - mov rcx, QWORD PTR [rsp+960] -ENDIF - mov r12, QWORD PTR [rsp+976] - mov r11, rcx - lea r10, QWORD PTR [rsp+768] - mov r9, r12 - neg r12 - add r11, 384 - mov rax, QWORD PTR [r10] - mov r8, QWORD PTR [r10+8] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11], rax - mov QWORD PTR [r11+8], r8 - mov rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r10+24] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+16], rax - mov QWORD PTR [r11+24], r8 - mov rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r10+40] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+32], rax - mov QWORD PTR [r11+40], r8 - mov rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r10+56] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+48], rax - mov QWORD PTR [r11+56], r8 - mov rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r10+72] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+64], rax - mov QWORD PTR [r11+72], r8 - mov rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r10+88] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+80], rax - mov QWORD PTR [r11+88], r8 - mov rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r10+104] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+96], rax - mov QWORD PTR [r11+104], r8 - mov rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r10+120] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+112], rax - mov QWORD PTR [r11+120], r8 - mov rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r10+136] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+128], rax - mov QWORD PTR [r11+136], r8 - mov rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r10+152] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+144], rax - mov QWORD PTR [r11+152], r8 - mov rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r10+168] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+160], rax - mov QWORD PTR [r11+168], r8 - mov rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r10+184] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+176], rax - mov QWORD PTR [r11+184], r8 - mov rax, QWORD PTR [r11] - add rax, rax - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, r8 - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, rax - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, r8 - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, rax - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, r8 - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, rax - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, r8 - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, rax - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, r8 - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, rax - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, r8 - mov rax, QWORD PTR [r11+96] - mov QWORD PTR [r11+88], r8 - adc rax, rax - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, r8 - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, rax - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, r8 - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, rax - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, r8 - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, rax - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, r8 - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, rax - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, r8 - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, rax - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, r8 - mov QWORD PTR [r11+184], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+384] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rdx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rdx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rdx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rdx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rdx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rdx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rdx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rdx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rdx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rdx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rdx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rdx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rdx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rdx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rdx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rdx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rdx+376] - mov QWORD PTR [r10+376], r8 - sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rcx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rcx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rcx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rcx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rcx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rcx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rcx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rcx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rcx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rcx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rcx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rcx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rcx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rcx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rcx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rcx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rcx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rcx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rcx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rcx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rcx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rcx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rcx+376] - mov QWORD PTR [r10+376], r8 - sbb r9, 0 - sub r11, 192 - ; Add in place - mov rax, QWORD PTR [r11] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [r11+96] - mov QWORD PTR [r11+88], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [r10+184] - mov rax, QWORD PTR [r11+192] - mov QWORD PTR [r11+184], r8 - adc rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [r11+200] - mov QWORD PTR [r11+192], rax - adc r8, QWORD PTR [r10+200] - mov rax, QWORD PTR [r11+208] - mov QWORD PTR [r11+200], r8 - adc rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [r11+216] - mov QWORD PTR [r11+208], rax - adc r8, QWORD PTR [r10+216] - mov rax, QWORD PTR [r11+224] - mov QWORD PTR [r11+216], r8 - adc rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [r11+232] - mov QWORD PTR [r11+224], rax - adc r8, QWORD PTR [r10+232] - mov rax, QWORD PTR [r11+240] - mov QWORD PTR [r11+232], r8 - adc rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [r11+248] - mov QWORD PTR [r11+240], rax - adc r8, QWORD PTR [r10+248] - mov rax, QWORD PTR [r11+256] - mov QWORD PTR [r11+248], r8 - adc rax, QWORD PTR [r10+256] - mov r8, QWORD PTR [r11+264] - mov QWORD PTR [r11+256], rax - adc r8, QWORD PTR [r10+264] - mov rax, QWORD PTR [r11+272] - mov QWORD PTR [r11+264], r8 - adc rax, QWORD PTR [r10+272] - mov r8, QWORD PTR [r11+280] - mov QWORD PTR [r11+272], rax - adc r8, QWORD PTR [r10+280] - mov rax, QWORD PTR [r11+288] - mov QWORD PTR [r11+280], r8 - adc rax, QWORD PTR [r10+288] - mov r8, QWORD PTR [r11+296] - mov QWORD PTR [r11+288], rax - adc r8, QWORD PTR [r10+296] - mov rax, QWORD PTR [r11+304] - mov QWORD PTR [r11+296], r8 - adc rax, QWORD PTR [r10+304] - mov r8, QWORD PTR [r11+312] - mov QWORD PTR [r11+304], rax - adc r8, QWORD PTR [r10+312] - mov rax, QWORD PTR [r11+320] - mov QWORD PTR [r11+312], r8 - adc rax, QWORD PTR [r10+320] - mov r8, QWORD PTR [r11+328] - mov QWORD PTR [r11+320], rax - adc r8, QWORD PTR [r10+328] - mov rax, QWORD PTR [r11+336] - mov QWORD PTR [r11+328], r8 - adc rax, QWORD PTR [r10+336] - mov r8, QWORD PTR [r11+344] - mov QWORD PTR [r11+336], rax - adc r8, QWORD PTR [r10+344] - mov rax, QWORD PTR [r11+352] - mov QWORD PTR [r11+344], r8 - adc rax, QWORD PTR [r10+352] - mov r8, QWORD PTR [r11+360] - mov QWORD PTR [r11+352], rax - adc r8, QWORD PTR [r10+360] - mov rax, QWORD PTR [r11+368] - mov QWORD PTR [r11+360], r8 - adc rax, QWORD PTR [r10+368] - mov r8, QWORD PTR [r11+376] - mov QWORD PTR [r11+368], rax - adc r8, QWORD PTR [r10+376] - mov QWORD PTR [r11+376], r8 - adc r9, 0 - mov QWORD PTR [rcx+576], r9 - ; Add in place - mov rax, QWORD PTR [r11+192] - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r11+200] - mov QWORD PTR [r11+192], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r11+208] - mov QWORD PTR [r11+200], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r11+216] - mov QWORD PTR [r11+208], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r11+224] - mov QWORD PTR [r11+216], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r11+232] - mov QWORD PTR [r11+224], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r11+240] - mov QWORD PTR [r11+232], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r11+248] - mov QWORD PTR [r11+240], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r11+256] - mov QWORD PTR [r11+248], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r11+264] - mov QWORD PTR [r11+256], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r11+272] - mov QWORD PTR [r11+264], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r11+280] - mov QWORD PTR [r11+272], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r11+288] - mov QWORD PTR [r11+280], r8 - adc rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r11+296] - mov QWORD PTR [r11+288], rax - adc r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r11+304] - mov QWORD PTR [r11+296], r8 - adc rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r11+312] - mov QWORD PTR [r11+304], rax - adc r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r11+320] - mov QWORD PTR [r11+312], r8 - adc rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r11+328] - mov QWORD PTR [r11+320], rax - adc r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r11+336] - mov QWORD PTR [r11+328], r8 - adc rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r11+344] - mov QWORD PTR [r11+336], rax - adc r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r11+352] - mov QWORD PTR [r11+344], r8 - adc rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r11+360] - mov QWORD PTR [r11+352], rax - adc r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r11+368] - mov QWORD PTR [r11+360], r8 - adc rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r11+376] - mov QWORD PTR [r11+368], rax - adc r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r11+384] - mov QWORD PTR [r11+376], r8 - adc rax, QWORD PTR [rdx+192] - mov QWORD PTR [r11+384], rax - ; Add to zero - mov rax, QWORD PTR [rdx+200] - adc rax, 0 - mov r8, QWORD PTR [rdx+208] - mov QWORD PTR [r11+392], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+216] - mov QWORD PTR [r11+400], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+224] - mov QWORD PTR [r11+408], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+232] - mov QWORD PTR [r11+416], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+240] - mov QWORD PTR [r11+424], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+248] - mov QWORD PTR [r11+432], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+256] - mov QWORD PTR [r11+440], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+264] - mov QWORD PTR [r11+448], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+272] - mov QWORD PTR [r11+456], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+280] - mov QWORD PTR [r11+464], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+288] - mov QWORD PTR [r11+472], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+296] - mov QWORD PTR [r11+480], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+304] - mov QWORD PTR [r11+488], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+312] - mov QWORD PTR [r11+496], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+320] - mov QWORD PTR [r11+504], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+328] - mov QWORD PTR [r11+512], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+336] - mov QWORD PTR [r11+520], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+344] - mov QWORD PTR [r11+528], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+352] - mov QWORD PTR [r11+536], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+360] - mov QWORD PTR [r11+544], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+368] - mov QWORD PTR [r11+552], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+376] - mov QWORD PTR [r11+560], r8 - adc rax, 0 - mov QWORD PTR [r11+568], rax - add rsp, 984 - pop r12 - ret -sp_3072_sqr_48 ENDP -_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Multiply a and b into r. (r = a * b) ; * @@ -21432,6 +18268,3170 @@ ENDIF sp_3072_mul_avx2_48 ENDP _text ENDS ENDIF +; /* Square a and put result in r. (r = a * a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_3072_sqr_12 PROC + push r12 + push r13 + push r14 + mov r8, rdx + sub rsp, 96 + ; A[0] * A[0] + mov rax, QWORD PTR [r8] + mul rax + xor r11, r11 + mov QWORD PTR [rsp], rax + mov r10, rdx + ; A[0] * A[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8] + xor r9, r9 + add r10, rax + adc r11, rdx + adc r9, 0 + add r10, rax + adc r11, rdx + adc r9, 0 + mov QWORD PTR [rsp+8], r10 + ; A[0] * A[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8] + xor r10, r10 + add r11, rax + adc r9, rdx + adc r10, 0 + add r11, rax + adc r9, rdx + adc r10, 0 + ; A[1] * A[1] + mov rax, QWORD PTR [r8+8] + mul rax + add r11, rax + adc r9, rdx + adc r10, 0 + mov QWORD PTR [rsp+16], r11 + ; A[0] * A[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r8] + xor r11, r11 + add r9, rax + adc r10, rdx + adc r11, 0 + add r9, rax + adc r10, rdx + adc r11, 0 + ; A[1] * A[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8+8] + add r9, rax + adc r10, rdx + adc r11, 0 + add r9, rax + adc r10, rdx + adc r11, 0 + mov QWORD PTR [rsp+24], r9 + ; A[0] * A[4] + mov rax, QWORD PTR [r8+32] + mul QWORD PTR [r8] + xor r9, r9 + add r10, rax + adc r11, rdx + adc r9, 0 + add r10, rax + adc r11, rdx + adc r9, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r8+8] + add r10, rax + adc r11, rdx + adc r9, 0 + add r10, rax + adc r11, rdx + adc r9, 0 + ; A[2] * A[2] + mov rax, QWORD PTR [r8+16] + mul rax + add r10, rax + adc r11, rdx + adc r9, 0 + mov QWORD PTR [rsp+32], r10 + ; A[0] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[4] + mov rax, QWORD PTR [r8+32] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rsp+40], r11 + ; A[0] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[4] + mov rax, QWORD PTR [r8+32] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[3] + mov rax, QWORD PTR [r8+24] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rsp+48], r9 + ; A[0] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[4] + mov rax, QWORD PTR [r8+32] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rsp+56], r10 + ; A[0] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[4] + mov rax, QWORD PTR [r8+32] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rsp+64], r11 + ; A[0] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[5] + mov rax, QWORD PTR [r8+40] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rsp+72], r9 + ; A[0] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[5] + mov rax, QWORD PTR [r8+40] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rsp+80], r10 + ; A[0] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[1] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[6] + mov rax, QWORD PTR [r8+48] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rsp+88], r11 + ; A[1] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+8] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[2] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[6] + mov rax, QWORD PTR [r8+48] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rcx+96], r9 + ; A[2] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+16] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[3] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[4] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[7] + mov rax, QWORD PTR [r8+56] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rcx+104], r10 + ; A[3] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+24] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[4] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+32] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[5] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[7] + mov rax, QWORD PTR [r8+56] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rcx+112], r11 + ; A[4] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+32] + xor r11, r11 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[5] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[6] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[8] + mov rax, QWORD PTR [r8+64] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r9, r12 + adc r10, r13 + adc r11, r14 + mov QWORD PTR [rcx+120], r9 + ; A[5] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+40] + xor r9, r9 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[6] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[7] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[8] * A[8] + mov rax, QWORD PTR [r8+64] + mul rax + add r12, r12 + adc r13, r13 + adc r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + add r10, r12 + adc r11, r13 + adc r9, r14 + mov QWORD PTR [rcx+128], r10 + ; A[6] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+48] + xor r10, r10 + xor r14, r14 + mov r12, rax + mov r13, rdx + ; A[7] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+56] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[8] * A[9] + mov rax, QWORD PTR [r8+72] + mul QWORD PTR [r8+64] + add r12, rax + adc r13, rdx + adc r14, 0 + add r12, r12 + adc r13, r13 + adc r14, r14 + add r11, r12 + adc r9, r13 + adc r10, r14 + mov QWORD PTR [rcx+136], r11 + ; A[7] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+56] + xor r11, r11 + add r9, rax + adc r10, rdx + adc r11, 0 + add r9, rax + adc r10, rdx + adc r11, 0 + ; A[8] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+64] + add r9, rax + adc r10, rdx + adc r11, 0 + add r9, rax + adc r10, rdx + adc r11, 0 + ; A[9] * A[9] + mov rax, QWORD PTR [r8+72] + mul rax + add r9, rax + adc r10, rdx + adc r11, 0 + mov QWORD PTR [rcx+144], r9 + ; A[8] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+64] + xor r9, r9 + add r10, rax + adc r11, rdx + adc r9, 0 + add r10, rax + adc r11, rdx + adc r9, 0 + ; A[9] * A[10] + mov rax, QWORD PTR [r8+80] + mul QWORD PTR [r8+72] + add r10, rax + adc r11, rdx + adc r9, 0 + add r10, rax + adc r11, rdx + adc r9, 0 + mov QWORD PTR [rcx+152], r10 + ; A[9] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+72] + xor r10, r10 + add r11, rax + adc r9, rdx + adc r10, 0 + add r11, rax + adc r9, rdx + adc r10, 0 + ; A[10] * A[10] + mov rax, QWORD PTR [r8+80] + mul rax + add r11, rax + adc r9, rdx + adc r10, 0 + mov QWORD PTR [rcx+160], r11 + ; A[10] * A[11] + mov rax, QWORD PTR [r8+88] + mul QWORD PTR [r8+80] + xor r11, r11 + add r9, rax + adc r10, rdx + adc r11, 0 + add r9, rax + adc r10, rdx + adc r11, 0 + mov QWORD PTR [rcx+168], r9 + ; A[11] * A[11] + mov rax, QWORD PTR [r8+88] + mul rax + add r10, rax + adc r11, rdx + mov QWORD PTR [rcx+176], r10 + mov QWORD PTR [rcx+184], r11 + mov rax, QWORD PTR [rsp] + mov rdx, QWORD PTR [rsp+8] + mov r12, QWORD PTR [rsp+16] + mov r13, QWORD PTR [rsp+24] + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], rdx + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+24], r13 + mov rax, QWORD PTR [rsp+32] + mov rdx, QWORD PTR [rsp+40] + mov r12, QWORD PTR [rsp+48] + mov r13, QWORD PTR [rsp+56] + mov QWORD PTR [rcx+32], rax + mov QWORD PTR [rcx+40], rdx + mov QWORD PTR [rcx+48], r12 + mov QWORD PTR [rcx+56], r13 + mov rax, QWORD PTR [rsp+64] + mov rdx, QWORD PTR [rsp+72] + mov r12, QWORD PTR [rsp+80] + mov r13, QWORD PTR [rsp+88] + mov QWORD PTR [rcx+64], rax + mov QWORD PTR [rcx+72], rdx + mov QWORD PTR [rcx+80], r12 + mov QWORD PTR [rcx+88], r13 + add rsp, 96 + pop r14 + pop r13 + pop r12 + ret +sp_3072_sqr_12 ENDP +_text ENDS +IFDEF HAVE_INTEL_AVX2 +; /* Square a and put result in r. (r = a * a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_3072_sqr_avx2_12 PROC + push rbp + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov r8, rcx + mov r9, rdx + sub rsp, 96 + cmp r9, r8 + mov rbp, rsp + cmovne rbp, r8 + add r8, 96 + xor r12, r12 + ; Diagonal 1 + ; Zero into %r9 + ; A[1] x A[0] + mov rdx, QWORD PTR [r9] + mulx r11, r10, QWORD PTR [r9+8] + mov QWORD PTR [rbp+8], r10 + ; Zero into %r8 + ; A[2] x A[0] + mulx r10, rax, QWORD PTR [r9+16] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [rbp+16], r11 + ; Zero into %r9 + ; A[3] x A[0] + mulx r11, rax, QWORD PTR [r9+24] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [rbp+24], r10 + ; Zero into %r8 + ; A[4] x A[0] + mulx r10, rax, QWORD PTR [r9+32] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [rbp+32], r11 + ; Zero into %r9 + ; A[5] x A[0] + mulx r11, rax, QWORD PTR [r9+40] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [rbp+40], r10 + ; No load %r12 - %r8 + ; A[6] x A[0] + mulx r14, rax, QWORD PTR [r9+48] + adcx r11, rax + adox r14, r12 + mov QWORD PTR [rbp+48], r11 + ; No load %r13 - %r9 + ; A[7] x A[0] + mulx r15, rax, QWORD PTR [r9+56] + adcx r14, rax + adox r15, r12 + ; No store %r12 - %r8 + ; No load %r14 - %r8 + ; A[8] x A[0] + mulx rdi, rax, QWORD PTR [r9+64] + adcx r15, rax + adox rdi, r12 + ; No store %r13 - %r9 + ; No load %r15 - %r9 + ; A[9] x A[0] + mulx rsi, rax, QWORD PTR [r9+72] + adcx rdi, rax + adox rsi, r12 + ; No store %r14 - %r8 + ; No load %rbx - %r8 + ; A[10] x A[0] + mulx rbx, rax, QWORD PTR [r9+80] + adcx rsi, rax + adox rbx, r12 + ; No store %r15 - %r9 + ; Zero into %r9 + ; A[11] x A[0] + mulx r11, rax, QWORD PTR [r9+88] + adcx rbx, rax + adox r11, r12 + ; No store %rbx - %r8 + ; Carry + adcx r11, r12 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8], r11 + ; Diagonal 2 + mov r11, QWORD PTR [rbp+24] + mov r10, QWORD PTR [rbp+32] + ; A[2] x A[1] + mov rdx, QWORD PTR [r9+8] + mulx rcx, rax, QWORD PTR [r9+16] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [rbp+24], r11 + mov r11, QWORD PTR [rbp+40] + ; A[3] x A[1] + mulx rcx, rax, QWORD PTR [r9+24] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [rbp+32], r10 + mov r10, QWORD PTR [rbp+48] + ; A[4] x A[1] + mulx rcx, rax, QWORD PTR [r9+32] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [rbp+40], r11 + ; No load %r12 - %r9 + ; A[5] x A[1] + mulx rcx, rax, QWORD PTR [r9+40] + adcx r10, rax + adox r14, rcx + mov QWORD PTR [rbp+48], r10 + ; No load %r13 - %r8 + ; A[6] x A[1] + mulx rcx, rax, QWORD PTR [r9+48] + adcx r14, rax + adox r15, rcx + ; No store %r12 - %r9 + ; No load %r14 - %r9 + ; A[7] x A[1] + mulx rcx, rax, QWORD PTR [r9+56] + adcx r15, rax + adox rdi, rcx + ; No store %r13 - %r8 + ; No load %r15 - %r8 + ; A[8] x A[1] + mulx rcx, rax, QWORD PTR [r9+64] + adcx rdi, rax + adox rsi, rcx + ; No store %r14 - %r9 + ; No load %rbx - %r9 + ; A[9] x A[1] + mulx rcx, rax, QWORD PTR [r9+72] + adcx rsi, rax + adox rbx, rcx + ; No store %r15 - %r8 + mov r10, QWORD PTR [r8] + ; A[10] x A[1] + mulx rcx, rax, QWORD PTR [r9+80] + adcx rbx, rax + adox r10, rcx + ; No store %rbx - %r9 + ; Zero into %r9 + ; A[11] x A[1] + mulx r11, rax, QWORD PTR [r9+88] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [r8], r10 + ; Zero into %r8 + ; A[11] x A[2] + mov rdx, QWORD PTR [r9+16] + mulx r10, rax, QWORD PTR [r9+88] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [r8+8], r11 + ; Carry + adcx r10, r13 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8+16], r10 + ; Diagonal 3 + mov r10, QWORD PTR [rbp+40] + mov r11, QWORD PTR [rbp+48] + ; A[3] x A[2] + mulx rcx, rax, QWORD PTR [r9+24] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [rbp+40], r10 + ; No load %r12 - %r8 + ; A[4] x A[2] + mulx rcx, rax, QWORD PTR [r9+32] + adcx r11, rax + adox r14, rcx + mov QWORD PTR [rbp+48], r11 + ; No load %r13 - %r9 + ; A[5] x A[2] + mulx rcx, rax, QWORD PTR [r9+40] + adcx r14, rax + adox r15, rcx + ; No store %r12 - %r8 + ; No load %r14 - %r8 + ; A[6] x A[2] + mulx rcx, rax, QWORD PTR [r9+48] + adcx r15, rax + adox rdi, rcx + ; No store %r13 - %r9 + ; No load %r15 - %r9 + ; A[7] x A[2] + mulx rcx, rax, QWORD PTR [r9+56] + adcx rdi, rax + adox rsi, rcx + ; No store %r14 - %r8 + ; No load %rbx - %r8 + ; A[8] x A[2] + mulx rcx, rax, QWORD PTR [r9+64] + adcx rsi, rax + adox rbx, rcx + ; No store %r15 - %r9 + mov r11, QWORD PTR [r8] + ; A[9] x A[2] + mulx rcx, rax, QWORD PTR [r9+72] + adcx rbx, rax + adox r11, rcx + ; No store %rbx - %r8 + mov r10, QWORD PTR [r8+8] + ; A[10] x A[2] + mulx rcx, rax, QWORD PTR [r9+80] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8], r11 + mov r11, QWORD PTR [r8+16] + ; A[10] x A[3] + mov rdx, QWORD PTR [r9+24] + mulx rcx, rax, QWORD PTR [r9+80] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+8], r10 + ; Zero into %r8 + ; A[10] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx r10, rax, QWORD PTR [r9+80] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [r8+16], r11 + ; Zero into %r9 + ; A[10] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx r11, rax, QWORD PTR [r9+80] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [r8+24], r10 + ; Carry + adcx r11, r13 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8+32], r11 + ; Diagonal 4 + ; No load %r13 - %r8 + ; A[4] x A[3] + mov rdx, QWORD PTR [r9+24] + mulx rcx, rax, QWORD PTR [r9+32] + adcx r14, rax + adox r15, rcx + ; No store %r12 - %r9 + ; No load %r14 - %r9 + ; A[5] x A[3] + mulx rcx, rax, QWORD PTR [r9+40] + adcx r15, rax + adox rdi, rcx + ; No store %r13 - %r8 + ; No load %r15 - %r8 + ; A[6] x A[3] + mulx rcx, rax, QWORD PTR [r9+48] + adcx rdi, rax + adox rsi, rcx + ; No store %r14 - %r9 + ; No load %rbx - %r9 + ; A[7] x A[3] + mulx rcx, rax, QWORD PTR [r9+56] + adcx rsi, rax + adox rbx, rcx + ; No store %r15 - %r8 + mov r10, QWORD PTR [r8] + ; A[8] x A[3] + mulx rcx, rax, QWORD PTR [r9+64] + adcx rbx, rax + adox r10, rcx + ; No store %rbx - %r9 + mov r11, QWORD PTR [r8+8] + ; A[9] x A[3] + mulx rcx, rax, QWORD PTR [r9+72] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8], r10 + mov r10, QWORD PTR [r8+16] + ; A[9] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx rcx, rax, QWORD PTR [r9+72] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+8], r11 + mov r11, QWORD PTR [r8+24] + ; A[9] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx rcx, rax, QWORD PTR [r9+72] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+16], r10 + mov r10, QWORD PTR [r8+32] + ; A[9] x A[6] + mov rdx, QWORD PTR [r9+48] + mulx rcx, rax, QWORD PTR [r9+72] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+24], r11 + ; Zero into %r9 + ; A[9] x A[7] + mov rdx, QWORD PTR [r9+56] + mulx r11, rax, QWORD PTR [r9+72] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [r8+32], r10 + ; Zero into %r8 + ; A[9] x A[8] + mov rdx, QWORD PTR [r9+64] + mulx r10, rax, QWORD PTR [r9+72] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [r8+40], r11 + ; Carry + adcx r10, r13 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8+48], r10 + ; Diagonal 5 + ; No load %r15 - %r9 + ; A[5] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx rcx, rax, QWORD PTR [r9+40] + adcx rdi, rax + adox rsi, rcx + ; No store %r14 - %r8 + ; No load %rbx - %r8 + ; A[6] x A[4] + mulx rcx, rax, QWORD PTR [r9+48] + adcx rsi, rax + adox rbx, rcx + ; No store %r15 - %r9 + mov r11, QWORD PTR [r8] + ; A[7] x A[4] + mulx rcx, rax, QWORD PTR [r9+56] + adcx rbx, rax + adox r11, rcx + ; No store %rbx - %r8 + mov r10, QWORD PTR [r8+8] + ; A[8] x A[4] + mulx rcx, rax, QWORD PTR [r9+64] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8], r11 + mov r11, QWORD PTR [r8+16] + ; A[8] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx rcx, rax, QWORD PTR [r9+64] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+8], r10 + mov r10, QWORD PTR [r8+24] + ; A[8] x A[6] + mov rdx, QWORD PTR [r9+48] + mulx rcx, rax, QWORD PTR [r9+64] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+16], r11 + mov r11, QWORD PTR [r8+32] + ; A[8] x A[7] + mov rdx, QWORD PTR [r9+56] + mulx rcx, rax, QWORD PTR [r9+64] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+24], r10 + mov r10, QWORD PTR [r8+40] + ; A[10] x A[6] + mov rdx, QWORD PTR [r9+48] + mulx rcx, rax, QWORD PTR [r9+80] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+32], r11 + mov r11, QWORD PTR [r8+48] + ; A[10] x A[7] + mov rdx, QWORD PTR [r9+56] + mulx rcx, rax, QWORD PTR [r9+80] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+40], r10 + ; Zero into %r8 + ; A[10] x A[8] + mov rdx, QWORD PTR [r9+64] + mulx r10, rax, QWORD PTR [r9+80] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [r8+48], r11 + ; Zero into %r9 + ; A[10] x A[9] + mov rdx, QWORD PTR [r9+72] + mulx r11, rax, QWORD PTR [r9+80] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [r8+56], r10 + ; Carry + adcx r11, r13 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8+64], r11 + ; Diagonal 6 + mov r10, QWORD PTR [r8] + ; A[6] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx rcx, rax, QWORD PTR [r9+48] + adcx rbx, rax + adox r10, rcx + ; No store %rbx - %r9 + mov r11, QWORD PTR [r8+8] + ; A[7] x A[5] + mulx rcx, rax, QWORD PTR [r9+56] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8], r10 + mov r10, QWORD PTR [r8+16] + ; A[7] x A[6] + mov rdx, QWORD PTR [r9+48] + mulx rcx, rax, QWORD PTR [r9+56] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+8], r11 + mov r11, QWORD PTR [r8+24] + ; A[11] x A[3] + mov rdx, QWORD PTR [r9+24] + mulx rcx, rax, QWORD PTR [r9+88] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+16], r10 + mov r10, QWORD PTR [r8+32] + ; A[11] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx rcx, rax, QWORD PTR [r9+88] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+24], r11 + mov r11, QWORD PTR [r8+40] + ; A[11] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx rcx, rax, QWORD PTR [r9+88] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+32], r10 + mov r10, QWORD PTR [r8+48] + ; A[11] x A[6] + mov rdx, QWORD PTR [r9+48] + mulx rcx, rax, QWORD PTR [r9+88] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+40], r11 + mov r11, QWORD PTR [r8+56] + ; A[11] x A[7] + mov rdx, QWORD PTR [r9+56] + mulx rcx, rax, QWORD PTR [r9+88] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+48], r10 + mov r10, QWORD PTR [r8+64] + ; A[11] x A[8] + mov rdx, QWORD PTR [r9+64] + mulx rcx, rax, QWORD PTR [r9+88] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+56], r11 + ; Zero into %r9 + ; A[11] x A[9] + mov rdx, QWORD PTR [r9+72] + mulx r11, rax, QWORD PTR [r9+88] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [r8+64], r10 + ; Zero into %r8 + ; A[11] x A[10] + mov rdx, QWORD PTR [r9+80] + mulx r10, rax, QWORD PTR [r9+88] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [r8+72], r11 + ; Carry + adcx r10, r13 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8+80], r10 + mov QWORD PTR [r8+88], r13 + ; Double and Add in A[i] x A[i] + mov r11, QWORD PTR [rbp+8] + ; A[0] x A[0] + mov rdx, QWORD PTR [r9] + mulx rcx, rax, rdx + mov QWORD PTR [rbp], rax + adox r11, r11 + adcx r11, rcx + mov QWORD PTR [rbp+8], r11 + mov r10, QWORD PTR [rbp+16] + mov r11, QWORD PTR [rbp+24] + ; A[1] x A[1] + mov rdx, QWORD PTR [r9+8] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [rbp+16], r10 + mov QWORD PTR [rbp+24], r11 + mov r10, QWORD PTR [rbp+32] + mov r11, QWORD PTR [rbp+40] + ; A[2] x A[2] + mov rdx, QWORD PTR [r9+16] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [rbp+32], r10 + mov QWORD PTR [rbp+40], r11 + mov r10, QWORD PTR [rbp+48] + ; A[3] x A[3] + mov rdx, QWORD PTR [r9+24] + mulx rcx, rax, rdx + adox r10, r10 + adox r14, r14 + adcx r10, rax + adcx r14, rcx + mov QWORD PTR [rbp+48], r10 + ; A[4] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx rcx, rax, rdx + adox r15, r15 + adox rdi, rdi + adcx r15, rax + adcx rdi, rcx + ; A[5] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx rcx, rax, rdx + adox rsi, rsi + adox rbx, rbx + adcx rsi, rax + adcx rbx, rcx + mov r10, QWORD PTR [r8] + mov r11, QWORD PTR [r8+8] + ; A[6] x A[6] + mov rdx, QWORD PTR [r9+48] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8], r10 + mov QWORD PTR [r8+8], r11 + mov r10, QWORD PTR [r8+16] + mov r11, QWORD PTR [r8+24] + ; A[7] x A[7] + mov rdx, QWORD PTR [r9+56] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8+16], r10 + mov QWORD PTR [r8+24], r11 + mov r10, QWORD PTR [r8+32] + mov r11, QWORD PTR [r8+40] + ; A[8] x A[8] + mov rdx, QWORD PTR [r9+64] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8+32], r10 + mov QWORD PTR [r8+40], r11 + mov r10, QWORD PTR [r8+48] + mov r11, QWORD PTR [r8+56] + ; A[9] x A[9] + mov rdx, QWORD PTR [r9+72] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8+48], r10 + mov QWORD PTR [r8+56], r11 + mov r10, QWORD PTR [r8+64] + mov r11, QWORD PTR [r8+72] + ; A[10] x A[10] + mov rdx, QWORD PTR [r9+80] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8+64], r10 + mov QWORD PTR [r8+72], r11 + mov r10, QWORD PTR [r8+80] + mov r11, QWORD PTR [r8+88] + ; A[11] x A[11] + mov rdx, QWORD PTR [r9+88] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8+80], r10 + mov QWORD PTR [r8+88], r11 + mov QWORD PTR [r8+-40], r14 + mov QWORD PTR [r8+-32], r15 + mov QWORD PTR [r8+-24], rdi + mov QWORD PTR [r8+-16], rsi + mov QWORD PTR [r8+-8], rbx + sub r8, 96 + cmp r9, r8 + jne L_end_3072_sqr_avx2_12 + vmovdqu xmm0, OWORD PTR [rbp] + vmovups OWORD PTR [r8], xmm0 + vmovdqu xmm0, OWORD PTR [rbp+16] + vmovups OWORD PTR [r8+16], xmm0 + vmovdqu xmm0, OWORD PTR [rbp+32] + vmovups OWORD PTR [r8+32], xmm0 + mov rax, QWORD PTR [rbp+48] + mov QWORD PTR [r8+48], rax +L_end_3072_sqr_avx2_12: + add rsp, 96 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + ret +sp_3072_sqr_avx2_12 ENDP +_text ENDS +ENDIF +; /* Add a to a into r. (r = a + a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_3072_dbl_12 PROC + mov r8, QWORD PTR [rdx] + xor rax, rax + add r8, r8 + mov r9, QWORD PTR [rdx+8] + mov QWORD PTR [rcx], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+16] + mov QWORD PTR [rcx+8], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+24] + mov QWORD PTR [rcx+16], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+32] + mov QWORD PTR [rcx+24], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+40] + mov QWORD PTR [rcx+32], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+48] + mov QWORD PTR [rcx+40], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+56] + mov QWORD PTR [rcx+48], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+64] + mov QWORD PTR [rcx+56], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+72] + mov QWORD PTR [rcx+64], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+80] + mov QWORD PTR [rcx+72], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+88] + mov QWORD PTR [rcx+80], r8 + adc r9, r9 + mov QWORD PTR [rcx+88], r9 + adc rax, 0 + ret +sp_3072_dbl_12 ENDP +_text ENDS +; /* Square a and put result in r. (r = a * a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_3072_sqr_24 PROC + push r12 + sub rsp, 504 + mov QWORD PTR [rsp+480], rcx + mov QWORD PTR [rsp+488], rdx + lea r10, QWORD PTR [rsp+384] + lea r11, QWORD PTR [rdx+96] + ; Add + mov rax, QWORD PTR [rdx] + xor r9, r9 + add rax, QWORD PTR [r11] + mov r8, QWORD PTR [rdx+8] + mov QWORD PTR [r10], rax + adc r8, QWORD PTR [r11+8] + mov rax, QWORD PTR [rdx+16] + mov QWORD PTR [r10+8], r8 + adc rax, QWORD PTR [r11+16] + mov r8, QWORD PTR [rdx+24] + mov QWORD PTR [r10+16], rax + adc r8, QWORD PTR [r11+24] + mov rax, QWORD PTR [rdx+32] + mov QWORD PTR [r10+24], r8 + adc rax, QWORD PTR [r11+32] + mov r8, QWORD PTR [rdx+40] + mov QWORD PTR [r10+32], rax + adc r8, QWORD PTR [r11+40] + mov rax, QWORD PTR [rdx+48] + mov QWORD PTR [r10+40], r8 + adc rax, QWORD PTR [r11+48] + mov r8, QWORD PTR [rdx+56] + mov QWORD PTR [r10+48], rax + adc r8, QWORD PTR [r11+56] + mov rax, QWORD PTR [rdx+64] + mov QWORD PTR [r10+56], r8 + adc rax, QWORD PTR [r11+64] + mov r8, QWORD PTR [rdx+72] + mov QWORD PTR [r10+64], rax + adc r8, QWORD PTR [r11+72] + mov rax, QWORD PTR [rdx+80] + mov QWORD PTR [r10+72], r8 + adc rax, QWORD PTR [r11+80] + mov r8, QWORD PTR [rdx+88] + mov QWORD PTR [r10+80], rax + adc r8, QWORD PTR [r11+88] + mov QWORD PTR [r10+88], r8 + adc r9, 0 + mov QWORD PTR [rsp+496], r9 + mov rdx, r10 + mov rcx, rsp + call sp_3072_sqr_12 + mov rdx, QWORD PTR [rsp+488] + lea rcx, QWORD PTR [rsp+192] + add rdx, 96 + call sp_3072_sqr_12 + mov rdx, QWORD PTR [rsp+488] + mov rcx, QWORD PTR [rsp+480] + call sp_3072_sqr_12 +IFDEF _WIN64 + mov rdx, QWORD PTR [rsp+488] + mov rcx, QWORD PTR [rsp+480] +ENDIF + mov r12, QWORD PTR [rsp+496] + mov r11, rcx + lea r10, QWORD PTR [rsp+384] + mov r9, r12 + neg r12 + add r11, 192 + mov rax, QWORD PTR [r10] + mov r8, QWORD PTR [r10+8] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11], rax + mov QWORD PTR [r11+8], r8 + mov rax, QWORD PTR [r10+16] + mov r8, QWORD PTR [r10+24] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+16], rax + mov QWORD PTR [r11+24], r8 + mov rax, QWORD PTR [r10+32] + mov r8, QWORD PTR [r10+40] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+32], rax + mov QWORD PTR [r11+40], r8 + mov rax, QWORD PTR [r10+48] + mov r8, QWORD PTR [r10+56] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+48], rax + mov QWORD PTR [r11+56], r8 + mov rax, QWORD PTR [r10+64] + mov r8, QWORD PTR [r10+72] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+64], rax + mov QWORD PTR [r11+72], r8 + mov rax, QWORD PTR [r10+80] + mov r8, QWORD PTR [r10+88] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+80], rax + mov QWORD PTR [r11+88], r8 + mov rax, QWORD PTR [r11] + add rax, rax + mov r8, QWORD PTR [r11+8] + mov QWORD PTR [r11], rax + adc r8, r8 + mov rax, QWORD PTR [r11+16] + mov QWORD PTR [r11+8], r8 + adc rax, rax + mov r8, QWORD PTR [r11+24] + mov QWORD PTR [r11+16], rax + adc r8, r8 + mov rax, QWORD PTR [r11+32] + mov QWORD PTR [r11+24], r8 + adc rax, rax + mov r8, QWORD PTR [r11+40] + mov QWORD PTR [r11+32], rax + adc r8, r8 + mov rax, QWORD PTR [r11+48] + mov QWORD PTR [r11+40], r8 + adc rax, rax + mov r8, QWORD PTR [r11+56] + mov QWORD PTR [r11+48], rax + adc r8, r8 + mov rax, QWORD PTR [r11+64] + mov QWORD PTR [r11+56], r8 + adc rax, rax + mov r8, QWORD PTR [r11+72] + mov QWORD PTR [r11+64], rax + adc r8, r8 + mov rax, QWORD PTR [r11+80] + mov QWORD PTR [r11+72], r8 + adc rax, rax + mov r8, QWORD PTR [r11+88] + mov QWORD PTR [r11+80], rax + adc r8, r8 + mov QWORD PTR [r11+88], r8 + adc r9, 0 + lea rdx, QWORD PTR [rsp+192] + mov r10, rsp + mov rax, QWORD PTR [r10] + sub rax, QWORD PTR [rdx] + mov r8, QWORD PTR [r10+8] + mov QWORD PTR [r10], rax + sbb r8, QWORD PTR [rdx+8] + mov rax, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], r8 + sbb rax, QWORD PTR [rdx+16] + mov r8, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], rax + sbb r8, QWORD PTR [rdx+24] + mov rax, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], r8 + sbb rax, QWORD PTR [rdx+32] + mov r8, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], rax + sbb r8, QWORD PTR [rdx+40] + mov rax, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], r8 + sbb rax, QWORD PTR [rdx+48] + mov r8, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], rax + sbb r8, QWORD PTR [rdx+56] + mov rax, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], r8 + sbb rax, QWORD PTR [rdx+64] + mov r8, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], rax + sbb r8, QWORD PTR [rdx+72] + mov rax, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], r8 + sbb rax, QWORD PTR [rdx+80] + mov r8, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], rax + sbb r8, QWORD PTR [rdx+88] + mov rax, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], r8 + sbb rax, QWORD PTR [rdx+96] + mov r8, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], rax + sbb r8, QWORD PTR [rdx+104] + mov rax, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], r8 + sbb rax, QWORD PTR [rdx+112] + mov r8, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], rax + sbb r8, QWORD PTR [rdx+120] + mov rax, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], r8 + sbb rax, QWORD PTR [rdx+128] + mov r8, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], rax + sbb r8, QWORD PTR [rdx+136] + mov rax, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], r8 + sbb rax, QWORD PTR [rdx+144] + mov r8, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], rax + sbb r8, QWORD PTR [rdx+152] + mov rax, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], r8 + sbb rax, QWORD PTR [rdx+160] + mov r8, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], rax + sbb r8, QWORD PTR [rdx+168] + mov rax, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], r8 + sbb rax, QWORD PTR [rdx+176] + mov r8, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], rax + sbb r8, QWORD PTR [rdx+184] + mov QWORD PTR [r10+184], r8 + sbb r9, 0 + mov rax, QWORD PTR [r10] + sub rax, QWORD PTR [rcx] + mov r8, QWORD PTR [r10+8] + mov QWORD PTR [r10], rax + sbb r8, QWORD PTR [rcx+8] + mov rax, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], r8 + sbb rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], rax + sbb r8, QWORD PTR [rcx+24] + mov rax, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], r8 + sbb rax, QWORD PTR [rcx+32] + mov r8, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], rax + sbb r8, QWORD PTR [rcx+40] + mov rax, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], r8 + sbb rax, QWORD PTR [rcx+48] + mov r8, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], rax + sbb r8, QWORD PTR [rcx+56] + mov rax, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], r8 + sbb rax, QWORD PTR [rcx+64] + mov r8, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], rax + sbb r8, QWORD PTR [rcx+72] + mov rax, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], r8 + sbb rax, QWORD PTR [rcx+80] + mov r8, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], rax + sbb r8, QWORD PTR [rcx+88] + mov rax, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], r8 + sbb rax, QWORD PTR [rcx+96] + mov r8, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], rax + sbb r8, QWORD PTR [rcx+104] + mov rax, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], r8 + sbb rax, QWORD PTR [rcx+112] + mov r8, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], rax + sbb r8, QWORD PTR [rcx+120] + mov rax, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], r8 + sbb rax, QWORD PTR [rcx+128] + mov r8, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], rax + sbb r8, QWORD PTR [rcx+136] + mov rax, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], r8 + sbb rax, QWORD PTR [rcx+144] + mov r8, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], rax + sbb r8, QWORD PTR [rcx+152] + mov rax, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], r8 + sbb rax, QWORD PTR [rcx+160] + mov r8, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], rax + sbb r8, QWORD PTR [rcx+168] + mov rax, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], r8 + sbb rax, QWORD PTR [rcx+176] + mov r8, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], rax + sbb r8, QWORD PTR [rcx+184] + mov QWORD PTR [r10+184], r8 + sbb r9, 0 + sub r11, 96 + ; Add in place + mov rax, QWORD PTR [r11] + add rax, QWORD PTR [r10] + mov r8, QWORD PTR [r11+8] + mov QWORD PTR [r11], rax + adc r8, QWORD PTR [r10+8] + mov rax, QWORD PTR [r11+16] + mov QWORD PTR [r11+8], r8 + adc rax, QWORD PTR [r10+16] + mov r8, QWORD PTR [r11+24] + mov QWORD PTR [r11+16], rax + adc r8, QWORD PTR [r10+24] + mov rax, QWORD PTR [r11+32] + mov QWORD PTR [r11+24], r8 + adc rax, QWORD PTR [r10+32] + mov r8, QWORD PTR [r11+40] + mov QWORD PTR [r11+32], rax + adc r8, QWORD PTR [r10+40] + mov rax, QWORD PTR [r11+48] + mov QWORD PTR [r11+40], r8 + adc rax, QWORD PTR [r10+48] + mov r8, QWORD PTR [r11+56] + mov QWORD PTR [r11+48], rax + adc r8, QWORD PTR [r10+56] + mov rax, QWORD PTR [r11+64] + mov QWORD PTR [r11+56], r8 + adc rax, QWORD PTR [r10+64] + mov r8, QWORD PTR [r11+72] + mov QWORD PTR [r11+64], rax + adc r8, QWORD PTR [r10+72] + mov rax, QWORD PTR [r11+80] + mov QWORD PTR [r11+72], r8 + adc rax, QWORD PTR [r10+80] + mov r8, QWORD PTR [r11+88] + mov QWORD PTR [r11+80], rax + adc r8, QWORD PTR [r10+88] + mov rax, QWORD PTR [r11+96] + mov QWORD PTR [r11+88], r8 + adc rax, QWORD PTR [r10+96] + mov r8, QWORD PTR [r11+104] + mov QWORD PTR [r11+96], rax + adc r8, QWORD PTR [r10+104] + mov rax, QWORD PTR [r11+112] + mov QWORD PTR [r11+104], r8 + adc rax, QWORD PTR [r10+112] + mov r8, QWORD PTR [r11+120] + mov QWORD PTR [r11+112], rax + adc r8, QWORD PTR [r10+120] + mov rax, QWORD PTR [r11+128] + mov QWORD PTR [r11+120], r8 + adc rax, QWORD PTR [r10+128] + mov r8, QWORD PTR [r11+136] + mov QWORD PTR [r11+128], rax + adc r8, QWORD PTR [r10+136] + mov rax, QWORD PTR [r11+144] + mov QWORD PTR [r11+136], r8 + adc rax, QWORD PTR [r10+144] + mov r8, QWORD PTR [r11+152] + mov QWORD PTR [r11+144], rax + adc r8, QWORD PTR [r10+152] + mov rax, QWORD PTR [r11+160] + mov QWORD PTR [r11+152], r8 + adc rax, QWORD PTR [r10+160] + mov r8, QWORD PTR [r11+168] + mov QWORD PTR [r11+160], rax + adc r8, QWORD PTR [r10+168] + mov rax, QWORD PTR [r11+176] + mov QWORD PTR [r11+168], r8 + adc rax, QWORD PTR [r10+176] + mov r8, QWORD PTR [r11+184] + mov QWORD PTR [r11+176], rax + adc r8, QWORD PTR [r10+184] + mov QWORD PTR [r11+184], r8 + adc r9, 0 + mov QWORD PTR [rcx+288], r9 + ; Add in place + mov rax, QWORD PTR [r11+96] + add rax, QWORD PTR [rdx] + mov r8, QWORD PTR [r11+104] + mov QWORD PTR [r11+96], rax + adc r8, QWORD PTR [rdx+8] + mov rax, QWORD PTR [r11+112] + mov QWORD PTR [r11+104], r8 + adc rax, QWORD PTR [rdx+16] + mov r8, QWORD PTR [r11+120] + mov QWORD PTR [r11+112], rax + adc r8, QWORD PTR [rdx+24] + mov rax, QWORD PTR [r11+128] + mov QWORD PTR [r11+120], r8 + adc rax, QWORD PTR [rdx+32] + mov r8, QWORD PTR [r11+136] + mov QWORD PTR [r11+128], rax + adc r8, QWORD PTR [rdx+40] + mov rax, QWORD PTR [r11+144] + mov QWORD PTR [r11+136], r8 + adc rax, QWORD PTR [rdx+48] + mov r8, QWORD PTR [r11+152] + mov QWORD PTR [r11+144], rax + adc r8, QWORD PTR [rdx+56] + mov rax, QWORD PTR [r11+160] + mov QWORD PTR [r11+152], r8 + adc rax, QWORD PTR [rdx+64] + mov r8, QWORD PTR [r11+168] + mov QWORD PTR [r11+160], rax + adc r8, QWORD PTR [rdx+72] + mov rax, QWORD PTR [r11+176] + mov QWORD PTR [r11+168], r8 + adc rax, QWORD PTR [rdx+80] + mov r8, QWORD PTR [r11+184] + mov QWORD PTR [r11+176], rax + adc r8, QWORD PTR [rdx+88] + mov rax, QWORD PTR [r11+192] + mov QWORD PTR [r11+184], r8 + adc rax, QWORD PTR [rdx+96] + mov QWORD PTR [r11+192], rax + ; Add to zero + mov rax, QWORD PTR [rdx+104] + adc rax, 0 + mov r8, QWORD PTR [rdx+112] + mov QWORD PTR [r11+200], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+120] + mov QWORD PTR [r11+208], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+128] + mov QWORD PTR [r11+216], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+136] + mov QWORD PTR [r11+224], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+144] + mov QWORD PTR [r11+232], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+152] + mov QWORD PTR [r11+240], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+160] + mov QWORD PTR [r11+248], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+168] + mov QWORD PTR [r11+256], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+176] + mov QWORD PTR [r11+264], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+184] + mov QWORD PTR [r11+272], r8 + adc rax, 0 + mov QWORD PTR [r11+280], rax + add rsp, 504 + pop r12 + ret +sp_3072_sqr_24 ENDP +_text ENDS +IFDEF HAVE_INTEL_AVX2 +; /* Square a and put result in r. (r = a * a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_3072_sqr_avx2_24 PROC + push r12 + sub rsp, 504 + mov QWORD PTR [rsp+480], rcx + mov QWORD PTR [rsp+488], rdx + lea r10, QWORD PTR [rsp+384] + lea r11, QWORD PTR [rdx+96] + ; Add + mov rax, QWORD PTR [rdx] + xor r9, r9 + add rax, QWORD PTR [r11] + mov r8, QWORD PTR [rdx+8] + mov QWORD PTR [r10], rax + adc r8, QWORD PTR [r11+8] + mov rax, QWORD PTR [rdx+16] + mov QWORD PTR [r10+8], r8 + adc rax, QWORD PTR [r11+16] + mov r8, QWORD PTR [rdx+24] + mov QWORD PTR [r10+16], rax + adc r8, QWORD PTR [r11+24] + mov rax, QWORD PTR [rdx+32] + mov QWORD PTR [r10+24], r8 + adc rax, QWORD PTR [r11+32] + mov r8, QWORD PTR [rdx+40] + mov QWORD PTR [r10+32], rax + adc r8, QWORD PTR [r11+40] + mov rax, QWORD PTR [rdx+48] + mov QWORD PTR [r10+40], r8 + adc rax, QWORD PTR [r11+48] + mov r8, QWORD PTR [rdx+56] + mov QWORD PTR [r10+48], rax + adc r8, QWORD PTR [r11+56] + mov rax, QWORD PTR [rdx+64] + mov QWORD PTR [r10+56], r8 + adc rax, QWORD PTR [r11+64] + mov r8, QWORD PTR [rdx+72] + mov QWORD PTR [r10+64], rax + adc r8, QWORD PTR [r11+72] + mov rax, QWORD PTR [rdx+80] + mov QWORD PTR [r10+72], r8 + adc rax, QWORD PTR [r11+80] + mov r8, QWORD PTR [rdx+88] + mov QWORD PTR [r10+80], rax + adc r8, QWORD PTR [r11+88] + mov QWORD PTR [r10+88], r8 + adc r9, 0 + mov QWORD PTR [rsp+496], r9 + mov rdx, r10 + mov rcx, rsp + call sp_3072_sqr_avx2_12 + mov rdx, QWORD PTR [rsp+488] + lea rcx, QWORD PTR [rsp+192] + add rdx, 96 + call sp_3072_sqr_avx2_12 + mov rdx, QWORD PTR [rsp+488] + mov rcx, QWORD PTR [rsp+480] + call sp_3072_sqr_avx2_12 +IFDEF _WIN64 + mov rdx, QWORD PTR [rsp+488] + mov rcx, QWORD PTR [rsp+480] +ENDIF + mov r12, QWORD PTR [rsp+496] + mov r11, rcx + lea r10, QWORD PTR [rsp+384] + mov r9, r12 + neg r12 + add r11, 192 + mov rax, QWORD PTR [r10] + pext rax, rax, r12 + add rax, rax + mov r8, QWORD PTR [r10+8] + mov QWORD PTR [r11], rax + pext r8, r8, r12 + adc r8, r8 + mov rax, QWORD PTR [r10+16] + mov QWORD PTR [r11+8], r8 + pext rax, rax, r12 + adc rax, rax + mov r8, QWORD PTR [r10+24] + mov QWORD PTR [r11+16], rax + pext r8, r8, r12 + adc r8, r8 + mov rax, QWORD PTR [r10+32] + mov QWORD PTR [r11+24], r8 + pext rax, rax, r12 + adc rax, rax + mov r8, QWORD PTR [r10+40] + mov QWORD PTR [r11+32], rax + pext r8, r8, r12 + adc r8, r8 + mov rax, QWORD PTR [r10+48] + mov QWORD PTR [r11+40], r8 + pext rax, rax, r12 + adc rax, rax + mov r8, QWORD PTR [r10+56] + mov QWORD PTR [r11+48], rax + pext r8, r8, r12 + adc r8, r8 + mov rax, QWORD PTR [r10+64] + mov QWORD PTR [r11+56], r8 + pext rax, rax, r12 + adc rax, rax + mov r8, QWORD PTR [r10+72] + mov QWORD PTR [r11+64], rax + pext r8, r8, r12 + adc r8, r8 + mov rax, QWORD PTR [r10+80] + mov QWORD PTR [r11+72], r8 + pext rax, rax, r12 + adc rax, rax + mov r8, QWORD PTR [r10+88] + mov QWORD PTR [r11+80], rax + pext r8, r8, r12 + adc r8, r8 + mov QWORD PTR [r11+88], r8 + adc r9, 0 + lea rdx, QWORD PTR [rsp+192] + mov r10, rsp + mov rax, QWORD PTR [r10] + sub rax, QWORD PTR [rdx] + mov r8, QWORD PTR [r10+8] + mov QWORD PTR [r10], rax + sbb r8, QWORD PTR [rdx+8] + mov rax, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], r8 + sbb rax, QWORD PTR [rdx+16] + mov r8, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], rax + sbb r8, QWORD PTR [rdx+24] + mov rax, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], r8 + sbb rax, QWORD PTR [rdx+32] + mov r8, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], rax + sbb r8, QWORD PTR [rdx+40] + mov rax, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], r8 + sbb rax, QWORD PTR [rdx+48] + mov r8, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], rax + sbb r8, QWORD PTR [rdx+56] + mov rax, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], r8 + sbb rax, QWORD PTR [rdx+64] + mov r8, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], rax + sbb r8, QWORD PTR [rdx+72] + mov rax, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], r8 + sbb rax, QWORD PTR [rdx+80] + mov r8, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], rax + sbb r8, QWORD PTR [rdx+88] + mov rax, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], r8 + sbb rax, QWORD PTR [rdx+96] + mov r8, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], rax + sbb r8, QWORD PTR [rdx+104] + mov rax, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], r8 + sbb rax, QWORD PTR [rdx+112] + mov r8, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], rax + sbb r8, QWORD PTR [rdx+120] + mov rax, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], r8 + sbb rax, QWORD PTR [rdx+128] + mov r8, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], rax + sbb r8, QWORD PTR [rdx+136] + mov rax, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], r8 + sbb rax, QWORD PTR [rdx+144] + mov r8, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], rax + sbb r8, QWORD PTR [rdx+152] + mov rax, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], r8 + sbb rax, QWORD PTR [rdx+160] + mov r8, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], rax + sbb r8, QWORD PTR [rdx+168] + mov rax, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], r8 + sbb rax, QWORD PTR [rdx+176] + mov r8, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], rax + sbb r8, QWORD PTR [rdx+184] + mov QWORD PTR [r10+184], r8 + sbb r9, 0 + mov rax, QWORD PTR [r10] + sub rax, QWORD PTR [rcx] + mov r8, QWORD PTR [r10+8] + mov QWORD PTR [r10], rax + sbb r8, QWORD PTR [rcx+8] + mov rax, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], r8 + sbb rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], rax + sbb r8, QWORD PTR [rcx+24] + mov rax, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], r8 + sbb rax, QWORD PTR [rcx+32] + mov r8, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], rax + sbb r8, QWORD PTR [rcx+40] + mov rax, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], r8 + sbb rax, QWORD PTR [rcx+48] + mov r8, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], rax + sbb r8, QWORD PTR [rcx+56] + mov rax, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], r8 + sbb rax, QWORD PTR [rcx+64] + mov r8, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], rax + sbb r8, QWORD PTR [rcx+72] + mov rax, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], r8 + sbb rax, QWORD PTR [rcx+80] + mov r8, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], rax + sbb r8, QWORD PTR [rcx+88] + mov rax, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], r8 + sbb rax, QWORD PTR [rcx+96] + mov r8, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], rax + sbb r8, QWORD PTR [rcx+104] + mov rax, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], r8 + sbb rax, QWORD PTR [rcx+112] + mov r8, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], rax + sbb r8, QWORD PTR [rcx+120] + mov rax, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], r8 + sbb rax, QWORD PTR [rcx+128] + mov r8, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], rax + sbb r8, QWORD PTR [rcx+136] + mov rax, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], r8 + sbb rax, QWORD PTR [rcx+144] + mov r8, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], rax + sbb r8, QWORD PTR [rcx+152] + mov rax, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], r8 + sbb rax, QWORD PTR [rcx+160] + mov r8, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], rax + sbb r8, QWORD PTR [rcx+168] + mov rax, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], r8 + sbb rax, QWORD PTR [rcx+176] + mov r8, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], rax + sbb r8, QWORD PTR [rcx+184] + mov QWORD PTR [r10+184], r8 + sbb r9, 0 + sub r11, 96 + ; Add in place + mov rax, QWORD PTR [r11] + add rax, QWORD PTR [r10] + mov r8, QWORD PTR [r11+8] + mov QWORD PTR [r11], rax + adc r8, QWORD PTR [r10+8] + mov rax, QWORD PTR [r11+16] + mov QWORD PTR [r11+8], r8 + adc rax, QWORD PTR [r10+16] + mov r8, QWORD PTR [r11+24] + mov QWORD PTR [r11+16], rax + adc r8, QWORD PTR [r10+24] + mov rax, QWORD PTR [r11+32] + mov QWORD PTR [r11+24], r8 + adc rax, QWORD PTR [r10+32] + mov r8, QWORD PTR [r11+40] + mov QWORD PTR [r11+32], rax + adc r8, QWORD PTR [r10+40] + mov rax, QWORD PTR [r11+48] + mov QWORD PTR [r11+40], r8 + adc rax, QWORD PTR [r10+48] + mov r8, QWORD PTR [r11+56] + mov QWORD PTR [r11+48], rax + adc r8, QWORD PTR [r10+56] + mov rax, QWORD PTR [r11+64] + mov QWORD PTR [r11+56], r8 + adc rax, QWORD PTR [r10+64] + mov r8, QWORD PTR [r11+72] + mov QWORD PTR [r11+64], rax + adc r8, QWORD PTR [r10+72] + mov rax, QWORD PTR [r11+80] + mov QWORD PTR [r11+72], r8 + adc rax, QWORD PTR [r10+80] + mov r8, QWORD PTR [r11+88] + mov QWORD PTR [r11+80], rax + adc r8, QWORD PTR [r10+88] + mov rax, QWORD PTR [r11+96] + mov QWORD PTR [r11+88], r8 + adc rax, QWORD PTR [r10+96] + mov r8, QWORD PTR [r11+104] + mov QWORD PTR [r11+96], rax + adc r8, QWORD PTR [r10+104] + mov rax, QWORD PTR [r11+112] + mov QWORD PTR [r11+104], r8 + adc rax, QWORD PTR [r10+112] + mov r8, QWORD PTR [r11+120] + mov QWORD PTR [r11+112], rax + adc r8, QWORD PTR [r10+120] + mov rax, QWORD PTR [r11+128] + mov QWORD PTR [r11+120], r8 + adc rax, QWORD PTR [r10+128] + mov r8, QWORD PTR [r11+136] + mov QWORD PTR [r11+128], rax + adc r8, QWORD PTR [r10+136] + mov rax, QWORD PTR [r11+144] + mov QWORD PTR [r11+136], r8 + adc rax, QWORD PTR [r10+144] + mov r8, QWORD PTR [r11+152] + mov QWORD PTR [r11+144], rax + adc r8, QWORD PTR [r10+152] + mov rax, QWORD PTR [r11+160] + mov QWORD PTR [r11+152], r8 + adc rax, QWORD PTR [r10+160] + mov r8, QWORD PTR [r11+168] + mov QWORD PTR [r11+160], rax + adc r8, QWORD PTR [r10+168] + mov rax, QWORD PTR [r11+176] + mov QWORD PTR [r11+168], r8 + adc rax, QWORD PTR [r10+176] + mov r8, QWORD PTR [r11+184] + mov QWORD PTR [r11+176], rax + adc r8, QWORD PTR [r10+184] + mov QWORD PTR [r11+184], r8 + adc r9, 0 + mov QWORD PTR [rcx+288], r9 + ; Add in place + mov rax, QWORD PTR [r11+96] + add rax, QWORD PTR [rdx] + mov r8, QWORD PTR [r11+104] + mov QWORD PTR [r11+96], rax + adc r8, QWORD PTR [rdx+8] + mov rax, QWORD PTR [r11+112] + mov QWORD PTR [r11+104], r8 + adc rax, QWORD PTR [rdx+16] + mov r8, QWORD PTR [r11+120] + mov QWORD PTR [r11+112], rax + adc r8, QWORD PTR [rdx+24] + mov rax, QWORD PTR [r11+128] + mov QWORD PTR [r11+120], r8 + adc rax, QWORD PTR [rdx+32] + mov r8, QWORD PTR [r11+136] + mov QWORD PTR [r11+128], rax + adc r8, QWORD PTR [rdx+40] + mov rax, QWORD PTR [r11+144] + mov QWORD PTR [r11+136], r8 + adc rax, QWORD PTR [rdx+48] + mov r8, QWORD PTR [r11+152] + mov QWORD PTR [r11+144], rax + adc r8, QWORD PTR [rdx+56] + mov rax, QWORD PTR [r11+160] + mov QWORD PTR [r11+152], r8 + adc rax, QWORD PTR [rdx+64] + mov r8, QWORD PTR [r11+168] + mov QWORD PTR [r11+160], rax + adc r8, QWORD PTR [rdx+72] + mov rax, QWORD PTR [r11+176] + mov QWORD PTR [r11+168], r8 + adc rax, QWORD PTR [rdx+80] + mov r8, QWORD PTR [r11+184] + mov QWORD PTR [r11+176], rax + adc r8, QWORD PTR [rdx+88] + mov rax, QWORD PTR [r11+192] + mov QWORD PTR [r11+184], r8 + adc rax, QWORD PTR [rdx+96] + mov QWORD PTR [r11+192], rax + ; Add to zero + mov rax, QWORD PTR [rdx+104] + adc rax, 0 + mov r8, QWORD PTR [rdx+112] + mov QWORD PTR [r11+200], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+120] + mov QWORD PTR [r11+208], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+128] + mov QWORD PTR [r11+216], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+136] + mov QWORD PTR [r11+224], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+144] + mov QWORD PTR [r11+232], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+152] + mov QWORD PTR [r11+240], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+160] + mov QWORD PTR [r11+248], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+168] + mov QWORD PTR [r11+256], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+176] + mov QWORD PTR [r11+264], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+184] + mov QWORD PTR [r11+272], r8 + adc rax, 0 + mov QWORD PTR [r11+280], rax + add rsp, 504 + pop r12 + ret +sp_3072_sqr_avx2_24 ENDP +_text ENDS +ENDIF +; /* Add a to a into r. (r = a + a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_3072_dbl_24 PROC + mov r8, QWORD PTR [rdx] + xor rax, rax + add r8, r8 + mov r9, QWORD PTR [rdx+8] + mov QWORD PTR [rcx], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+16] + mov QWORD PTR [rcx+8], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+24] + mov QWORD PTR [rcx+16], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+32] + mov QWORD PTR [rcx+24], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+40] + mov QWORD PTR [rcx+32], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+48] + mov QWORD PTR [rcx+40], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+56] + mov QWORD PTR [rcx+48], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+64] + mov QWORD PTR [rcx+56], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+72] + mov QWORD PTR [rcx+64], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+80] + mov QWORD PTR [rcx+72], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+88] + mov QWORD PTR [rcx+80], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+96] + mov QWORD PTR [rcx+88], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+104] + mov QWORD PTR [rcx+96], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+112] + mov QWORD PTR [rcx+104], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+120] + mov QWORD PTR [rcx+112], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+128] + mov QWORD PTR [rcx+120], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+136] + mov QWORD PTR [rcx+128], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+144] + mov QWORD PTR [rcx+136], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+152] + mov QWORD PTR [rcx+144], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+160] + mov QWORD PTR [rcx+152], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+168] + mov QWORD PTR [rcx+160], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+176] + mov QWORD PTR [rcx+168], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+184] + mov QWORD PTR [rcx+176], r8 + adc r9, r9 + mov QWORD PTR [rcx+184], r9 + adc rax, 0 + ret +sp_3072_dbl_24 ENDP +_text ENDS +; /* Square a and put result in r. (r = a * a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_3072_sqr_48 PROC + push r12 + sub rsp, 984 + mov QWORD PTR [rsp+960], rcx + mov QWORD PTR [rsp+968], rdx + lea r10, QWORD PTR [rsp+768] + lea r11, QWORD PTR [rdx+192] + ; Add + mov rax, QWORD PTR [rdx] + xor r9, r9 + add rax, QWORD PTR [r11] + mov r8, QWORD PTR [rdx+8] + mov QWORD PTR [r10], rax + adc r8, QWORD PTR [r11+8] + mov rax, QWORD PTR [rdx+16] + mov QWORD PTR [r10+8], r8 + adc rax, QWORD PTR [r11+16] + mov r8, QWORD PTR [rdx+24] + mov QWORD PTR [r10+16], rax + adc r8, QWORD PTR [r11+24] + mov rax, QWORD PTR [rdx+32] + mov QWORD PTR [r10+24], r8 + adc rax, QWORD PTR [r11+32] + mov r8, QWORD PTR [rdx+40] + mov QWORD PTR [r10+32], rax + adc r8, QWORD PTR [r11+40] + mov rax, QWORD PTR [rdx+48] + mov QWORD PTR [r10+40], r8 + adc rax, QWORD PTR [r11+48] + mov r8, QWORD PTR [rdx+56] + mov QWORD PTR [r10+48], rax + adc r8, QWORD PTR [r11+56] + mov rax, QWORD PTR [rdx+64] + mov QWORD PTR [r10+56], r8 + adc rax, QWORD PTR [r11+64] + mov r8, QWORD PTR [rdx+72] + mov QWORD PTR [r10+64], rax + adc r8, QWORD PTR [r11+72] + mov rax, QWORD PTR [rdx+80] + mov QWORD PTR [r10+72], r8 + adc rax, QWORD PTR [r11+80] + mov r8, QWORD PTR [rdx+88] + mov QWORD PTR [r10+80], rax + adc r8, QWORD PTR [r11+88] + mov rax, QWORD PTR [rdx+96] + mov QWORD PTR [r10+88], r8 + adc rax, QWORD PTR [r11+96] + mov r8, QWORD PTR [rdx+104] + mov QWORD PTR [r10+96], rax + adc r8, QWORD PTR [r11+104] + mov rax, QWORD PTR [rdx+112] + mov QWORD PTR [r10+104], r8 + adc rax, QWORD PTR [r11+112] + mov r8, QWORD PTR [rdx+120] + mov QWORD PTR [r10+112], rax + adc r8, QWORD PTR [r11+120] + mov rax, QWORD PTR [rdx+128] + mov QWORD PTR [r10+120], r8 + adc rax, QWORD PTR [r11+128] + mov r8, QWORD PTR [rdx+136] + mov QWORD PTR [r10+128], rax + adc r8, QWORD PTR [r11+136] + mov rax, QWORD PTR [rdx+144] + mov QWORD PTR [r10+136], r8 + adc rax, QWORD PTR [r11+144] + mov r8, QWORD PTR [rdx+152] + mov QWORD PTR [r10+144], rax + adc r8, QWORD PTR [r11+152] + mov rax, QWORD PTR [rdx+160] + mov QWORD PTR [r10+152], r8 + adc rax, QWORD PTR [r11+160] + mov r8, QWORD PTR [rdx+168] + mov QWORD PTR [r10+160], rax + adc r8, QWORD PTR [r11+168] + mov rax, QWORD PTR [rdx+176] + mov QWORD PTR [r10+168], r8 + adc rax, QWORD PTR [r11+176] + mov r8, QWORD PTR [rdx+184] + mov QWORD PTR [r10+176], rax + adc r8, QWORD PTR [r11+184] + mov QWORD PTR [r10+184], r8 + adc r9, 0 + mov QWORD PTR [rsp+976], r9 + mov rdx, r10 + mov rcx, rsp + call sp_3072_sqr_24 + mov rdx, QWORD PTR [rsp+968] + lea rcx, QWORD PTR [rsp+384] + add rdx, 192 + call sp_3072_sqr_24 + mov rdx, QWORD PTR [rsp+968] + mov rcx, QWORD PTR [rsp+960] + call sp_3072_sqr_24 +IFDEF _WIN64 + mov rdx, QWORD PTR [rsp+968] + mov rcx, QWORD PTR [rsp+960] +ENDIF + mov r12, QWORD PTR [rsp+976] + mov r11, rcx + lea r10, QWORD PTR [rsp+768] + mov r9, r12 + neg r12 + add r11, 384 + mov rax, QWORD PTR [r10] + mov r8, QWORD PTR [r10+8] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11], rax + mov QWORD PTR [r11+8], r8 + mov rax, QWORD PTR [r10+16] + mov r8, QWORD PTR [r10+24] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+16], rax + mov QWORD PTR [r11+24], r8 + mov rax, QWORD PTR [r10+32] + mov r8, QWORD PTR [r10+40] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+32], rax + mov QWORD PTR [r11+40], r8 + mov rax, QWORD PTR [r10+48] + mov r8, QWORD PTR [r10+56] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+48], rax + mov QWORD PTR [r11+56], r8 + mov rax, QWORD PTR [r10+64] + mov r8, QWORD PTR [r10+72] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+64], rax + mov QWORD PTR [r11+72], r8 + mov rax, QWORD PTR [r10+80] + mov r8, QWORD PTR [r10+88] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+80], rax + mov QWORD PTR [r11+88], r8 + mov rax, QWORD PTR [r10+96] + mov r8, QWORD PTR [r10+104] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+96], rax + mov QWORD PTR [r11+104], r8 + mov rax, QWORD PTR [r10+112] + mov r8, QWORD PTR [r10+120] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+112], rax + mov QWORD PTR [r11+120], r8 + mov rax, QWORD PTR [r10+128] + mov r8, QWORD PTR [r10+136] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+128], rax + mov QWORD PTR [r11+136], r8 + mov rax, QWORD PTR [r10+144] + mov r8, QWORD PTR [r10+152] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+144], rax + mov QWORD PTR [r11+152], r8 + mov rax, QWORD PTR [r10+160] + mov r8, QWORD PTR [r10+168] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+160], rax + mov QWORD PTR [r11+168], r8 + mov rax, QWORD PTR [r10+176] + mov r8, QWORD PTR [r10+184] + and rax, r12 + and r8, r12 + mov QWORD PTR [r11+176], rax + mov QWORD PTR [r11+184], r8 + mov rax, QWORD PTR [r11] + add rax, rax + mov r8, QWORD PTR [r11+8] + mov QWORD PTR [r11], rax + adc r8, r8 + mov rax, QWORD PTR [r11+16] + mov QWORD PTR [r11+8], r8 + adc rax, rax + mov r8, QWORD PTR [r11+24] + mov QWORD PTR [r11+16], rax + adc r8, r8 + mov rax, QWORD PTR [r11+32] + mov QWORD PTR [r11+24], r8 + adc rax, rax + mov r8, QWORD PTR [r11+40] + mov QWORD PTR [r11+32], rax + adc r8, r8 + mov rax, QWORD PTR [r11+48] + mov QWORD PTR [r11+40], r8 + adc rax, rax + mov r8, QWORD PTR [r11+56] + mov QWORD PTR [r11+48], rax + adc r8, r8 + mov rax, QWORD PTR [r11+64] + mov QWORD PTR [r11+56], r8 + adc rax, rax + mov r8, QWORD PTR [r11+72] + mov QWORD PTR [r11+64], rax + adc r8, r8 + mov rax, QWORD PTR [r11+80] + mov QWORD PTR [r11+72], r8 + adc rax, rax + mov r8, QWORD PTR [r11+88] + mov QWORD PTR [r11+80], rax + adc r8, r8 + mov rax, QWORD PTR [r11+96] + mov QWORD PTR [r11+88], r8 + adc rax, rax + mov r8, QWORD PTR [r11+104] + mov QWORD PTR [r11+96], rax + adc r8, r8 + mov rax, QWORD PTR [r11+112] + mov QWORD PTR [r11+104], r8 + adc rax, rax + mov r8, QWORD PTR [r11+120] + mov QWORD PTR [r11+112], rax + adc r8, r8 + mov rax, QWORD PTR [r11+128] + mov QWORD PTR [r11+120], r8 + adc rax, rax + mov r8, QWORD PTR [r11+136] + mov QWORD PTR [r11+128], rax + adc r8, r8 + mov rax, QWORD PTR [r11+144] + mov QWORD PTR [r11+136], r8 + adc rax, rax + mov r8, QWORD PTR [r11+152] + mov QWORD PTR [r11+144], rax + adc r8, r8 + mov rax, QWORD PTR [r11+160] + mov QWORD PTR [r11+152], r8 + adc rax, rax + mov r8, QWORD PTR [r11+168] + mov QWORD PTR [r11+160], rax + adc r8, r8 + mov rax, QWORD PTR [r11+176] + mov QWORD PTR [r11+168], r8 + adc rax, rax + mov r8, QWORD PTR [r11+184] + mov QWORD PTR [r11+176], rax + adc r8, r8 + mov QWORD PTR [r11+184], r8 + adc r9, 0 + lea rdx, QWORD PTR [rsp+384] + mov r10, rsp + mov rax, QWORD PTR [r10] + sub rax, QWORD PTR [rdx] + mov r8, QWORD PTR [r10+8] + mov QWORD PTR [r10], rax + sbb r8, QWORD PTR [rdx+8] + mov rax, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], r8 + sbb rax, QWORD PTR [rdx+16] + mov r8, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], rax + sbb r8, QWORD PTR [rdx+24] + mov rax, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], r8 + sbb rax, QWORD PTR [rdx+32] + mov r8, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], rax + sbb r8, QWORD PTR [rdx+40] + mov rax, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], r8 + sbb rax, QWORD PTR [rdx+48] + mov r8, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], rax + sbb r8, QWORD PTR [rdx+56] + mov rax, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], r8 + sbb rax, QWORD PTR [rdx+64] + mov r8, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], rax + sbb r8, QWORD PTR [rdx+72] + mov rax, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], r8 + sbb rax, QWORD PTR [rdx+80] + mov r8, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], rax + sbb r8, QWORD PTR [rdx+88] + mov rax, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], r8 + sbb rax, QWORD PTR [rdx+96] + mov r8, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], rax + sbb r8, QWORD PTR [rdx+104] + mov rax, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], r8 + sbb rax, QWORD PTR [rdx+112] + mov r8, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], rax + sbb r8, QWORD PTR [rdx+120] + mov rax, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], r8 + sbb rax, QWORD PTR [rdx+128] + mov r8, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], rax + sbb r8, QWORD PTR [rdx+136] + mov rax, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], r8 + sbb rax, QWORD PTR [rdx+144] + mov r8, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], rax + sbb r8, QWORD PTR [rdx+152] + mov rax, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], r8 + sbb rax, QWORD PTR [rdx+160] + mov r8, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], rax + sbb r8, QWORD PTR [rdx+168] + mov rax, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], r8 + sbb rax, QWORD PTR [rdx+176] + mov r8, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], rax + sbb r8, QWORD PTR [rdx+184] + mov rax, QWORD PTR [r10+192] + mov QWORD PTR [r10+184], r8 + sbb rax, QWORD PTR [rdx+192] + mov r8, QWORD PTR [r10+200] + mov QWORD PTR [r10+192], rax + sbb r8, QWORD PTR [rdx+200] + mov rax, QWORD PTR [r10+208] + mov QWORD PTR [r10+200], r8 + sbb rax, QWORD PTR [rdx+208] + mov r8, QWORD PTR [r10+216] + mov QWORD PTR [r10+208], rax + sbb r8, QWORD PTR [rdx+216] + mov rax, QWORD PTR [r10+224] + mov QWORD PTR [r10+216], r8 + sbb rax, QWORD PTR [rdx+224] + mov r8, QWORD PTR [r10+232] + mov QWORD PTR [r10+224], rax + sbb r8, QWORD PTR [rdx+232] + mov rax, QWORD PTR [r10+240] + mov QWORD PTR [r10+232], r8 + sbb rax, QWORD PTR [rdx+240] + mov r8, QWORD PTR [r10+248] + mov QWORD PTR [r10+240], rax + sbb r8, QWORD PTR [rdx+248] + mov rax, QWORD PTR [r10+256] + mov QWORD PTR [r10+248], r8 + sbb rax, QWORD PTR [rdx+256] + mov r8, QWORD PTR [r10+264] + mov QWORD PTR [r10+256], rax + sbb r8, QWORD PTR [rdx+264] + mov rax, QWORD PTR [r10+272] + mov QWORD PTR [r10+264], r8 + sbb rax, QWORD PTR [rdx+272] + mov r8, QWORD PTR [r10+280] + mov QWORD PTR [r10+272], rax + sbb r8, QWORD PTR [rdx+280] + mov rax, QWORD PTR [r10+288] + mov QWORD PTR [r10+280], r8 + sbb rax, QWORD PTR [rdx+288] + mov r8, QWORD PTR [r10+296] + mov QWORD PTR [r10+288], rax + sbb r8, QWORD PTR [rdx+296] + mov rax, QWORD PTR [r10+304] + mov QWORD PTR [r10+296], r8 + sbb rax, QWORD PTR [rdx+304] + mov r8, QWORD PTR [r10+312] + mov QWORD PTR [r10+304], rax + sbb r8, QWORD PTR [rdx+312] + mov rax, QWORD PTR [r10+320] + mov QWORD PTR [r10+312], r8 + sbb rax, QWORD PTR [rdx+320] + mov r8, QWORD PTR [r10+328] + mov QWORD PTR [r10+320], rax + sbb r8, QWORD PTR [rdx+328] + mov rax, QWORD PTR [r10+336] + mov QWORD PTR [r10+328], r8 + sbb rax, QWORD PTR [rdx+336] + mov r8, QWORD PTR [r10+344] + mov QWORD PTR [r10+336], rax + sbb r8, QWORD PTR [rdx+344] + mov rax, QWORD PTR [r10+352] + mov QWORD PTR [r10+344], r8 + sbb rax, QWORD PTR [rdx+352] + mov r8, QWORD PTR [r10+360] + mov QWORD PTR [r10+352], rax + sbb r8, QWORD PTR [rdx+360] + mov rax, QWORD PTR [r10+368] + mov QWORD PTR [r10+360], r8 + sbb rax, QWORD PTR [rdx+368] + mov r8, QWORD PTR [r10+376] + mov QWORD PTR [r10+368], rax + sbb r8, QWORD PTR [rdx+376] + mov QWORD PTR [r10+376], r8 + sbb r9, 0 + mov rax, QWORD PTR [r10] + sub rax, QWORD PTR [rcx] + mov r8, QWORD PTR [r10+8] + mov QWORD PTR [r10], rax + sbb r8, QWORD PTR [rcx+8] + mov rax, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], r8 + sbb rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], rax + sbb r8, QWORD PTR [rcx+24] + mov rax, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], r8 + sbb rax, QWORD PTR [rcx+32] + mov r8, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], rax + sbb r8, QWORD PTR [rcx+40] + mov rax, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], r8 + sbb rax, QWORD PTR [rcx+48] + mov r8, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], rax + sbb r8, QWORD PTR [rcx+56] + mov rax, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], r8 + sbb rax, QWORD PTR [rcx+64] + mov r8, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], rax + sbb r8, QWORD PTR [rcx+72] + mov rax, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], r8 + sbb rax, QWORD PTR [rcx+80] + mov r8, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], rax + sbb r8, QWORD PTR [rcx+88] + mov rax, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], r8 + sbb rax, QWORD PTR [rcx+96] + mov r8, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], rax + sbb r8, QWORD PTR [rcx+104] + mov rax, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], r8 + sbb rax, QWORD PTR [rcx+112] + mov r8, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], rax + sbb r8, QWORD PTR [rcx+120] + mov rax, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], r8 + sbb rax, QWORD PTR [rcx+128] + mov r8, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], rax + sbb r8, QWORD PTR [rcx+136] + mov rax, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], r8 + sbb rax, QWORD PTR [rcx+144] + mov r8, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], rax + sbb r8, QWORD PTR [rcx+152] + mov rax, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], r8 + sbb rax, QWORD PTR [rcx+160] + mov r8, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], rax + sbb r8, QWORD PTR [rcx+168] + mov rax, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], r8 + sbb rax, QWORD PTR [rcx+176] + mov r8, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], rax + sbb r8, QWORD PTR [rcx+184] + mov rax, QWORD PTR [r10+192] + mov QWORD PTR [r10+184], r8 + sbb rax, QWORD PTR [rcx+192] + mov r8, QWORD PTR [r10+200] + mov QWORD PTR [r10+192], rax + sbb r8, QWORD PTR [rcx+200] + mov rax, QWORD PTR [r10+208] + mov QWORD PTR [r10+200], r8 + sbb rax, QWORD PTR [rcx+208] + mov r8, QWORD PTR [r10+216] + mov QWORD PTR [r10+208], rax + sbb r8, QWORD PTR [rcx+216] + mov rax, QWORD PTR [r10+224] + mov QWORD PTR [r10+216], r8 + sbb rax, QWORD PTR [rcx+224] + mov r8, QWORD PTR [r10+232] + mov QWORD PTR [r10+224], rax + sbb r8, QWORD PTR [rcx+232] + mov rax, QWORD PTR [r10+240] + mov QWORD PTR [r10+232], r8 + sbb rax, QWORD PTR [rcx+240] + mov r8, QWORD PTR [r10+248] + mov QWORD PTR [r10+240], rax + sbb r8, QWORD PTR [rcx+248] + mov rax, QWORD PTR [r10+256] + mov QWORD PTR [r10+248], r8 + sbb rax, QWORD PTR [rcx+256] + mov r8, QWORD PTR [r10+264] + mov QWORD PTR [r10+256], rax + sbb r8, QWORD PTR [rcx+264] + mov rax, QWORD PTR [r10+272] + mov QWORD PTR [r10+264], r8 + sbb rax, QWORD PTR [rcx+272] + mov r8, QWORD PTR [r10+280] + mov QWORD PTR [r10+272], rax + sbb r8, QWORD PTR [rcx+280] + mov rax, QWORD PTR [r10+288] + mov QWORD PTR [r10+280], r8 + sbb rax, QWORD PTR [rcx+288] + mov r8, QWORD PTR [r10+296] + mov QWORD PTR [r10+288], rax + sbb r8, QWORD PTR [rcx+296] + mov rax, QWORD PTR [r10+304] + mov QWORD PTR [r10+296], r8 + sbb rax, QWORD PTR [rcx+304] + mov r8, QWORD PTR [r10+312] + mov QWORD PTR [r10+304], rax + sbb r8, QWORD PTR [rcx+312] + mov rax, QWORD PTR [r10+320] + mov QWORD PTR [r10+312], r8 + sbb rax, QWORD PTR [rcx+320] + mov r8, QWORD PTR [r10+328] + mov QWORD PTR [r10+320], rax + sbb r8, QWORD PTR [rcx+328] + mov rax, QWORD PTR [r10+336] + mov QWORD PTR [r10+328], r8 + sbb rax, QWORD PTR [rcx+336] + mov r8, QWORD PTR [r10+344] + mov QWORD PTR [r10+336], rax + sbb r8, QWORD PTR [rcx+344] + mov rax, QWORD PTR [r10+352] + mov QWORD PTR [r10+344], r8 + sbb rax, QWORD PTR [rcx+352] + mov r8, QWORD PTR [r10+360] + mov QWORD PTR [r10+352], rax + sbb r8, QWORD PTR [rcx+360] + mov rax, QWORD PTR [r10+368] + mov QWORD PTR [r10+360], r8 + sbb rax, QWORD PTR [rcx+368] + mov r8, QWORD PTR [r10+376] + mov QWORD PTR [r10+368], rax + sbb r8, QWORD PTR [rcx+376] + mov QWORD PTR [r10+376], r8 + sbb r9, 0 + sub r11, 192 + ; Add in place + mov rax, QWORD PTR [r11] + add rax, QWORD PTR [r10] + mov r8, QWORD PTR [r11+8] + mov QWORD PTR [r11], rax + adc r8, QWORD PTR [r10+8] + mov rax, QWORD PTR [r11+16] + mov QWORD PTR [r11+8], r8 + adc rax, QWORD PTR [r10+16] + mov r8, QWORD PTR [r11+24] + mov QWORD PTR [r11+16], rax + adc r8, QWORD PTR [r10+24] + mov rax, QWORD PTR [r11+32] + mov QWORD PTR [r11+24], r8 + adc rax, QWORD PTR [r10+32] + mov r8, QWORD PTR [r11+40] + mov QWORD PTR [r11+32], rax + adc r8, QWORD PTR [r10+40] + mov rax, QWORD PTR [r11+48] + mov QWORD PTR [r11+40], r8 + adc rax, QWORD PTR [r10+48] + mov r8, QWORD PTR [r11+56] + mov QWORD PTR [r11+48], rax + adc r8, QWORD PTR [r10+56] + mov rax, QWORD PTR [r11+64] + mov QWORD PTR [r11+56], r8 + adc rax, QWORD PTR [r10+64] + mov r8, QWORD PTR [r11+72] + mov QWORD PTR [r11+64], rax + adc r8, QWORD PTR [r10+72] + mov rax, QWORD PTR [r11+80] + mov QWORD PTR [r11+72], r8 + adc rax, QWORD PTR [r10+80] + mov r8, QWORD PTR [r11+88] + mov QWORD PTR [r11+80], rax + adc r8, QWORD PTR [r10+88] + mov rax, QWORD PTR [r11+96] + mov QWORD PTR [r11+88], r8 + adc rax, QWORD PTR [r10+96] + mov r8, QWORD PTR [r11+104] + mov QWORD PTR [r11+96], rax + adc r8, QWORD PTR [r10+104] + mov rax, QWORD PTR [r11+112] + mov QWORD PTR [r11+104], r8 + adc rax, QWORD PTR [r10+112] + mov r8, QWORD PTR [r11+120] + mov QWORD PTR [r11+112], rax + adc r8, QWORD PTR [r10+120] + mov rax, QWORD PTR [r11+128] + mov QWORD PTR [r11+120], r8 + adc rax, QWORD PTR [r10+128] + mov r8, QWORD PTR [r11+136] + mov QWORD PTR [r11+128], rax + adc r8, QWORD PTR [r10+136] + mov rax, QWORD PTR [r11+144] + mov QWORD PTR [r11+136], r8 + adc rax, QWORD PTR [r10+144] + mov r8, QWORD PTR [r11+152] + mov QWORD PTR [r11+144], rax + adc r8, QWORD PTR [r10+152] + mov rax, QWORD PTR [r11+160] + mov QWORD PTR [r11+152], r8 + adc rax, QWORD PTR [r10+160] + mov r8, QWORD PTR [r11+168] + mov QWORD PTR [r11+160], rax + adc r8, QWORD PTR [r10+168] + mov rax, QWORD PTR [r11+176] + mov QWORD PTR [r11+168], r8 + adc rax, QWORD PTR [r10+176] + mov r8, QWORD PTR [r11+184] + mov QWORD PTR [r11+176], rax + adc r8, QWORD PTR [r10+184] + mov rax, QWORD PTR [r11+192] + mov QWORD PTR [r11+184], r8 + adc rax, QWORD PTR [r10+192] + mov r8, QWORD PTR [r11+200] + mov QWORD PTR [r11+192], rax + adc r8, QWORD PTR [r10+200] + mov rax, QWORD PTR [r11+208] + mov QWORD PTR [r11+200], r8 + adc rax, QWORD PTR [r10+208] + mov r8, QWORD PTR [r11+216] + mov QWORD PTR [r11+208], rax + adc r8, QWORD PTR [r10+216] + mov rax, QWORD PTR [r11+224] + mov QWORD PTR [r11+216], r8 + adc rax, QWORD PTR [r10+224] + mov r8, QWORD PTR [r11+232] + mov QWORD PTR [r11+224], rax + adc r8, QWORD PTR [r10+232] + mov rax, QWORD PTR [r11+240] + mov QWORD PTR [r11+232], r8 + adc rax, QWORD PTR [r10+240] + mov r8, QWORD PTR [r11+248] + mov QWORD PTR [r11+240], rax + adc r8, QWORD PTR [r10+248] + mov rax, QWORD PTR [r11+256] + mov QWORD PTR [r11+248], r8 + adc rax, QWORD PTR [r10+256] + mov r8, QWORD PTR [r11+264] + mov QWORD PTR [r11+256], rax + adc r8, QWORD PTR [r10+264] + mov rax, QWORD PTR [r11+272] + mov QWORD PTR [r11+264], r8 + adc rax, QWORD PTR [r10+272] + mov r8, QWORD PTR [r11+280] + mov QWORD PTR [r11+272], rax + adc r8, QWORD PTR [r10+280] + mov rax, QWORD PTR [r11+288] + mov QWORD PTR [r11+280], r8 + adc rax, QWORD PTR [r10+288] + mov r8, QWORD PTR [r11+296] + mov QWORD PTR [r11+288], rax + adc r8, QWORD PTR [r10+296] + mov rax, QWORD PTR [r11+304] + mov QWORD PTR [r11+296], r8 + adc rax, QWORD PTR [r10+304] + mov r8, QWORD PTR [r11+312] + mov QWORD PTR [r11+304], rax + adc r8, QWORD PTR [r10+312] + mov rax, QWORD PTR [r11+320] + mov QWORD PTR [r11+312], r8 + adc rax, QWORD PTR [r10+320] + mov r8, QWORD PTR [r11+328] + mov QWORD PTR [r11+320], rax + adc r8, QWORD PTR [r10+328] + mov rax, QWORD PTR [r11+336] + mov QWORD PTR [r11+328], r8 + adc rax, QWORD PTR [r10+336] + mov r8, QWORD PTR [r11+344] + mov QWORD PTR [r11+336], rax + adc r8, QWORD PTR [r10+344] + mov rax, QWORD PTR [r11+352] + mov QWORD PTR [r11+344], r8 + adc rax, QWORD PTR [r10+352] + mov r8, QWORD PTR [r11+360] + mov QWORD PTR [r11+352], rax + adc r8, QWORD PTR [r10+360] + mov rax, QWORD PTR [r11+368] + mov QWORD PTR [r11+360], r8 + adc rax, QWORD PTR [r10+368] + mov r8, QWORD PTR [r11+376] + mov QWORD PTR [r11+368], rax + adc r8, QWORD PTR [r10+376] + mov QWORD PTR [r11+376], r8 + adc r9, 0 + mov QWORD PTR [rcx+576], r9 + ; Add in place + mov rax, QWORD PTR [r11+192] + add rax, QWORD PTR [rdx] + mov r8, QWORD PTR [r11+200] + mov QWORD PTR [r11+192], rax + adc r8, QWORD PTR [rdx+8] + mov rax, QWORD PTR [r11+208] + mov QWORD PTR [r11+200], r8 + adc rax, QWORD PTR [rdx+16] + mov r8, QWORD PTR [r11+216] + mov QWORD PTR [r11+208], rax + adc r8, QWORD PTR [rdx+24] + mov rax, QWORD PTR [r11+224] + mov QWORD PTR [r11+216], r8 + adc rax, QWORD PTR [rdx+32] + mov r8, QWORD PTR [r11+232] + mov QWORD PTR [r11+224], rax + adc r8, QWORD PTR [rdx+40] + mov rax, QWORD PTR [r11+240] + mov QWORD PTR [r11+232], r8 + adc rax, QWORD PTR [rdx+48] + mov r8, QWORD PTR [r11+248] + mov QWORD PTR [r11+240], rax + adc r8, QWORD PTR [rdx+56] + mov rax, QWORD PTR [r11+256] + mov QWORD PTR [r11+248], r8 + adc rax, QWORD PTR [rdx+64] + mov r8, QWORD PTR [r11+264] + mov QWORD PTR [r11+256], rax + adc r8, QWORD PTR [rdx+72] + mov rax, QWORD PTR [r11+272] + mov QWORD PTR [r11+264], r8 + adc rax, QWORD PTR [rdx+80] + mov r8, QWORD PTR [r11+280] + mov QWORD PTR [r11+272], rax + adc r8, QWORD PTR [rdx+88] + mov rax, QWORD PTR [r11+288] + mov QWORD PTR [r11+280], r8 + adc rax, QWORD PTR [rdx+96] + mov r8, QWORD PTR [r11+296] + mov QWORD PTR [r11+288], rax + adc r8, QWORD PTR [rdx+104] + mov rax, QWORD PTR [r11+304] + mov QWORD PTR [r11+296], r8 + adc rax, QWORD PTR [rdx+112] + mov r8, QWORD PTR [r11+312] + mov QWORD PTR [r11+304], rax + adc r8, QWORD PTR [rdx+120] + mov rax, QWORD PTR [r11+320] + mov QWORD PTR [r11+312], r8 + adc rax, QWORD PTR [rdx+128] + mov r8, QWORD PTR [r11+328] + mov QWORD PTR [r11+320], rax + adc r8, QWORD PTR [rdx+136] + mov rax, QWORD PTR [r11+336] + mov QWORD PTR [r11+328], r8 + adc rax, QWORD PTR [rdx+144] + mov r8, QWORD PTR [r11+344] + mov QWORD PTR [r11+336], rax + adc r8, QWORD PTR [rdx+152] + mov rax, QWORD PTR [r11+352] + mov QWORD PTR [r11+344], r8 + adc rax, QWORD PTR [rdx+160] + mov r8, QWORD PTR [r11+360] + mov QWORD PTR [r11+352], rax + adc r8, QWORD PTR [rdx+168] + mov rax, QWORD PTR [r11+368] + mov QWORD PTR [r11+360], r8 + adc rax, QWORD PTR [rdx+176] + mov r8, QWORD PTR [r11+376] + mov QWORD PTR [r11+368], rax + adc r8, QWORD PTR [rdx+184] + mov rax, QWORD PTR [r11+384] + mov QWORD PTR [r11+376], r8 + adc rax, QWORD PTR [rdx+192] + mov QWORD PTR [r11+384], rax + ; Add to zero + mov rax, QWORD PTR [rdx+200] + adc rax, 0 + mov r8, QWORD PTR [rdx+208] + mov QWORD PTR [r11+392], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+216] + mov QWORD PTR [r11+400], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+224] + mov QWORD PTR [r11+408], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+232] + mov QWORD PTR [r11+416], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+240] + mov QWORD PTR [r11+424], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+248] + mov QWORD PTR [r11+432], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+256] + mov QWORD PTR [r11+440], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+264] + mov QWORD PTR [r11+448], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+272] + mov QWORD PTR [r11+456], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+280] + mov QWORD PTR [r11+464], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+288] + mov QWORD PTR [r11+472], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+296] + mov QWORD PTR [r11+480], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+304] + mov QWORD PTR [r11+488], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+312] + mov QWORD PTR [r11+496], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+320] + mov QWORD PTR [r11+504], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+328] + mov QWORD PTR [r11+512], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+336] + mov QWORD PTR [r11+520], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+344] + mov QWORD PTR [r11+528], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+352] + mov QWORD PTR [r11+536], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+360] + mov QWORD PTR [r11+544], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+368] + mov QWORD PTR [r11+552], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+376] + mov QWORD PTR [r11+560], r8 + adc rax, 0 + mov QWORD PTR [r11+568], rax + add rsp, 984 + pop r12 + ret +sp_3072_sqr_48 ENDP +_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Square a and put result in r. (r = a * a) ; * @@ -29503,1222 +29503,6 @@ ENDIF ret sp_4096_mul_64 ENDP _text ENDS -; /* Add a to a into r. (r = a + a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_dbl_32 PROC - mov r8, QWORD PTR [rdx] - xor rax, rax - add r8, r8 - mov r9, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+120], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+136] - mov QWORD PTR [rcx+128], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+136], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+144], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+152], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+160], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+168], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+176], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+192] - mov QWORD PTR [rcx+184], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+200] - mov QWORD PTR [rcx+192], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+208] - mov QWORD PTR [rcx+200], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+216] - mov QWORD PTR [rcx+208], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+224] - mov QWORD PTR [rcx+216], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+232] - mov QWORD PTR [rcx+224], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+240] - mov QWORD PTR [rcx+232], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+248] - mov QWORD PTR [rcx+240], r8 - adc r9, r9 - mov QWORD PTR [rcx+248], r9 - adc rax, 0 - ret -sp_2048_dbl_32 ENDP -_text ENDS -; /* Square a and put result in r. (r = a * a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_4096_sqr_64 PROC - push r12 - sub rsp, 1304 - mov QWORD PTR [rsp+1280], rcx - mov QWORD PTR [rsp+1288], rdx - lea r10, QWORD PTR [rsp+1024] - lea r11, QWORD PTR [rdx+256] - ; Add - mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] - mov r8, QWORD PTR [rdx+8] - mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] - mov rax, QWORD PTR [rdx+16] - mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] - mov r8, QWORD PTR [rdx+24] - mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] - mov rax, QWORD PTR [rdx+32] - mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] - mov r8, QWORD PTR [rdx+40] - mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] - mov rax, QWORD PTR [rdx+48] - mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] - mov r8, QWORD PTR [rdx+56] - mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] - mov rax, QWORD PTR [rdx+64] - mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] - mov r8, QWORD PTR [rdx+72] - mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] - mov rax, QWORD PTR [rdx+80] - mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] - mov r8, QWORD PTR [rdx+88] - mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] - mov rax, QWORD PTR [rdx+96] - mov QWORD PTR [r10+88], r8 - adc rax, QWORD PTR [r11+96] - mov r8, QWORD PTR [rdx+104] - mov QWORD PTR [r10+96], rax - adc r8, QWORD PTR [r11+104] - mov rax, QWORD PTR [rdx+112] - mov QWORD PTR [r10+104], r8 - adc rax, QWORD PTR [r11+112] - mov r8, QWORD PTR [rdx+120] - mov QWORD PTR [r10+112], rax - adc r8, QWORD PTR [r11+120] - mov rax, QWORD PTR [rdx+128] - mov QWORD PTR [r10+120], r8 - adc rax, QWORD PTR [r11+128] - mov r8, QWORD PTR [rdx+136] - mov QWORD PTR [r10+128], rax - adc r8, QWORD PTR [r11+136] - mov rax, QWORD PTR [rdx+144] - mov QWORD PTR [r10+136], r8 - adc rax, QWORD PTR [r11+144] - mov r8, QWORD PTR [rdx+152] - mov QWORD PTR [r10+144], rax - adc r8, QWORD PTR [r11+152] - mov rax, QWORD PTR [rdx+160] - mov QWORD PTR [r10+152], r8 - adc rax, QWORD PTR [r11+160] - mov r8, QWORD PTR [rdx+168] - mov QWORD PTR [r10+160], rax - adc r8, QWORD PTR [r11+168] - mov rax, QWORD PTR [rdx+176] - mov QWORD PTR [r10+168], r8 - adc rax, QWORD PTR [r11+176] - mov r8, QWORD PTR [rdx+184] - mov QWORD PTR [r10+176], rax - adc r8, QWORD PTR [r11+184] - mov rax, QWORD PTR [rdx+192] - mov QWORD PTR [r10+184], r8 - adc rax, QWORD PTR [r11+192] - mov r8, QWORD PTR [rdx+200] - mov QWORD PTR [r10+192], rax - adc r8, QWORD PTR [r11+200] - mov rax, QWORD PTR [rdx+208] - mov QWORD PTR [r10+200], r8 - adc rax, QWORD PTR [r11+208] - mov r8, QWORD PTR [rdx+216] - mov QWORD PTR [r10+208], rax - adc r8, QWORD PTR [r11+216] - mov rax, QWORD PTR [rdx+224] - mov QWORD PTR [r10+216], r8 - adc rax, QWORD PTR [r11+224] - mov r8, QWORD PTR [rdx+232] - mov QWORD PTR [r10+224], rax - adc r8, QWORD PTR [r11+232] - mov rax, QWORD PTR [rdx+240] - mov QWORD PTR [r10+232], r8 - adc rax, QWORD PTR [r11+240] - mov r8, QWORD PTR [rdx+248] - mov QWORD PTR [r10+240], rax - adc r8, QWORD PTR [r11+248] - mov QWORD PTR [r10+248], r8 - adc r9, 0 - mov QWORD PTR [rsp+1296], r9 - mov rdx, r10 - mov rcx, rsp - call sp_2048_sqr_32 - mov rdx, QWORD PTR [rsp+1288] - lea rcx, QWORD PTR [rsp+512] - add rdx, 256 - call sp_2048_sqr_32 - mov rdx, QWORD PTR [rsp+1288] - mov rcx, QWORD PTR [rsp+1280] - call sp_2048_sqr_32 -IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+1288] - mov rcx, QWORD PTR [rsp+1280] -ENDIF - mov r12, QWORD PTR [rsp+1296] - lea r10, QWORD PTR [rsp+1024] - mov r9, r12 - neg r12 - mov rax, QWORD PTR [r10] - mov r8, QWORD PTR [r10+8] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+512], rax - mov QWORD PTR [rcx+520], r8 - mov rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r10+24] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+528], rax - mov QWORD PTR [rcx+536], r8 - mov rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r10+40] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+544], rax - mov QWORD PTR [rcx+552], r8 - mov rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r10+56] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+560], rax - mov QWORD PTR [rcx+568], r8 - mov rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r10+72] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+576], rax - mov QWORD PTR [rcx+584], r8 - mov rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r10+88] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+592], rax - mov QWORD PTR [rcx+600], r8 - mov rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r10+104] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+608], rax - mov QWORD PTR [rcx+616], r8 - mov rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r10+120] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+624], rax - mov QWORD PTR [rcx+632], r8 - mov rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r10+136] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+640], rax - mov QWORD PTR [rcx+648], r8 - mov rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r10+152] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+656], rax - mov QWORD PTR [rcx+664], r8 - mov rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r10+168] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+672], rax - mov QWORD PTR [rcx+680], r8 - mov rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r10+184] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+688], rax - mov QWORD PTR [rcx+696], r8 - mov rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [r10+200] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+704], rax - mov QWORD PTR [rcx+712], r8 - mov rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [r10+216] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+720], rax - mov QWORD PTR [rcx+728], r8 - mov rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [r10+232] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+736], rax - mov QWORD PTR [rcx+744], r8 - mov rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [r10+248] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+752], rax - mov QWORD PTR [rcx+760], r8 - mov rax, QWORD PTR [rcx+512] - add rax, rax - mov r8, QWORD PTR [rcx+520] - mov QWORD PTR [rcx+512], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+528] - mov QWORD PTR [rcx+520], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+536] - mov QWORD PTR [rcx+528], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+544] - mov QWORD PTR [rcx+536], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+552] - mov QWORD PTR [rcx+544], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+560] - mov QWORD PTR [rcx+552], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+568] - mov QWORD PTR [rcx+560], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+576] - mov QWORD PTR [rcx+568], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+584] - mov QWORD PTR [rcx+576], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+592] - mov QWORD PTR [rcx+584], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+600] - mov QWORD PTR [rcx+592], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+608] - mov QWORD PTR [rcx+600], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+616] - mov QWORD PTR [rcx+608], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+624] - mov QWORD PTR [rcx+616], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+632] - mov QWORD PTR [rcx+624], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+640] - mov QWORD PTR [rcx+632], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+648] - mov QWORD PTR [rcx+640], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+656] - mov QWORD PTR [rcx+648], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+664] - mov QWORD PTR [rcx+656], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+672] - mov QWORD PTR [rcx+664], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+680] - mov QWORD PTR [rcx+672], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+688] - mov QWORD PTR [rcx+680], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+696] - mov QWORD PTR [rcx+688], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+704] - mov QWORD PTR [rcx+696], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+712] - mov QWORD PTR [rcx+704], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+720] - mov QWORD PTR [rcx+712], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+728] - mov QWORD PTR [rcx+720], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+736] - mov QWORD PTR [rcx+728], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+744] - mov QWORD PTR [rcx+736], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+752] - mov QWORD PTR [rcx+744], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+760] - mov QWORD PTR [rcx+752], rax - adc r8, r8 - mov QWORD PTR [rcx+760], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+512] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rdx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rdx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rdx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rdx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rdx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rdx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rdx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rdx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rdx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rdx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rdx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rdx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rdx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rdx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rdx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rdx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rdx+376] - mov rax, QWORD PTR [r10+384] - mov QWORD PTR [r10+376], r8 - sbb rax, QWORD PTR [rdx+384] - mov r8, QWORD PTR [r10+392] - mov QWORD PTR [r10+384], rax - sbb r8, QWORD PTR [rdx+392] - mov rax, QWORD PTR [r10+400] - mov QWORD PTR [r10+392], r8 - sbb rax, QWORD PTR [rdx+400] - mov r8, QWORD PTR [r10+408] - mov QWORD PTR [r10+400], rax - sbb r8, QWORD PTR [rdx+408] - mov rax, QWORD PTR [r10+416] - mov QWORD PTR [r10+408], r8 - sbb rax, QWORD PTR [rdx+416] - mov r8, QWORD PTR [r10+424] - mov QWORD PTR [r10+416], rax - sbb r8, QWORD PTR [rdx+424] - mov rax, QWORD PTR [r10+432] - mov QWORD PTR [r10+424], r8 - sbb rax, QWORD PTR [rdx+432] - mov r8, QWORD PTR [r10+440] - mov QWORD PTR [r10+432], rax - sbb r8, QWORD PTR [rdx+440] - mov rax, QWORD PTR [r10+448] - mov QWORD PTR [r10+440], r8 - sbb rax, QWORD PTR [rdx+448] - mov r8, QWORD PTR [r10+456] - mov QWORD PTR [r10+448], rax - sbb r8, QWORD PTR [rdx+456] - mov rax, QWORD PTR [r10+464] - mov QWORD PTR [r10+456], r8 - sbb rax, QWORD PTR [rdx+464] - mov r8, QWORD PTR [r10+472] - mov QWORD PTR [r10+464], rax - sbb r8, QWORD PTR [rdx+472] - mov rax, QWORD PTR [r10+480] - mov QWORD PTR [r10+472], r8 - sbb rax, QWORD PTR [rdx+480] - mov r8, QWORD PTR [r10+488] - mov QWORD PTR [r10+480], rax - sbb r8, QWORD PTR [rdx+488] - mov rax, QWORD PTR [r10+496] - mov QWORD PTR [r10+488], r8 - sbb rax, QWORD PTR [rdx+496] - mov r8, QWORD PTR [r10+504] - mov QWORD PTR [r10+496], rax - sbb r8, QWORD PTR [rdx+504] - mov QWORD PTR [r10+504], r8 - sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rcx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rcx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rcx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rcx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rcx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rcx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rcx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rcx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rcx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rcx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rcx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rcx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rcx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rcx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rcx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rcx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rcx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rcx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rcx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rcx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rcx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rcx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rcx+376] - mov rax, QWORD PTR [r10+384] - mov QWORD PTR [r10+376], r8 - sbb rax, QWORD PTR [rcx+384] - mov r8, QWORD PTR [r10+392] - mov QWORD PTR [r10+384], rax - sbb r8, QWORD PTR [rcx+392] - mov rax, QWORD PTR [r10+400] - mov QWORD PTR [r10+392], r8 - sbb rax, QWORD PTR [rcx+400] - mov r8, QWORD PTR [r10+408] - mov QWORD PTR [r10+400], rax - sbb r8, QWORD PTR [rcx+408] - mov rax, QWORD PTR [r10+416] - mov QWORD PTR [r10+408], r8 - sbb rax, QWORD PTR [rcx+416] - mov r8, QWORD PTR [r10+424] - mov QWORD PTR [r10+416], rax - sbb r8, QWORD PTR [rcx+424] - mov rax, QWORD PTR [r10+432] - mov QWORD PTR [r10+424], r8 - sbb rax, QWORD PTR [rcx+432] - mov r8, QWORD PTR [r10+440] - mov QWORD PTR [r10+432], rax - sbb r8, QWORD PTR [rcx+440] - mov rax, QWORD PTR [r10+448] - mov QWORD PTR [r10+440], r8 - sbb rax, QWORD PTR [rcx+448] - mov r8, QWORD PTR [r10+456] - mov QWORD PTR [r10+448], rax - sbb r8, QWORD PTR [rcx+456] - mov rax, QWORD PTR [r10+464] - mov QWORD PTR [r10+456], r8 - sbb rax, QWORD PTR [rcx+464] - mov r8, QWORD PTR [r10+472] - mov QWORD PTR [r10+464], rax - sbb r8, QWORD PTR [rcx+472] - mov rax, QWORD PTR [r10+480] - mov QWORD PTR [r10+472], r8 - sbb rax, QWORD PTR [rcx+480] - mov r8, QWORD PTR [r10+488] - mov QWORD PTR [r10+480], rax - sbb r8, QWORD PTR [rcx+488] - mov rax, QWORD PTR [r10+496] - mov QWORD PTR [r10+488], r8 - sbb rax, QWORD PTR [rcx+496] - mov r8, QWORD PTR [r10+504] - mov QWORD PTR [r10+496], rax - sbb r8, QWORD PTR [rcx+504] - mov QWORD PTR [r10+504], r8 - sbb r9, 0 - ; Add in place - mov rax, QWORD PTR [rcx+256] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [rcx+264] - mov QWORD PTR [rcx+256], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [rcx+272] - mov QWORD PTR [rcx+264], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [rcx+280] - mov QWORD PTR [rcx+272], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [rcx+288] - mov QWORD PTR [rcx+280], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [rcx+296] - mov QWORD PTR [rcx+288], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [rcx+304] - mov QWORD PTR [rcx+296], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [rcx+312] - mov QWORD PTR [rcx+304], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [rcx+320] - mov QWORD PTR [rcx+312], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [rcx+328] - mov QWORD PTR [rcx+320], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [rcx+336] - mov QWORD PTR [rcx+328], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [rcx+344] - mov QWORD PTR [rcx+336], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [rcx+352] - mov QWORD PTR [rcx+344], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [rcx+360] - mov QWORD PTR [rcx+352], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [rcx+368] - mov QWORD PTR [rcx+360], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [rcx+376] - mov QWORD PTR [rcx+368], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [rcx+384] - mov QWORD PTR [rcx+376], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [rcx+392] - mov QWORD PTR [rcx+384], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [rcx+400] - mov QWORD PTR [rcx+392], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [rcx+408] - mov QWORD PTR [rcx+400], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [rcx+416] - mov QWORD PTR [rcx+408], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [rcx+424] - mov QWORD PTR [rcx+416], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [rcx+432] - mov QWORD PTR [rcx+424], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [rcx+440] - mov QWORD PTR [rcx+432], rax - adc r8, QWORD PTR [r10+184] - mov rax, QWORD PTR [rcx+448] - mov QWORD PTR [rcx+440], r8 - adc rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [rcx+456] - mov QWORD PTR [rcx+448], rax - adc r8, QWORD PTR [r10+200] - mov rax, QWORD PTR [rcx+464] - mov QWORD PTR [rcx+456], r8 - adc rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [rcx+472] - mov QWORD PTR [rcx+464], rax - adc r8, QWORD PTR [r10+216] - mov rax, QWORD PTR [rcx+480] - mov QWORD PTR [rcx+472], r8 - adc rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [rcx+488] - mov QWORD PTR [rcx+480], rax - adc r8, QWORD PTR [r10+232] - mov rax, QWORD PTR [rcx+496] - mov QWORD PTR [rcx+488], r8 - adc rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [rcx+504] - mov QWORD PTR [rcx+496], rax - adc r8, QWORD PTR [r10+248] - mov rax, QWORD PTR [rcx+512] - mov QWORD PTR [rcx+504], r8 - adc rax, QWORD PTR [r10+256] - mov r8, QWORD PTR [rcx+520] - mov QWORD PTR [rcx+512], rax - adc r8, QWORD PTR [r10+264] - mov rax, QWORD PTR [rcx+528] - mov QWORD PTR [rcx+520], r8 - adc rax, QWORD PTR [r10+272] - mov r8, QWORD PTR [rcx+536] - mov QWORD PTR [rcx+528], rax - adc r8, QWORD PTR [r10+280] - mov rax, QWORD PTR [rcx+544] - mov QWORD PTR [rcx+536], r8 - adc rax, QWORD PTR [r10+288] - mov r8, QWORD PTR [rcx+552] - mov QWORD PTR [rcx+544], rax - adc r8, QWORD PTR [r10+296] - mov rax, QWORD PTR [rcx+560] - mov QWORD PTR [rcx+552], r8 - adc rax, QWORD PTR [r10+304] - mov r8, QWORD PTR [rcx+568] - mov QWORD PTR [rcx+560], rax - adc r8, QWORD PTR [r10+312] - mov rax, QWORD PTR [rcx+576] - mov QWORD PTR [rcx+568], r8 - adc rax, QWORD PTR [r10+320] - mov r8, QWORD PTR [rcx+584] - mov QWORD PTR [rcx+576], rax - adc r8, QWORD PTR [r10+328] - mov rax, QWORD PTR [rcx+592] - mov QWORD PTR [rcx+584], r8 - adc rax, QWORD PTR [r10+336] - mov r8, QWORD PTR [rcx+600] - mov QWORD PTR [rcx+592], rax - adc r8, QWORD PTR [r10+344] - mov rax, QWORD PTR [rcx+608] - mov QWORD PTR [rcx+600], r8 - adc rax, QWORD PTR [r10+352] - mov r8, QWORD PTR [rcx+616] - mov QWORD PTR [rcx+608], rax - adc r8, QWORD PTR [r10+360] - mov rax, QWORD PTR [rcx+624] - mov QWORD PTR [rcx+616], r8 - adc rax, QWORD PTR [r10+368] - mov r8, QWORD PTR [rcx+632] - mov QWORD PTR [rcx+624], rax - adc r8, QWORD PTR [r10+376] - mov rax, QWORD PTR [rcx+640] - mov QWORD PTR [rcx+632], r8 - adc rax, QWORD PTR [r10+384] - mov r8, QWORD PTR [rcx+648] - mov QWORD PTR [rcx+640], rax - adc r8, QWORD PTR [r10+392] - mov rax, QWORD PTR [rcx+656] - mov QWORD PTR [rcx+648], r8 - adc rax, QWORD PTR [r10+400] - mov r8, QWORD PTR [rcx+664] - mov QWORD PTR [rcx+656], rax - adc r8, QWORD PTR [r10+408] - mov rax, QWORD PTR [rcx+672] - mov QWORD PTR [rcx+664], r8 - adc rax, QWORD PTR [r10+416] - mov r8, QWORD PTR [rcx+680] - mov QWORD PTR [rcx+672], rax - adc r8, QWORD PTR [r10+424] - mov rax, QWORD PTR [rcx+688] - mov QWORD PTR [rcx+680], r8 - adc rax, QWORD PTR [r10+432] - mov r8, QWORD PTR [rcx+696] - mov QWORD PTR [rcx+688], rax - adc r8, QWORD PTR [r10+440] - mov rax, QWORD PTR [rcx+704] - mov QWORD PTR [rcx+696], r8 - adc rax, QWORD PTR [r10+448] - mov r8, QWORD PTR [rcx+712] - mov QWORD PTR [rcx+704], rax - adc r8, QWORD PTR [r10+456] - mov rax, QWORD PTR [rcx+720] - mov QWORD PTR [rcx+712], r8 - adc rax, QWORD PTR [r10+464] - mov r8, QWORD PTR [rcx+728] - mov QWORD PTR [rcx+720], rax - adc r8, QWORD PTR [r10+472] - mov rax, QWORD PTR [rcx+736] - mov QWORD PTR [rcx+728], r8 - adc rax, QWORD PTR [r10+480] - mov r8, QWORD PTR [rcx+744] - mov QWORD PTR [rcx+736], rax - adc r8, QWORD PTR [r10+488] - mov rax, QWORD PTR [rcx+752] - mov QWORD PTR [rcx+744], r8 - adc rax, QWORD PTR [r10+496] - mov r8, QWORD PTR [rcx+760] - mov QWORD PTR [rcx+752], rax - adc r8, QWORD PTR [r10+504] - mov QWORD PTR [rcx+760], r8 - adc r9, 0 - mov QWORD PTR [rcx+768], r9 - ; Add in place - mov rax, QWORD PTR [rcx+512] - xor r9, r9 - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [rcx+520] - mov QWORD PTR [rcx+512], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [rcx+528] - mov QWORD PTR [rcx+520], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [rcx+536] - mov QWORD PTR [rcx+528], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [rcx+544] - mov QWORD PTR [rcx+536], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [rcx+552] - mov QWORD PTR [rcx+544], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [rcx+560] - mov QWORD PTR [rcx+552], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [rcx+568] - mov QWORD PTR [rcx+560], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [rcx+576] - mov QWORD PTR [rcx+568], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [rcx+584] - mov QWORD PTR [rcx+576], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [rcx+592] - mov QWORD PTR [rcx+584], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [rcx+600] - mov QWORD PTR [rcx+592], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [rcx+608] - mov QWORD PTR [rcx+600], r8 - adc rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [rcx+616] - mov QWORD PTR [rcx+608], rax - adc r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [rcx+624] - mov QWORD PTR [rcx+616], r8 - adc rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [rcx+632] - mov QWORD PTR [rcx+624], rax - adc r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [rcx+640] - mov QWORD PTR [rcx+632], r8 - adc rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [rcx+648] - mov QWORD PTR [rcx+640], rax - adc r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [rcx+656] - mov QWORD PTR [rcx+648], r8 - adc rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [rcx+664] - mov QWORD PTR [rcx+656], rax - adc r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [rcx+672] - mov QWORD PTR [rcx+664], r8 - adc rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [rcx+680] - mov QWORD PTR [rcx+672], rax - adc r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [rcx+688] - mov QWORD PTR [rcx+680], r8 - adc rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [rcx+696] - mov QWORD PTR [rcx+688], rax - adc r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [rcx+704] - mov QWORD PTR [rcx+696], r8 - adc rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [rcx+712] - mov QWORD PTR [rcx+704], rax - adc r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [rcx+720] - mov QWORD PTR [rcx+712], r8 - adc rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [rcx+728] - mov QWORD PTR [rcx+720], rax - adc r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [rcx+736] - mov QWORD PTR [rcx+728], r8 - adc rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [rcx+744] - mov QWORD PTR [rcx+736], rax - adc r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [rcx+752] - mov QWORD PTR [rcx+744], r8 - adc rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [rcx+760] - mov QWORD PTR [rcx+752], rax - adc r8, QWORD PTR [rdx+248] - mov rax, QWORD PTR [rcx+768] - mov QWORD PTR [rcx+760], r8 - adc rax, QWORD PTR [rdx+256] - mov QWORD PTR [rcx+768], rax - adc r9, 0 - ; Add to zero - mov rax, QWORD PTR [rdx+264] - adc rax, 0 - mov r8, QWORD PTR [rdx+272] - mov QWORD PTR [rcx+776], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+280] - mov QWORD PTR [rcx+784], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+288] - mov QWORD PTR [rcx+792], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+296] - mov QWORD PTR [rcx+800], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+304] - mov QWORD PTR [rcx+808], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+312] - mov QWORD PTR [rcx+816], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+320] - mov QWORD PTR [rcx+824], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+328] - mov QWORD PTR [rcx+832], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+336] - mov QWORD PTR [rcx+840], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+344] - mov QWORD PTR [rcx+848], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+352] - mov QWORD PTR [rcx+856], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+360] - mov QWORD PTR [rcx+864], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+368] - mov QWORD PTR [rcx+872], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+376] - mov QWORD PTR [rcx+880], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+384] - mov QWORD PTR [rcx+888], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+392] - mov QWORD PTR [rcx+896], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+400] - mov QWORD PTR [rcx+904], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+408] - mov QWORD PTR [rcx+912], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+416] - mov QWORD PTR [rcx+920], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+424] - mov QWORD PTR [rcx+928], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+432] - mov QWORD PTR [rcx+936], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+440] - mov QWORD PTR [rcx+944], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+448] - mov QWORD PTR [rcx+952], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+456] - mov QWORD PTR [rcx+960], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+464] - mov QWORD PTR [rcx+968], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+472] - mov QWORD PTR [rcx+976], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+480] - mov QWORD PTR [rcx+984], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+488] - mov QWORD PTR [rcx+992], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+496] - mov QWORD PTR [rcx+1000], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+504] - mov QWORD PTR [rcx+1008], r8 - adc rax, 0 - mov QWORD PTR [rcx+1016], rax - add rsp, 1304 - pop r12 - ret -sp_4096_sqr_64 ENDP -_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Multiply a and b into r. (r = a * b) ; * @@ -31956,6 +30740,1222 @@ ENDIF sp_4096_mul_avx2_64 ENDP _text ENDS ENDIF +; /* Add a to a into r. (r = a + a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_2048_dbl_32 PROC + mov r8, QWORD PTR [rdx] + xor rax, rax + add r8, r8 + mov r9, QWORD PTR [rdx+8] + mov QWORD PTR [rcx], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+16] + mov QWORD PTR [rcx+8], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+24] + mov QWORD PTR [rcx+16], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+32] + mov QWORD PTR [rcx+24], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+40] + mov QWORD PTR [rcx+32], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+48] + mov QWORD PTR [rcx+40], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+56] + mov QWORD PTR [rcx+48], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+64] + mov QWORD PTR [rcx+56], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+72] + mov QWORD PTR [rcx+64], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+80] + mov QWORD PTR [rcx+72], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+88] + mov QWORD PTR [rcx+80], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+96] + mov QWORD PTR [rcx+88], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+104] + mov QWORD PTR [rcx+96], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+112] + mov QWORD PTR [rcx+104], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+120] + mov QWORD PTR [rcx+112], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+128] + mov QWORD PTR [rcx+120], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+136] + mov QWORD PTR [rcx+128], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+144] + mov QWORD PTR [rcx+136], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+152] + mov QWORD PTR [rcx+144], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+160] + mov QWORD PTR [rcx+152], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+168] + mov QWORD PTR [rcx+160], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+176] + mov QWORD PTR [rcx+168], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+184] + mov QWORD PTR [rcx+176], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+192] + mov QWORD PTR [rcx+184], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+200] + mov QWORD PTR [rcx+192], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+208] + mov QWORD PTR [rcx+200], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+216] + mov QWORD PTR [rcx+208], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+224] + mov QWORD PTR [rcx+216], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+232] + mov QWORD PTR [rcx+224], r8 + adc r9, r9 + mov r8, QWORD PTR [rdx+240] + mov QWORD PTR [rcx+232], r9 + adc r8, r8 + mov r9, QWORD PTR [rdx+248] + mov QWORD PTR [rcx+240], r8 + adc r9, r9 + mov QWORD PTR [rcx+248], r9 + adc rax, 0 + ret +sp_2048_dbl_32 ENDP +_text ENDS +; /* Square a and put result in r. (r = a * a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_4096_sqr_64 PROC + push r12 + sub rsp, 1304 + mov QWORD PTR [rsp+1280], rcx + mov QWORD PTR [rsp+1288], rdx + lea r10, QWORD PTR [rsp+1024] + lea r11, QWORD PTR [rdx+256] + ; Add + mov rax, QWORD PTR [rdx] + xor r9, r9 + add rax, QWORD PTR [r11] + mov r8, QWORD PTR [rdx+8] + mov QWORD PTR [r10], rax + adc r8, QWORD PTR [r11+8] + mov rax, QWORD PTR [rdx+16] + mov QWORD PTR [r10+8], r8 + adc rax, QWORD PTR [r11+16] + mov r8, QWORD PTR [rdx+24] + mov QWORD PTR [r10+16], rax + adc r8, QWORD PTR [r11+24] + mov rax, QWORD PTR [rdx+32] + mov QWORD PTR [r10+24], r8 + adc rax, QWORD PTR [r11+32] + mov r8, QWORD PTR [rdx+40] + mov QWORD PTR [r10+32], rax + adc r8, QWORD PTR [r11+40] + mov rax, QWORD PTR [rdx+48] + mov QWORD PTR [r10+40], r8 + adc rax, QWORD PTR [r11+48] + mov r8, QWORD PTR [rdx+56] + mov QWORD PTR [r10+48], rax + adc r8, QWORD PTR [r11+56] + mov rax, QWORD PTR [rdx+64] + mov QWORD PTR [r10+56], r8 + adc rax, QWORD PTR [r11+64] + mov r8, QWORD PTR [rdx+72] + mov QWORD PTR [r10+64], rax + adc r8, QWORD PTR [r11+72] + mov rax, QWORD PTR [rdx+80] + mov QWORD PTR [r10+72], r8 + adc rax, QWORD PTR [r11+80] + mov r8, QWORD PTR [rdx+88] + mov QWORD PTR [r10+80], rax + adc r8, QWORD PTR [r11+88] + mov rax, QWORD PTR [rdx+96] + mov QWORD PTR [r10+88], r8 + adc rax, QWORD PTR [r11+96] + mov r8, QWORD PTR [rdx+104] + mov QWORD PTR [r10+96], rax + adc r8, QWORD PTR [r11+104] + mov rax, QWORD PTR [rdx+112] + mov QWORD PTR [r10+104], r8 + adc rax, QWORD PTR [r11+112] + mov r8, QWORD PTR [rdx+120] + mov QWORD PTR [r10+112], rax + adc r8, QWORD PTR [r11+120] + mov rax, QWORD PTR [rdx+128] + mov QWORD PTR [r10+120], r8 + adc rax, QWORD PTR [r11+128] + mov r8, QWORD PTR [rdx+136] + mov QWORD PTR [r10+128], rax + adc r8, QWORD PTR [r11+136] + mov rax, QWORD PTR [rdx+144] + mov QWORD PTR [r10+136], r8 + adc rax, QWORD PTR [r11+144] + mov r8, QWORD PTR [rdx+152] + mov QWORD PTR [r10+144], rax + adc r8, QWORD PTR [r11+152] + mov rax, QWORD PTR [rdx+160] + mov QWORD PTR [r10+152], r8 + adc rax, QWORD PTR [r11+160] + mov r8, QWORD PTR [rdx+168] + mov QWORD PTR [r10+160], rax + adc r8, QWORD PTR [r11+168] + mov rax, QWORD PTR [rdx+176] + mov QWORD PTR [r10+168], r8 + adc rax, QWORD PTR [r11+176] + mov r8, QWORD PTR [rdx+184] + mov QWORD PTR [r10+176], rax + adc r8, QWORD PTR [r11+184] + mov rax, QWORD PTR [rdx+192] + mov QWORD PTR [r10+184], r8 + adc rax, QWORD PTR [r11+192] + mov r8, QWORD PTR [rdx+200] + mov QWORD PTR [r10+192], rax + adc r8, QWORD PTR [r11+200] + mov rax, QWORD PTR [rdx+208] + mov QWORD PTR [r10+200], r8 + adc rax, QWORD PTR [r11+208] + mov r8, QWORD PTR [rdx+216] + mov QWORD PTR [r10+208], rax + adc r8, QWORD PTR [r11+216] + mov rax, QWORD PTR [rdx+224] + mov QWORD PTR [r10+216], r8 + adc rax, QWORD PTR [r11+224] + mov r8, QWORD PTR [rdx+232] + mov QWORD PTR [r10+224], rax + adc r8, QWORD PTR [r11+232] + mov rax, QWORD PTR [rdx+240] + mov QWORD PTR [r10+232], r8 + adc rax, QWORD PTR [r11+240] + mov r8, QWORD PTR [rdx+248] + mov QWORD PTR [r10+240], rax + adc r8, QWORD PTR [r11+248] + mov QWORD PTR [r10+248], r8 + adc r9, 0 + mov QWORD PTR [rsp+1296], r9 + mov rdx, r10 + mov rcx, rsp + call sp_2048_sqr_32 + mov rdx, QWORD PTR [rsp+1288] + lea rcx, QWORD PTR [rsp+512] + add rdx, 256 + call sp_2048_sqr_32 + mov rdx, QWORD PTR [rsp+1288] + mov rcx, QWORD PTR [rsp+1280] + call sp_2048_sqr_32 +IFDEF _WIN64 + mov rdx, QWORD PTR [rsp+1288] + mov rcx, QWORD PTR [rsp+1280] +ENDIF + mov r12, QWORD PTR [rsp+1296] + lea r10, QWORD PTR [rsp+1024] + mov r9, r12 + neg r12 + mov rax, QWORD PTR [r10] + mov r8, QWORD PTR [r10+8] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+512], rax + mov QWORD PTR [rcx+520], r8 + mov rax, QWORD PTR [r10+16] + mov r8, QWORD PTR [r10+24] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+528], rax + mov QWORD PTR [rcx+536], r8 + mov rax, QWORD PTR [r10+32] + mov r8, QWORD PTR [r10+40] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+544], rax + mov QWORD PTR [rcx+552], r8 + mov rax, QWORD PTR [r10+48] + mov r8, QWORD PTR [r10+56] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+560], rax + mov QWORD PTR [rcx+568], r8 + mov rax, QWORD PTR [r10+64] + mov r8, QWORD PTR [r10+72] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+576], rax + mov QWORD PTR [rcx+584], r8 + mov rax, QWORD PTR [r10+80] + mov r8, QWORD PTR [r10+88] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+592], rax + mov QWORD PTR [rcx+600], r8 + mov rax, QWORD PTR [r10+96] + mov r8, QWORD PTR [r10+104] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+608], rax + mov QWORD PTR [rcx+616], r8 + mov rax, QWORD PTR [r10+112] + mov r8, QWORD PTR [r10+120] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+624], rax + mov QWORD PTR [rcx+632], r8 + mov rax, QWORD PTR [r10+128] + mov r8, QWORD PTR [r10+136] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+640], rax + mov QWORD PTR [rcx+648], r8 + mov rax, QWORD PTR [r10+144] + mov r8, QWORD PTR [r10+152] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+656], rax + mov QWORD PTR [rcx+664], r8 + mov rax, QWORD PTR [r10+160] + mov r8, QWORD PTR [r10+168] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+672], rax + mov QWORD PTR [rcx+680], r8 + mov rax, QWORD PTR [r10+176] + mov r8, QWORD PTR [r10+184] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+688], rax + mov QWORD PTR [rcx+696], r8 + mov rax, QWORD PTR [r10+192] + mov r8, QWORD PTR [r10+200] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+704], rax + mov QWORD PTR [rcx+712], r8 + mov rax, QWORD PTR [r10+208] + mov r8, QWORD PTR [r10+216] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+720], rax + mov QWORD PTR [rcx+728], r8 + mov rax, QWORD PTR [r10+224] + mov r8, QWORD PTR [r10+232] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+736], rax + mov QWORD PTR [rcx+744], r8 + mov rax, QWORD PTR [r10+240] + mov r8, QWORD PTR [r10+248] + and rax, r12 + and r8, r12 + mov QWORD PTR [rcx+752], rax + mov QWORD PTR [rcx+760], r8 + mov rax, QWORD PTR [rcx+512] + add rax, rax + mov r8, QWORD PTR [rcx+520] + mov QWORD PTR [rcx+512], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+528] + mov QWORD PTR [rcx+520], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+536] + mov QWORD PTR [rcx+528], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+544] + mov QWORD PTR [rcx+536], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+552] + mov QWORD PTR [rcx+544], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+560] + mov QWORD PTR [rcx+552], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+568] + mov QWORD PTR [rcx+560], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+576] + mov QWORD PTR [rcx+568], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+584] + mov QWORD PTR [rcx+576], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+592] + mov QWORD PTR [rcx+584], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+600] + mov QWORD PTR [rcx+592], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+608] + mov QWORD PTR [rcx+600], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+616] + mov QWORD PTR [rcx+608], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+624] + mov QWORD PTR [rcx+616], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+632] + mov QWORD PTR [rcx+624], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+640] + mov QWORD PTR [rcx+632], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+648] + mov QWORD PTR [rcx+640], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+656] + mov QWORD PTR [rcx+648], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+664] + mov QWORD PTR [rcx+656], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+672] + mov QWORD PTR [rcx+664], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+680] + mov QWORD PTR [rcx+672], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+688] + mov QWORD PTR [rcx+680], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+696] + mov QWORD PTR [rcx+688], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+704] + mov QWORD PTR [rcx+696], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+712] + mov QWORD PTR [rcx+704], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+720] + mov QWORD PTR [rcx+712], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+728] + mov QWORD PTR [rcx+720], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+736] + mov QWORD PTR [rcx+728], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+744] + mov QWORD PTR [rcx+736], rax + adc r8, r8 + mov rax, QWORD PTR [rcx+752] + mov QWORD PTR [rcx+744], r8 + adc rax, rax + mov r8, QWORD PTR [rcx+760] + mov QWORD PTR [rcx+752], rax + adc r8, r8 + mov QWORD PTR [rcx+760], r8 + adc r9, 0 + lea rdx, QWORD PTR [rsp+512] + mov r10, rsp + mov rax, QWORD PTR [r10] + sub rax, QWORD PTR [rdx] + mov r8, QWORD PTR [r10+8] + mov QWORD PTR [r10], rax + sbb r8, QWORD PTR [rdx+8] + mov rax, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], r8 + sbb rax, QWORD PTR [rdx+16] + mov r8, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], rax + sbb r8, QWORD PTR [rdx+24] + mov rax, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], r8 + sbb rax, QWORD PTR [rdx+32] + mov r8, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], rax + sbb r8, QWORD PTR [rdx+40] + mov rax, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], r8 + sbb rax, QWORD PTR [rdx+48] + mov r8, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], rax + sbb r8, QWORD PTR [rdx+56] + mov rax, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], r8 + sbb rax, QWORD PTR [rdx+64] + mov r8, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], rax + sbb r8, QWORD PTR [rdx+72] + mov rax, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], r8 + sbb rax, QWORD PTR [rdx+80] + mov r8, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], rax + sbb r8, QWORD PTR [rdx+88] + mov rax, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], r8 + sbb rax, QWORD PTR [rdx+96] + mov r8, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], rax + sbb r8, QWORD PTR [rdx+104] + mov rax, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], r8 + sbb rax, QWORD PTR [rdx+112] + mov r8, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], rax + sbb r8, QWORD PTR [rdx+120] + mov rax, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], r8 + sbb rax, QWORD PTR [rdx+128] + mov r8, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], rax + sbb r8, QWORD PTR [rdx+136] + mov rax, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], r8 + sbb rax, QWORD PTR [rdx+144] + mov r8, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], rax + sbb r8, QWORD PTR [rdx+152] + mov rax, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], r8 + sbb rax, QWORD PTR [rdx+160] + mov r8, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], rax + sbb r8, QWORD PTR [rdx+168] + mov rax, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], r8 + sbb rax, QWORD PTR [rdx+176] + mov r8, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], rax + sbb r8, QWORD PTR [rdx+184] + mov rax, QWORD PTR [r10+192] + mov QWORD PTR [r10+184], r8 + sbb rax, QWORD PTR [rdx+192] + mov r8, QWORD PTR [r10+200] + mov QWORD PTR [r10+192], rax + sbb r8, QWORD PTR [rdx+200] + mov rax, QWORD PTR [r10+208] + mov QWORD PTR [r10+200], r8 + sbb rax, QWORD PTR [rdx+208] + mov r8, QWORD PTR [r10+216] + mov QWORD PTR [r10+208], rax + sbb r8, QWORD PTR [rdx+216] + mov rax, QWORD PTR [r10+224] + mov QWORD PTR [r10+216], r8 + sbb rax, QWORD PTR [rdx+224] + mov r8, QWORD PTR [r10+232] + mov QWORD PTR [r10+224], rax + sbb r8, QWORD PTR [rdx+232] + mov rax, QWORD PTR [r10+240] + mov QWORD PTR [r10+232], r8 + sbb rax, QWORD PTR [rdx+240] + mov r8, QWORD PTR [r10+248] + mov QWORD PTR [r10+240], rax + sbb r8, QWORD PTR [rdx+248] + mov rax, QWORD PTR [r10+256] + mov QWORD PTR [r10+248], r8 + sbb rax, QWORD PTR [rdx+256] + mov r8, QWORD PTR [r10+264] + mov QWORD PTR [r10+256], rax + sbb r8, QWORD PTR [rdx+264] + mov rax, QWORD PTR [r10+272] + mov QWORD PTR [r10+264], r8 + sbb rax, QWORD PTR [rdx+272] + mov r8, QWORD PTR [r10+280] + mov QWORD PTR [r10+272], rax + sbb r8, QWORD PTR [rdx+280] + mov rax, QWORD PTR [r10+288] + mov QWORD PTR [r10+280], r8 + sbb rax, QWORD PTR [rdx+288] + mov r8, QWORD PTR [r10+296] + mov QWORD PTR [r10+288], rax + sbb r8, QWORD PTR [rdx+296] + mov rax, QWORD PTR [r10+304] + mov QWORD PTR [r10+296], r8 + sbb rax, QWORD PTR [rdx+304] + mov r8, QWORD PTR [r10+312] + mov QWORD PTR [r10+304], rax + sbb r8, QWORD PTR [rdx+312] + mov rax, QWORD PTR [r10+320] + mov QWORD PTR [r10+312], r8 + sbb rax, QWORD PTR [rdx+320] + mov r8, QWORD PTR [r10+328] + mov QWORD PTR [r10+320], rax + sbb r8, QWORD PTR [rdx+328] + mov rax, QWORD PTR [r10+336] + mov QWORD PTR [r10+328], r8 + sbb rax, QWORD PTR [rdx+336] + mov r8, QWORD PTR [r10+344] + mov QWORD PTR [r10+336], rax + sbb r8, QWORD PTR [rdx+344] + mov rax, QWORD PTR [r10+352] + mov QWORD PTR [r10+344], r8 + sbb rax, QWORD PTR [rdx+352] + mov r8, QWORD PTR [r10+360] + mov QWORD PTR [r10+352], rax + sbb r8, QWORD PTR [rdx+360] + mov rax, QWORD PTR [r10+368] + mov QWORD PTR [r10+360], r8 + sbb rax, QWORD PTR [rdx+368] + mov r8, QWORD PTR [r10+376] + mov QWORD PTR [r10+368], rax + sbb r8, QWORD PTR [rdx+376] + mov rax, QWORD PTR [r10+384] + mov QWORD PTR [r10+376], r8 + sbb rax, QWORD PTR [rdx+384] + mov r8, QWORD PTR [r10+392] + mov QWORD PTR [r10+384], rax + sbb r8, QWORD PTR [rdx+392] + mov rax, QWORD PTR [r10+400] + mov QWORD PTR [r10+392], r8 + sbb rax, QWORD PTR [rdx+400] + mov r8, QWORD PTR [r10+408] + mov QWORD PTR [r10+400], rax + sbb r8, QWORD PTR [rdx+408] + mov rax, QWORD PTR [r10+416] + mov QWORD PTR [r10+408], r8 + sbb rax, QWORD PTR [rdx+416] + mov r8, QWORD PTR [r10+424] + mov QWORD PTR [r10+416], rax + sbb r8, QWORD PTR [rdx+424] + mov rax, QWORD PTR [r10+432] + mov QWORD PTR [r10+424], r8 + sbb rax, QWORD PTR [rdx+432] + mov r8, QWORD PTR [r10+440] + mov QWORD PTR [r10+432], rax + sbb r8, QWORD PTR [rdx+440] + mov rax, QWORD PTR [r10+448] + mov QWORD PTR [r10+440], r8 + sbb rax, QWORD PTR [rdx+448] + mov r8, QWORD PTR [r10+456] + mov QWORD PTR [r10+448], rax + sbb r8, QWORD PTR [rdx+456] + mov rax, QWORD PTR [r10+464] + mov QWORD PTR [r10+456], r8 + sbb rax, QWORD PTR [rdx+464] + mov r8, QWORD PTR [r10+472] + mov QWORD PTR [r10+464], rax + sbb r8, QWORD PTR [rdx+472] + mov rax, QWORD PTR [r10+480] + mov QWORD PTR [r10+472], r8 + sbb rax, QWORD PTR [rdx+480] + mov r8, QWORD PTR [r10+488] + mov QWORD PTR [r10+480], rax + sbb r8, QWORD PTR [rdx+488] + mov rax, QWORD PTR [r10+496] + mov QWORD PTR [r10+488], r8 + sbb rax, QWORD PTR [rdx+496] + mov r8, QWORD PTR [r10+504] + mov QWORD PTR [r10+496], rax + sbb r8, QWORD PTR [rdx+504] + mov QWORD PTR [r10+504], r8 + sbb r9, 0 + mov rax, QWORD PTR [r10] + sub rax, QWORD PTR [rcx] + mov r8, QWORD PTR [r10+8] + mov QWORD PTR [r10], rax + sbb r8, QWORD PTR [rcx+8] + mov rax, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], r8 + sbb rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], rax + sbb r8, QWORD PTR [rcx+24] + mov rax, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], r8 + sbb rax, QWORD PTR [rcx+32] + mov r8, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], rax + sbb r8, QWORD PTR [rcx+40] + mov rax, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], r8 + sbb rax, QWORD PTR [rcx+48] + mov r8, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], rax + sbb r8, QWORD PTR [rcx+56] + mov rax, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], r8 + sbb rax, QWORD PTR [rcx+64] + mov r8, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], rax + sbb r8, QWORD PTR [rcx+72] + mov rax, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], r8 + sbb rax, QWORD PTR [rcx+80] + mov r8, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], rax + sbb r8, QWORD PTR [rcx+88] + mov rax, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], r8 + sbb rax, QWORD PTR [rcx+96] + mov r8, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], rax + sbb r8, QWORD PTR [rcx+104] + mov rax, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], r8 + sbb rax, QWORD PTR [rcx+112] + mov r8, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], rax + sbb r8, QWORD PTR [rcx+120] + mov rax, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], r8 + sbb rax, QWORD PTR [rcx+128] + mov r8, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], rax + sbb r8, QWORD PTR [rcx+136] + mov rax, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], r8 + sbb rax, QWORD PTR [rcx+144] + mov r8, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], rax + sbb r8, QWORD PTR [rcx+152] + mov rax, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], r8 + sbb rax, QWORD PTR [rcx+160] + mov r8, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], rax + sbb r8, QWORD PTR [rcx+168] + mov rax, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], r8 + sbb rax, QWORD PTR [rcx+176] + mov r8, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], rax + sbb r8, QWORD PTR [rcx+184] + mov rax, QWORD PTR [r10+192] + mov QWORD PTR [r10+184], r8 + sbb rax, QWORD PTR [rcx+192] + mov r8, QWORD PTR [r10+200] + mov QWORD PTR [r10+192], rax + sbb r8, QWORD PTR [rcx+200] + mov rax, QWORD PTR [r10+208] + mov QWORD PTR [r10+200], r8 + sbb rax, QWORD PTR [rcx+208] + mov r8, QWORD PTR [r10+216] + mov QWORD PTR [r10+208], rax + sbb r8, QWORD PTR [rcx+216] + mov rax, QWORD PTR [r10+224] + mov QWORD PTR [r10+216], r8 + sbb rax, QWORD PTR [rcx+224] + mov r8, QWORD PTR [r10+232] + mov QWORD PTR [r10+224], rax + sbb r8, QWORD PTR [rcx+232] + mov rax, QWORD PTR [r10+240] + mov QWORD PTR [r10+232], r8 + sbb rax, QWORD PTR [rcx+240] + mov r8, QWORD PTR [r10+248] + mov QWORD PTR [r10+240], rax + sbb r8, QWORD PTR [rcx+248] + mov rax, QWORD PTR [r10+256] + mov QWORD PTR [r10+248], r8 + sbb rax, QWORD PTR [rcx+256] + mov r8, QWORD PTR [r10+264] + mov QWORD PTR [r10+256], rax + sbb r8, QWORD PTR [rcx+264] + mov rax, QWORD PTR [r10+272] + mov QWORD PTR [r10+264], r8 + sbb rax, QWORD PTR [rcx+272] + mov r8, QWORD PTR [r10+280] + mov QWORD PTR [r10+272], rax + sbb r8, QWORD PTR [rcx+280] + mov rax, QWORD PTR [r10+288] + mov QWORD PTR [r10+280], r8 + sbb rax, QWORD PTR [rcx+288] + mov r8, QWORD PTR [r10+296] + mov QWORD PTR [r10+288], rax + sbb r8, QWORD PTR [rcx+296] + mov rax, QWORD PTR [r10+304] + mov QWORD PTR [r10+296], r8 + sbb rax, QWORD PTR [rcx+304] + mov r8, QWORD PTR [r10+312] + mov QWORD PTR [r10+304], rax + sbb r8, QWORD PTR [rcx+312] + mov rax, QWORD PTR [r10+320] + mov QWORD PTR [r10+312], r8 + sbb rax, QWORD PTR [rcx+320] + mov r8, QWORD PTR [r10+328] + mov QWORD PTR [r10+320], rax + sbb r8, QWORD PTR [rcx+328] + mov rax, QWORD PTR [r10+336] + mov QWORD PTR [r10+328], r8 + sbb rax, QWORD PTR [rcx+336] + mov r8, QWORD PTR [r10+344] + mov QWORD PTR [r10+336], rax + sbb r8, QWORD PTR [rcx+344] + mov rax, QWORD PTR [r10+352] + mov QWORD PTR [r10+344], r8 + sbb rax, QWORD PTR [rcx+352] + mov r8, QWORD PTR [r10+360] + mov QWORD PTR [r10+352], rax + sbb r8, QWORD PTR [rcx+360] + mov rax, QWORD PTR [r10+368] + mov QWORD PTR [r10+360], r8 + sbb rax, QWORD PTR [rcx+368] + mov r8, QWORD PTR [r10+376] + mov QWORD PTR [r10+368], rax + sbb r8, QWORD PTR [rcx+376] + mov rax, QWORD PTR [r10+384] + mov QWORD PTR [r10+376], r8 + sbb rax, QWORD PTR [rcx+384] + mov r8, QWORD PTR [r10+392] + mov QWORD PTR [r10+384], rax + sbb r8, QWORD PTR [rcx+392] + mov rax, QWORD PTR [r10+400] + mov QWORD PTR [r10+392], r8 + sbb rax, QWORD PTR [rcx+400] + mov r8, QWORD PTR [r10+408] + mov QWORD PTR [r10+400], rax + sbb r8, QWORD PTR [rcx+408] + mov rax, QWORD PTR [r10+416] + mov QWORD PTR [r10+408], r8 + sbb rax, QWORD PTR [rcx+416] + mov r8, QWORD PTR [r10+424] + mov QWORD PTR [r10+416], rax + sbb r8, QWORD PTR [rcx+424] + mov rax, QWORD PTR [r10+432] + mov QWORD PTR [r10+424], r8 + sbb rax, QWORD PTR [rcx+432] + mov r8, QWORD PTR [r10+440] + mov QWORD PTR [r10+432], rax + sbb r8, QWORD PTR [rcx+440] + mov rax, QWORD PTR [r10+448] + mov QWORD PTR [r10+440], r8 + sbb rax, QWORD PTR [rcx+448] + mov r8, QWORD PTR [r10+456] + mov QWORD PTR [r10+448], rax + sbb r8, QWORD PTR [rcx+456] + mov rax, QWORD PTR [r10+464] + mov QWORD PTR [r10+456], r8 + sbb rax, QWORD PTR [rcx+464] + mov r8, QWORD PTR [r10+472] + mov QWORD PTR [r10+464], rax + sbb r8, QWORD PTR [rcx+472] + mov rax, QWORD PTR [r10+480] + mov QWORD PTR [r10+472], r8 + sbb rax, QWORD PTR [rcx+480] + mov r8, QWORD PTR [r10+488] + mov QWORD PTR [r10+480], rax + sbb r8, QWORD PTR [rcx+488] + mov rax, QWORD PTR [r10+496] + mov QWORD PTR [r10+488], r8 + sbb rax, QWORD PTR [rcx+496] + mov r8, QWORD PTR [r10+504] + mov QWORD PTR [r10+496], rax + sbb r8, QWORD PTR [rcx+504] + mov QWORD PTR [r10+504], r8 + sbb r9, 0 + ; Add in place + mov rax, QWORD PTR [rcx+256] + add rax, QWORD PTR [r10] + mov r8, QWORD PTR [rcx+264] + mov QWORD PTR [rcx+256], rax + adc r8, QWORD PTR [r10+8] + mov rax, QWORD PTR [rcx+272] + mov QWORD PTR [rcx+264], r8 + adc rax, QWORD PTR [r10+16] + mov r8, QWORD PTR [rcx+280] + mov QWORD PTR [rcx+272], rax + adc r8, QWORD PTR [r10+24] + mov rax, QWORD PTR [rcx+288] + mov QWORD PTR [rcx+280], r8 + adc rax, QWORD PTR [r10+32] + mov r8, QWORD PTR [rcx+296] + mov QWORD PTR [rcx+288], rax + adc r8, QWORD PTR [r10+40] + mov rax, QWORD PTR [rcx+304] + mov QWORD PTR [rcx+296], r8 + adc rax, QWORD PTR [r10+48] + mov r8, QWORD PTR [rcx+312] + mov QWORD PTR [rcx+304], rax + adc r8, QWORD PTR [r10+56] + mov rax, QWORD PTR [rcx+320] + mov QWORD PTR [rcx+312], r8 + adc rax, QWORD PTR [r10+64] + mov r8, QWORD PTR [rcx+328] + mov QWORD PTR [rcx+320], rax + adc r8, QWORD PTR [r10+72] + mov rax, QWORD PTR [rcx+336] + mov QWORD PTR [rcx+328], r8 + adc rax, QWORD PTR [r10+80] + mov r8, QWORD PTR [rcx+344] + mov QWORD PTR [rcx+336], rax + adc r8, QWORD PTR [r10+88] + mov rax, QWORD PTR [rcx+352] + mov QWORD PTR [rcx+344], r8 + adc rax, QWORD PTR [r10+96] + mov r8, QWORD PTR [rcx+360] + mov QWORD PTR [rcx+352], rax + adc r8, QWORD PTR [r10+104] + mov rax, QWORD PTR [rcx+368] + mov QWORD PTR [rcx+360], r8 + adc rax, QWORD PTR [r10+112] + mov r8, QWORD PTR [rcx+376] + mov QWORD PTR [rcx+368], rax + adc r8, QWORD PTR [r10+120] + mov rax, QWORD PTR [rcx+384] + mov QWORD PTR [rcx+376], r8 + adc rax, QWORD PTR [r10+128] + mov r8, QWORD PTR [rcx+392] + mov QWORD PTR [rcx+384], rax + adc r8, QWORD PTR [r10+136] + mov rax, QWORD PTR [rcx+400] + mov QWORD PTR [rcx+392], r8 + adc rax, QWORD PTR [r10+144] + mov r8, QWORD PTR [rcx+408] + mov QWORD PTR [rcx+400], rax + adc r8, QWORD PTR [r10+152] + mov rax, QWORD PTR [rcx+416] + mov QWORD PTR [rcx+408], r8 + adc rax, QWORD PTR [r10+160] + mov r8, QWORD PTR [rcx+424] + mov QWORD PTR [rcx+416], rax + adc r8, QWORD PTR [r10+168] + mov rax, QWORD PTR [rcx+432] + mov QWORD PTR [rcx+424], r8 + adc rax, QWORD PTR [r10+176] + mov r8, QWORD PTR [rcx+440] + mov QWORD PTR [rcx+432], rax + adc r8, QWORD PTR [r10+184] + mov rax, QWORD PTR [rcx+448] + mov QWORD PTR [rcx+440], r8 + adc rax, QWORD PTR [r10+192] + mov r8, QWORD PTR [rcx+456] + mov QWORD PTR [rcx+448], rax + adc r8, QWORD PTR [r10+200] + mov rax, QWORD PTR [rcx+464] + mov QWORD PTR [rcx+456], r8 + adc rax, QWORD PTR [r10+208] + mov r8, QWORD PTR [rcx+472] + mov QWORD PTR [rcx+464], rax + adc r8, QWORD PTR [r10+216] + mov rax, QWORD PTR [rcx+480] + mov QWORD PTR [rcx+472], r8 + adc rax, QWORD PTR [r10+224] + mov r8, QWORD PTR [rcx+488] + mov QWORD PTR [rcx+480], rax + adc r8, QWORD PTR [r10+232] + mov rax, QWORD PTR [rcx+496] + mov QWORD PTR [rcx+488], r8 + adc rax, QWORD PTR [r10+240] + mov r8, QWORD PTR [rcx+504] + mov QWORD PTR [rcx+496], rax + adc r8, QWORD PTR [r10+248] + mov rax, QWORD PTR [rcx+512] + mov QWORD PTR [rcx+504], r8 + adc rax, QWORD PTR [r10+256] + mov r8, QWORD PTR [rcx+520] + mov QWORD PTR [rcx+512], rax + adc r8, QWORD PTR [r10+264] + mov rax, QWORD PTR [rcx+528] + mov QWORD PTR [rcx+520], r8 + adc rax, QWORD PTR [r10+272] + mov r8, QWORD PTR [rcx+536] + mov QWORD PTR [rcx+528], rax + adc r8, QWORD PTR [r10+280] + mov rax, QWORD PTR [rcx+544] + mov QWORD PTR [rcx+536], r8 + adc rax, QWORD PTR [r10+288] + mov r8, QWORD PTR [rcx+552] + mov QWORD PTR [rcx+544], rax + adc r8, QWORD PTR [r10+296] + mov rax, QWORD PTR [rcx+560] + mov QWORD PTR [rcx+552], r8 + adc rax, QWORD PTR [r10+304] + mov r8, QWORD PTR [rcx+568] + mov QWORD PTR [rcx+560], rax + adc r8, QWORD PTR [r10+312] + mov rax, QWORD PTR [rcx+576] + mov QWORD PTR [rcx+568], r8 + adc rax, QWORD PTR [r10+320] + mov r8, QWORD PTR [rcx+584] + mov QWORD PTR [rcx+576], rax + adc r8, QWORD PTR [r10+328] + mov rax, QWORD PTR [rcx+592] + mov QWORD PTR [rcx+584], r8 + adc rax, QWORD PTR [r10+336] + mov r8, QWORD PTR [rcx+600] + mov QWORD PTR [rcx+592], rax + adc r8, QWORD PTR [r10+344] + mov rax, QWORD PTR [rcx+608] + mov QWORD PTR [rcx+600], r8 + adc rax, QWORD PTR [r10+352] + mov r8, QWORD PTR [rcx+616] + mov QWORD PTR [rcx+608], rax + adc r8, QWORD PTR [r10+360] + mov rax, QWORD PTR [rcx+624] + mov QWORD PTR [rcx+616], r8 + adc rax, QWORD PTR [r10+368] + mov r8, QWORD PTR [rcx+632] + mov QWORD PTR [rcx+624], rax + adc r8, QWORD PTR [r10+376] + mov rax, QWORD PTR [rcx+640] + mov QWORD PTR [rcx+632], r8 + adc rax, QWORD PTR [r10+384] + mov r8, QWORD PTR [rcx+648] + mov QWORD PTR [rcx+640], rax + adc r8, QWORD PTR [r10+392] + mov rax, QWORD PTR [rcx+656] + mov QWORD PTR [rcx+648], r8 + adc rax, QWORD PTR [r10+400] + mov r8, QWORD PTR [rcx+664] + mov QWORD PTR [rcx+656], rax + adc r8, QWORD PTR [r10+408] + mov rax, QWORD PTR [rcx+672] + mov QWORD PTR [rcx+664], r8 + adc rax, QWORD PTR [r10+416] + mov r8, QWORD PTR [rcx+680] + mov QWORD PTR [rcx+672], rax + adc r8, QWORD PTR [r10+424] + mov rax, QWORD PTR [rcx+688] + mov QWORD PTR [rcx+680], r8 + adc rax, QWORD PTR [r10+432] + mov r8, QWORD PTR [rcx+696] + mov QWORD PTR [rcx+688], rax + adc r8, QWORD PTR [r10+440] + mov rax, QWORD PTR [rcx+704] + mov QWORD PTR [rcx+696], r8 + adc rax, QWORD PTR [r10+448] + mov r8, QWORD PTR [rcx+712] + mov QWORD PTR [rcx+704], rax + adc r8, QWORD PTR [r10+456] + mov rax, QWORD PTR [rcx+720] + mov QWORD PTR [rcx+712], r8 + adc rax, QWORD PTR [r10+464] + mov r8, QWORD PTR [rcx+728] + mov QWORD PTR [rcx+720], rax + adc r8, QWORD PTR [r10+472] + mov rax, QWORD PTR [rcx+736] + mov QWORD PTR [rcx+728], r8 + adc rax, QWORD PTR [r10+480] + mov r8, QWORD PTR [rcx+744] + mov QWORD PTR [rcx+736], rax + adc r8, QWORD PTR [r10+488] + mov rax, QWORD PTR [rcx+752] + mov QWORD PTR [rcx+744], r8 + adc rax, QWORD PTR [r10+496] + mov r8, QWORD PTR [rcx+760] + mov QWORD PTR [rcx+752], rax + adc r8, QWORD PTR [r10+504] + mov QWORD PTR [rcx+760], r8 + adc r9, 0 + mov QWORD PTR [rcx+768], r9 + ; Add in place + mov rax, QWORD PTR [rcx+512] + xor r9, r9 + add rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rcx+520] + mov QWORD PTR [rcx+512], rax + adc r8, QWORD PTR [rdx+8] + mov rax, QWORD PTR [rcx+528] + mov QWORD PTR [rcx+520], r8 + adc rax, QWORD PTR [rdx+16] + mov r8, QWORD PTR [rcx+536] + mov QWORD PTR [rcx+528], rax + adc r8, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rcx+544] + mov QWORD PTR [rcx+536], r8 + adc rax, QWORD PTR [rdx+32] + mov r8, QWORD PTR [rcx+552] + mov QWORD PTR [rcx+544], rax + adc r8, QWORD PTR [rdx+40] + mov rax, QWORD PTR [rcx+560] + mov QWORD PTR [rcx+552], r8 + adc rax, QWORD PTR [rdx+48] + mov r8, QWORD PTR [rcx+568] + mov QWORD PTR [rcx+560], rax + adc r8, QWORD PTR [rdx+56] + mov rax, QWORD PTR [rcx+576] + mov QWORD PTR [rcx+568], r8 + adc rax, QWORD PTR [rdx+64] + mov r8, QWORD PTR [rcx+584] + mov QWORD PTR [rcx+576], rax + adc r8, QWORD PTR [rdx+72] + mov rax, QWORD PTR [rcx+592] + mov QWORD PTR [rcx+584], r8 + adc rax, QWORD PTR [rdx+80] + mov r8, QWORD PTR [rcx+600] + mov QWORD PTR [rcx+592], rax + adc r8, QWORD PTR [rdx+88] + mov rax, QWORD PTR [rcx+608] + mov QWORD PTR [rcx+600], r8 + adc rax, QWORD PTR [rdx+96] + mov r8, QWORD PTR [rcx+616] + mov QWORD PTR [rcx+608], rax + adc r8, QWORD PTR [rdx+104] + mov rax, QWORD PTR [rcx+624] + mov QWORD PTR [rcx+616], r8 + adc rax, QWORD PTR [rdx+112] + mov r8, QWORD PTR [rcx+632] + mov QWORD PTR [rcx+624], rax + adc r8, QWORD PTR [rdx+120] + mov rax, QWORD PTR [rcx+640] + mov QWORD PTR [rcx+632], r8 + adc rax, QWORD PTR [rdx+128] + mov r8, QWORD PTR [rcx+648] + mov QWORD PTR [rcx+640], rax + adc r8, QWORD PTR [rdx+136] + mov rax, QWORD PTR [rcx+656] + mov QWORD PTR [rcx+648], r8 + adc rax, QWORD PTR [rdx+144] + mov r8, QWORD PTR [rcx+664] + mov QWORD PTR [rcx+656], rax + adc r8, QWORD PTR [rdx+152] + mov rax, QWORD PTR [rcx+672] + mov QWORD PTR [rcx+664], r8 + adc rax, QWORD PTR [rdx+160] + mov r8, QWORD PTR [rcx+680] + mov QWORD PTR [rcx+672], rax + adc r8, QWORD PTR [rdx+168] + mov rax, QWORD PTR [rcx+688] + mov QWORD PTR [rcx+680], r8 + adc rax, QWORD PTR [rdx+176] + mov r8, QWORD PTR [rcx+696] + mov QWORD PTR [rcx+688], rax + adc r8, QWORD PTR [rdx+184] + mov rax, QWORD PTR [rcx+704] + mov QWORD PTR [rcx+696], r8 + adc rax, QWORD PTR [rdx+192] + mov r8, QWORD PTR [rcx+712] + mov QWORD PTR [rcx+704], rax + adc r8, QWORD PTR [rdx+200] + mov rax, QWORD PTR [rcx+720] + mov QWORD PTR [rcx+712], r8 + adc rax, QWORD PTR [rdx+208] + mov r8, QWORD PTR [rcx+728] + mov QWORD PTR [rcx+720], rax + adc r8, QWORD PTR [rdx+216] + mov rax, QWORD PTR [rcx+736] + mov QWORD PTR [rcx+728], r8 + adc rax, QWORD PTR [rdx+224] + mov r8, QWORD PTR [rcx+744] + mov QWORD PTR [rcx+736], rax + adc r8, QWORD PTR [rdx+232] + mov rax, QWORD PTR [rcx+752] + mov QWORD PTR [rcx+744], r8 + adc rax, QWORD PTR [rdx+240] + mov r8, QWORD PTR [rcx+760] + mov QWORD PTR [rcx+752], rax + adc r8, QWORD PTR [rdx+248] + mov rax, QWORD PTR [rcx+768] + mov QWORD PTR [rcx+760], r8 + adc rax, QWORD PTR [rdx+256] + mov QWORD PTR [rcx+768], rax + adc r9, 0 + ; Add to zero + mov rax, QWORD PTR [rdx+264] + adc rax, 0 + mov r8, QWORD PTR [rdx+272] + mov QWORD PTR [rcx+776], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+280] + mov QWORD PTR [rcx+784], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+288] + mov QWORD PTR [rcx+792], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+296] + mov QWORD PTR [rcx+800], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+304] + mov QWORD PTR [rcx+808], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+312] + mov QWORD PTR [rcx+816], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+320] + mov QWORD PTR [rcx+824], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+328] + mov QWORD PTR [rcx+832], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+336] + mov QWORD PTR [rcx+840], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+344] + mov QWORD PTR [rcx+848], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+352] + mov QWORD PTR [rcx+856], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+360] + mov QWORD PTR [rcx+864], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+368] + mov QWORD PTR [rcx+872], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+376] + mov QWORD PTR [rcx+880], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+384] + mov QWORD PTR [rcx+888], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+392] + mov QWORD PTR [rcx+896], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+400] + mov QWORD PTR [rcx+904], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+408] + mov QWORD PTR [rcx+912], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+416] + mov QWORD PTR [rcx+920], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+424] + mov QWORD PTR [rcx+928], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+432] + mov QWORD PTR [rcx+936], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+440] + mov QWORD PTR [rcx+944], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+448] + mov QWORD PTR [rcx+952], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+456] + mov QWORD PTR [rcx+960], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+464] + mov QWORD PTR [rcx+968], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+472] + mov QWORD PTR [rcx+976], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+480] + mov QWORD PTR [rcx+984], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+488] + mov QWORD PTR [rcx+992], r8 + adc rax, 0 + mov r8, QWORD PTR [rdx+496] + mov QWORD PTR [rcx+1000], rax + adc r8, 0 + mov rax, QWORD PTR [rdx+504] + mov QWORD PTR [rcx+1008], r8 + adc rax, 0 + mov QWORD PTR [rcx+1016], rax + add rsp, 1304 + pop r12 + ret +sp_4096_sqr_64 ENDP +_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Square a and put result in r. (r = a * a) ; *