From 23af4e92f368570ce4e5144360d0e09063e5335c Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 29 Jul 2019 11:02:29 +1000 Subject: [PATCH] Special implementation of mod exp when base is 2 in SP --- wolfcrypt/benchmark/benchmark.c | 86 +- wolfcrypt/src/sp_arm32.c | 4339 +++++++++++++++++++++++++------ wolfcrypt/src/sp_arm64.c | 888 ++++++- wolfcrypt/src/sp_armthumb.c | 2553 ++++++++++++------ wolfcrypt/src/sp_c32.c | 498 +++- wolfcrypt/src/sp_c64.c | 366 ++- wolfcrypt/src/sp_cortexm.c | 3551 +++++++++++++------------ wolfcrypt/src/sp_x86_64.c | 602 ++++- wolfcrypt/src/sp_x86_64_asm.S | 282 ++ wolfcrypt/test/test.c | 84 + 10 files changed, 9802 insertions(+), 3447 deletions(-) diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 1797231e2..cd6c3c033 100755 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -499,7 +499,7 @@ static int lng_index = 0; #ifndef NO_MAIN_DRIVER #ifndef MAIN_NO_ARGS -static const char* bench_Usage_msg1[][12] = { +static const char* bench_Usage_msg1[][14] = { /* 0 English */ { "-? Help, print this usage\n 0: English, 1: Japanese\n", "-csv Print terminal output in csv format\n", @@ -508,6 +508,8 @@ static const char* bench_Usage_msg1[][12] = { "-dgst_full Full digest operation performed.\n", "-rsa_sign Measure RSA sign/verify instead of encrypt/decrypt.\n", " -rsa-sz\n Measure RSA performance.\n", + "-ffhdhe2048 Measure DH using FFDHE 2048-bit parameters.\n", + "-ffhdhe3072 Measure DH using FFDHE 3072-bit parameters.\n", "- Algorithm to benchmark. Available algorithms include:\n", "-lng Display benchmark result by specified language.\n 0: English, 1: Japanese\n", " Size of block in bytes\n", @@ -523,6 +525,8 @@ static const char* bench_Usage_msg1[][12] = { "-dgst_full フルの digest 暗号操作を実施します。\n", "-rsa_sign 暗号/復号化の代わりに RSA の署名/検証を測定します。\n", " -rsa-sz\n RSA の性能を測定します。\n", + "-ffhdhe2048 Measure DH using FFDHE 2048-bit parameters.\n", + "-ffhdhe3072 Measure DH using FFDHE 3072-bit parameters.\n", "- アルゴリズムのベンチマークを実施します。\n 利用可能なアルゴリズムは下記を含みます:\n", "-lng 指定された言語でベンチマーク結果を表示します。\n 0: 英語、 1: 日本語\n", " ブロックサイズをバイト単位で指定します。\n", @@ -858,6 +862,10 @@ static int digest_stream = 1; /* Don't measure RSA sign/verify by default */ static int rsa_sign_verify = 0; #endif +#ifndef NO_DH +/* Use the FFDHE parameters */ +static int use_ffdhe = 0; +#endif /* Don't print out in CSV format by default */ static int csv_format = 0; @@ -4647,7 +4655,7 @@ void bench_dh(int doAsync) int dhKeySz = BENCH_DH_KEY_SIZE * 8; /* used in printf */ const char**desc = bench_desc_words[lng_index]; #ifndef NO_ASN - size_t bytes; + size_t bytes = 0; word32 idx; #endif word32 pubSz[BENCH_MAX_PENDING]; @@ -4655,6 +4663,9 @@ void bench_dh(int doAsync) word32 pubSz2; word32 privSz2; word32 agreeSz[BENCH_MAX_PENDING]; +#ifdef HAVE_FFDHE_2048 + const DhParams *params = NULL; +#endif DECLARE_ARRAY(pub, byte, BENCH_MAX_PENDING, BENCH_DH_KEY_SIZE, HEAP_HINT); DECLARE_VAR(pub2, byte, BENCH_DH_KEY_SIZE, HEAP_HINT); @@ -4664,24 +4675,38 @@ void bench_dh(int doAsync) (void)tmp; + if (!use_ffdhe) { #if defined(NO_ASN) - dhKeySz = 1024; - /* do nothing, but don't use default FILE */ + dhKeySz = 1024; + /* do nothing, but don't use default FILE */ #elif defined(USE_CERT_BUFFERS_1024) - tmp = dh_key_der_1024; - bytes = (size_t)sizeof_dh_key_der_1024; - dhKeySz = 1024; + tmp = dh_key_der_1024; + bytes = (size_t)sizeof_dh_key_der_1024; + dhKeySz = 1024; #elif defined(USE_CERT_BUFFERS_2048) - tmp = dh_key_der_2048; - bytes = (size_t)sizeof_dh_key_der_2048; - dhKeySz = 2048; + tmp = dh_key_der_2048; + bytes = (size_t)sizeof_dh_key_der_2048; + dhKeySz = 2048; #elif defined(USE_CERT_BUFFERS_3072) - tmp = dh_key_der_3072; - bytes = (size_t)sizeof_dh_key_der_3072; - dhKeySz = 3072; + tmp = dh_key_der_3072; + bytes = (size_t)sizeof_dh_key_der_3072; + dhKeySz = 3072; #else #error "need to define a cert buffer size" #endif /* USE_CERT_BUFFERS */ + } +#ifdef HAVE_FFDHE_2048 + else if (use_ffdhe == 2048) { + params = wc_Dh_ffdhe2048_Get(); + dhKeySz = 2048; + } +#endif +#ifdef HAVE_FFDHE_3072 + else if (use_ffdhe == 3072) { + params = wc_Dh_ffdhe3072_Get(); + dhKeySz = 3072; + } +#endif /* clear for done cleanup */ XMEMSET(dhKey, 0, sizeof(dhKey)); @@ -4695,11 +4720,20 @@ void bench_dh(int doAsync) goto exit; /* setup key */ + if (!use_ffdhe) { #ifdef NO_ASN - ret = wc_DhSetKey(&dhKey[i], dh_p, sizeof(dh_p), dh_g, sizeof(dh_g)); + ret = wc_DhSetKey(&dhKey[i], dh_p, sizeof(dh_p), dh_g, + sizeof(dh_g)); #else - idx = 0; - ret = wc_DhKeyDecode(tmp, &idx, &dhKey[i], (word32)bytes); + idx = 0; + ret = wc_DhKeyDecode(tmp, &idx, &dhKey[i], (word32)bytes); + #endif + } + #if defined(HAVE_FFDHE_2048) || defined(HAVE_FFDHE_3072) + else if (params != NULL) { + ret = wc_DhSetKey(&dhKey[i], params->p, params->p_len, params->g, + params->g_len); + } #endif if (ret != 0) { printf("DhKeyDecode failed %d, can't benchmark\n", ret); @@ -5682,8 +5716,14 @@ static void Usage(void) printf("%s", bench_Usage_msg1[lng_index][6]); /* option -rsa-sz */ #endif #endif +#if !defined(NO_DH) && defined(HAVE_FFDHE_2048) + printf("%s", bench_Usage_msg1[lng_index][7]); /* option -ffdhe2048 */ +#endif +#if !defined(NO_DH) && defined(HAVE_FFDHE_3072) + printf("%s", bench_Usage_msg1[lng_index][8]); /* option -ffdhe3072 */ +#endif #ifndef WOLFSSL_BENCHMARK_ALL - printf("%s", bench_Usage_msg1[lng_index][7]); /* option - */ + printf("%s", bench_Usage_msg1[lng_index][9]); /* option - */ printf(" "); line = 13; for (i=0; bench_cipher_opt[i].str != NULL; i++) @@ -5706,8 +5746,8 @@ static void Usage(void) print_alg(bench_other_opt[i].str + 1, &line); printf("\n"); #endif - printf("%s", bench_Usage_msg1[lng_index][8]); /* option -lng */ - printf("%s", bench_Usage_msg1[lng_index][9]); /* option */ + printf("%s", bench_Usage_msg1[lng_index][10]); /* option -lng */ + printf("%s", bench_Usage_msg1[lng_index][11]); /* option */ #if defined(WOLFSSL_ASYNC_CRYPT) && !defined(WC_NO_ASYNC_THREADING) printf("%s", bench_Usage_msg1[lng_index][10]); /* option -threads */ #endif @@ -5791,6 +5831,14 @@ int main(int argc, char** argv) else if (string_matches(argv[1], "-rsa_sign")) rsa_sign_verify = 1; #endif +#if !defined(NO_DH) && defined(HAVE_FFDHE_2048) + else if (string_matches(argv[1], "-ffdhe2048")) + use_ffdhe = 2048; +#endif +#if !defined(NO_DH) && defined(HAVE_FFDHE_3072) + else if (string_matches(argv[1], "-ffdhe3072")) + use_ffdhe = 3072; +#endif #ifdef BENCH_ASYM else if (string_matches(argv[1], "-csv")) { csv_format = 1; diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index 47e82de99..8b28392a5 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -102,14 +102,14 @@ static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a) s = 32 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 32 <= DIGIT_BIT) { s += 32; r[j] &= 0xffffffff; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -1253,7 +1253,7 @@ static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_8(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -1278,7 +1278,7 @@ static void sp_2048_mask_8(sp_digit* r, sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit* z0 = r; @@ -1310,7 +1310,7 @@ static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; sp_digit z2[16]; @@ -1635,7 +1635,7 @@ static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -1664,7 +1664,7 @@ static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit* z0 = r; @@ -1696,7 +1696,7 @@ static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; sp_digit z2[32]; @@ -2277,7 +2277,7 @@ static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -2306,7 +2306,7 @@ static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit* z0 = r; @@ -2338,7 +2338,7 @@ static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; sp_digit z2[64]; @@ -2592,7 +2592,7 @@ static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) { int i; @@ -4354,8 +4354,11 @@ static int32_t sp_2048_cmp_32(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "sub r6, r6, #4\n\t" "bcc 1b\n\t" @@ -4373,256 +4376,352 @@ static int32_t sp_2048_cmp_32(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #120]\n\t" "ldr r5, [%[b], #120]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #116]\n\t" "ldr r5, [%[b], #116]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #112]\n\t" "ldr r5, [%[b], #112]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #108]\n\t" "ldr r5, [%[b], #108]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #104]\n\t" "ldr r5, [%[b], #104]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #100]\n\t" "ldr r5, [%[b], #100]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #96]\n\t" "ldr r5, [%[b], #96]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #92]\n\t" "ldr r5, [%[b], #92]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #88]\n\t" "ldr r5, [%[b], #88]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #84]\n\t" "ldr r5, [%[b], #84]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #80]\n\t" "ldr r5, [%[b], #80]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #76]\n\t" "ldr r5, [%[b], #76]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #72]\n\t" "ldr r5, [%[b], #72]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #68]\n\t" "ldr r5, [%[b], #68]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #64]\n\t" "ldr r5, [%[b], #64]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #60]\n\t" "ldr r5, [%[b], #60]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #56]\n\t" "ldr r5, [%[b], #56]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #52]\n\t" "ldr r5, [%[b], #52]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #48]\n\t" "ldr r5, [%[b], #48]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #44]\n\t" "ldr r5, [%[b], #44]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #40]\n\t" "ldr r5, [%[b], #40]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #36]\n\t" "ldr r5, [%[b], #36]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #32]\n\t" "ldr r5, [%[b], #32]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #28]\n\t" "ldr r5, [%[b], #28]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #24]\n\t" "ldr r5, [%[b], #24]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #20]\n\t" "ldr r5, [%[b], #20]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #16]\n\t" "ldr r5, [%[b], #16]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #12]\n\t" "ldr r5, [%[b], #12]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #8]\n\t" "ldr r5, [%[b], #8]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #4]\n\t" "ldr r5, [%[b], #4]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #0]\n\t" "ldr r5, [%[b], #0]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "eor %[r], %[r], r3\n\t" : [r] "+r" (r) @@ -4652,6 +4751,7 @@ static WC_INLINE int sp_2048_div_32(sp_digit* a, sp_digit* d, sp_digit* m, (void)m; + div = d[31]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); for (i=31; i>=0; i--) { @@ -4760,9 +4860,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -4792,10 +4895,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_32(r, r, m, mp); - sp_2048_mont_mul_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_32(r, m, mp); @@ -4903,9 +5002,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -4936,10 +5038,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_32(r, r, m, mp); - sp_2048_mont_mul_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_32(r, m, mp); @@ -6046,7 +6144,7 @@ static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_64(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -6093,8 +6191,11 @@ static int32_t sp_2048_cmp_64(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "sub r6, r6, #4\n\t" "bcc 1b\n\t" @@ -6112,512 +6213,704 @@ static int32_t sp_2048_cmp_64(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #248]\n\t" "ldr r5, [%[b], #248]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #244]\n\t" "ldr r5, [%[b], #244]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #240]\n\t" "ldr r5, [%[b], #240]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #236]\n\t" "ldr r5, [%[b], #236]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #232]\n\t" "ldr r5, [%[b], #232]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #228]\n\t" "ldr r5, [%[b], #228]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #224]\n\t" "ldr r5, [%[b], #224]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #220]\n\t" "ldr r5, [%[b], #220]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #216]\n\t" "ldr r5, [%[b], #216]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #212]\n\t" "ldr r5, [%[b], #212]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #208]\n\t" "ldr r5, [%[b], #208]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #204]\n\t" "ldr r5, [%[b], #204]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #200]\n\t" "ldr r5, [%[b], #200]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #196]\n\t" "ldr r5, [%[b], #196]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #192]\n\t" "ldr r5, [%[b], #192]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #188]\n\t" "ldr r5, [%[b], #188]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #184]\n\t" "ldr r5, [%[b], #184]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #180]\n\t" "ldr r5, [%[b], #180]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #176]\n\t" "ldr r5, [%[b], #176]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #172]\n\t" "ldr r5, [%[b], #172]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #168]\n\t" "ldr r5, [%[b], #168]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #164]\n\t" "ldr r5, [%[b], #164]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #160]\n\t" "ldr r5, [%[b], #160]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #156]\n\t" "ldr r5, [%[b], #156]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #152]\n\t" "ldr r5, [%[b], #152]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #148]\n\t" "ldr r5, [%[b], #148]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #144]\n\t" "ldr r5, [%[b], #144]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #140]\n\t" "ldr r5, [%[b], #140]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #136]\n\t" "ldr r5, [%[b], #136]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #132]\n\t" "ldr r5, [%[b], #132]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #128]\n\t" "ldr r5, [%[b], #128]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #124]\n\t" "ldr r5, [%[b], #124]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #120]\n\t" "ldr r5, [%[b], #120]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #116]\n\t" "ldr r5, [%[b], #116]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #112]\n\t" "ldr r5, [%[b], #112]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #108]\n\t" "ldr r5, [%[b], #108]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #104]\n\t" "ldr r5, [%[b], #104]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #100]\n\t" "ldr r5, [%[b], #100]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #96]\n\t" "ldr r5, [%[b], #96]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #92]\n\t" "ldr r5, [%[b], #92]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #88]\n\t" "ldr r5, [%[b], #88]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #84]\n\t" "ldr r5, [%[b], #84]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #80]\n\t" "ldr r5, [%[b], #80]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #76]\n\t" "ldr r5, [%[b], #76]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #72]\n\t" "ldr r5, [%[b], #72]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #68]\n\t" "ldr r5, [%[b], #68]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #64]\n\t" "ldr r5, [%[b], #64]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #60]\n\t" "ldr r5, [%[b], #60]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #56]\n\t" "ldr r5, [%[b], #56]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #52]\n\t" "ldr r5, [%[b], #52]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #48]\n\t" "ldr r5, [%[b], #48]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #44]\n\t" "ldr r5, [%[b], #44]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #40]\n\t" "ldr r5, [%[b], #40]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #36]\n\t" "ldr r5, [%[b], #36]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #32]\n\t" "ldr r5, [%[b], #32]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #28]\n\t" "ldr r5, [%[b], #28]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #24]\n\t" "ldr r5, [%[b], #24]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #20]\n\t" "ldr r5, [%[b], #20]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #16]\n\t" "ldr r5, [%[b], #16]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #12]\n\t" "ldr r5, [%[b], #12]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #8]\n\t" "ldr r5, [%[b], #8]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #4]\n\t" "ldr r5, [%[b], #4]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #0]\n\t" "ldr r5, [%[b], #0]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "eor %[r], %[r], r3\n\t" : [r] "+r" (r) @@ -6647,6 +6940,7 @@ static WC_INLINE int sp_2048_div_64(sp_digit* a, sp_digit* d, sp_digit* m, (void)m; + div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); for (i=63; i>=0; i--) { @@ -6698,6 +6992,7 @@ static WC_INLINE int sp_2048_div_64_cond(sp_digit* a, sp_digit* d, sp_digit* m, (void)m; + div = d[63]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); for (i=63; i>=0; i--) { @@ -6809,9 +7104,12 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 64); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -6841,10 +7139,6 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_64(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_64(r, r, m, mp); - sp_2048_mont_mul_64(r, r, t[y], m, mp); XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); sp_2048_mont_reduce_64(r, m, mp); @@ -6952,9 +7246,12 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 64); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -6985,10 +7282,6 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_64(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_64(r, r, m, mp); - sp_2048_mont_mul_64(r, r, t[y], m, mp); XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); sp_2048_mont_reduce_64(r, m, mp); @@ -7303,7 +7596,7 @@ static int sp_2048_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 64; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 32 >= DIGIT_BIT) { - #if DIGIT_BIT < 32 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -7360,6 +7653,507 @@ int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_2048 +static void sp_2048_lshift_64(sp_digit* r, sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "mov r6, #31\n\t" + "sub r6, r6, %[n]\n\t" + "ldr r3, [%[a], #252]\n\t" + "lsr r4, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r4, r4, r6\n\t" + "ldr r2, [%[a], #248]\n\t" + "str r4, [%[r], #256]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #244]\n\t" + "str r3, [%[r], #252]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #240]\n\t" + "str r2, [%[r], #248]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #236]\n\t" + "str r4, [%[r], #244]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #232]\n\t" + "str r3, [%[r], #240]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #228]\n\t" + "str r2, [%[r], #236]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #224]\n\t" + "str r4, [%[r], #232]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #220]\n\t" + "str r3, [%[r], #228]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #216]\n\t" + "str r2, [%[r], #224]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #212]\n\t" + "str r4, [%[r], #220]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #208]\n\t" + "str r3, [%[r], #216]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #204]\n\t" + "str r2, [%[r], #212]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #200]\n\t" + "str r4, [%[r], #208]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #196]\n\t" + "str r3, [%[r], #204]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #192]\n\t" + "str r2, [%[r], #200]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #188]\n\t" + "str r4, [%[r], #196]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #184]\n\t" + "str r3, [%[r], #192]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #180]\n\t" + "str r2, [%[r], #188]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #176]\n\t" + "str r4, [%[r], #184]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #172]\n\t" + "str r3, [%[r], #180]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #168]\n\t" + "str r2, [%[r], #176]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #164]\n\t" + "str r4, [%[r], #172]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #160]\n\t" + "str r3, [%[r], #168]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #156]\n\t" + "str r2, [%[r], #164]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #152]\n\t" + "str r4, [%[r], #160]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #148]\n\t" + "str r3, [%[r], #156]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #144]\n\t" + "str r2, [%[r], #152]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #140]\n\t" + "str r4, [%[r], #148]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #136]\n\t" + "str r3, [%[r], #144]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #132]\n\t" + "str r2, [%[r], #140]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #128]\n\t" + "str r4, [%[r], #136]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #124]\n\t" + "str r3, [%[r], #132]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #120]\n\t" + "str r2, [%[r], #128]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #116]\n\t" + "str r4, [%[r], #124]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #112]\n\t" + "str r3, [%[r], #120]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #108]\n\t" + "str r2, [%[r], #116]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #104]\n\t" + "str r4, [%[r], #112]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #100]\n\t" + "str r3, [%[r], #108]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #96]\n\t" + "str r2, [%[r], #104]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #92]\n\t" + "str r4, [%[r], #100]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #88]\n\t" + "str r3, [%[r], #96]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #84]\n\t" + "str r2, [%[r], #92]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #80]\n\t" + "str r4, [%[r], #88]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #76]\n\t" + "str r3, [%[r], #84]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #72]\n\t" + "str r2, [%[r], #80]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #68]\n\t" + "str r4, [%[r], #76]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #64]\n\t" + "str r3, [%[r], #72]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #60]\n\t" + "str r2, [%[r], #68]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #56]\n\t" + "str r4, [%[r], #64]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #52]\n\t" + "str r3, [%[r], #60]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #48]\n\t" + "str r2, [%[r], #56]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #44]\n\t" + "str r4, [%[r], #52]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #40]\n\t" + "str r3, [%[r], #48]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #36]\n\t" + "str r2, [%[r], #44]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #32]\n\t" + "str r4, [%[r], #40]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #28]\n\t" + "str r3, [%[r], #36]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #24]\n\t" + "str r2, [%[r], #32]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #20]\n\t" + "str r4, [%[r], #28]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #16]\n\t" + "str r3, [%[r], #24]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #12]\n\t" + "str r2, [%[r], #20]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #8]\n\t" + "str r4, [%[r], #16]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #4]\n\t" + "str r3, [%[r], #12]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #0]\n\t" + "str r2, [%[r], #8]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "str r3, [%[r]]\n\t" + "str r4, [%[r], #4]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "r2", "r3", "r4", "r5", "r6" + ); +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_2048_mod_exp_2_64(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[128]; + sp_digit td[65]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 193, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 128; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_2048_mont_setup(m, &mp); + sp_2048_mont_norm_64(norm, m); + + i = (bits - 1) / 32; + n = e[i--]; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; + sp_2048_lshift_64(r, norm, y); + for (; i>=0 || c>=5; ) { + if (c == 0) { + n = e[i--]; + y = n >> 27; + n <<= 5; + c = 27; + } + else if (c < 5) { + y = n >> 27; + n = e[i--]; + c = 5 - c; + y |= n >> (32 - c); + n <<= c; + c = 32 - c; + } + else { + y = (n >> 27) & 0x1f; + n <<= 5; + c -= 5; + } + + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + + sp_2048_lshift_64(r, r, y); + sp_2048_mul_d_64(tmp, norm, r[64]); + r[64] = 0; + o = sp_2048_add_64(r, r, tmp); + sp_2048_cond_sub_64(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); + sp_2048_mont_reduce_64(r, m, mp); + + mask = 0 - (sp_2048_cmp_64(r, m) >= 0); + sp_2048_cond_sub_64(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_FFDHE_2048 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -7390,7 +8184,13 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen, sp_2048_from_bin(e, 64, exp, expLen); sp_2048_from_mp(m, 64, mod); - err = sp_2048_mod_exp_64(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_2048 + if (base->used == 1 && base->dp[0] == 2 && m[63] == (sp_digit)-1) + err = sp_2048_mod_exp_2_64(r, e, expLen * 8, m); + else + #endif + err = sp_2048_mod_exp_64(r, b, e, expLen * 8, m, 0); + } if (err == MP_OKAY) { @@ -7508,14 +8308,14 @@ static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a) s = 32 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 32 <= DIGIT_BIT) { s += 32; r[j] &= 0xffffffff; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -7590,10 +8390,10 @@ static void sp_3072_to_bin(sp_digit* r, byte* a) * a A single precision integer. * b A single precision integer. */ -static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) +static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b) { __asm__ __volatile__ ( - "sub sp, sp, #32\n\t" + "sub sp, sp, #48\n\t" "mov r10, #0\n\t" "# A[0] * B[0]\n\t" "ldr r8, [%[a], #0]\n\t" @@ -7853,13 +8653,20 @@ static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adcs r5, r5, r7\n\t" "adc r3, r3, r10\n\t" "str r4, [sp, #28]\n\t" + "# A[0] * B[8]\n\t" + "ldr r8, [%[a], #0]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r10, r10\n\t" "# A[1] * B[7]\n\t" "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #28]\n\t" "umull r6, r7, r8, r9\n\t" "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" - "adc r4, r10, r10\n\t" + "adc r4, r4, r10\n\t" "# A[2] * B[6]\n\t" "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #24]\n\t" @@ -7902,14 +8709,35 @@ static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, r10\n\t" - "str r5, [%[r], #32]\n\t" + "# A[8] * B[0]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #0]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "str r5, [sp, #32]\n\t" + "# A[0] * B[9]\n\t" + "ldr r8, [%[a], #0]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r10, r10\n\t" + "# A[1] * B[8]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" "# A[2] * B[7]\n\t" "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #28]\n\t" "umull r6, r7, r8, r9\n\t" "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" - "adc r5, r10, r10\n\t" + "adc r5, r5, r10\n\t" "# A[3] * B[6]\n\t" "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #24]\n\t" @@ -7945,14 +8773,49 @@ static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, r10\n\t" - "str r3, [%[r], #36]\n\t" + "# A[8] * B[1]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #4]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[9] * B[0]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #0]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "str r3, [sp, #36]\n\t" + "# A[0] * B[10]\n\t" + "ldr r8, [%[a], #0]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r10, r10\n\t" + "# A[1] * B[9]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[2] * B[8]\n\t" + "ldr r8, [%[a], #8]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" "# A[3] * B[7]\n\t" "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #28]\n\t" "umull r6, r7, r8, r9\n\t" "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" - "adc r3, r10, r10\n\t" + "adc r3, r3, r10\n\t" "# A[4] * B[6]\n\t" "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #24]\n\t" @@ -7981,14 +8844,63 @@ static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, r10\n\t" - "str r4, [%[r], #40]\n\t" + "# A[8] * B[2]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #8]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[9] * B[1]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #4]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[10] * B[0]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #0]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "str r4, [sp, #40]\n\t" + "# A[0] * B[11]\n\t" + "ldr r8, [%[a], #0]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r10, r10\n\t" + "# A[1] * B[10]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[2] * B[9]\n\t" + "ldr r8, [%[a], #8]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[3] * B[8]\n\t" + "ldr r8, [%[a], #12]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" "# A[4] * B[7]\n\t" "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #28]\n\t" "umull r6, r7, r8, r9\n\t" "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" - "adc r4, r10, r10\n\t" + "adc r4, r4, r10\n\t" "# A[5] * B[6]\n\t" "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #24]\n\t" @@ -8010,14 +8922,70 @@ static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, r10\n\t" - "str r5, [%[r], #44]\n\t" + "# A[8] * B[3]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #12]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[9] * B[2]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #8]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[10] * B[1]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #4]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[11] * B[0]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #0]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "str r5, [sp, #44]\n\t" + "# A[1] * B[11]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r10, r10\n\t" + "# A[2] * B[10]\n\t" + "ldr r8, [%[a], #8]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[3] * B[9]\n\t" + "ldr r8, [%[a], #12]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[4] * B[8]\n\t" + "ldr r8, [%[a], #16]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" "# A[5] * B[7]\n\t" "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #28]\n\t" "umull r6, r7, r8, r9\n\t" "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" - "adc r5, r10, r10\n\t" + "adc r5, r5, r10\n\t" "# A[6] * B[6]\n\t" "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #24]\n\t" @@ -8032,14 +9000,70 @@ static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, r10\n\t" + "# A[8] * B[4]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #16]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[9] * B[3]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #12]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[10] * B[2]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #8]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[11] * B[1]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #4]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" "str r3, [%[r], #48]\n\t" + "# A[2] * B[11]\n\t" + "ldr r8, [%[a], #8]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r10, r10\n\t" + "# A[3] * B[10]\n\t" + "ldr r8, [%[a], #12]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[4] * B[9]\n\t" + "ldr r8, [%[a], #16]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[5] * B[8]\n\t" + "ldr r8, [%[a], #20]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" "# A[6] * B[7]\n\t" "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #28]\n\t" "umull r6, r7, r8, r9\n\t" "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" - "adc r3, r10, r10\n\t" + "adc r3, r3, r10\n\t" "# A[7] * B[6]\n\t" "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #24]\n\t" @@ -8047,15 +9071,359 @@ static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, r10\n\t" + "# A[8] * B[5]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #20]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[9] * B[4]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #16]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[10] * B[3]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #12]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[11] * B[2]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #8]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" "str r4, [%[r], #52]\n\t" + "# A[3] * B[11]\n\t" + "ldr r8, [%[a], #12]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r10, r10\n\t" + "# A[4] * B[10]\n\t" + "ldr r8, [%[a], #16]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[5] * B[9]\n\t" + "ldr r8, [%[a], #20]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[6] * B[8]\n\t" + "ldr r8, [%[a], #24]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" "# A[7] * B[7]\n\t" "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #28]\n\t" "umull r6, r7, r8, r9\n\t" "adds r5, r5, r6\n\t" - "adc r3, r3, r7\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[8] * B[6]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #24]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[9] * B[5]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #20]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[10] * B[4]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #16]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[11] * B[3]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #12]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" "str r5, [%[r], #56]\n\t" + "# A[4] * B[11]\n\t" + "ldr r8, [%[a], #16]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r10, r10\n\t" + "# A[5] * B[10]\n\t" + "ldr r8, [%[a], #20]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[6] * B[9]\n\t" + "ldr r8, [%[a], #24]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[7] * B[8]\n\t" + "ldr r8, [%[a], #28]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[8] * B[7]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #28]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[9] * B[6]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #24]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[10] * B[5]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #20]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[11] * B[4]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #16]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" "str r3, [%[r], #60]\n\t" + "# A[5] * B[11]\n\t" + "ldr r8, [%[a], #20]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r10, r10\n\t" + "# A[6] * B[10]\n\t" + "ldr r8, [%[a], #24]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[7] * B[9]\n\t" + "ldr r8, [%[a], #28]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[8] * B[8]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[9] * B[7]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #28]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[10] * B[6]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #24]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[11] * B[5]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #20]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "str r4, [%[r], #64]\n\t" + "# A[6] * B[11]\n\t" + "ldr r8, [%[a], #24]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r10, r10\n\t" + "# A[7] * B[10]\n\t" + "ldr r8, [%[a], #28]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[8] * B[9]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[9] * B[8]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[10] * B[7]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #28]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[11] * B[6]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #24]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "str r5, [%[r], #68]\n\t" + "# A[7] * B[11]\n\t" + "ldr r8, [%[a], #28]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r10, r10\n\t" + "# A[8] * B[10]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[9] * B[9]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[10] * B[8]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "# A[11] * B[7]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #28]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "str r3, [%[r], #72]\n\t" + "# A[8] * B[11]\n\t" + "ldr r8, [%[a], #32]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r10, r10\n\t" + "# A[9] * B[10]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[10] * B[9]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "# A[11] * B[8]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #32]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r7\n\t" + "adc r3, r3, r10\n\t" + "str r4, [%[r], #76]\n\t" + "# A[9] * B[11]\n\t" + "ldr r8, [%[a], #36]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r10, r10\n\t" + "# A[10] * B[10]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "# A[11] * B[9]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #36]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r7\n\t" + "adc r4, r4, r10\n\t" + "str r5, [%[r], #80]\n\t" + "# A[10] * B[11]\n\t" + "ldr r8, [%[a], #40]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r10, r10\n\t" + "# A[11] * B[10]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #40]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, r10\n\t" + "str r3, [%[r], #84]\n\t" + "# A[11] * B[11]\n\t" + "ldr r8, [%[a], #44]\n\t" + "ldr r9, [%[b], #44]\n\t" + "umull r6, r7, r8, r9\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" "ldr r3, [sp, #0]\n\t" "ldr r4, [sp, #4]\n\t" "ldr r5, [sp, #8]\n\t" @@ -8072,7 +9440,15 @@ static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "str r4, [%[r], #20]\n\t" "str r5, [%[r], #24]\n\t" "str r6, [%[r], #28]\n\t" - "add sp, sp, #32\n\t" + "ldr r3, [sp, #32]\n\t" + "ldr r4, [sp, #36]\n\t" + "ldr r5, [sp, #40]\n\t" + "ldr r6, [sp, #44]\n\t" + "str r3, [%[r], #32]\n\t" + "str r4, [%[r], #36]\n\t" + "str r5, [%[r], #40]\n\t" + "str r6, [%[r], #44]\n\t" + "add sp, sp, #48\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" @@ -8084,10 +9460,10 @@ static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) * r A single precision integer. * a A single precision integer. */ -static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) +static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( - "sub sp, sp, #32\n\t" + "sub sp, sp, #48\n\t" "mov r14, #0\n\t" "# A[0] * A[0]\n\t" "ldr r10, [%[a], #0]\n\t" @@ -8264,12 +9640,19 @@ static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "adcs r4, r4, r6\n\t" "adc r2, r2, r7\n\t" "str r3, [sp, #28]\n\t" - "# A[1] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #4]\n\t" + "# A[0] * A[8]\n\t" + "ldr r10, [%[a], #32]\n\t" + "ldr r8, [%[a], #0]\n\t" "umull r5, r6, r10, r8\n\t" "mov r3, #0\n\t" "mov r7, #0\n\t" + "# A[1] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" "# A[2] * A[6]\n\t" "ldr r10, [%[a], #24]\n\t" "ldr r8, [%[a], #8]\n\t" @@ -8296,13 +9679,27 @@ static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "adds r4, r4, r5\n\t" "adcs r2, r2, r6\n\t" "adc r3, r3, r7\n\t" - "str r4, [%[r], #32]\n\t" - "# A[2] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #8]\n\t" + "str r4, [sp, #32]\n\t" + "# A[0] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" + "ldr r8, [%[a], #0]\n\t" "umull r5, r6, r10, r8\n\t" "mov r4, #0\n\t" "mov r7, #0\n\t" + "# A[1] * A[8]\n\t" + "ldr r10, [%[a], #32]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[2] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" "# A[3] * A[6]\n\t" "ldr r10, [%[a], #24]\n\t" "ldr r8, [%[a], #12]\n\t" @@ -8323,75 +9720,354 @@ static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "adds r2, r2, r5\n\t" "adcs r3, r3, r6\n\t" "adc r4, r4, r7\n\t" - "str r2, [%[r], #36]\n\t" + "str r2, [sp, #36]\n\t" + "# A[0] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #0]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r2, #0\n\t" + "mov r7, #0\n\t" + "# A[1] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[2] * A[8]\n\t" + "ldr r10, [%[a], #32]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" "# A[3] * A[7]\n\t" "ldr r10, [%[a], #28]\n\t" "ldr r8, [%[a], #12]\n\t" "umull r8, r9, r10, r8\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r14, r14\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r14\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" "# A[4] * A[6]\n\t" "ldr r10, [%[a], #24]\n\t" "ldr r8, [%[a], #16]\n\t" "umull r8, r9, r10, r8\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r14\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r14\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" "# A[5] * A[5]\n\t" "ldr r10, [%[a], #20]\n\t" "umull r8, r9, r10, r10\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, r14\n\t" - "str r3, [%[r], #40]\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc r2, r2, r7\n\t" + "str r3, [sp, #40]\n\t" + "# A[0] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #0]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r3, #0\n\t" + "mov r7, #0\n\t" + "# A[1] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[2] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[3] * A[8]\n\t" + "ldr r10, [%[a], #32]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" "# A[4] * A[7]\n\t" "ldr r10, [%[a], #28]\n\t" "ldr r8, [%[a], #16]\n\t" "umull r8, r9, r10, r8\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r14, r14\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, r14\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" "# A[5] * A[6]\n\t" "ldr r10, [%[a], #24]\n\t" "ldr r8, [%[a], #20]\n\t" "umull r8, r9, r10, r8\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, r14\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, r14\n\t" - "str r4, [%[r], #44]\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r4, r4, r5\n\t" + "adcs r2, r2, r6\n\t" + "adc r3, r3, r7\n\t" + "str r4, [sp, #44]\n\t" + "# A[1] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r4, #0\n\t" + "mov r7, #0\n\t" + "# A[2] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[3] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[4] * A[8]\n\t" + "ldr r10, [%[a], #32]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" "# A[5] * A[7]\n\t" "ldr r10, [%[a], #28]\n\t" "ldr r8, [%[a], #20]\n\t" "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[6] * A[6]\n\t" + "ldr r10, [%[a], #24]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "str r2, [%[r], #48]\n\t" + "# A[2] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r2, #0\n\t" + "mov r7, #0\n\t" + "# A[3] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[4] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[5] * A[8]\n\t" + "ldr r10, [%[a], #32]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[6] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc r2, r2, r7\n\t" + "str r3, [%[r], #52]\n\t" + "# A[3] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r3, #0\n\t" + "mov r7, #0\n\t" + "# A[4] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[5] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[6] * A[8]\n\t" + "ldr r10, [%[a], #32]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[7] * A[7]\n\t" + "ldr r10, [%[a], #28]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "adds r4, r4, r5\n\t" + "adcs r2, r2, r6\n\t" + "adc r3, r3, r7\n\t" + "str r4, [%[r], #56]\n\t" + "# A[4] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r4, #0\n\t" + "mov r7, #0\n\t" + "# A[5] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[6] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[7] * A[8]\n\t" + "ldr r10, [%[a], #32]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "str r2, [%[r], #60]\n\t" + "# A[5] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r2, #0\n\t" + "mov r7, #0\n\t" + "# A[6] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[7] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[8] * A[8]\n\t" + "ldr r10, [%[a], #32]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc r2, r2, r7\n\t" + "str r3, [%[r], #64]\n\t" + "# A[6] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r5, r6, r10, r8\n\t" + "mov r3, #0\n\t" + "mov r7, #0\n\t" + "# A[7] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "# A[8] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" + "ldr r8, [%[a], #32]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r5, r5, r8\n\t" + "adcs r6, r6, r9\n\t" + "adc r7, r7, r14\n\t" + "adds r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + "adds r4, r4, r5\n\t" + "adcs r2, r2, r6\n\t" + "adc r3, r3, r7\n\t" + "str r4, [%[r], #68]\n\t" + "# A[7] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r8, r9, r10, r8\n\t" "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r14, r14\n\t" "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, r14\n\t" - "# A[6] * A[6]\n\t" - "ldr r10, [%[a], #24]\n\t" + "# A[8] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #32]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r4, r14\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r4, r14\n\t" + "# A[9] * A[9]\n\t" + "ldr r10, [%[a], #36]\n\t" "umull r8, r9, r10, r10\n\t" "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, r14\n\t" - "str r2, [%[r], #48]\n\t" - "# A[6] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" - "ldr r8, [%[a], #24]\n\t" + "str r2, [%[r], #72]\n\t" + "# A[8] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #32]\n\t" "umull r8, r9, r10, r8\n\t" "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" @@ -8399,14 +10075,52 @@ static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, r14\n\t" - "str r3, [%[r], #52]\n\t" - "# A[7] * A[7]\n\t" - "ldr r10, [%[a], #28]\n\t" + "# A[9] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" + "ldr r8, [%[a], #36]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r14\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r2, r2, r14\n\t" + "str r3, [%[r], #76]\n\t" + "# A[9] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #36]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r4, r4, r8\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r14, r14\n\t" + "adds r4, r4, r8\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r3, r14\n\t" + "# A[10] * A[10]\n\t" + "ldr r10, [%[a], #40]\n\t" "umull r8, r9, r10, r10\n\t" "adds r4, r4, r8\n\t" - "adc r2, r2, r9\n\t" - "str r4, [%[r], #56]\n\t" - "str r2, [%[r], #60]\n\t" + "adcs r2, r2, r9\n\t" + "adc r3, r3, r14\n\t" + "str r4, [%[r], #80]\n\t" + "# A[10] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "ldr r8, [%[a], #40]\n\t" + "umull r8, r9, r10, r8\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r14, r14\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, r9\n\t" + "adc r4, r4, r14\n\t" + "str r2, [%[r], #84]\n\t" + "# A[11] * A[11]\n\t" + "ldr r10, [%[a], #44]\n\t" + "umull r8, r9, r10, r10\n\t" + "adds r3, r3, r8\n\t" + "adc r4, r4, r9\n\t" + "str r3, [%[r], #88]\n\t" + "str r4, [%[r], #92]\n\t" "ldr r2, [sp, #0]\n\t" "ldr r3, [sp, #4]\n\t" "ldr r4, [sp, #8]\n\t" @@ -8423,7 +10137,15 @@ static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "str r3, [%[r], #20]\n\t" "str r4, [%[r], #24]\n\t" "str r8, [%[r], #28]\n\t" - "add sp, sp, #32\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r3, [sp, #36]\n\t" + "ldr r4, [sp, #40]\n\t" + "ldr r8, [sp, #44]\n\t" + "str r2, [%[r], #32]\n\t" + "str r3, [%[r], #36]\n\t" + "str r4, [%[r], #40]\n\t" + "str r8, [%[r], #44]\n\t" + "add sp, sp, #48\n\t" : : [r] "r" (r), [a] "r" (a) : "memory", "r2", "r3", "r4", "r8", "r9", "r10", "r8", "r5", "r6", "r7", "r14" @@ -8436,7 +10158,7 @@ static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -static sp_digit sp_3072_add_8(sp_digit* r, const sp_digit* a, +static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit c = 0; @@ -8475,6 +10197,22 @@ static sp_digit sp_3072_add_8(sp_digit* r, const sp_digit* a, "str r5, [%[r], #20]\n\t" "str r6, [%[r], #24]\n\t" "str r7, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[a], #40]\n\t" + "ldr r7, [%[a], #44]\n\t" + "ldr r8, [%[b], #32]\n\t" + "ldr r9, [%[b], #36]\n\t" + "ldr r10, [%[b], #40]\n\t" + "ldr r14, [%[b], #44]\n\t" + "adcs r4, r4, r8\n\t" + "adcs r5, r5, r9\n\t" + "adcs r6, r6, r10\n\t" + "adcs r7, r7, r14\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "str r6, [%[r], #40]\n\t" + "str r7, [%[r], #44]\n\t" "adc %[c], r12, r12\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b) @@ -8489,7 +10227,7 @@ static sp_digit sp_3072_add_8(sp_digit* r, const sp_digit* a, * a A single precision integer and result. * b A single precision integer. */ -static sp_digit sp_3072_sub_in_place_16(sp_digit* a, const sp_digit* b) +static sp_digit sp_3072_sub_in_place_24(sp_digit* a, const sp_digit* b) { sp_digit c = 0; @@ -8558,6 +10296,38 @@ static sp_digit sp_3072_sub_in_place_16(sp_digit* a, const sp_digit* b) "str r3, [%[a], #52]\n\t" "str r4, [%[a], #56]\n\t" "str r5, [%[a], #60]\n\t" + "ldr r2, [%[a], #64]\n\t" + "ldr r3, [%[a], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" + "ldr r8, [%[b], #72]\n\t" + "ldr r9, [%[b], #76]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #64]\n\t" + "str r3, [%[a], #68]\n\t" + "str r4, [%[a], #72]\n\t" + "str r5, [%[a], #76]\n\t" + "ldr r2, [%[a], #80]\n\t" + "ldr r3, [%[a], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" + "ldr r8, [%[b], #88]\n\t" + "ldr r9, [%[b], #92]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #80]\n\t" + "str r3, [%[a], #84]\n\t" + "str r4, [%[a], #88]\n\t" + "str r5, [%[a], #92]\n\t" "sbc %[c], r9, r9\n\t" : [c] "+r" (c) : [a] "r" (a), [b] "r" (b) @@ -8573,327 +10343,7 @@ static sp_digit sp_3072_sub_in_place_16(sp_digit* a, const sp_digit* b) * a A single precision integer. * b A single precision integer. */ -static sp_digit sp_3072_add_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldr r4, [%[a], #0]\n\t" - "ldr r5, [%[a], #4]\n\t" - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #12]\n\t" - "ldr r8, [%[b], #0]\n\t" - "ldr r9, [%[b], #4]\n\t" - "ldr r10, [%[b], #8]\n\t" - "ldr r14, [%[b], #12]\n\t" - "adds r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #0]\n\t" - "str r5, [%[r], #4]\n\t" - "str r6, [%[r], #8]\n\t" - "str r7, [%[r], #12]\n\t" - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #20]\n\t" - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[a], #28]\n\t" - "ldr r8, [%[b], #16]\n\t" - "ldr r9, [%[b], #20]\n\t" - "ldr r10, [%[b], #24]\n\t" - "ldr r14, [%[b], #28]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #20]\n\t" - "str r6, [%[r], #24]\n\t" - "str r7, [%[r], #28]\n\t" - "ldr r4, [%[a], #32]\n\t" - "ldr r5, [%[a], #36]\n\t" - "ldr r6, [%[a], #40]\n\t" - "ldr r7, [%[a], #44]\n\t" - "ldr r8, [%[b], #32]\n\t" - "ldr r9, [%[b], #36]\n\t" - "ldr r10, [%[b], #40]\n\t" - "ldr r14, [%[b], #44]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #32]\n\t" - "str r5, [%[r], #36]\n\t" - "str r6, [%[r], #40]\n\t" - "str r7, [%[r], #44]\n\t" - "ldr r4, [%[a], #48]\n\t" - "ldr r5, [%[a], #52]\n\t" - "ldr r6, [%[a], #56]\n\t" - "ldr r7, [%[a], #60]\n\t" - "ldr r8, [%[b], #48]\n\t" - "ldr r9, [%[b], #52]\n\t" - "ldr r10, [%[b], #56]\n\t" - "ldr r14, [%[b], #60]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #48]\n\t" - "str r5, [%[r], #52]\n\t" - "str r6, [%[r], #56]\n\t" - "str r7, [%[r], #60]\n\t" - "adc %[c], r12, r12\n\t" - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" - ); - - return c; -} - -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_8(sp_digit* r, sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<8; i++) - r[i] = a[i] & m; -#else - r[0] = a[0] & m; - r[1] = a[1] & m; - r[2] = a[2] & m; - r[3] = a[3] & m; - r[4] = a[4] & m; - r[5] = a[5] & m; - r[6] = a[6] & m; - r[7] = a[7] & m; -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static void sp_3072_mul_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit b1[8]; - sp_digit z2[16]; - sp_digit u, ca, cb; - - ca = sp_3072_add_8(a1, a, &a[8]); - cb = sp_3072_add_8(b1, b, &b[8]); - u = ca & cb; - sp_3072_mul_8(z1, a1, b1); - sp_3072_mul_8(z2, &a[8], &b[8]); - sp_3072_mul_8(z0, a, b); - sp_3072_mask_8(r + 16, a1, 0 - cb); - sp_3072_mask_8(b1, b1, 0 - ca); - u += sp_3072_add_8(r + 16, r + 16, b1); - u += sp_3072_sub_in_place_16(z1, z2); - u += sp_3072_sub_in_place_16(z1, z0); - u += sp_3072_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - sp_3072_add_16(r + 16, r + 16, z2); -} - -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -static void sp_3072_sqr_16(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[16]; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit u; - - u = sp_3072_add_8(a1, a, &a[8]); - sp_3072_sqr_8(z1, a1); - sp_3072_sqr_8(z2, &a[8]); - sp_3072_sqr_8(z0, a); - sp_3072_mask_8(r + 16, a1, 0 - u); - u += sp_3072_add_8(r + 16, r + 16, r + 16); - u += sp_3072_sub_in_place_16(z1, z2); - u += sp_3072_sub_in_place_16(z1, z0); - u += sp_3072_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - sp_3072_add_16(r + 16, r + 16, z2); -} - -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldr r3, [%[a], #0]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[a], #8]\n\t" - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #0]\n\t" - "ldr r8, [%[b], #4]\n\t" - "ldr r9, [%[b], #8]\n\t" - "ldr r10, [%[b], #12]\n\t" - "subs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[a], #24]\n\t" - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #16]\n\t" - "ldr r8, [%[b], #20]\n\t" - "ldr r9, [%[b], #24]\n\t" - "ldr r10, [%[b], #28]\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "str r3, [%[r], #16]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #24]\n\t" - "str r6, [%[r], #28]\n\t" - "ldr r3, [%[a], #32]\n\t" - "ldr r4, [%[a], #36]\n\t" - "ldr r5, [%[a], #40]\n\t" - "ldr r6, [%[a], #44]\n\t" - "ldr r7, [%[b], #32]\n\t" - "ldr r8, [%[b], #36]\n\t" - "ldr r9, [%[b], #40]\n\t" - "ldr r10, [%[b], #44]\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "str r3, [%[r], #32]\n\t" - "str r4, [%[r], #36]\n\t" - "str r5, [%[r], #40]\n\t" - "str r6, [%[r], #44]\n\t" - "ldr r3, [%[a], #48]\n\t" - "ldr r4, [%[a], #52]\n\t" - "ldr r5, [%[a], #56]\n\t" - "ldr r6, [%[a], #60]\n\t" - "ldr r7, [%[b], #48]\n\t" - "ldr r8, [%[b], #52]\n\t" - "ldr r9, [%[b], #56]\n\t" - "ldr r10, [%[b], #60]\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "str r3, [%[r], #48]\n\t" - "str r4, [%[r], #52]\n\t" - "str r5, [%[r], #56]\n\t" - "str r6, [%[r], #60]\n\t" - "ldr r3, [%[a], #64]\n\t" - "ldr r4, [%[a], #68]\n\t" - "ldr r5, [%[a], #72]\n\t" - "ldr r6, [%[a], #76]\n\t" - "ldr r7, [%[b], #64]\n\t" - "ldr r8, [%[b], #68]\n\t" - "ldr r9, [%[b], #72]\n\t" - "ldr r10, [%[b], #76]\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "str r3, [%[r], #64]\n\t" - "str r4, [%[r], #68]\n\t" - "str r5, [%[r], #72]\n\t" - "str r6, [%[r], #76]\n\t" - "ldr r3, [%[a], #80]\n\t" - "ldr r4, [%[a], #84]\n\t" - "ldr r5, [%[a], #88]\n\t" - "ldr r6, [%[a], #92]\n\t" - "ldr r7, [%[b], #80]\n\t" - "ldr r8, [%[b], #84]\n\t" - "ldr r9, [%[b], #88]\n\t" - "ldr r10, [%[b], #92]\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "str r3, [%[r], #80]\n\t" - "str r4, [%[r], #84]\n\t" - "str r5, [%[r], #88]\n\t" - "str r6, [%[r], #92]\n\t" - "ldr r3, [%[a], #96]\n\t" - "ldr r4, [%[a], #100]\n\t" - "ldr r5, [%[a], #104]\n\t" - "ldr r6, [%[a], #108]\n\t" - "ldr r7, [%[b], #96]\n\t" - "ldr r8, [%[b], #100]\n\t" - "ldr r9, [%[b], #104]\n\t" - "ldr r10, [%[b], #108]\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "str r3, [%[r], #96]\n\t" - "str r4, [%[r], #100]\n\t" - "str r5, [%[r], #104]\n\t" - "str r6, [%[r], #108]\n\t" - "ldr r3, [%[a], #112]\n\t" - "ldr r4, [%[a], #116]\n\t" - "ldr r5, [%[a], #120]\n\t" - "ldr r6, [%[a], #124]\n\t" - "ldr r7, [%[b], #112]\n\t" - "ldr r8, [%[b], #116]\n\t" - "ldr r9, [%[b], #120]\n\t" - "ldr r10, [%[b], #124]\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "str r3, [%[r], #112]\n\t" - "str r4, [%[r], #116]\n\t" - "str r5, [%[r], #120]\n\t" - "str r6, [%[r], #124]\n\t" - "sbc %[c], %[c], #0\n\t" - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" - ); - - return c; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a, +static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit c = 0; @@ -8996,38 +10446,6 @@ static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a, "str r5, [%[r], #84]\n\t" "str r6, [%[r], #88]\n\t" "str r7, [%[r], #92]\n\t" - "ldr r4, [%[a], #96]\n\t" - "ldr r5, [%[a], #100]\n\t" - "ldr r6, [%[a], #104]\n\t" - "ldr r7, [%[a], #108]\n\t" - "ldr r8, [%[b], #96]\n\t" - "ldr r9, [%[b], #100]\n\t" - "ldr r10, [%[b], #104]\n\t" - "ldr r14, [%[b], #108]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #96]\n\t" - "str r5, [%[r], #100]\n\t" - "str r6, [%[r], #104]\n\t" - "str r7, [%[r], #108]\n\t" - "ldr r4, [%[a], #112]\n\t" - "ldr r5, [%[a], #116]\n\t" - "ldr r6, [%[a], #120]\n\t" - "ldr r7, [%[a], #124]\n\t" - "ldr r8, [%[b], #112]\n\t" - "ldr r9, [%[b], #116]\n\t" - "ldr r10, [%[b], #120]\n\t" - "ldr r14, [%[b], #124]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #112]\n\t" - "str r5, [%[r], #116]\n\t" - "str r6, [%[r], #120]\n\t" - "str r7, [%[r], #124]\n\t" "adc %[c], r12, r12\n\t" : [c] "+r" (c) : [r] "r" (r), [a] "r" (a), [b] "r" (b) @@ -9037,96 +10455,304 @@ static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a, return c; } +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<12; i++) + r[i] = a[i] & m; +#else + r[0] = a[0] & m; + r[1] = a[1] & m; + r[2] = a[2] & m; + r[3] = a[3] & m; + r[4] = a[4] & m; + r[5] = a[5] & m; + r[6] = a[6] & m; + r[7] = a[7] & m; + r[8] = a[8] & m; + r[9] = a[9] & m; + r[10] = a[10] & m; + r[11] = a[11] & m; +#endif +} + /* Multiply a and b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) +SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) { - sp_digit p0[32]; - sp_digit p1[32]; - sp_digit p2[32]; - sp_digit p3[32]; - sp_digit p4[32]; - sp_digit p5[32]; - sp_digit t0[32]; - sp_digit t1[32]; - sp_digit t2[32]; - sp_digit a0[16]; - sp_digit a1[16]; - sp_digit a2[16]; - sp_digit b0[16]; - sp_digit b1[16]; - sp_digit b2[16]; - sp_3072_add_16(a0, a, &a[16]); - sp_3072_add_16(b0, b, &b[16]); - sp_3072_add_16(a1, &a[16], &a[32]); - sp_3072_add_16(b1, &b[16], &b[32]); - sp_3072_add_16(a2, a0, &a[32]); - sp_3072_add_16(b2, b0, &b[32]); - sp_3072_mul_16(p0, a, b); - sp_3072_mul_16(p2, &a[16], &b[16]); - sp_3072_mul_16(p4, &a[32], &b[32]); - sp_3072_mul_16(p1, a0, b0); - sp_3072_mul_16(p3, a1, b1); - sp_3072_mul_16(p5, a2, b2); - XMEMSET(r, 0, sizeof(*r)*2*48); - sp_3072_sub_32(t0, p3, p2); - sp_3072_sub_32(t1, p1, p2); - sp_3072_sub_32(t2, p5, t0); - sp_3072_sub_32(t2, t2, t1); - sp_3072_sub_32(t0, t0, p4); - sp_3072_sub_32(t1, t1, p0); - sp_3072_add_32(r, r, p0); - sp_3072_add_32(&r[16], &r[16], t1); - sp_3072_add_32(&r[32], &r[32], t2); - sp_3072_add_32(&r[48], &r[48], t0); - sp_3072_add_32(&r[64], &r[64], p4); + sp_digit* z0 = r; + sp_digit z1[24]; + sp_digit a1[12]; + sp_digit b1[12]; + sp_digit z2[24]; + sp_digit u, ca, cb; + + ca = sp_3072_add_12(a1, a, &a[12]); + cb = sp_3072_add_12(b1, b, &b[12]); + u = ca & cb; + sp_3072_mul_12(z1, a1, b1); + sp_3072_mul_12(z2, &a[12], &b[12]); + sp_3072_mul_12(z0, a, b); + sp_3072_mask_12(r + 24, a1, 0 - cb); + sp_3072_mask_12(b1, b1, 0 - ca); + u += sp_3072_add_12(r + 24, r + 24, b1); + u += sp_3072_sub_in_place_24(z1, z2); + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_add_24(r + 12, r + 12, z1); + r[36] = u; + XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); + sp_3072_add_24(r + 24, r + 24, z2); } -/* Square a into r. (r = a * a) +/* Square a and put result in r. (r = a * a) * * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) { - sp_digit p0[32]; - sp_digit p1[32]; - sp_digit p2[32]; - sp_digit p3[32]; - sp_digit p4[32]; - sp_digit p5[32]; - sp_digit t0[32]; - sp_digit t1[32]; - sp_digit t2[32]; - sp_digit a0[16]; - sp_digit a1[16]; - sp_digit a2[16]; - sp_3072_add_16(a0, a, &a[16]); - sp_3072_add_16(a1, &a[16], &a[32]); - sp_3072_add_16(a2, a0, &a[32]); - sp_3072_sqr_16(p0, a); - sp_3072_sqr_16(p2, &a[16]); - sp_3072_sqr_16(p4, &a[32]); - sp_3072_sqr_16(p1, a0); - sp_3072_sqr_16(p3, a1); - sp_3072_sqr_16(p5, a2); - XMEMSET(r, 0, sizeof(*r)*2*48); - sp_3072_sub_32(t0, p3, p2); - sp_3072_sub_32(t1, p1, p2); - sp_3072_sub_32(t2, p5, t0); - sp_3072_sub_32(t2, t2, t1); - sp_3072_sub_32(t0, t0, p4); - sp_3072_sub_32(t1, t1, p0); - sp_3072_add_32(r, r, p0); - sp_3072_add_32(&r[16], &r[16], t1); - sp_3072_add_32(&r[32], &r[32], t2); - sp_3072_add_32(&r[48], &r[48], t0); - sp_3072_add_32(&r[64], &r[64], p4); + sp_digit* z0 = r; + sp_digit z2[24]; + sp_digit z1[24]; + sp_digit a1[12]; + sp_digit u; + + u = sp_3072_add_12(a1, a, &a[12]); + sp_3072_sqr_12(z1, a1); + sp_3072_sqr_12(z2, &a[12]); + sp_3072_sqr_12(z0, a); + sp_3072_mask_12(r + 24, a1, 0 - u); + u += sp_3072_add_12(r + 24, r + 24, r + 24); + u += sp_3072_sub_in_place_24(z1, z2); + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_add_24(r + 12, r + 12, z1); + r[36] = u; + XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); + sp_3072_add_24(r + 24, r + 24, z2); +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer and result. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldr r2, [%[a], #0]\n\t" + "ldr r3, [%[a], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r7, [%[b], #4]\n\t" + "ldr r8, [%[b], #8]\n\t" + "ldr r9, [%[b], #12]\n\t" + "subs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #0]\n\t" + "str r3, [%[a], #4]\n\t" + "str r4, [%[a], #8]\n\t" + "str r5, [%[a], #12]\n\t" + "ldr r2, [%[a], #16]\n\t" + "ldr r3, [%[a], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" + "ldr r8, [%[b], #24]\n\t" + "ldr r9, [%[b], #28]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #16]\n\t" + "str r3, [%[a], #20]\n\t" + "str r4, [%[a], #24]\n\t" + "str r5, [%[a], #28]\n\t" + "ldr r2, [%[a], #32]\n\t" + "ldr r3, [%[a], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" + "ldr r8, [%[b], #40]\n\t" + "ldr r9, [%[b], #44]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #32]\n\t" + "str r3, [%[a], #36]\n\t" + "str r4, [%[a], #40]\n\t" + "str r5, [%[a], #44]\n\t" + "ldr r2, [%[a], #48]\n\t" + "ldr r3, [%[a], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" + "ldr r8, [%[b], #56]\n\t" + "ldr r9, [%[b], #60]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #48]\n\t" + "str r3, [%[a], #52]\n\t" + "str r4, [%[a], #56]\n\t" + "str r5, [%[a], #60]\n\t" + "ldr r2, [%[a], #64]\n\t" + "ldr r3, [%[a], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" + "ldr r8, [%[b], #72]\n\t" + "ldr r9, [%[b], #76]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #64]\n\t" + "str r3, [%[a], #68]\n\t" + "str r4, [%[a], #72]\n\t" + "str r5, [%[a], #76]\n\t" + "ldr r2, [%[a], #80]\n\t" + "ldr r3, [%[a], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" + "ldr r8, [%[b], #88]\n\t" + "ldr r9, [%[b], #92]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #80]\n\t" + "str r3, [%[a], #84]\n\t" + "str r4, [%[a], #88]\n\t" + "str r5, [%[a], #92]\n\t" + "ldr r2, [%[a], #96]\n\t" + "ldr r3, [%[a], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" + "ldr r8, [%[b], #104]\n\t" + "ldr r9, [%[b], #108]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #96]\n\t" + "str r3, [%[a], #100]\n\t" + "str r4, [%[a], #104]\n\t" + "str r5, [%[a], #108]\n\t" + "ldr r2, [%[a], #112]\n\t" + "ldr r3, [%[a], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" + "ldr r8, [%[b], #120]\n\t" + "ldr r9, [%[b], #124]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #112]\n\t" + "str r3, [%[a], #116]\n\t" + "str r4, [%[a], #120]\n\t" + "str r5, [%[a], #124]\n\t" + "ldr r2, [%[a], #128]\n\t" + "ldr r3, [%[a], #132]\n\t" + "ldr r4, [%[a], #136]\n\t" + "ldr r5, [%[a], #140]\n\t" + "ldr r6, [%[b], #128]\n\t" + "ldr r7, [%[b], #132]\n\t" + "ldr r8, [%[b], #136]\n\t" + "ldr r9, [%[b], #140]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #128]\n\t" + "str r3, [%[a], #132]\n\t" + "str r4, [%[a], #136]\n\t" + "str r5, [%[a], #140]\n\t" + "ldr r2, [%[a], #144]\n\t" + "ldr r3, [%[a], #148]\n\t" + "ldr r4, [%[a], #152]\n\t" + "ldr r5, [%[a], #156]\n\t" + "ldr r6, [%[b], #144]\n\t" + "ldr r7, [%[b], #148]\n\t" + "ldr r8, [%[b], #152]\n\t" + "ldr r9, [%[b], #156]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #144]\n\t" + "str r3, [%[a], #148]\n\t" + "str r4, [%[a], #152]\n\t" + "str r5, [%[a], #156]\n\t" + "ldr r2, [%[a], #160]\n\t" + "ldr r3, [%[a], #164]\n\t" + "ldr r4, [%[a], #168]\n\t" + "ldr r5, [%[a], #172]\n\t" + "ldr r6, [%[b], #160]\n\t" + "ldr r7, [%[b], #164]\n\t" + "ldr r8, [%[b], #168]\n\t" + "ldr r9, [%[b], #172]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #160]\n\t" + "str r3, [%[a], #164]\n\t" + "str r4, [%[a], #168]\n\t" + "str r5, [%[a], #172]\n\t" + "ldr r2, [%[a], #176]\n\t" + "ldr r3, [%[a], #180]\n\t" + "ldr r4, [%[a], #184]\n\t" + "ldr r5, [%[a], #188]\n\t" + "ldr r6, [%[b], #176]\n\t" + "ldr r7, [%[b], #180]\n\t" + "ldr r8, [%[b], #184]\n\t" + "ldr r9, [%[b], #188]\n\t" + "sbcs r2, r2, r6\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "str r2, [%[a], #176]\n\t" + "str r3, [%[a], #180]\n\t" + "str r4, [%[a], #184]\n\t" + "str r5, [%[a], #188]\n\t" + "sbc %[c], r9, r9\n\t" + : [c] "+r" (c) + : [a] "r" (a), [b] "r" (b) + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" + ); + + return c; } /* Add b to a into r. (r = a + b) @@ -9343,6 +10969,95 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, return c; } +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<24; i++) + r[i] = a[i] & m; +#else + int i; + + for (i = 0; i < 24; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[48]; + sp_digit a1[24]; + sp_digit b1[24]; + sp_digit z2[48]; + sp_digit u, ca, cb; + + ca = sp_3072_add_24(a1, a, &a[24]); + cb = sp_3072_add_24(b1, b, &b[24]); + u = ca & cb; + sp_3072_mul_24(z1, a1, b1); + sp_3072_mul_24(z2, &a[24], &b[24]); + sp_3072_mul_24(z0, a, b); + sp_3072_mask_24(r + 48, a1, 0 - cb); + sp_3072_mask_24(b1, b1, 0 - ca); + u += sp_3072_add_24(r + 48, r + 48, b1); + u += sp_3072_sub_in_place_48(z1, z2); + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_add_48(r + 24, r + 24, z1); + r[72] = u; + XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); + sp_3072_add_48(r + 48, r + 48, z2); +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit z2[48]; + sp_digit z1[48]; + sp_digit a1[24]; + sp_digit u; + + u = sp_3072_add_24(a1, a, &a[24]); + sp_3072_sqr_24(z1, a1); + sp_3072_sqr_24(z2, &a[24]); + sp_3072_sqr_24(z0, a); + sp_3072_mask_24(r + 48, a1, 0 - u); + u += sp_3072_add_24(r + 48, r + 48, r + 48); + u += sp_3072_sub_in_place_48(z1, z2); + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_add_48(r + 24, r + 24, z1); + r[72] = u; + XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); + sp_3072_add_48(r + 48, r + 48, z2); +} + /* Sub b from a into a. (a -= b) * * a A single precision integer and result. @@ -10158,7 +11873,7 @@ static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -10187,7 +11902,7 @@ static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit* z0 = r; @@ -10219,7 +11934,7 @@ static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; sp_digit z2[96]; @@ -10473,7 +12188,7 @@ static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) { int i; @@ -10526,6 +12241,49 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, return c; } +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into a. (a -= b) + * + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov r14, #0\n\t" + "add r12, %[a], #192\n\t" + "\n1:\n\t" + "subs %[c], r14, %[c]\n\t" + "ldr r3, [%[a]]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[a], #8]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r7, [%[b]], #4\n\t" + "ldr r8, [%[b]], #4\n\t" + "ldr r9, [%[b]], #4\n\t" + "ldr r10, [%[b]], #4\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[a]], #4\n\t" + "str r4, [%[a]], #4\n\t" + "str r5, [%[a]], #4\n\t" + "str r6, [%[a]], #4\n\t" + "sbc %[c], r14, r14\n\t" + "cmp %[a], r12\n\t" + "bne 1b\n\t" + : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14" + ); + + return c; +} + #endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Multiply a and b into r. (r = a * b) @@ -11495,261 +13253,6 @@ static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, } #if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY) -#ifdef WOLFSSL_SP_SMALL -/* Sub b from a into a. (a -= b) - * - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "mov r14, #0\n\t" - "add r12, %[a], #192\n\t" - "\n1:\n\t" - "subs %[c], r14, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[a], #8]\n\t" - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b]], #4\n\t" - "ldr r8, [%[b]], #4\n\t" - "ldr r9, [%[b]], #4\n\t" - "ldr r10, [%[b]], #4\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "str r3, [%[a]], #4\n\t" - "str r4, [%[a]], #4\n\t" - "str r5, [%[a]], #4\n\t" - "str r6, [%[a]], #4\n\t" - "sbc %[c], r14, r14\n\t" - "cmp %[a], r12\n\t" - "bne 1b\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14" - ); - - return c; -} - -#else -/* Sub b from a into a. (a -= b) - * - * a A single precision integer and result. - * b A single precision integer. - */ -static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldr r2, [%[a], #0]\n\t" - "ldr r3, [%[a], #4]\n\t" - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #12]\n\t" - "ldr r6, [%[b], #0]\n\t" - "ldr r7, [%[b], #4]\n\t" - "ldr r8, [%[b], #8]\n\t" - "ldr r9, [%[b], #12]\n\t" - "subs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #0]\n\t" - "str r3, [%[a], #4]\n\t" - "str r4, [%[a], #8]\n\t" - "str r5, [%[a], #12]\n\t" - "ldr r2, [%[a], #16]\n\t" - "ldr r3, [%[a], #20]\n\t" - "ldr r4, [%[a], #24]\n\t" - "ldr r5, [%[a], #28]\n\t" - "ldr r6, [%[b], #16]\n\t" - "ldr r7, [%[b], #20]\n\t" - "ldr r8, [%[b], #24]\n\t" - "ldr r9, [%[b], #28]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #16]\n\t" - "str r3, [%[a], #20]\n\t" - "str r4, [%[a], #24]\n\t" - "str r5, [%[a], #28]\n\t" - "ldr r2, [%[a], #32]\n\t" - "ldr r3, [%[a], #36]\n\t" - "ldr r4, [%[a], #40]\n\t" - "ldr r5, [%[a], #44]\n\t" - "ldr r6, [%[b], #32]\n\t" - "ldr r7, [%[b], #36]\n\t" - "ldr r8, [%[b], #40]\n\t" - "ldr r9, [%[b], #44]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #32]\n\t" - "str r3, [%[a], #36]\n\t" - "str r4, [%[a], #40]\n\t" - "str r5, [%[a], #44]\n\t" - "ldr r2, [%[a], #48]\n\t" - "ldr r3, [%[a], #52]\n\t" - "ldr r4, [%[a], #56]\n\t" - "ldr r5, [%[a], #60]\n\t" - "ldr r6, [%[b], #48]\n\t" - "ldr r7, [%[b], #52]\n\t" - "ldr r8, [%[b], #56]\n\t" - "ldr r9, [%[b], #60]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #48]\n\t" - "str r3, [%[a], #52]\n\t" - "str r4, [%[a], #56]\n\t" - "str r5, [%[a], #60]\n\t" - "ldr r2, [%[a], #64]\n\t" - "ldr r3, [%[a], #68]\n\t" - "ldr r4, [%[a], #72]\n\t" - "ldr r5, [%[a], #76]\n\t" - "ldr r6, [%[b], #64]\n\t" - "ldr r7, [%[b], #68]\n\t" - "ldr r8, [%[b], #72]\n\t" - "ldr r9, [%[b], #76]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #64]\n\t" - "str r3, [%[a], #68]\n\t" - "str r4, [%[a], #72]\n\t" - "str r5, [%[a], #76]\n\t" - "ldr r2, [%[a], #80]\n\t" - "ldr r3, [%[a], #84]\n\t" - "ldr r4, [%[a], #88]\n\t" - "ldr r5, [%[a], #92]\n\t" - "ldr r6, [%[b], #80]\n\t" - "ldr r7, [%[b], #84]\n\t" - "ldr r8, [%[b], #88]\n\t" - "ldr r9, [%[b], #92]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #80]\n\t" - "str r3, [%[a], #84]\n\t" - "str r4, [%[a], #88]\n\t" - "str r5, [%[a], #92]\n\t" - "ldr r2, [%[a], #96]\n\t" - "ldr r3, [%[a], #100]\n\t" - "ldr r4, [%[a], #104]\n\t" - "ldr r5, [%[a], #108]\n\t" - "ldr r6, [%[b], #96]\n\t" - "ldr r7, [%[b], #100]\n\t" - "ldr r8, [%[b], #104]\n\t" - "ldr r9, [%[b], #108]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #96]\n\t" - "str r3, [%[a], #100]\n\t" - "str r4, [%[a], #104]\n\t" - "str r5, [%[a], #108]\n\t" - "ldr r2, [%[a], #112]\n\t" - "ldr r3, [%[a], #116]\n\t" - "ldr r4, [%[a], #120]\n\t" - "ldr r5, [%[a], #124]\n\t" - "ldr r6, [%[b], #112]\n\t" - "ldr r7, [%[b], #116]\n\t" - "ldr r8, [%[b], #120]\n\t" - "ldr r9, [%[b], #124]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #112]\n\t" - "str r3, [%[a], #116]\n\t" - "str r4, [%[a], #120]\n\t" - "str r5, [%[a], #124]\n\t" - "ldr r2, [%[a], #128]\n\t" - "ldr r3, [%[a], #132]\n\t" - "ldr r4, [%[a], #136]\n\t" - "ldr r5, [%[a], #140]\n\t" - "ldr r6, [%[b], #128]\n\t" - "ldr r7, [%[b], #132]\n\t" - "ldr r8, [%[b], #136]\n\t" - "ldr r9, [%[b], #140]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #128]\n\t" - "str r3, [%[a], #132]\n\t" - "str r4, [%[a], #136]\n\t" - "str r5, [%[a], #140]\n\t" - "ldr r2, [%[a], #144]\n\t" - "ldr r3, [%[a], #148]\n\t" - "ldr r4, [%[a], #152]\n\t" - "ldr r5, [%[a], #156]\n\t" - "ldr r6, [%[b], #144]\n\t" - "ldr r7, [%[b], #148]\n\t" - "ldr r8, [%[b], #152]\n\t" - "ldr r9, [%[b], #156]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #144]\n\t" - "str r3, [%[a], #148]\n\t" - "str r4, [%[a], #152]\n\t" - "str r5, [%[a], #156]\n\t" - "ldr r2, [%[a], #160]\n\t" - "ldr r3, [%[a], #164]\n\t" - "ldr r4, [%[a], #168]\n\t" - "ldr r5, [%[a], #172]\n\t" - "ldr r6, [%[b], #160]\n\t" - "ldr r7, [%[b], #164]\n\t" - "ldr r8, [%[b], #168]\n\t" - "ldr r9, [%[b], #172]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #160]\n\t" - "str r3, [%[a], #164]\n\t" - "str r4, [%[a], #168]\n\t" - "str r5, [%[a], #172]\n\t" - "ldr r2, [%[a], #176]\n\t" - "ldr r3, [%[a], #180]\n\t" - "ldr r4, [%[a], #184]\n\t" - "ldr r5, [%[a], #188]\n\t" - "ldr r6, [%[b], #176]\n\t" - "ldr r7, [%[b], #180]\n\t" - "ldr r8, [%[b], #184]\n\t" - "ldr r9, [%[b], #188]\n\t" - "sbcs r2, r2, r6\n\t" - "sbcs r3, r3, r7\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "str r2, [%[a], #176]\n\t" - "str r3, [%[a], #180]\n\t" - "str r4, [%[a], #184]\n\t" - "str r5, [%[a], #188]\n\t" - "sbc %[c], r9, r9\n\t" - : [c] "+r" (c) - : [a] "r" (a), [b] "r" (b) - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" - ); - - return c; -} - -#endif /* WOLFSSL_SP_SMALL */ /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. * @@ -13055,8 +14558,11 @@ static int32_t sp_3072_cmp_48(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "sub r6, r6, #4\n\t" "bcc 1b\n\t" @@ -13074,384 +14580,528 @@ static int32_t sp_3072_cmp_48(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #184]\n\t" "ldr r5, [%[b], #184]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #180]\n\t" "ldr r5, [%[b], #180]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #176]\n\t" "ldr r5, [%[b], #176]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #172]\n\t" "ldr r5, [%[b], #172]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #168]\n\t" "ldr r5, [%[b], #168]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #164]\n\t" "ldr r5, [%[b], #164]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #160]\n\t" "ldr r5, [%[b], #160]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #156]\n\t" "ldr r5, [%[b], #156]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #152]\n\t" "ldr r5, [%[b], #152]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #148]\n\t" "ldr r5, [%[b], #148]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #144]\n\t" "ldr r5, [%[b], #144]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #140]\n\t" "ldr r5, [%[b], #140]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #136]\n\t" "ldr r5, [%[b], #136]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #132]\n\t" "ldr r5, [%[b], #132]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #128]\n\t" "ldr r5, [%[b], #128]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #124]\n\t" "ldr r5, [%[b], #124]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #120]\n\t" "ldr r5, [%[b], #120]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #116]\n\t" "ldr r5, [%[b], #116]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #112]\n\t" "ldr r5, [%[b], #112]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #108]\n\t" "ldr r5, [%[b], #108]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #104]\n\t" "ldr r5, [%[b], #104]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #100]\n\t" "ldr r5, [%[b], #100]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #96]\n\t" "ldr r5, [%[b], #96]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #92]\n\t" "ldr r5, [%[b], #92]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #88]\n\t" "ldr r5, [%[b], #88]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #84]\n\t" "ldr r5, [%[b], #84]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #80]\n\t" "ldr r5, [%[b], #80]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #76]\n\t" "ldr r5, [%[b], #76]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #72]\n\t" "ldr r5, [%[b], #72]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #68]\n\t" "ldr r5, [%[b], #68]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #64]\n\t" "ldr r5, [%[b], #64]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #60]\n\t" "ldr r5, [%[b], #60]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #56]\n\t" "ldr r5, [%[b], #56]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #52]\n\t" "ldr r5, [%[b], #52]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #48]\n\t" "ldr r5, [%[b], #48]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #44]\n\t" "ldr r5, [%[b], #44]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #40]\n\t" "ldr r5, [%[b], #40]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #36]\n\t" "ldr r5, [%[b], #36]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #32]\n\t" "ldr r5, [%[b], #32]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #28]\n\t" "ldr r5, [%[b], #28]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #24]\n\t" "ldr r5, [%[b], #24]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #20]\n\t" "ldr r5, [%[b], #20]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #16]\n\t" "ldr r5, [%[b], #16]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #12]\n\t" "ldr r5, [%[b], #12]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #8]\n\t" "ldr r5, [%[b], #8]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #4]\n\t" "ldr r5, [%[b], #4]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #0]\n\t" "ldr r5, [%[b], #0]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "eor %[r], %[r], r3\n\t" : [r] "+r" (r) @@ -13481,6 +15131,7 @@ static WC_INLINE int sp_3072_div_48(sp_digit* a, sp_digit* d, sp_digit* m, (void)m; + div = d[47]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); for (i=47; i>=0; i--) { @@ -13589,9 +15240,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -13621,10 +15275,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_48(r, r, m, mp); - sp_3072_mont_mul_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_48(r, m, mp); @@ -13732,9 +15382,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -13765,10 +15418,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_48(r, r, m, mp); - sp_3072_mont_mul_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_48(r, m, mp); @@ -15323,7 +16972,7 @@ static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_96(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -15370,8 +17019,11 @@ static int32_t sp_3072_cmp_96(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "sub r6, r6, #4\n\t" "bcc 1b\n\t" @@ -15389,768 +17041,1056 @@ static int32_t sp_3072_cmp_96(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #376]\n\t" "ldr r5, [%[b], #376]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #372]\n\t" "ldr r5, [%[b], #372]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #368]\n\t" "ldr r5, [%[b], #368]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #364]\n\t" "ldr r5, [%[b], #364]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #360]\n\t" "ldr r5, [%[b], #360]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #356]\n\t" "ldr r5, [%[b], #356]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #352]\n\t" "ldr r5, [%[b], #352]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #348]\n\t" "ldr r5, [%[b], #348]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #344]\n\t" "ldr r5, [%[b], #344]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #340]\n\t" "ldr r5, [%[b], #340]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #336]\n\t" "ldr r5, [%[b], #336]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #332]\n\t" "ldr r5, [%[b], #332]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #328]\n\t" "ldr r5, [%[b], #328]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #324]\n\t" "ldr r5, [%[b], #324]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #320]\n\t" "ldr r5, [%[b], #320]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #316]\n\t" "ldr r5, [%[b], #316]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #312]\n\t" "ldr r5, [%[b], #312]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #308]\n\t" "ldr r5, [%[b], #308]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #304]\n\t" "ldr r5, [%[b], #304]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #300]\n\t" "ldr r5, [%[b], #300]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #296]\n\t" "ldr r5, [%[b], #296]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #292]\n\t" "ldr r5, [%[b], #292]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #288]\n\t" "ldr r5, [%[b], #288]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #284]\n\t" "ldr r5, [%[b], #284]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #280]\n\t" "ldr r5, [%[b], #280]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #276]\n\t" "ldr r5, [%[b], #276]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #272]\n\t" "ldr r5, [%[b], #272]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #268]\n\t" "ldr r5, [%[b], #268]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #264]\n\t" "ldr r5, [%[b], #264]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #260]\n\t" "ldr r5, [%[b], #260]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #256]\n\t" "ldr r5, [%[b], #256]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #252]\n\t" "ldr r5, [%[b], #252]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #248]\n\t" "ldr r5, [%[b], #248]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #244]\n\t" "ldr r5, [%[b], #244]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #240]\n\t" "ldr r5, [%[b], #240]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #236]\n\t" "ldr r5, [%[b], #236]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #232]\n\t" "ldr r5, [%[b], #232]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #228]\n\t" "ldr r5, [%[b], #228]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #224]\n\t" "ldr r5, [%[b], #224]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #220]\n\t" "ldr r5, [%[b], #220]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #216]\n\t" "ldr r5, [%[b], #216]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #212]\n\t" "ldr r5, [%[b], #212]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #208]\n\t" "ldr r5, [%[b], #208]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #204]\n\t" "ldr r5, [%[b], #204]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #200]\n\t" "ldr r5, [%[b], #200]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #196]\n\t" "ldr r5, [%[b], #196]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #192]\n\t" "ldr r5, [%[b], #192]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #188]\n\t" "ldr r5, [%[b], #188]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #184]\n\t" "ldr r5, [%[b], #184]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #180]\n\t" "ldr r5, [%[b], #180]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #176]\n\t" "ldr r5, [%[b], #176]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #172]\n\t" "ldr r5, [%[b], #172]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #168]\n\t" "ldr r5, [%[b], #168]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #164]\n\t" "ldr r5, [%[b], #164]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #160]\n\t" "ldr r5, [%[b], #160]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #156]\n\t" "ldr r5, [%[b], #156]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #152]\n\t" "ldr r5, [%[b], #152]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #148]\n\t" "ldr r5, [%[b], #148]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #144]\n\t" "ldr r5, [%[b], #144]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #140]\n\t" "ldr r5, [%[b], #140]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #136]\n\t" "ldr r5, [%[b], #136]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #132]\n\t" "ldr r5, [%[b], #132]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #128]\n\t" "ldr r5, [%[b], #128]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #124]\n\t" "ldr r5, [%[b], #124]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #120]\n\t" "ldr r5, [%[b], #120]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #116]\n\t" "ldr r5, [%[b], #116]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #112]\n\t" "ldr r5, [%[b], #112]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #108]\n\t" "ldr r5, [%[b], #108]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #104]\n\t" "ldr r5, [%[b], #104]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #100]\n\t" "ldr r5, [%[b], #100]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #96]\n\t" "ldr r5, [%[b], #96]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #92]\n\t" "ldr r5, [%[b], #92]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #88]\n\t" "ldr r5, [%[b], #88]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #84]\n\t" "ldr r5, [%[b], #84]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #80]\n\t" "ldr r5, [%[b], #80]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #76]\n\t" "ldr r5, [%[b], #76]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #72]\n\t" "ldr r5, [%[b], #72]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #68]\n\t" "ldr r5, [%[b], #68]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #64]\n\t" "ldr r5, [%[b], #64]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #60]\n\t" "ldr r5, [%[b], #60]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #56]\n\t" "ldr r5, [%[b], #56]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #52]\n\t" "ldr r5, [%[b], #52]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #48]\n\t" "ldr r5, [%[b], #48]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #44]\n\t" "ldr r5, [%[b], #44]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #40]\n\t" "ldr r5, [%[b], #40]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #36]\n\t" "ldr r5, [%[b], #36]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #32]\n\t" "ldr r5, [%[b], #32]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #28]\n\t" "ldr r5, [%[b], #28]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #24]\n\t" "ldr r5, [%[b], #24]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #20]\n\t" "ldr r5, [%[b], #20]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #16]\n\t" "ldr r5, [%[b], #16]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #12]\n\t" "ldr r5, [%[b], #12]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #8]\n\t" "ldr r5, [%[b], #8]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #4]\n\t" "ldr r5, [%[b], #4]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #0]\n\t" "ldr r5, [%[b], #0]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "eor %[r], %[r], r3\n\t" : [r] "+r" (r) @@ -16180,6 +18120,7 @@ static WC_INLINE int sp_3072_div_96(sp_digit* a, sp_digit* d, sp_digit* m, (void)m; + div = d[95]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); for (i=95; i>=0; i--) { @@ -16231,6 +18172,7 @@ static WC_INLINE int sp_3072_div_96_cond(sp_digit* a, sp_digit* d, sp_digit* m, (void)m; + div = d[95]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); for (i=95; i>=0; i--) { @@ -16342,9 +18284,12 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 96); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -16374,10 +18319,6 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_96(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_96(r, r, m, mp); - sp_3072_mont_mul_96(r, r, t[y], m, mp); XMEMSET(&r[96], 0, sizeof(sp_digit) * 96); sp_3072_mont_reduce_96(r, m, mp); @@ -16485,9 +18426,12 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 96); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -16518,10 +18462,6 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_96(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_96(r, r, m, mp); - sp_3072_mont_mul_96(r, r, t[y], m, mp); XMEMSET(&r[96], 0, sizeof(sp_digit) * 96); sp_3072_mont_reduce_96(r, m, mp); @@ -16836,7 +18776,7 @@ static int sp_3072_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 96; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 32 >= DIGIT_BIT) { - #if DIGIT_BIT < 32 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -16893,6 +18833,699 @@ int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_3072 +static void sp_3072_lshift_96(sp_digit* r, sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "mov r6, #31\n\t" + "sub r6, r6, %[n]\n\t" + "ldr r3, [%[a], #380]\n\t" + "lsr r4, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r4, r4, r6\n\t" + "ldr r2, [%[a], #376]\n\t" + "str r4, [%[r], #384]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #372]\n\t" + "str r3, [%[r], #380]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #368]\n\t" + "str r2, [%[r], #376]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #364]\n\t" + "str r4, [%[r], #372]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #360]\n\t" + "str r3, [%[r], #368]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #356]\n\t" + "str r2, [%[r], #364]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #352]\n\t" + "str r4, [%[r], #360]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #348]\n\t" + "str r3, [%[r], #356]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #344]\n\t" + "str r2, [%[r], #352]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #340]\n\t" + "str r4, [%[r], #348]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #336]\n\t" + "str r3, [%[r], #344]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #332]\n\t" + "str r2, [%[r], #340]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #328]\n\t" + "str r4, [%[r], #336]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #324]\n\t" + "str r3, [%[r], #332]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #320]\n\t" + "str r2, [%[r], #328]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #316]\n\t" + "str r4, [%[r], #324]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #312]\n\t" + "str r3, [%[r], #320]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #308]\n\t" + "str r2, [%[r], #316]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #304]\n\t" + "str r4, [%[r], #312]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #300]\n\t" + "str r3, [%[r], #308]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #296]\n\t" + "str r2, [%[r], #304]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #292]\n\t" + "str r4, [%[r], #300]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #288]\n\t" + "str r3, [%[r], #296]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #284]\n\t" + "str r2, [%[r], #292]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #280]\n\t" + "str r4, [%[r], #288]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #276]\n\t" + "str r3, [%[r], #284]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #272]\n\t" + "str r2, [%[r], #280]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #268]\n\t" + "str r4, [%[r], #276]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #264]\n\t" + "str r3, [%[r], #272]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #260]\n\t" + "str r2, [%[r], #268]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #256]\n\t" + "str r4, [%[r], #264]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #252]\n\t" + "str r3, [%[r], #260]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #248]\n\t" + "str r2, [%[r], #256]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #244]\n\t" + "str r4, [%[r], #252]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #240]\n\t" + "str r3, [%[r], #248]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #236]\n\t" + "str r2, [%[r], #244]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #232]\n\t" + "str r4, [%[r], #240]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #228]\n\t" + "str r3, [%[r], #236]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #224]\n\t" + "str r2, [%[r], #232]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #220]\n\t" + "str r4, [%[r], #228]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #216]\n\t" + "str r3, [%[r], #224]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #212]\n\t" + "str r2, [%[r], #220]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #208]\n\t" + "str r4, [%[r], #216]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #204]\n\t" + "str r3, [%[r], #212]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #200]\n\t" + "str r2, [%[r], #208]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #196]\n\t" + "str r4, [%[r], #204]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #192]\n\t" + "str r3, [%[r], #200]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #188]\n\t" + "str r2, [%[r], #196]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #184]\n\t" + "str r4, [%[r], #192]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #180]\n\t" + "str r3, [%[r], #188]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #176]\n\t" + "str r2, [%[r], #184]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #172]\n\t" + "str r4, [%[r], #180]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #168]\n\t" + "str r3, [%[r], #176]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #164]\n\t" + "str r2, [%[r], #172]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #160]\n\t" + "str r4, [%[r], #168]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #156]\n\t" + "str r3, [%[r], #164]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #152]\n\t" + "str r2, [%[r], #160]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #148]\n\t" + "str r4, [%[r], #156]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #144]\n\t" + "str r3, [%[r], #152]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #140]\n\t" + "str r2, [%[r], #148]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #136]\n\t" + "str r4, [%[r], #144]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #132]\n\t" + "str r3, [%[r], #140]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #128]\n\t" + "str r2, [%[r], #136]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #124]\n\t" + "str r4, [%[r], #132]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #120]\n\t" + "str r3, [%[r], #128]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #116]\n\t" + "str r2, [%[r], #124]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #112]\n\t" + "str r4, [%[r], #120]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #108]\n\t" + "str r3, [%[r], #116]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #104]\n\t" + "str r2, [%[r], #112]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #100]\n\t" + "str r4, [%[r], #108]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #96]\n\t" + "str r3, [%[r], #104]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #92]\n\t" + "str r2, [%[r], #100]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #88]\n\t" + "str r4, [%[r], #96]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #84]\n\t" + "str r3, [%[r], #92]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #80]\n\t" + "str r2, [%[r], #88]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #76]\n\t" + "str r4, [%[r], #84]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #72]\n\t" + "str r3, [%[r], #80]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #68]\n\t" + "str r2, [%[r], #76]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #64]\n\t" + "str r4, [%[r], #72]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #60]\n\t" + "str r3, [%[r], #68]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #56]\n\t" + "str r2, [%[r], #64]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #52]\n\t" + "str r4, [%[r], #60]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #48]\n\t" + "str r3, [%[r], #56]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #44]\n\t" + "str r2, [%[r], #52]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #40]\n\t" + "str r4, [%[r], #48]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #36]\n\t" + "str r3, [%[r], #44]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #32]\n\t" + "str r2, [%[r], #40]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #28]\n\t" + "str r4, [%[r], #36]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #24]\n\t" + "str r3, [%[r], #32]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #20]\n\t" + "str r2, [%[r], #28]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #16]\n\t" + "str r4, [%[r], #24]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #12]\n\t" + "str r3, [%[r], #20]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #8]\n\t" + "str r2, [%[r], #16]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #4]\n\t" + "str r4, [%[r], #12]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #0]\n\t" + "str r3, [%[r], #8]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "str r4, [%[r]]\n\t" + "str r2, [%[r], #4]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "r2", "r3", "r4", "r5", "r6" + ); +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_3072_mod_exp_2_96(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[192]; + sp_digit td[97]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 289, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 192; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_3072_mont_setup(m, &mp); + sp_3072_mont_norm_96(norm, m); + + i = (bits - 1) / 32; + n = e[i--]; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; + sp_3072_lshift_96(r, norm, y); + for (; i>=0 || c>=5; ) { + if (c == 0) { + n = e[i--]; + y = n >> 27; + n <<= 5; + c = 27; + } + else if (c < 5) { + y = n >> 27; + n = e[i--]; + c = 5 - c; + y |= n >> (32 - c); + n <<= c; + c = 32 - c; + } + else { + y = (n >> 27) & 0x1f; + n <<= 5; + c -= 5; + } + + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + + sp_3072_lshift_96(r, r, y); + sp_3072_mul_d_96(tmp, norm, r[96]); + r[96] = 0; + o = sp_3072_add_96(r, r, tmp); + sp_3072_cond_sub_96(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[96], 0, sizeof(sp_digit) * 96); + sp_3072_mont_reduce_96(r, m, mp); + + mask = 0 - (sp_3072_cmp_96(r, m) >= 0); + sp_3072_cond_sub_96(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_FFDHE_3072 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -16923,7 +19556,13 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen, sp_3072_from_bin(e, 96, exp, expLen); sp_3072_from_mp(m, 96, mod); - err = sp_3072_mod_exp_96(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_3072 + if (base->used == 1 && base->dp[0] == 2 && m[95] == (sp_digit)-1) + err = sp_3072_mod_exp_2_96(r, e, expLen * 8, m); + else + #endif + err = sp_3072_mod_exp_96(r, b, e, expLen * 8, m, 0); + } if (err == MP_OKAY) { @@ -17352,14 +19991,14 @@ static void sp_256_from_mp(sp_digit* r, int max, mp_int* a) s = 32 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 32 <= DIGIT_BIT) { s += 32; r[j] &= 0xffffffff; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -17453,7 +20092,7 @@ static int sp_256_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 8; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 32 >= DIGIT_BIT) { - #if DIGIT_BIT < 32 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -17515,8 +20154,11 @@ static int32_t sp_256_cmp_8(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "sub r6, r6, #4\n\t" "bcc 1b\n\t" @@ -17534,64 +20176,88 @@ static int32_t sp_256_cmp_8(sp_digit* a, sp_digit* b) "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #24]\n\t" "ldr r5, [%[b], #24]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #20]\n\t" "ldr r5, [%[b], #20]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #16]\n\t" "ldr r5, [%[b], #16]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #12]\n\t" "ldr r5, [%[b], #12]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #8]\n\t" "ldr r5, [%[b], #8]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #4]\n\t" "ldr r5, [%[b], #4]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "ldr r4, [%[a], #0]\n\t" "ldr r5, [%[b], #0]\n\t" "and r4, r4, r3\n\t" "and r5, r5, r3\n\t" "subs r4, r4, r5\n\t" + "it hi\n\t" "movhi %[r], %[one]\n\t" + "it lo\n\t" "movlo %[r], r3\n\t" + "it ne\n\t" "movne r3, r7\n\t" "eor %[r], %[r], r3\n\t" : [r] "+r" (r) @@ -23605,7 +26271,7 @@ static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_256_mask_8(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_256_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -23642,6 +26308,7 @@ static WC_INLINE int sp_256_div_8(sp_digit* a, sp_digit* d, sp_digit* m, (void)m; + div = d[7]; XMEMCPY(t1, a, sizeof(*t1) * 2 * 8); for (i=7; i>=0; i--) { diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 94bbd9498..30ee20a0d 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -102,14 +102,14 @@ static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a) s = 64 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 64 <= DIGIT_BIT) { s += 64; r[j] &= 0xffffffffffffffffl; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -1077,7 +1077,7 @@ static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_8(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -1102,7 +1102,7 @@ static void sp_2048_mask_8(sp_digit* r, sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit* z0 = r; @@ -1134,7 +1134,7 @@ static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; sp_digit z2[16]; @@ -1362,7 +1362,7 @@ static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -1391,7 +1391,7 @@ static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit* z0 = r; @@ -1423,7 +1423,7 @@ static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; sp_digit z2[32]; @@ -1645,7 +1645,7 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) { int i; @@ -3101,9 +3101,12 @@ static int sp_2048_mod_exp_16(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 60; - n <<= 4; - c = 60; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 4; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 16); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -3133,10 +3136,6 @@ static int sp_2048_mod_exp_16(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_16(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_16(r, r, m, mp); - sp_2048_mont_mul_16(r, r, t[y], m, mp); XMEMSET(&r[16], 0, sizeof(sp_digit) * 16); sp_2048_mont_reduce_16(r, m, mp); @@ -3244,9 +3243,12 @@ static int sp_2048_mod_exp_16(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 16); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -3277,10 +3279,6 @@ static int sp_2048_mod_exp_16(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_16(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_16(r, r, m, mp); - sp_2048_mont_mul_16(r, r, t[y], m, mp); XMEMSET(&r[16], 0, sizeof(sp_digit) * 16); sp_2048_mont_reduce_16(r, m, mp); @@ -3964,7 +3962,7 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -4467,9 +4465,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 60; - n <<= 4; - c = 60; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 4; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -4499,10 +4500,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_32(r, r, m, mp); - sp_2048_mont_mul_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_32(r, m, mp); @@ -4610,9 +4607,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -4643,10 +4643,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_32(r, r, m, mp); - sp_2048_mont_mul_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_32(r, m, mp); @@ -4959,7 +4955,7 @@ static int sp_2048_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 32; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 64 >= DIGIT_BIT) { - #if DIGIT_BIT < 64 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -5016,6 +5012,316 @@ int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_2048 +static void sp_2048_lshift_32(sp_digit* r, sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "mov x6, 63\n\t" + "sub x6, x6, %[n]\n\t" + "ldr x3, [%[a], 248]\n\t" + "lsr x4, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x4, x4, x6\n\t" + "ldr x2, [%[a], 240]\n\t" + "str x4, [%[r], 256]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 232]\n\t" + "str x3, [%[r], 248]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 224]\n\t" + "str x2, [%[r], 240]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 216]\n\t" + "str x4, [%[r], 232]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 208]\n\t" + "str x3, [%[r], 224]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 200]\n\t" + "str x2, [%[r], 216]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 192]\n\t" + "str x4, [%[r], 208]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 184]\n\t" + "str x3, [%[r], 200]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 176]\n\t" + "str x2, [%[r], 192]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 168]\n\t" + "str x4, [%[r], 184]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 160]\n\t" + "str x3, [%[r], 176]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 152]\n\t" + "str x2, [%[r], 168]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 144]\n\t" + "str x4, [%[r], 160]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 136]\n\t" + "str x3, [%[r], 152]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 128]\n\t" + "str x2, [%[r], 144]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 120]\n\t" + "str x4, [%[r], 136]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 112]\n\t" + "str x3, [%[r], 128]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 104]\n\t" + "str x2, [%[r], 120]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 96]\n\t" + "str x4, [%[r], 112]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 88]\n\t" + "str x3, [%[r], 104]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 80]\n\t" + "str x2, [%[r], 96]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 72]\n\t" + "str x4, [%[r], 88]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 64]\n\t" + "str x3, [%[r], 80]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 56]\n\t" + "str x2, [%[r], 72]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 48]\n\t" + "str x4, [%[r], 64]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 40]\n\t" + "str x3, [%[r], 56]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 32]\n\t" + "str x2, [%[r], 48]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 24]\n\t" + "str x4, [%[r], 40]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 16]\n\t" + "str x3, [%[r], 32]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 8]\n\t" + "str x2, [%[r], 24]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 0]\n\t" + "str x4, [%[r], 16]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "str x2, [%[r]]\n\t" + "str x3, [%[r], 8]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "x2", "x3", "x4", "x5", "x6" + ); +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_2048_mod_exp_2_32(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[64]; + sp_digit td[33]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 97, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 64; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_2048_mont_setup(m, &mp); + sp_2048_mont_norm_32(norm, m); + + i = (bits - 1) / 64; + n = e[i--]; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 6; + y = n >> c; + n <<= 64 - c; + sp_2048_lshift_32(r, norm, y); + for (; i>=0 || c>=6; ) { + if (c == 0) { + n = e[i--]; + y = n >> 58; + n <<= 6; + c = 58; + } + else if (c < 6) { + y = n >> 58; + n = e[i--]; + c = 6 - c; + y |= n >> (64 - c); + n <<= c; + c = 64 - c; + } + else { + y = (n >> 58) & 0x3f; + n <<= 6; + c -= 6; + } + + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + + sp_2048_lshift_32(r, r, y); + sp_2048_mul_d_32(tmp, norm, r[32]); + r[32] = 0; + o = sp_2048_add_32(r, r, tmp); + sp_2048_cond_sub_32(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); + sp_2048_mont_reduce_32(r, m, mp); + + mask = 0 - (sp_2048_cmp_32(r, m) >= 0); + sp_2048_cond_sub_32(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_FFDHE_2048 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -5046,7 +5352,13 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen, sp_2048_from_bin(e, 32, exp, expLen); sp_2048_from_mp(m, 32, mod); - err = sp_2048_mod_exp_32(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_2048 + if (base->used == 1 && base->dp[0] == 2 && m[31] == (sp_digit)-1) + err = sp_2048_mod_exp_2_32(r, e, expLen * 8, m); + else + #endif + err = sp_2048_mod_exp_32(r, b, e, expLen * 8, m, 0); + } if (err == MP_OKAY) { @@ -5164,14 +5476,14 @@ static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a) s = 64 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 64 <= DIGIT_BIT) { s += 64; r[j] &= 0xffffffffffffffffl; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -7261,7 +7573,7 @@ static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_12(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -7290,7 +7602,7 @@ static void sp_3072_mask_12(sp_digit* r, sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit* z0 = r; @@ -7322,7 +7634,7 @@ static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; sp_digit z2[24]; @@ -7630,7 +7942,7 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_24(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -7659,7 +7971,7 @@ static void sp_3072_mask_24(sp_digit* r, sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit* z0 = r; @@ -7691,7 +8003,7 @@ static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) { sp_digit* z0 = r; sp_digit z2[48]; @@ -7913,7 +8225,7 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_24(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) { int i; @@ -9769,9 +10081,12 @@ static int sp_3072_mod_exp_24(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 60; - n <<= 4; - c = 60; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 4; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 24); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -9801,10 +10116,6 @@ static int sp_3072_mod_exp_24(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_24(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_24(r, r, m, mp); - sp_3072_mont_mul_24(r, r, t[y], m, mp); XMEMSET(&r[24], 0, sizeof(sp_digit) * 24); sp_3072_mont_reduce_24(r, m, mp); @@ -9912,9 +10223,12 @@ static int sp_3072_mod_exp_24(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 24); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -9945,10 +10259,6 @@ static int sp_3072_mod_exp_24(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_24(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_24(r, r, m, mp); - sp_3072_mont_mul_24(r, r, t[y], m, mp); XMEMSET(&r[24], 0, sizeof(sp_digit) * 24); sp_3072_mont_reduce_24(r, m, mp); @@ -10872,7 +11182,7 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -11503,9 +11813,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 60; - n <<= 4; - c = 60; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 4; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -11535,10 +11848,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_48(r, r, m, mp); - sp_3072_mont_mul_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_48(r, m, mp); @@ -11646,9 +11955,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -11679,10 +11991,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_48(r, r, m, mp); - sp_3072_mont_mul_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_48(r, m, mp); @@ -11995,7 +12303,7 @@ static int sp_3072_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 48; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 64 >= DIGIT_BIT) { - #if DIGIT_BIT < 64 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -12052,6 +12360,412 @@ int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_3072 +static void sp_3072_lshift_48(sp_digit* r, sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "mov x6, 63\n\t" + "sub x6, x6, %[n]\n\t" + "ldr x3, [%[a], 376]\n\t" + "lsr x4, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x4, x4, x6\n\t" + "ldr x2, [%[a], 368]\n\t" + "str x4, [%[r], 384]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 360]\n\t" + "str x3, [%[r], 376]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 352]\n\t" + "str x2, [%[r], 368]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 344]\n\t" + "str x4, [%[r], 360]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 336]\n\t" + "str x3, [%[r], 352]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 328]\n\t" + "str x2, [%[r], 344]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 320]\n\t" + "str x4, [%[r], 336]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 312]\n\t" + "str x3, [%[r], 328]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 304]\n\t" + "str x2, [%[r], 320]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 296]\n\t" + "str x4, [%[r], 312]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 288]\n\t" + "str x3, [%[r], 304]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 280]\n\t" + "str x2, [%[r], 296]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 272]\n\t" + "str x4, [%[r], 288]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 264]\n\t" + "str x3, [%[r], 280]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 256]\n\t" + "str x2, [%[r], 272]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 248]\n\t" + "str x4, [%[r], 264]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 240]\n\t" + "str x3, [%[r], 256]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 232]\n\t" + "str x2, [%[r], 248]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 224]\n\t" + "str x4, [%[r], 240]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 216]\n\t" + "str x3, [%[r], 232]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 208]\n\t" + "str x2, [%[r], 224]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 200]\n\t" + "str x4, [%[r], 216]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 192]\n\t" + "str x3, [%[r], 208]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 184]\n\t" + "str x2, [%[r], 200]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 176]\n\t" + "str x4, [%[r], 192]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 168]\n\t" + "str x3, [%[r], 184]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 160]\n\t" + "str x2, [%[r], 176]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 152]\n\t" + "str x4, [%[r], 168]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 144]\n\t" + "str x3, [%[r], 160]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 136]\n\t" + "str x2, [%[r], 152]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 128]\n\t" + "str x4, [%[r], 144]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 120]\n\t" + "str x3, [%[r], 136]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 112]\n\t" + "str x2, [%[r], 128]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 104]\n\t" + "str x4, [%[r], 120]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 96]\n\t" + "str x3, [%[r], 112]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 88]\n\t" + "str x2, [%[r], 104]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 80]\n\t" + "str x4, [%[r], 96]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 72]\n\t" + "str x3, [%[r], 88]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 64]\n\t" + "str x2, [%[r], 80]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 56]\n\t" + "str x4, [%[r], 72]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 48]\n\t" + "str x3, [%[r], 64]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 40]\n\t" + "str x2, [%[r], 56]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 32]\n\t" + "str x4, [%[r], 48]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 24]\n\t" + "str x3, [%[r], 40]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "ldr x3, [%[a], 16]\n\t" + "str x2, [%[r], 32]\n\t" + "lsr x5, x3, 1\n\t" + "lsl x3, x3, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x4, x4, x5\n\t" + "ldr x2, [%[a], 8]\n\t" + "str x4, [%[r], 24]\n\t" + "lsr x5, x2, 1\n\t" + "lsl x2, x2, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x3, x3, x5\n\t" + "ldr x4, [%[a], 0]\n\t" + "str x3, [%[r], 16]\n\t" + "lsr x5, x4, 1\n\t" + "lsl x4, x4, %[n]\n\t" + "lsr x5, x5, x6\n\t" + "orr x2, x2, x5\n\t" + "str x4, [%[r]]\n\t" + "str x2, [%[r], 8]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "x2", "x3", "x4", "x5", "x6" + ); +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_3072_mod_exp_2_48(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[96]; + sp_digit td[49]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 145, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 96; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_3072_mont_setup(m, &mp); + sp_3072_mont_norm_48(norm, m); + + i = (bits - 1) / 64; + n = e[i--]; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 6; + y = n >> c; + n <<= 64 - c; + sp_3072_lshift_48(r, norm, y); + for (; i>=0 || c>=6; ) { + if (c == 0) { + n = e[i--]; + y = n >> 58; + n <<= 6; + c = 58; + } + else if (c < 6) { + y = n >> 58; + n = e[i--]; + c = 6 - c; + y |= n >> (64 - c); + n <<= c; + c = 64 - c; + } + else { + y = (n >> 58) & 0x3f; + n <<= 6; + c -= 6; + } + + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + + sp_3072_lshift_48(r, r, y); + sp_3072_mul_d_48(tmp, norm, r[48]); + r[48] = 0; + o = sp_3072_add_48(r, r, tmp); + sp_3072_cond_sub_48(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); + sp_3072_mont_reduce_48(r, m, mp); + + mask = 0 - (sp_3072_cmp_48(r, m) >= 0); + sp_3072_cond_sub_48(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_FFDHE_3072 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -12082,7 +12796,13 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen, sp_3072_from_bin(e, 48, exp, expLen); sp_3072_from_mp(m, 48, mod); - err = sp_3072_mod_exp_48(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_3072 + if (base->used == 1 && base->dp[0] == 2 && m[47] == (sp_digit)-1) + err = sp_3072_mod_exp_2_48(r, e, expLen * 8, m); + else + #endif + err = sp_3072_mod_exp_48(r, b, e, expLen * 8, m, 0); + } if (err == MP_OKAY) { @@ -12345,14 +13065,14 @@ static void sp_256_from_mp(sp_digit* r, int max, mp_int* a) s = 64 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 64 <= DIGIT_BIT) { s += 64; r[j] &= 0xffffffffffffffffl; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -12446,7 +13166,7 @@ static int sp_256_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 4; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 64 >= DIGIT_BIT) { - #if DIGIT_BIT < 64 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -28440,7 +29160,7 @@ static sp_digit div_256_word_4(sp_digit d1, sp_digit d0, sp_digit div) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_256_mask_4(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_256_mask_4(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c index d31397524..c4052b263 100644 --- a/wolfcrypt/src/sp_armthumb.c +++ b/wolfcrypt/src/sp_armthumb.c @@ -102,14 +102,14 @@ static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a) s = 32 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 32 <= DIGIT_BIT) { s += 32; r[j] &= 0xffffffff; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -657,7 +657,7 @@ SP_NOINLINE static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_8(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -1041,7 +1041,7 @@ SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -1698,7 +1698,7 @@ SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -2131,7 +2131,7 @@ SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) { int i; @@ -3162,9 +3162,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -3194,10 +3197,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_32(r, r, m, mp); - sp_2048_mont_mul_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_32(r, m, mp); @@ -3305,9 +3304,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -3338,10 +3340,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_32(r, r, m, mp); - sp_2048_mont_mul_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_32(r, m, mp); @@ -3738,7 +3736,7 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_64(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -3983,9 +3981,12 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 64); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -4015,10 +4016,6 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_64(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_64(r, r, m, mp); - sp_2048_mont_mul_64(r, r, t[y], m, mp); XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); sp_2048_mont_reduce_64(r, m, mp); @@ -4126,9 +4123,12 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 64); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -4159,10 +4159,6 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_64(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_64(r, r, m, mp); - sp_2048_mont_mul_64(r, r, t[y], m, mp); XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); sp_2048_mont_reduce_64(r, m, mp); @@ -4475,7 +4471,7 @@ static int sp_2048_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 64; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 32 >= DIGIT_BIT) { - #if DIGIT_BIT < 32 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -4532,6 +4528,515 @@ int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_2048 +static void sp_2048_lshift_64(sp_digit* r, sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "mov r6, #31\n\t" + "sub r6, r6, %[n]\n\t" + "add %[a], %[a], #192\n\t" + "add %[r], %[r], #192\n\t" + "ldr r3, [%[a], #60]\n\t" + "lsr r4, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r4, r4, r6\n\t" + "ldr r2, [%[a], #56]\n\t" + "str r4, [%[r], #64]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #52]\n\t" + "str r3, [%[r], #60]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #48]\n\t" + "str r2, [%[r], #56]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #44]\n\t" + "str r4, [%[r], #52]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #40]\n\t" + "str r3, [%[r], #48]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #36]\n\t" + "str r2, [%[r], #44]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #32]\n\t" + "str r4, [%[r], #40]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #28]\n\t" + "str r3, [%[r], #36]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #24]\n\t" + "str r2, [%[r], #32]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #20]\n\t" + "str r4, [%[r], #28]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #16]\n\t" + "str r3, [%[r], #24]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #12]\n\t" + "str r2, [%[r], #20]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #8]\n\t" + "str r4, [%[r], #16]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #4]\n\t" + "str r3, [%[r], #12]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #0]\n\t" + "str r2, [%[r], #8]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r2, [%[a], #60]\n\t" + "str r4, [%[r], #68]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #56]\n\t" + "str r3, [%[r], #64]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #52]\n\t" + "str r2, [%[r], #60]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #48]\n\t" + "str r4, [%[r], #56]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #44]\n\t" + "str r3, [%[r], #52]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #40]\n\t" + "str r2, [%[r], #48]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #36]\n\t" + "str r4, [%[r], #44]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #32]\n\t" + "str r3, [%[r], #40]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #28]\n\t" + "str r2, [%[r], #36]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #24]\n\t" + "str r4, [%[r], #32]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #20]\n\t" + "str r3, [%[r], #28]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #16]\n\t" + "str r2, [%[r], #24]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #12]\n\t" + "str r4, [%[r], #20]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #8]\n\t" + "str r3, [%[r], #16]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #4]\n\t" + "str r2, [%[r], #12]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #0]\n\t" + "str r4, [%[r], #8]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r4, [%[a], #60]\n\t" + "str r3, [%[r], #68]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #56]\n\t" + "str r2, [%[r], #64]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #52]\n\t" + "str r4, [%[r], #60]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #48]\n\t" + "str r3, [%[r], #56]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #44]\n\t" + "str r2, [%[r], #52]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #40]\n\t" + "str r4, [%[r], #48]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #36]\n\t" + "str r3, [%[r], #44]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #32]\n\t" + "str r2, [%[r], #40]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #28]\n\t" + "str r4, [%[r], #36]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #24]\n\t" + "str r3, [%[r], #32]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #20]\n\t" + "str r2, [%[r], #28]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #16]\n\t" + "str r4, [%[r], #24]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #12]\n\t" + "str r3, [%[r], #20]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #8]\n\t" + "str r2, [%[r], #16]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #4]\n\t" + "str r4, [%[r], #12]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #0]\n\t" + "str r3, [%[r], #8]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r3, [%[a], #60]\n\t" + "str r2, [%[r], #68]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #56]\n\t" + "str r4, [%[r], #64]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #52]\n\t" + "str r3, [%[r], #60]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #48]\n\t" + "str r2, [%[r], #56]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #44]\n\t" + "str r4, [%[r], #52]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #40]\n\t" + "str r3, [%[r], #48]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #36]\n\t" + "str r2, [%[r], #44]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #32]\n\t" + "str r4, [%[r], #40]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #28]\n\t" + "str r3, [%[r], #36]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #24]\n\t" + "str r2, [%[r], #32]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #20]\n\t" + "str r4, [%[r], #28]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #16]\n\t" + "str r3, [%[r], #24]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #12]\n\t" + "str r2, [%[r], #20]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #8]\n\t" + "str r4, [%[r], #16]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #4]\n\t" + "str r3, [%[r], #12]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #0]\n\t" + "str r2, [%[r], #8]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "str r3, [%[r]]\n\t" + "str r4, [%[r], #4]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "r2", "r3", "r4", "r5", "r6" + ); +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_2048_mod_exp_2_64(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[128]; + sp_digit td[65]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 193, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 128; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_2048_mont_setup(m, &mp); + sp_2048_mont_norm_64(norm, m); + + i = (bits - 1) / 32; + n = e[i--]; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; + sp_2048_lshift_64(r, norm, y); + for (; i>=0 || c>=5; ) { + if (c == 0) { + n = e[i--]; + y = n >> 27; + n <<= 5; + c = 27; + } + else if (c < 5) { + y = n >> 27; + n = e[i--]; + c = 5 - c; + y |= n >> (32 - c); + n <<= c; + c = 32 - c; + } + else { + y = (n >> 27) & 0x1f; + n <<= 5; + c -= 5; + } + + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + + sp_2048_lshift_64(r, r, y); + sp_2048_mul_d_64(tmp, norm, r[64]); + r[64] = 0; + o = sp_2048_add_64(r, r, tmp); + sp_2048_cond_sub_64(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); + sp_2048_mont_reduce_64(r, m, mp); + + mask = 0 - (sp_2048_cmp_64(r, m) >= 0); + sp_2048_cond_sub_64(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_FFDHE_2048 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -4562,7 +5067,13 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen, sp_2048_from_bin(e, 64, exp, expLen); sp_2048_from_mp(m, 64, mod); - err = sp_2048_mod_exp_64(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_2048 + if (base->used == 1 && base->dp[0] == 2 && m[63] == (sp_digit)-1) + err = sp_2048_mod_exp_2_64(r, e, expLen * 8, m); + else + #endif + err = sp_2048_mod_exp_64(r, b, e, expLen * 8, m, 0); + } if (err == MP_OKAY) { @@ -4680,14 +5191,14 @@ static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a) s = 32 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 32 <= DIGIT_BIT) { s += 32; r[j] &= 0xffffffff; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -4762,10 +5273,10 @@ static void sp_3072_to_bin(sp_digit* r, byte* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b) { - sp_digit tmp[8 * 2]; + sp_digit tmp[12 * 2]; __asm__ __volatile__ ( "mov r3, #0\n\t" "mov r4, #0\n\t" @@ -4773,13 +5284,13 @@ SP_NOINLINE static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, "mov r11, %[r]\n\t" "mov r9, %[a]\n\t" "mov r10, %[b]\n\t" - "mov r6, #32\n\t" + "mov r6, #48\n\t" "add r6, r9\n\t" "mov r12, r6\n\t" "\n1:\n\t" "mov %[r], #0\n\t" "mov r5, #0\n\t" - "mov r6, #28\n\t" + "mov r6, #44\n\t" "mov %[a], r8\n\t" "sub %[a], r6\n\t" "sbc r6, r6\n\t" @@ -4842,7 +5353,7 @@ SP_NOINLINE static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, "mov r4, r5\n\t" "add r7, #4\n\t" "mov r8, r7\n\t" - "mov r6, #56\n\t" + "mov r6, #88\n\t" "cmp r7, r6\n\t" "ble 1b\n\t" "str r3, [%[r], r7]\n\t" @@ -4861,7 +5372,7 @@ SP_NOINLINE static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -4869,14 +5380,14 @@ SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "mov r5, #0\n\t" "mov r8, r3\n\t" "mov r11, %[r]\n\t" - "mov r6, #64\n\t" + "mov r6, #96\n\t" "neg r6, r6\n\t" "add sp, r6\n\t" "mov r10, sp\n\t" "mov r9, %[a]\n\t" "\n1:\n\t" "mov %[r], #0\n\t" - "mov r6, #28\n\t" + "mov r6, #44\n\t" "mov %[a], r8\n\t" "sub %[a], r6\n\t" "sbc r6, r6\n\t" @@ -4964,7 +5475,7 @@ SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "\n5:\n\t" "add %[a], #4\n\t" "sub r2, #4\n\t" - "mov r6, #32\n\t" + "mov r6, #48\n\t" "add r6, r9\n\t" "cmp %[a], r6\n\t" "beq 3f\n\t" @@ -4983,20 +5494,20 @@ SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "mov r5, #0\n\t" "add r7, #4\n\t" "mov r8, r7\n\t" - "mov r6, #56\n\t" + "mov r6, #88\n\t" "cmp r7, r6\n\t" "ble 1b\n\t" "mov %[a], r9\n\t" "str r3, [%[r], r7]\n\t" "mov %[r], r11\n\t" "mov %[a], r10\n\t" - "mov r3, #60\n\t" + "mov r3, #92\n\t" "\n4:\n\t" "ldr r6, [%[a], r3]\n\t" "str r6, [%[r], r3]\n\t" "sub r3, #4\n\t" "bge 4b\n\t" - "mov r6, #64\n\t" + "mov r6, #96\n\t" "add sp, r6\n\t" : : [r] "r" (r), [a] "r" (a) @@ -5010,7 +5521,7 @@ SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_8(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit c = 0; @@ -5048,6 +5559,22 @@ SP_NOINLINE static sp_digit sp_3072_add_8(sp_digit* r, const sp_digit* a, "ldr r5, [%[b], #28]\n\t" "adc r4, r5\n\t" "str r4, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[b], #32]\n\t" + "adc r4, r5\n\t" + "str r4, [%[r], #32]\n\t" + "ldr r4, [%[a], #36]\n\t" + "ldr r5, [%[b], #36]\n\t" + "adc r4, r5\n\t" + "str r4, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[b], #40]\n\t" + "adc r4, r5\n\t" + "str r4, [%[r], #40]\n\t" + "ldr r4, [%[a], #44]\n\t" + "ldr r5, [%[b], #44]\n\t" + "adc r4, r5\n\t" + "str r4, [%[r], #44]\n\t" "mov %[c], #0\n\t" "adc %[c], %[c]\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -5064,7 +5591,7 @@ SP_NOINLINE static sp_digit sp_3072_add_8(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_16(sp_digit* a, +SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a, const sp_digit* b) { sp_digit c = 0; @@ -5134,6 +5661,38 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_16(sp_digit* a, "sbc r4, r6\n\t" "str r3, [%[a], #56]\n\t" "str r4, [%[a], #60]\n\t" + "ldr r3, [%[a], #64]\n\t" + "ldr r4, [%[a], #68]\n\t" + "ldr r5, [%[b], #64]\n\t" + "ldr r6, [%[b], #68]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #64]\n\t" + "str r4, [%[a], #68]\n\t" + "ldr r3, [%[a], #72]\n\t" + "ldr r4, [%[a], #76]\n\t" + "ldr r5, [%[b], #72]\n\t" + "ldr r6, [%[b], #76]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #72]\n\t" + "str r4, [%[a], #76]\n\t" + "ldr r3, [%[a], #80]\n\t" + "ldr r4, [%[a], #84]\n\t" + "ldr r5, [%[b], #80]\n\t" + "ldr r6, [%[b], #84]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #80]\n\t" + "str r4, [%[a], #84]\n\t" + "ldr r3, [%[a], #88]\n\t" + "ldr r4, [%[a], #92]\n\t" + "ldr r5, [%[b], #88]\n\t" + "ldr r6, [%[b], #92]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #88]\n\t" + "str r4, [%[a], #92]\n\t" "sbc %[c], %[c]\n\t" : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) : @@ -5149,327 +5708,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_16(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldr r4, [%[a], #0]\n\t" - "ldr r5, [%[b], #0]\n\t" - "add r4, r5\n\t" - "str r4, [%[r], #0]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b], #4]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #4]\n\t" - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[b], #8]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #8]\n\t" - "ldr r4, [%[a], #12]\n\t" - "ldr r5, [%[b], #12]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #12]\n\t" - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[b], #16]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[b], #20]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #20]\n\t" - "ldr r4, [%[a], #24]\n\t" - "ldr r5, [%[b], #24]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #24]\n\t" - "ldr r4, [%[a], #28]\n\t" - "ldr r5, [%[b], #28]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #28]\n\t" - "ldr r4, [%[a], #32]\n\t" - "ldr r5, [%[b], #32]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #32]\n\t" - "ldr r4, [%[a], #36]\n\t" - "ldr r5, [%[b], #36]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #36]\n\t" - "ldr r4, [%[a], #40]\n\t" - "ldr r5, [%[b], #40]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #40]\n\t" - "ldr r4, [%[a], #44]\n\t" - "ldr r5, [%[b], #44]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #44]\n\t" - "ldr r4, [%[a], #48]\n\t" - "ldr r5, [%[b], #48]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #48]\n\t" - "ldr r4, [%[a], #52]\n\t" - "ldr r5, [%[b], #52]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #52]\n\t" - "ldr r4, [%[a], #56]\n\t" - "ldr r5, [%[b], #56]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #56]\n\t" - "ldr r4, [%[a], #60]\n\t" - "ldr r5, [%[b], #60]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #60]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r4", "r5" - ); - - return c; -} - -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_8(sp_digit* r, sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<8; i++) - r[i] = a[i] & m; -#else - r[0] = a[0] & m; - r[1] = a[1] & m; - r[2] = a[2] & m; - r[3] = a[3] & m; - r[4] = a[4] & m; - r[5] = a[5] & m; - r[6] = a[6] & m; - r[7] = a[7] & m; -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit b1[8]; - sp_digit z2[16]; - sp_digit u, ca, cb; - - ca = sp_3072_add_8(a1, a, &a[8]); - cb = sp_3072_add_8(b1, b, &b[8]); - u = ca & cb; - sp_3072_mul_8(z1, a1, b1); - sp_3072_mul_8(z2, &a[8], &b[8]); - sp_3072_mul_8(z0, a, b); - sp_3072_mask_8(r + 16, a1, 0 - cb); - sp_3072_mask_8(b1, b1, 0 - ca); - u += sp_3072_add_8(r + 16, r + 16, b1); - u += sp_3072_sub_in_place_16(z1, z2); - u += sp_3072_sub_in_place_16(z1, z0); - u += sp_3072_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - sp_3072_add_16(r + 16, r + 16, z2); -} - -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_3072_sqr_16(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[16]; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit u; - - u = sp_3072_add_8(a1, a, &a[8]); - sp_3072_sqr_8(z1, a1); - sp_3072_sqr_8(z2, &a[8]); - sp_3072_sqr_8(z0, a); - sp_3072_mask_8(r + 16, a1, 0 - u); - u += sp_3072_add_8(r + 16, r + 16, r + 16); - u += sp_3072_sub_in_place_16(z1, z2); - u += sp_3072_sub_in_place_16(z1, z0); - u += sp_3072_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - sp_3072_add_16(r + 16, r + 16, z2); -} - -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldr r4, [%[a], #0]\n\t" - "ldr r5, [%[a], #4]\n\t" - "ldr r6, [%[b], #0]\n\t" - "ldr r7, [%[b], #4]\n\t" - "sub r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #0]\n\t" - "str r5, [%[r], #4]\n\t" - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #12]\n\t" - "ldr r6, [%[b], #8]\n\t" - "ldr r7, [%[b], #12]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #12]\n\t" - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #20]\n\t" - "ldr r6, [%[b], #16]\n\t" - "ldr r7, [%[b], #20]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #20]\n\t" - "ldr r4, [%[a], #24]\n\t" - "ldr r5, [%[a], #28]\n\t" - "ldr r6, [%[b], #24]\n\t" - "ldr r7, [%[b], #28]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #24]\n\t" - "str r5, [%[r], #28]\n\t" - "ldr r4, [%[a], #32]\n\t" - "ldr r5, [%[a], #36]\n\t" - "ldr r6, [%[b], #32]\n\t" - "ldr r7, [%[b], #36]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #32]\n\t" - "str r5, [%[r], #36]\n\t" - "ldr r4, [%[a], #40]\n\t" - "ldr r5, [%[a], #44]\n\t" - "ldr r6, [%[b], #40]\n\t" - "ldr r7, [%[b], #44]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #40]\n\t" - "str r5, [%[r], #44]\n\t" - "ldr r4, [%[a], #48]\n\t" - "ldr r5, [%[a], #52]\n\t" - "ldr r6, [%[b], #48]\n\t" - "ldr r7, [%[b], #52]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #48]\n\t" - "str r5, [%[r], #52]\n\t" - "ldr r4, [%[a], #56]\n\t" - "ldr r5, [%[a], #60]\n\t" - "ldr r6, [%[b], #56]\n\t" - "ldr r7, [%[b], #60]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #56]\n\t" - "str r5, [%[r], #60]\n\t" - "ldr r4, [%[a], #64]\n\t" - "ldr r5, [%[a], #68]\n\t" - "ldr r6, [%[b], #64]\n\t" - "ldr r7, [%[b], #68]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #64]\n\t" - "str r5, [%[r], #68]\n\t" - "ldr r4, [%[a], #72]\n\t" - "ldr r5, [%[a], #76]\n\t" - "ldr r6, [%[b], #72]\n\t" - "ldr r7, [%[b], #76]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #72]\n\t" - "str r5, [%[r], #76]\n\t" - "ldr r4, [%[a], #80]\n\t" - "ldr r5, [%[a], #84]\n\t" - "ldr r6, [%[b], #80]\n\t" - "ldr r7, [%[b], #84]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #80]\n\t" - "str r5, [%[r], #84]\n\t" - "ldr r4, [%[a], #88]\n\t" - "ldr r5, [%[a], #92]\n\t" - "ldr r6, [%[b], #88]\n\t" - "ldr r7, [%[b], #92]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #88]\n\t" - "str r5, [%[r], #92]\n\t" - "ldr r4, [%[a], #96]\n\t" - "ldr r5, [%[a], #100]\n\t" - "ldr r6, [%[b], #96]\n\t" - "ldr r7, [%[b], #100]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #96]\n\t" - "str r5, [%[r], #100]\n\t" - "ldr r4, [%[a], #104]\n\t" - "ldr r5, [%[a], #108]\n\t" - "ldr r6, [%[b], #104]\n\t" - "ldr r7, [%[b], #108]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #104]\n\t" - "str r5, [%[r], #108]\n\t" - "ldr r4, [%[a], #112]\n\t" - "ldr r5, [%[a], #116]\n\t" - "ldr r6, [%[b], #112]\n\t" - "ldr r7, [%[b], #116]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #112]\n\t" - "str r5, [%[r], #116]\n\t" - "ldr r4, [%[a], #120]\n\t" - "ldr r5, [%[a], #124]\n\t" - "ldr r6, [%[b], #120]\n\t" - "ldr r7, [%[b], #124]\n\t" - "sbc r4, r6\n\t" - "sbc r5, r7\n\t" - "str r4, [%[r], #120]\n\t" - "str r5, [%[r], #124]\n\t" - "sbc %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r4", "r5", "r6", "r7" - ); - - return c; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit c = 0; @@ -5571,38 +5810,6 @@ SP_NOINLINE static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a, "ldr r5, [%[b], #92]\n\t" "adc r4, r5\n\t" "str r4, [%[r], #92]\n\t" - "ldr r4, [%[a], #96]\n\t" - "ldr r5, [%[b], #96]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #96]\n\t" - "ldr r4, [%[a], #100]\n\t" - "ldr r5, [%[b], #100]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #100]\n\t" - "ldr r4, [%[a], #104]\n\t" - "ldr r5, [%[b], #104]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #104]\n\t" - "ldr r4, [%[a], #108]\n\t" - "ldr r5, [%[b], #108]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #108]\n\t" - "ldr r4, [%[a], #112]\n\t" - "ldr r5, [%[b], #112]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #112]\n\t" - "ldr r4, [%[a], #116]\n\t" - "ldr r5, [%[b], #116]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #116]\n\t" - "ldr r4, [%[a], #120]\n\t" - "ldr r5, [%[b], #120]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #120]\n\t" - "ldr r4, [%[a], #124]\n\t" - "ldr r5, [%[b], #124]\n\t" - "adc r4, r5\n\t" - "str r4, [%[r], #124]\n\t" "mov %[c], #0\n\t" "adc %[c], %[c]\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -5613,96 +5820,311 @@ SP_NOINLINE static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a, return c; } +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<12; i++) + r[i] = a[i] & m; +#else + r[0] = a[0] & m; + r[1] = a[1] & m; + r[2] = a[2] & m; + r[3] = a[3] & m; + r[4] = a[4] & m; + r[5] = a[5] & m; + r[6] = a[6] & m; + r[7] = a[7] & m; + r[8] = a[8] & m; + r[9] = a[9] & m; + r[10] = a[10] & m; + r[11] = a[11] & m; +#endif +} + /* Multiply a and b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) +SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) { - sp_digit p0[32]; - sp_digit p1[32]; - sp_digit p2[32]; - sp_digit p3[32]; - sp_digit p4[32]; - sp_digit p5[32]; - sp_digit t0[32]; - sp_digit t1[32]; - sp_digit t2[32]; - sp_digit a0[16]; - sp_digit a1[16]; - sp_digit a2[16]; - sp_digit b0[16]; - sp_digit b1[16]; - sp_digit b2[16]; - sp_3072_add_16(a0, a, &a[16]); - sp_3072_add_16(b0, b, &b[16]); - sp_3072_add_16(a1, &a[16], &a[32]); - sp_3072_add_16(b1, &b[16], &b[32]); - sp_3072_add_16(a2, a0, &a[32]); - sp_3072_add_16(b2, b0, &b[32]); - sp_3072_mul_16(p0, a, b); - sp_3072_mul_16(p2, &a[16], &b[16]); - sp_3072_mul_16(p4, &a[32], &b[32]); - sp_3072_mul_16(p1, a0, b0); - sp_3072_mul_16(p3, a1, b1); - sp_3072_mul_16(p5, a2, b2); - XMEMSET(r, 0, sizeof(*r)*2*48); - sp_3072_sub_32(t0, p3, p2); - sp_3072_sub_32(t1, p1, p2); - sp_3072_sub_32(t2, p5, t0); - sp_3072_sub_32(t2, t2, t1); - sp_3072_sub_32(t0, t0, p4); - sp_3072_sub_32(t1, t1, p0); - sp_3072_add_32(r, r, p0); - sp_3072_add_32(&r[16], &r[16], t1); - sp_3072_add_32(&r[32], &r[32], t2); - sp_3072_add_32(&r[48], &r[48], t0); - sp_3072_add_32(&r[64], &r[64], p4); + sp_digit* z0 = r; + sp_digit z1[24]; + sp_digit a1[12]; + sp_digit b1[12]; + sp_digit z2[24]; + sp_digit u, ca, cb; + + ca = sp_3072_add_12(a1, a, &a[12]); + cb = sp_3072_add_12(b1, b, &b[12]); + u = ca & cb; + sp_3072_mul_12(z1, a1, b1); + sp_3072_mul_12(z2, &a[12], &b[12]); + sp_3072_mul_12(z0, a, b); + sp_3072_mask_12(r + 24, a1, 0 - cb); + sp_3072_mask_12(b1, b1, 0 - ca); + u += sp_3072_add_12(r + 24, r + 24, b1); + u += sp_3072_sub_in_place_24(z1, z2); + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_add_24(r + 12, r + 12, z1); + r[36] = u; + XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); + sp_3072_add_24(r + 24, r + 24, z2); } -/* Square a into r. (r = a * a) +/* Square a and put result in r. (r = a * a) * * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) { - sp_digit p0[32]; - sp_digit p1[32]; - sp_digit p2[32]; - sp_digit p3[32]; - sp_digit p4[32]; - sp_digit p5[32]; - sp_digit t0[32]; - sp_digit t1[32]; - sp_digit t2[32]; - sp_digit a0[16]; - sp_digit a1[16]; - sp_digit a2[16]; - sp_3072_add_16(a0, a, &a[16]); - sp_3072_add_16(a1, &a[16], &a[32]); - sp_3072_add_16(a2, a0, &a[32]); - sp_3072_sqr_16(p0, a); - sp_3072_sqr_16(p2, &a[16]); - sp_3072_sqr_16(p4, &a[32]); - sp_3072_sqr_16(p1, a0); - sp_3072_sqr_16(p3, a1); - sp_3072_sqr_16(p5, a2); - XMEMSET(r, 0, sizeof(*r)*2*48); - sp_3072_sub_32(t0, p3, p2); - sp_3072_sub_32(t1, p1, p2); - sp_3072_sub_32(t2, p5, t0); - sp_3072_sub_32(t2, t2, t1); - sp_3072_sub_32(t0, t0, p4); - sp_3072_sub_32(t1, t1, p0); - sp_3072_add_32(r, r, p0); - sp_3072_add_32(&r[16], &r[16], t1); - sp_3072_add_32(&r[32], &r[32], t2); - sp_3072_add_32(&r[48], &r[48], t0); - sp_3072_add_32(&r[64], &r[64], p4); + sp_digit* z0 = r; + sp_digit z2[24]; + sp_digit z1[24]; + sp_digit a1[12]; + sp_digit u; + + u = sp_3072_add_12(a1, a, &a[12]); + sp_3072_sqr_12(z1, a1); + sp_3072_sqr_12(z2, &a[12]); + sp_3072_sqr_12(z0, a); + sp_3072_mask_12(r + 24, a1, 0 - u); + u += sp_3072_add_12(r + 24, r + 24, r + 24); + u += sp_3072_sub_in_place_24(z1, z2); + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_add_24(r + 12, r + 12, z1); + r[36] = u; + XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); + sp_3072_add_24(r + 24, r + 24, z2); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldr r3, [%[a], #0]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[b], #0]\n\t" + "ldr r6, [%[b], #4]\n\t" + "sub r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #0]\n\t" + "str r4, [%[a], #4]\n\t" + "ldr r3, [%[a], #8]\n\t" + "ldr r4, [%[a], #12]\n\t" + "ldr r5, [%[b], #8]\n\t" + "ldr r6, [%[b], #12]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #8]\n\t" + "str r4, [%[a], #12]\n\t" + "ldr r3, [%[a], #16]\n\t" + "ldr r4, [%[a], #20]\n\t" + "ldr r5, [%[b], #16]\n\t" + "ldr r6, [%[b], #20]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #16]\n\t" + "str r4, [%[a], #20]\n\t" + "ldr r3, [%[a], #24]\n\t" + "ldr r4, [%[a], #28]\n\t" + "ldr r5, [%[b], #24]\n\t" + "ldr r6, [%[b], #28]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #24]\n\t" + "str r4, [%[a], #28]\n\t" + "ldr r3, [%[a], #32]\n\t" + "ldr r4, [%[a], #36]\n\t" + "ldr r5, [%[b], #32]\n\t" + "ldr r6, [%[b], #36]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #32]\n\t" + "str r4, [%[a], #36]\n\t" + "ldr r3, [%[a], #40]\n\t" + "ldr r4, [%[a], #44]\n\t" + "ldr r5, [%[b], #40]\n\t" + "ldr r6, [%[b], #44]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #40]\n\t" + "str r4, [%[a], #44]\n\t" + "ldr r3, [%[a], #48]\n\t" + "ldr r4, [%[a], #52]\n\t" + "ldr r5, [%[b], #48]\n\t" + "ldr r6, [%[b], #52]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #48]\n\t" + "str r4, [%[a], #52]\n\t" + "ldr r3, [%[a], #56]\n\t" + "ldr r4, [%[a], #60]\n\t" + "ldr r5, [%[b], #56]\n\t" + "ldr r6, [%[b], #60]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #56]\n\t" + "str r4, [%[a], #60]\n\t" + "ldr r3, [%[a], #64]\n\t" + "ldr r4, [%[a], #68]\n\t" + "ldr r5, [%[b], #64]\n\t" + "ldr r6, [%[b], #68]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #64]\n\t" + "str r4, [%[a], #68]\n\t" + "ldr r3, [%[a], #72]\n\t" + "ldr r4, [%[a], #76]\n\t" + "ldr r5, [%[b], #72]\n\t" + "ldr r6, [%[b], #76]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #72]\n\t" + "str r4, [%[a], #76]\n\t" + "ldr r3, [%[a], #80]\n\t" + "ldr r4, [%[a], #84]\n\t" + "ldr r5, [%[b], #80]\n\t" + "ldr r6, [%[b], #84]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #80]\n\t" + "str r4, [%[a], #84]\n\t" + "ldr r3, [%[a], #88]\n\t" + "ldr r4, [%[a], #92]\n\t" + "ldr r5, [%[b], #88]\n\t" + "ldr r6, [%[b], #92]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #88]\n\t" + "str r4, [%[a], #92]\n\t" + "ldr r3, [%[a], #96]\n\t" + "ldr r4, [%[a], #100]\n\t" + "ldr r5, [%[b], #96]\n\t" + "ldr r6, [%[b], #100]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #96]\n\t" + "str r4, [%[a], #100]\n\t" + "ldr r3, [%[a], #104]\n\t" + "ldr r4, [%[a], #108]\n\t" + "ldr r5, [%[b], #104]\n\t" + "ldr r6, [%[b], #108]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #104]\n\t" + "str r4, [%[a], #108]\n\t" + "ldr r3, [%[a], #112]\n\t" + "ldr r4, [%[a], #116]\n\t" + "ldr r5, [%[b], #112]\n\t" + "ldr r6, [%[b], #116]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #112]\n\t" + "str r4, [%[a], #116]\n\t" + "ldr r3, [%[a], #120]\n\t" + "ldr r4, [%[a], #124]\n\t" + "ldr r5, [%[b], #120]\n\t" + "ldr r6, [%[b], #124]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #120]\n\t" + "str r4, [%[a], #124]\n\t" + "sbc %[c], %[c]\n\t" + "add %[a], #0x80\n\t" + "add %[b], #0x80\n\t" + "mov r5, #0\n\t" + "sub r5, %[c]\n\t" + "ldr r3, [%[a], #0]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[b], #0]\n\t" + "ldr r6, [%[b], #4]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #0]\n\t" + "str r4, [%[a], #4]\n\t" + "ldr r3, [%[a], #8]\n\t" + "ldr r4, [%[a], #12]\n\t" + "ldr r5, [%[b], #8]\n\t" + "ldr r6, [%[b], #12]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #8]\n\t" + "str r4, [%[a], #12]\n\t" + "ldr r3, [%[a], #16]\n\t" + "ldr r4, [%[a], #20]\n\t" + "ldr r5, [%[b], #16]\n\t" + "ldr r6, [%[b], #20]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #16]\n\t" + "str r4, [%[a], #20]\n\t" + "ldr r3, [%[a], #24]\n\t" + "ldr r4, [%[a], #28]\n\t" + "ldr r5, [%[b], #24]\n\t" + "ldr r6, [%[b], #28]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #24]\n\t" + "str r4, [%[a], #28]\n\t" + "ldr r3, [%[a], #32]\n\t" + "ldr r4, [%[a], #36]\n\t" + "ldr r5, [%[b], #32]\n\t" + "ldr r6, [%[b], #36]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #32]\n\t" + "str r4, [%[a], #36]\n\t" + "ldr r3, [%[a], #40]\n\t" + "ldr r4, [%[a], #44]\n\t" + "ldr r5, [%[b], #40]\n\t" + "ldr r6, [%[b], #44]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #40]\n\t" + "str r4, [%[a], #44]\n\t" + "ldr r3, [%[a], #48]\n\t" + "ldr r4, [%[a], #52]\n\t" + "ldr r5, [%[b], #48]\n\t" + "ldr r6, [%[b], #52]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #48]\n\t" + "str r4, [%[a], #52]\n\t" + "ldr r3, [%[a], #56]\n\t" + "ldr r4, [%[a], #60]\n\t" + "ldr r5, [%[b], #56]\n\t" + "ldr r6, [%[b], #60]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a], #56]\n\t" + "str r4, [%[a], #60]\n\t" + "sbc %[c], %[c]\n\t" + : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + + return c; } /* Add b to a into r. (r = a + b) @@ -5927,6 +6349,95 @@ SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, return c; } +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<24; i++) + r[i] = a[i] & m; +#else + int i; + + for (i = 0; i < 24; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[48]; + sp_digit a1[24]; + sp_digit b1[24]; + sp_digit z2[48]; + sp_digit u, ca, cb; + + ca = sp_3072_add_24(a1, a, &a[24]); + cb = sp_3072_add_24(b1, b, &b[24]); + u = ca & cb; + sp_3072_mul_24(z1, a1, b1); + sp_3072_mul_24(z2, &a[24], &b[24]); + sp_3072_mul_24(z0, a, b); + sp_3072_mask_24(r + 48, a1, 0 - cb); + sp_3072_mask_24(b1, b1, 0 - ca); + u += sp_3072_add_24(r + 48, r + 48, b1); + u += sp_3072_sub_in_place_48(z1, z2); + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_add_48(r + 24, r + 24, z1); + r[72] = u; + XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); + sp_3072_add_48(r + 48, r + 48, z2); +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit z2[48]; + sp_digit z1[48]; + sp_digit a1[24]; + sp_digit u; + + u = sp_3072_add_24(a1, a, &a[24]); + sp_3072_sqr_24(z1, a1); + sp_3072_sqr_24(z2, &a[24]); + sp_3072_sqr_24(z0, a); + sp_3072_mask_24(r + 48, a1, 0 - u); + u += sp_3072_add_24(r + 48, r + 48, r + 48); + u += sp_3072_sub_in_place_48(z1, z2); + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_add_48(r + 24, r + 24, z1); + r[72] = u; + XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); + sp_3072_add_48(r + 48, r + 48, z2); +} + /* Sub b from a into r. (r = a - b) * * r A single precision integer. @@ -6768,7 +7279,7 @@ SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -7209,7 +7720,7 @@ SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) { int i; @@ -7256,6 +7767,44 @@ SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, return c; } +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into a. (a -= b) + * + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + __asm__ __volatile__ ( + "mov r7, %[a]\n\t" + "add r7, #192\n\t" + "\n1:\n\t" + "mov r5, #0\n\t" + "sub r5, %[c]\n\t" + "ldr r3, [%[a]]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[b]]\n\t" + "ldr r6, [%[b], #4]\n\t" + "sbc r3, r5\n\t" + "sbc r4, r6\n\t" + "str r3, [%[a]]\n\t" + "str r4, [%[a], #4]\n\t" + "sbc %[c], %[c]\n\t" + "add %[a], #8\n\t" + "add %[b], #8\n\t" + "cmp %[a], r7\n\t" + "bne 1b\n\t" + : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7" + ); + + return c; +} + #endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Multiply a and b into r. (r = a * b) @@ -7608,263 +8157,6 @@ SP_NOINLINE static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, } #if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY) -#ifdef WOLFSSL_SP_SMALL -/* Sub b from a into a. (a -= b) - * - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r7, %[a]\n\t" - "add r7, #192\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "sub r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c]\n\t" - "add %[a], #8\n\t" - "add %[b], #8\n\t" - "cmp %[a], r7\n\t" - "bne 1b\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r7" - ); - - return c; -} - -#else -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldr r3, [%[a], #0]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b], #0]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sub r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #0]\n\t" - "str r4, [%[a], #4]\n\t" - "ldr r3, [%[a], #8]\n\t" - "ldr r4, [%[a], #12]\n\t" - "ldr r5, [%[b], #8]\n\t" - "ldr r6, [%[b], #12]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #8]\n\t" - "str r4, [%[a], #12]\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[b], #16]\n\t" - "ldr r6, [%[b], #20]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #16]\n\t" - "str r4, [%[a], #20]\n\t" - "ldr r3, [%[a], #24]\n\t" - "ldr r4, [%[a], #28]\n\t" - "ldr r5, [%[b], #24]\n\t" - "ldr r6, [%[b], #28]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #24]\n\t" - "str r4, [%[a], #28]\n\t" - "ldr r3, [%[a], #32]\n\t" - "ldr r4, [%[a], #36]\n\t" - "ldr r5, [%[b], #32]\n\t" - "ldr r6, [%[b], #36]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #32]\n\t" - "str r4, [%[a], #36]\n\t" - "ldr r3, [%[a], #40]\n\t" - "ldr r4, [%[a], #44]\n\t" - "ldr r5, [%[b], #40]\n\t" - "ldr r6, [%[b], #44]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #40]\n\t" - "str r4, [%[a], #44]\n\t" - "ldr r3, [%[a], #48]\n\t" - "ldr r4, [%[a], #52]\n\t" - "ldr r5, [%[b], #48]\n\t" - "ldr r6, [%[b], #52]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #48]\n\t" - "str r4, [%[a], #52]\n\t" - "ldr r3, [%[a], #56]\n\t" - "ldr r4, [%[a], #60]\n\t" - "ldr r5, [%[b], #56]\n\t" - "ldr r6, [%[b], #60]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #56]\n\t" - "str r4, [%[a], #60]\n\t" - "ldr r3, [%[a], #64]\n\t" - "ldr r4, [%[a], #68]\n\t" - "ldr r5, [%[b], #64]\n\t" - "ldr r6, [%[b], #68]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #64]\n\t" - "str r4, [%[a], #68]\n\t" - "ldr r3, [%[a], #72]\n\t" - "ldr r4, [%[a], #76]\n\t" - "ldr r5, [%[b], #72]\n\t" - "ldr r6, [%[b], #76]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #72]\n\t" - "str r4, [%[a], #76]\n\t" - "ldr r3, [%[a], #80]\n\t" - "ldr r4, [%[a], #84]\n\t" - "ldr r5, [%[b], #80]\n\t" - "ldr r6, [%[b], #84]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #80]\n\t" - "str r4, [%[a], #84]\n\t" - "ldr r3, [%[a], #88]\n\t" - "ldr r4, [%[a], #92]\n\t" - "ldr r5, [%[b], #88]\n\t" - "ldr r6, [%[b], #92]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #88]\n\t" - "str r4, [%[a], #92]\n\t" - "ldr r3, [%[a], #96]\n\t" - "ldr r4, [%[a], #100]\n\t" - "ldr r5, [%[b], #96]\n\t" - "ldr r6, [%[b], #100]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #96]\n\t" - "str r4, [%[a], #100]\n\t" - "ldr r3, [%[a], #104]\n\t" - "ldr r4, [%[a], #108]\n\t" - "ldr r5, [%[b], #104]\n\t" - "ldr r6, [%[b], #108]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #104]\n\t" - "str r4, [%[a], #108]\n\t" - "ldr r3, [%[a], #112]\n\t" - "ldr r4, [%[a], #116]\n\t" - "ldr r5, [%[b], #112]\n\t" - "ldr r6, [%[b], #116]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #112]\n\t" - "str r4, [%[a], #116]\n\t" - "ldr r3, [%[a], #120]\n\t" - "ldr r4, [%[a], #124]\n\t" - "ldr r5, [%[b], #120]\n\t" - "ldr r6, [%[b], #124]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #120]\n\t" - "str r4, [%[a], #124]\n\t" - "sbc %[c], %[c]\n\t" - "add %[a], #0x80\n\t" - "add %[b], #0x80\n\t" - "mov r5, #0\n\t" - "sub r5, %[c]\n\t" - "ldr r3, [%[a], #0]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b], #0]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #0]\n\t" - "str r4, [%[a], #4]\n\t" - "ldr r3, [%[a], #8]\n\t" - "ldr r4, [%[a], #12]\n\t" - "ldr r5, [%[b], #8]\n\t" - "ldr r6, [%[b], #12]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #8]\n\t" - "str r4, [%[a], #12]\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[b], #16]\n\t" - "ldr r6, [%[b], #20]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #16]\n\t" - "str r4, [%[a], #20]\n\t" - "ldr r3, [%[a], #24]\n\t" - "ldr r4, [%[a], #28]\n\t" - "ldr r5, [%[b], #24]\n\t" - "ldr r6, [%[b], #28]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #24]\n\t" - "str r4, [%[a], #28]\n\t" - "ldr r3, [%[a], #32]\n\t" - "ldr r4, [%[a], #36]\n\t" - "ldr r5, [%[b], #32]\n\t" - "ldr r6, [%[b], #36]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #32]\n\t" - "str r4, [%[a], #36]\n\t" - "ldr r3, [%[a], #40]\n\t" - "ldr r4, [%[a], #44]\n\t" - "ldr r5, [%[b], #40]\n\t" - "ldr r6, [%[b], #44]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #40]\n\t" - "str r4, [%[a], #44]\n\t" - "ldr r3, [%[a], #48]\n\t" - "ldr r4, [%[a], #52]\n\t" - "ldr r5, [%[b], #48]\n\t" - "ldr r6, [%[b], #52]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #48]\n\t" - "str r4, [%[a], #52]\n\t" - "ldr r3, [%[a], #56]\n\t" - "ldr r4, [%[a], #60]\n\t" - "ldr r5, [%[b], #56]\n\t" - "ldr r6, [%[b], #60]\n\t" - "sbc r3, r5\n\t" - "sbc r4, r6\n\t" - "str r3, [%[a], #56]\n\t" - "str r4, [%[a], #60]\n\t" - "sbc %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6" - ); - - return c; -} - -#endif /* WOLFSSL_SP_SMALL */ /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. * @@ -8468,9 +8760,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -8500,10 +8795,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_48(r, r, m, mp); - sp_3072_mont_mul_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_48(r, m, mp); @@ -8611,9 +8902,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -8644,10 +8938,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_48(r, r, m, mp); - sp_3072_mont_mul_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_48(r, m, mp); @@ -9048,7 +9338,7 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_96(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -9295,9 +9585,12 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 96); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -9327,10 +9620,6 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_96(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_96(r, r, m, mp); - sp_3072_mont_mul_96(r, r, t[y], m, mp); XMEMSET(&r[96], 0, sizeof(sp_digit) * 96); sp_3072_mont_reduce_96(r, m, mp); @@ -9438,9 +9727,12 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 96); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -9471,10 +9763,6 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_96(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_96(r, r, m, mp); - sp_3072_mont_mul_96(r, r, t[y], m, mp); XMEMSET(&r[96], 0, sizeof(sp_digit) * 96); sp_3072_mont_reduce_96(r, m, mp); @@ -9787,7 +10075,7 @@ static int sp_3072_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 96; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 32 >= DIGIT_BIT) { - #if DIGIT_BIT < 32 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -9844,6 +10132,713 @@ int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_3072 +static void sp_3072_lshift_96(sp_digit* r, sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "mov r6, #31\n\t" + "sub r6, r6, %[n]\n\t" + "add %[a], %[a], #255\n\t" + "add %[r], %[r], #255\n\t" + "add %[a], %[a], #65\n\t" + "add %[r], %[r], #65\n\t" + "ldr r3, [%[a], #60]\n\t" + "lsr r4, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r4, r4, r6\n\t" + "ldr r2, [%[a], #56]\n\t" + "str r4, [%[r], #64]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #52]\n\t" + "str r3, [%[r], #60]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #48]\n\t" + "str r2, [%[r], #56]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #44]\n\t" + "str r4, [%[r], #52]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #40]\n\t" + "str r3, [%[r], #48]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #36]\n\t" + "str r2, [%[r], #44]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #32]\n\t" + "str r4, [%[r], #40]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #28]\n\t" + "str r3, [%[r], #36]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #24]\n\t" + "str r2, [%[r], #32]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #20]\n\t" + "str r4, [%[r], #28]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #16]\n\t" + "str r3, [%[r], #24]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #12]\n\t" + "str r2, [%[r], #20]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #8]\n\t" + "str r4, [%[r], #16]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #4]\n\t" + "str r3, [%[r], #12]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #0]\n\t" + "str r2, [%[r], #8]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r2, [%[a], #60]\n\t" + "str r4, [%[r], #68]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #56]\n\t" + "str r3, [%[r], #64]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #52]\n\t" + "str r2, [%[r], #60]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #48]\n\t" + "str r4, [%[r], #56]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #44]\n\t" + "str r3, [%[r], #52]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #40]\n\t" + "str r2, [%[r], #48]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #36]\n\t" + "str r4, [%[r], #44]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #32]\n\t" + "str r3, [%[r], #40]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #28]\n\t" + "str r2, [%[r], #36]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #24]\n\t" + "str r4, [%[r], #32]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #20]\n\t" + "str r3, [%[r], #28]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #16]\n\t" + "str r2, [%[r], #24]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #12]\n\t" + "str r4, [%[r], #20]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #8]\n\t" + "str r3, [%[r], #16]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #4]\n\t" + "str r2, [%[r], #12]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #0]\n\t" + "str r4, [%[r], #8]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r4, [%[a], #60]\n\t" + "str r3, [%[r], #68]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #56]\n\t" + "str r2, [%[r], #64]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #52]\n\t" + "str r4, [%[r], #60]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #48]\n\t" + "str r3, [%[r], #56]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #44]\n\t" + "str r2, [%[r], #52]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #40]\n\t" + "str r4, [%[r], #48]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #36]\n\t" + "str r3, [%[r], #44]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #32]\n\t" + "str r2, [%[r], #40]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #28]\n\t" + "str r4, [%[r], #36]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #24]\n\t" + "str r3, [%[r], #32]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #20]\n\t" + "str r2, [%[r], #28]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #16]\n\t" + "str r4, [%[r], #24]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #12]\n\t" + "str r3, [%[r], #20]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #8]\n\t" + "str r2, [%[r], #16]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #4]\n\t" + "str r4, [%[r], #12]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #0]\n\t" + "str r3, [%[r], #8]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r3, [%[a], #60]\n\t" + "str r2, [%[r], #68]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #56]\n\t" + "str r4, [%[r], #64]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #52]\n\t" + "str r3, [%[r], #60]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #48]\n\t" + "str r2, [%[r], #56]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #44]\n\t" + "str r4, [%[r], #52]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #40]\n\t" + "str r3, [%[r], #48]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #36]\n\t" + "str r2, [%[r], #44]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #32]\n\t" + "str r4, [%[r], #40]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #28]\n\t" + "str r3, [%[r], #36]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #24]\n\t" + "str r2, [%[r], #32]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #20]\n\t" + "str r4, [%[r], #28]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #16]\n\t" + "str r3, [%[r], #24]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #12]\n\t" + "str r2, [%[r], #20]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #8]\n\t" + "str r4, [%[r], #16]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #4]\n\t" + "str r3, [%[r], #12]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #0]\n\t" + "str r2, [%[r], #8]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r2, [%[a], #60]\n\t" + "str r4, [%[r], #68]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #56]\n\t" + "str r3, [%[r], #64]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #52]\n\t" + "str r2, [%[r], #60]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #48]\n\t" + "str r4, [%[r], #56]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #44]\n\t" + "str r3, [%[r], #52]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #40]\n\t" + "str r2, [%[r], #48]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #36]\n\t" + "str r4, [%[r], #44]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #32]\n\t" + "str r3, [%[r], #40]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #28]\n\t" + "str r2, [%[r], #36]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #24]\n\t" + "str r4, [%[r], #32]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #20]\n\t" + "str r3, [%[r], #28]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #16]\n\t" + "str r2, [%[r], #24]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #12]\n\t" + "str r4, [%[r], #20]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #8]\n\t" + "str r3, [%[r], #16]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #4]\n\t" + "str r2, [%[r], #12]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #0]\n\t" + "str r4, [%[r], #8]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r4, [%[a], #60]\n\t" + "str r3, [%[r], #68]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #56]\n\t" + "str r2, [%[r], #64]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #52]\n\t" + "str r4, [%[r], #60]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #48]\n\t" + "str r3, [%[r], #56]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #44]\n\t" + "str r2, [%[r], #52]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #40]\n\t" + "str r4, [%[r], #48]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #36]\n\t" + "str r3, [%[r], #44]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #32]\n\t" + "str r2, [%[r], #40]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #28]\n\t" + "str r4, [%[r], #36]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #24]\n\t" + "str r3, [%[r], #32]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #20]\n\t" + "str r2, [%[r], #28]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #16]\n\t" + "str r4, [%[r], #24]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #12]\n\t" + "str r3, [%[r], #20]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #8]\n\t" + "str r2, [%[r], #16]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #4]\n\t" + "str r4, [%[r], #12]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #0]\n\t" + "str r3, [%[r], #8]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "str r4, [%[r]]\n\t" + "str r2, [%[r], #4]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "r2", "r3", "r4", "r5", "r6" + ); +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_3072_mod_exp_2_96(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[192]; + sp_digit td[97]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 289, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 192; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_3072_mont_setup(m, &mp); + sp_3072_mont_norm_96(norm, m); + + i = (bits - 1) / 32; + n = e[i--]; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; + sp_3072_lshift_96(r, norm, y); + for (; i>=0 || c>=5; ) { + if (c == 0) { + n = e[i--]; + y = n >> 27; + n <<= 5; + c = 27; + } + else if (c < 5) { + y = n >> 27; + n = e[i--]; + c = 5 - c; + y |= n >> (32 - c); + n <<= c; + c = 32 - c; + } + else { + y = (n >> 27) & 0x1f; + n <<= 5; + c -= 5; + } + + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + + sp_3072_lshift_96(r, r, y); + sp_3072_mul_d_96(tmp, norm, r[96]); + r[96] = 0; + o = sp_3072_add_96(r, r, tmp); + sp_3072_cond_sub_96(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[96], 0, sizeof(sp_digit) * 96); + sp_3072_mont_reduce_96(r, m, mp); + + mask = 0 - (sp_3072_cmp_96(r, m) >= 0); + sp_3072_cond_sub_96(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_FFDHE_3072 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -9874,7 +10869,13 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen, sp_3072_from_bin(e, 96, exp, expLen); sp_3072_from_mp(m, 96, mod); - err = sp_3072_mod_exp_96(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_3072 + if (base->used == 1 && base->dp[0] == 2 && m[95] == (sp_digit)-1) + err = sp_3072_mod_exp_2_96(r, e, expLen * 8, m); + else + #endif + err = sp_3072_mod_exp_96(r, b, e, expLen * 8, m, 0); + } if (err == MP_OKAY) { @@ -10139,14 +11140,14 @@ static void sp_256_from_mp(sp_digit* r, int max, mp_int* a) s = 32 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 32 <= DIGIT_BIT) { s += 32; r[j] &= 0xffffffff; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -10240,7 +11241,7 @@ static int sp_256_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 8; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 32 >= DIGIT_BIT) { - #if DIGIT_BIT < 32 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -15087,7 +16088,7 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_256_mask_8(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_256_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index 444f53781..09f810b60 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -3080,7 +3080,7 @@ static int sp_2048_mod_exp_90(sp_digit* r, sp_digit* a, sp_digit* e, int bits, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_45(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_45(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -3720,6 +3720,213 @@ int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_2048 +SP_NOINLINE static void sp_2048_lshift_90(sp_digit* r, sp_digit* a, byte n) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + r[90] = a[89] >> (23 - n); + for (i=89; i>0; i--) + r[i] = ((a[i] << n) | (a[i-1] >> (23 - n)) & 0x7fffff; +#else + r[90] = a[89] >> (23 - n); + r[89] = ((a[89] << n) | (a[88] >> (23 - n))) & 0x7fffff; + r[88] = ((a[88] << n) | (a[87] >> (23 - n))) & 0x7fffff; + r[87] = ((a[87] << n) | (a[86] >> (23 - n))) & 0x7fffff; + r[86] = ((a[86] << n) | (a[85] >> (23 - n))) & 0x7fffff; + r[85] = ((a[85] << n) | (a[84] >> (23 - n))) & 0x7fffff; + r[84] = ((a[84] << n) | (a[83] >> (23 - n))) & 0x7fffff; + r[83] = ((a[83] << n) | (a[82] >> (23 - n))) & 0x7fffff; + r[82] = ((a[82] << n) | (a[81] >> (23 - n))) & 0x7fffff; + r[81] = ((a[81] << n) | (a[80] >> (23 - n))) & 0x7fffff; + r[80] = ((a[80] << n) | (a[79] >> (23 - n))) & 0x7fffff; + r[79] = ((a[79] << n) | (a[78] >> (23 - n))) & 0x7fffff; + r[78] = ((a[78] << n) | (a[77] >> (23 - n))) & 0x7fffff; + r[77] = ((a[77] << n) | (a[76] >> (23 - n))) & 0x7fffff; + r[76] = ((a[76] << n) | (a[75] >> (23 - n))) & 0x7fffff; + r[75] = ((a[75] << n) | (a[74] >> (23 - n))) & 0x7fffff; + r[74] = ((a[74] << n) | (a[73] >> (23 - n))) & 0x7fffff; + r[73] = ((a[73] << n) | (a[72] >> (23 - n))) & 0x7fffff; + r[72] = ((a[72] << n) | (a[71] >> (23 - n))) & 0x7fffff; + r[71] = ((a[71] << n) | (a[70] >> (23 - n))) & 0x7fffff; + r[70] = ((a[70] << n) | (a[69] >> (23 - n))) & 0x7fffff; + r[69] = ((a[69] << n) | (a[68] >> (23 - n))) & 0x7fffff; + r[68] = ((a[68] << n) | (a[67] >> (23 - n))) & 0x7fffff; + r[67] = ((a[67] << n) | (a[66] >> (23 - n))) & 0x7fffff; + r[66] = ((a[66] << n) | (a[65] >> (23 - n))) & 0x7fffff; + r[65] = ((a[65] << n) | (a[64] >> (23 - n))) & 0x7fffff; + r[64] = ((a[64] << n) | (a[63] >> (23 - n))) & 0x7fffff; + r[63] = ((a[63] << n) | (a[62] >> (23 - n))) & 0x7fffff; + r[62] = ((a[62] << n) | (a[61] >> (23 - n))) & 0x7fffff; + r[61] = ((a[61] << n) | (a[60] >> (23 - n))) & 0x7fffff; + r[60] = ((a[60] << n) | (a[59] >> (23 - n))) & 0x7fffff; + r[59] = ((a[59] << n) | (a[58] >> (23 - n))) & 0x7fffff; + r[58] = ((a[58] << n) | (a[57] >> (23 - n))) & 0x7fffff; + r[57] = ((a[57] << n) | (a[56] >> (23 - n))) & 0x7fffff; + r[56] = ((a[56] << n) | (a[55] >> (23 - n))) & 0x7fffff; + r[55] = ((a[55] << n) | (a[54] >> (23 - n))) & 0x7fffff; + r[54] = ((a[54] << n) | (a[53] >> (23 - n))) & 0x7fffff; + r[53] = ((a[53] << n) | (a[52] >> (23 - n))) & 0x7fffff; + r[52] = ((a[52] << n) | (a[51] >> (23 - n))) & 0x7fffff; + r[51] = ((a[51] << n) | (a[50] >> (23 - n))) & 0x7fffff; + r[50] = ((a[50] << n) | (a[49] >> (23 - n))) & 0x7fffff; + r[49] = ((a[49] << n) | (a[48] >> (23 - n))) & 0x7fffff; + r[48] = ((a[48] << n) | (a[47] >> (23 - n))) & 0x7fffff; + r[47] = ((a[47] << n) | (a[46] >> (23 - n))) & 0x7fffff; + r[46] = ((a[46] << n) | (a[45] >> (23 - n))) & 0x7fffff; + r[45] = ((a[45] << n) | (a[44] >> (23 - n))) & 0x7fffff; + r[44] = ((a[44] << n) | (a[43] >> (23 - n))) & 0x7fffff; + r[43] = ((a[43] << n) | (a[42] >> (23 - n))) & 0x7fffff; + r[42] = ((a[42] << n) | (a[41] >> (23 - n))) & 0x7fffff; + r[41] = ((a[41] << n) | (a[40] >> (23 - n))) & 0x7fffff; + r[40] = ((a[40] << n) | (a[39] >> (23 - n))) & 0x7fffff; + r[39] = ((a[39] << n) | (a[38] >> (23 - n))) & 0x7fffff; + r[38] = ((a[38] << n) | (a[37] >> (23 - n))) & 0x7fffff; + r[37] = ((a[37] << n) | (a[36] >> (23 - n))) & 0x7fffff; + r[36] = ((a[36] << n) | (a[35] >> (23 - n))) & 0x7fffff; + r[35] = ((a[35] << n) | (a[34] >> (23 - n))) & 0x7fffff; + r[34] = ((a[34] << n) | (a[33] >> (23 - n))) & 0x7fffff; + r[33] = ((a[33] << n) | (a[32] >> (23 - n))) & 0x7fffff; + r[32] = ((a[32] << n) | (a[31] >> (23 - n))) & 0x7fffff; + r[31] = ((a[31] << n) | (a[30] >> (23 - n))) & 0x7fffff; + r[30] = ((a[30] << n) | (a[29] >> (23 - n))) & 0x7fffff; + r[29] = ((a[29] << n) | (a[28] >> (23 - n))) & 0x7fffff; + r[28] = ((a[28] << n) | (a[27] >> (23 - n))) & 0x7fffff; + r[27] = ((a[27] << n) | (a[26] >> (23 - n))) & 0x7fffff; + r[26] = ((a[26] << n) | (a[25] >> (23 - n))) & 0x7fffff; + r[25] = ((a[25] << n) | (a[24] >> (23 - n))) & 0x7fffff; + r[24] = ((a[24] << n) | (a[23] >> (23 - n))) & 0x7fffff; + r[23] = ((a[23] << n) | (a[22] >> (23 - n))) & 0x7fffff; + r[22] = ((a[22] << n) | (a[21] >> (23 - n))) & 0x7fffff; + r[21] = ((a[21] << n) | (a[20] >> (23 - n))) & 0x7fffff; + r[20] = ((a[20] << n) | (a[19] >> (23 - n))) & 0x7fffff; + r[19] = ((a[19] << n) | (a[18] >> (23 - n))) & 0x7fffff; + r[18] = ((a[18] << n) | (a[17] >> (23 - n))) & 0x7fffff; + r[17] = ((a[17] << n) | (a[16] >> (23 - n))) & 0x7fffff; + r[16] = ((a[16] << n) | (a[15] >> (23 - n))) & 0x7fffff; + r[15] = ((a[15] << n) | (a[14] >> (23 - n))) & 0x7fffff; + r[14] = ((a[14] << n) | (a[13] >> (23 - n))) & 0x7fffff; + r[13] = ((a[13] << n) | (a[12] >> (23 - n))) & 0x7fffff; + r[12] = ((a[12] << n) | (a[11] >> (23 - n))) & 0x7fffff; + r[11] = ((a[11] << n) | (a[10] >> (23 - n))) & 0x7fffff; + r[10] = ((a[10] << n) | (a[9] >> (23 - n))) & 0x7fffff; + r[9] = ((a[9] << n) | (a[8] >> (23 - n))) & 0x7fffff; + r[8] = ((a[8] << n) | (a[7] >> (23 - n))) & 0x7fffff; + r[7] = ((a[7] << n) | (a[6] >> (23 - n))) & 0x7fffff; + r[6] = ((a[6] << n) | (a[5] >> (23 - n))) & 0x7fffff; + r[5] = ((a[5] << n) | (a[4] >> (23 - n))) & 0x7fffff; + r[4] = ((a[4] << n) | (a[3] >> (23 - n))) & 0x7fffff; + r[3] = ((a[3] << n) | (a[2] >> (23 - n))) & 0x7fffff; + r[2] = ((a[2] << n) | (a[1] >> (23 - n))) & 0x7fffff; + r[1] = ((a[1] << n) | (a[0] >> (23 - n))) & 0x7fffff; +#endif + r[0] = (a[0] << n) & 0x7fffff; +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_2048_mod_exp_2_90(sp_digit* r, sp_digit* e, int bits, sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[180]; + sp_digit td[91]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 271, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 180; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + XMEMSET(td, 0, sizeof(td)); + sp_2048_mont_setup(m, &mp); + sp_2048_mont_norm_90(norm, m); + + bits = ((bits + 3) / 4) * 4; + i = ((bits + 22) / 23) - 1; + c = bits % 23; + if (c == 0) + c = 23; + if (i < 90) + n = e[i--] << (32 - c); + else { + n = 0; + i--; + } + if (c < 4) { + n |= e[i--] << (9 - c); + c += 23; + } + y = (n >> 28) & 0xf; + n <<= 4; + c -= 4; + sp_2048_lshift_90(r, norm, y); + for (; i>=0 || c>=4; ) { + if (c < 4) { + n |= e[i--] << (9 - c); + c += 23; + } + y = (n >> 28) & 0xf; + n <<= 4; + c -= 4; + + sp_2048_mont_sqr_90(r, r, m, mp); + sp_2048_mont_sqr_90(r, r, m, mp); + sp_2048_mont_sqr_90(r, r, m, mp); + sp_2048_mont_sqr_90(r, r, m, mp); + + sp_2048_lshift_90(r, r, y); + sp_2048_mul_d_90(tmp, norm, (r[90] << 22) + (r[89] >> 1)); + r[90] = 0; + r[89] &= 0x1L; + sp_2048_add_90(r, r, tmp); + sp_2048_norm_90(r); + o = sp_2048_cmp_90(r, m); + sp_2048_cond_sub_90(r, r, m, (o < 0) - 1); + } + + sp_2048_mont_reduce_90(r, m, mp); + n = sp_2048_cmp_90(r, m); + sp_2048_cond_sub_90(r, r, m, (n < 0) - 1); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} + +#endif /* HAVE_FFDHE_2048 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -3765,7 +3972,14 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen, sp_2048_from_bin(e, 90, exp, expLen); sp_2048_from_mp(m, 90, mod); - err = sp_2048_mod_exp_90(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_2048 + if (base->used == 1 && base->dp[0] == 2 && + ((m[89] << 15) | (m[88] >> 8)) == 0xffffL) { + err = sp_2048_mod_exp_2_90(r, e, expLen * 8, m); + } + else + #endif + err = sp_2048_mod_exp_90(r, b, e, expLen * 8, m, 0); } if (err == MP_OKAY) { @@ -3824,7 +4038,14 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen, sp_2048_from_bin(e, 90, exp, expLen); sp_2048_from_mp(m, 90, mod); - err = sp_2048_mod_exp_90(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_2048 + if (base->used == 1 && base->dp[0] == 2 && + ((m[89] << 15) | (m[88] >> 8)) == 0xffffL) { + err = sp_2048_mod_exp_2_90(r, e, expLen * 8, m); + } + else + #endif + err = sp_2048_mod_exp_90(r, b, e, expLen * 8, m, 0); } if (err == MP_OKAY) { @@ -6427,7 +6648,7 @@ static int sp_3072_mod_exp_134(sp_digit* r, sp_digit* a, sp_digit* e, int bits, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_67(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_67(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -7065,6 +7286,257 @@ int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_3072 +SP_NOINLINE static void sp_3072_lshift_134(sp_digit* r, sp_digit* a, byte n) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + r[134] = a[133] >> (23 - n); + for (i=133; i>0; i--) + r[i] = ((a[i] << n) | (a[i-1] >> (23 - n)) & 0x7fffff; +#else + r[134] = a[133] >> (23 - n); + r[133] = ((a[133] << n) | (a[132] >> (23 - n))) & 0x7fffff; + r[132] = ((a[132] << n) | (a[131] >> (23 - n))) & 0x7fffff; + r[131] = ((a[131] << n) | (a[130] >> (23 - n))) & 0x7fffff; + r[130] = ((a[130] << n) | (a[129] >> (23 - n))) & 0x7fffff; + r[129] = ((a[129] << n) | (a[128] >> (23 - n))) & 0x7fffff; + r[128] = ((a[128] << n) | (a[127] >> (23 - n))) & 0x7fffff; + r[127] = ((a[127] << n) | (a[126] >> (23 - n))) & 0x7fffff; + r[126] = ((a[126] << n) | (a[125] >> (23 - n))) & 0x7fffff; + r[125] = ((a[125] << n) | (a[124] >> (23 - n))) & 0x7fffff; + r[124] = ((a[124] << n) | (a[123] >> (23 - n))) & 0x7fffff; + r[123] = ((a[123] << n) | (a[122] >> (23 - n))) & 0x7fffff; + r[122] = ((a[122] << n) | (a[121] >> (23 - n))) & 0x7fffff; + r[121] = ((a[121] << n) | (a[120] >> (23 - n))) & 0x7fffff; + r[120] = ((a[120] << n) | (a[119] >> (23 - n))) & 0x7fffff; + r[119] = ((a[119] << n) | (a[118] >> (23 - n))) & 0x7fffff; + r[118] = ((a[118] << n) | (a[117] >> (23 - n))) & 0x7fffff; + r[117] = ((a[117] << n) | (a[116] >> (23 - n))) & 0x7fffff; + r[116] = ((a[116] << n) | (a[115] >> (23 - n))) & 0x7fffff; + r[115] = ((a[115] << n) | (a[114] >> (23 - n))) & 0x7fffff; + r[114] = ((a[114] << n) | (a[113] >> (23 - n))) & 0x7fffff; + r[113] = ((a[113] << n) | (a[112] >> (23 - n))) & 0x7fffff; + r[112] = ((a[112] << n) | (a[111] >> (23 - n))) & 0x7fffff; + r[111] = ((a[111] << n) | (a[110] >> (23 - n))) & 0x7fffff; + r[110] = ((a[110] << n) | (a[109] >> (23 - n))) & 0x7fffff; + r[109] = ((a[109] << n) | (a[108] >> (23 - n))) & 0x7fffff; + r[108] = ((a[108] << n) | (a[107] >> (23 - n))) & 0x7fffff; + r[107] = ((a[107] << n) | (a[106] >> (23 - n))) & 0x7fffff; + r[106] = ((a[106] << n) | (a[105] >> (23 - n))) & 0x7fffff; + r[105] = ((a[105] << n) | (a[104] >> (23 - n))) & 0x7fffff; + r[104] = ((a[104] << n) | (a[103] >> (23 - n))) & 0x7fffff; + r[103] = ((a[103] << n) | (a[102] >> (23 - n))) & 0x7fffff; + r[102] = ((a[102] << n) | (a[101] >> (23 - n))) & 0x7fffff; + r[101] = ((a[101] << n) | (a[100] >> (23 - n))) & 0x7fffff; + r[100] = ((a[100] << n) | (a[99] >> (23 - n))) & 0x7fffff; + r[99] = ((a[99] << n) | (a[98] >> (23 - n))) & 0x7fffff; + r[98] = ((a[98] << n) | (a[97] >> (23 - n))) & 0x7fffff; + r[97] = ((a[97] << n) | (a[96] >> (23 - n))) & 0x7fffff; + r[96] = ((a[96] << n) | (a[95] >> (23 - n))) & 0x7fffff; + r[95] = ((a[95] << n) | (a[94] >> (23 - n))) & 0x7fffff; + r[94] = ((a[94] << n) | (a[93] >> (23 - n))) & 0x7fffff; + r[93] = ((a[93] << n) | (a[92] >> (23 - n))) & 0x7fffff; + r[92] = ((a[92] << n) | (a[91] >> (23 - n))) & 0x7fffff; + r[91] = ((a[91] << n) | (a[90] >> (23 - n))) & 0x7fffff; + r[90] = ((a[90] << n) | (a[89] >> (23 - n))) & 0x7fffff; + r[89] = ((a[89] << n) | (a[88] >> (23 - n))) & 0x7fffff; + r[88] = ((a[88] << n) | (a[87] >> (23 - n))) & 0x7fffff; + r[87] = ((a[87] << n) | (a[86] >> (23 - n))) & 0x7fffff; + r[86] = ((a[86] << n) | (a[85] >> (23 - n))) & 0x7fffff; + r[85] = ((a[85] << n) | (a[84] >> (23 - n))) & 0x7fffff; + r[84] = ((a[84] << n) | (a[83] >> (23 - n))) & 0x7fffff; + r[83] = ((a[83] << n) | (a[82] >> (23 - n))) & 0x7fffff; + r[82] = ((a[82] << n) | (a[81] >> (23 - n))) & 0x7fffff; + r[81] = ((a[81] << n) | (a[80] >> (23 - n))) & 0x7fffff; + r[80] = ((a[80] << n) | (a[79] >> (23 - n))) & 0x7fffff; + r[79] = ((a[79] << n) | (a[78] >> (23 - n))) & 0x7fffff; + r[78] = ((a[78] << n) | (a[77] >> (23 - n))) & 0x7fffff; + r[77] = ((a[77] << n) | (a[76] >> (23 - n))) & 0x7fffff; + r[76] = ((a[76] << n) | (a[75] >> (23 - n))) & 0x7fffff; + r[75] = ((a[75] << n) | (a[74] >> (23 - n))) & 0x7fffff; + r[74] = ((a[74] << n) | (a[73] >> (23 - n))) & 0x7fffff; + r[73] = ((a[73] << n) | (a[72] >> (23 - n))) & 0x7fffff; + r[72] = ((a[72] << n) | (a[71] >> (23 - n))) & 0x7fffff; + r[71] = ((a[71] << n) | (a[70] >> (23 - n))) & 0x7fffff; + r[70] = ((a[70] << n) | (a[69] >> (23 - n))) & 0x7fffff; + r[69] = ((a[69] << n) | (a[68] >> (23 - n))) & 0x7fffff; + r[68] = ((a[68] << n) | (a[67] >> (23 - n))) & 0x7fffff; + r[67] = ((a[67] << n) | (a[66] >> (23 - n))) & 0x7fffff; + r[66] = ((a[66] << n) | (a[65] >> (23 - n))) & 0x7fffff; + r[65] = ((a[65] << n) | (a[64] >> (23 - n))) & 0x7fffff; + r[64] = ((a[64] << n) | (a[63] >> (23 - n))) & 0x7fffff; + r[63] = ((a[63] << n) | (a[62] >> (23 - n))) & 0x7fffff; + r[62] = ((a[62] << n) | (a[61] >> (23 - n))) & 0x7fffff; + r[61] = ((a[61] << n) | (a[60] >> (23 - n))) & 0x7fffff; + r[60] = ((a[60] << n) | (a[59] >> (23 - n))) & 0x7fffff; + r[59] = ((a[59] << n) | (a[58] >> (23 - n))) & 0x7fffff; + r[58] = ((a[58] << n) | (a[57] >> (23 - n))) & 0x7fffff; + r[57] = ((a[57] << n) | (a[56] >> (23 - n))) & 0x7fffff; + r[56] = ((a[56] << n) | (a[55] >> (23 - n))) & 0x7fffff; + r[55] = ((a[55] << n) | (a[54] >> (23 - n))) & 0x7fffff; + r[54] = ((a[54] << n) | (a[53] >> (23 - n))) & 0x7fffff; + r[53] = ((a[53] << n) | (a[52] >> (23 - n))) & 0x7fffff; + r[52] = ((a[52] << n) | (a[51] >> (23 - n))) & 0x7fffff; + r[51] = ((a[51] << n) | (a[50] >> (23 - n))) & 0x7fffff; + r[50] = ((a[50] << n) | (a[49] >> (23 - n))) & 0x7fffff; + r[49] = ((a[49] << n) | (a[48] >> (23 - n))) & 0x7fffff; + r[48] = ((a[48] << n) | (a[47] >> (23 - n))) & 0x7fffff; + r[47] = ((a[47] << n) | (a[46] >> (23 - n))) & 0x7fffff; + r[46] = ((a[46] << n) | (a[45] >> (23 - n))) & 0x7fffff; + r[45] = ((a[45] << n) | (a[44] >> (23 - n))) & 0x7fffff; + r[44] = ((a[44] << n) | (a[43] >> (23 - n))) & 0x7fffff; + r[43] = ((a[43] << n) | (a[42] >> (23 - n))) & 0x7fffff; + r[42] = ((a[42] << n) | (a[41] >> (23 - n))) & 0x7fffff; + r[41] = ((a[41] << n) | (a[40] >> (23 - n))) & 0x7fffff; + r[40] = ((a[40] << n) | (a[39] >> (23 - n))) & 0x7fffff; + r[39] = ((a[39] << n) | (a[38] >> (23 - n))) & 0x7fffff; + r[38] = ((a[38] << n) | (a[37] >> (23 - n))) & 0x7fffff; + r[37] = ((a[37] << n) | (a[36] >> (23 - n))) & 0x7fffff; + r[36] = ((a[36] << n) | (a[35] >> (23 - n))) & 0x7fffff; + r[35] = ((a[35] << n) | (a[34] >> (23 - n))) & 0x7fffff; + r[34] = ((a[34] << n) | (a[33] >> (23 - n))) & 0x7fffff; + r[33] = ((a[33] << n) | (a[32] >> (23 - n))) & 0x7fffff; + r[32] = ((a[32] << n) | (a[31] >> (23 - n))) & 0x7fffff; + r[31] = ((a[31] << n) | (a[30] >> (23 - n))) & 0x7fffff; + r[30] = ((a[30] << n) | (a[29] >> (23 - n))) & 0x7fffff; + r[29] = ((a[29] << n) | (a[28] >> (23 - n))) & 0x7fffff; + r[28] = ((a[28] << n) | (a[27] >> (23 - n))) & 0x7fffff; + r[27] = ((a[27] << n) | (a[26] >> (23 - n))) & 0x7fffff; + r[26] = ((a[26] << n) | (a[25] >> (23 - n))) & 0x7fffff; + r[25] = ((a[25] << n) | (a[24] >> (23 - n))) & 0x7fffff; + r[24] = ((a[24] << n) | (a[23] >> (23 - n))) & 0x7fffff; + r[23] = ((a[23] << n) | (a[22] >> (23 - n))) & 0x7fffff; + r[22] = ((a[22] << n) | (a[21] >> (23 - n))) & 0x7fffff; + r[21] = ((a[21] << n) | (a[20] >> (23 - n))) & 0x7fffff; + r[20] = ((a[20] << n) | (a[19] >> (23 - n))) & 0x7fffff; + r[19] = ((a[19] << n) | (a[18] >> (23 - n))) & 0x7fffff; + r[18] = ((a[18] << n) | (a[17] >> (23 - n))) & 0x7fffff; + r[17] = ((a[17] << n) | (a[16] >> (23 - n))) & 0x7fffff; + r[16] = ((a[16] << n) | (a[15] >> (23 - n))) & 0x7fffff; + r[15] = ((a[15] << n) | (a[14] >> (23 - n))) & 0x7fffff; + r[14] = ((a[14] << n) | (a[13] >> (23 - n))) & 0x7fffff; + r[13] = ((a[13] << n) | (a[12] >> (23 - n))) & 0x7fffff; + r[12] = ((a[12] << n) | (a[11] >> (23 - n))) & 0x7fffff; + r[11] = ((a[11] << n) | (a[10] >> (23 - n))) & 0x7fffff; + r[10] = ((a[10] << n) | (a[9] >> (23 - n))) & 0x7fffff; + r[9] = ((a[9] << n) | (a[8] >> (23 - n))) & 0x7fffff; + r[8] = ((a[8] << n) | (a[7] >> (23 - n))) & 0x7fffff; + r[7] = ((a[7] << n) | (a[6] >> (23 - n))) & 0x7fffff; + r[6] = ((a[6] << n) | (a[5] >> (23 - n))) & 0x7fffff; + r[5] = ((a[5] << n) | (a[4] >> (23 - n))) & 0x7fffff; + r[4] = ((a[4] << n) | (a[3] >> (23 - n))) & 0x7fffff; + r[3] = ((a[3] << n) | (a[2] >> (23 - n))) & 0x7fffff; + r[2] = ((a[2] << n) | (a[1] >> (23 - n))) & 0x7fffff; + r[1] = ((a[1] << n) | (a[0] >> (23 - n))) & 0x7fffff; +#endif + r[0] = (a[0] << n) & 0x7fffff; +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_3072_mod_exp_2_134(sp_digit* r, sp_digit* e, int bits, sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[268]; + sp_digit td[135]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 403, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 268; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + XMEMSET(td, 0, sizeof(td)); + sp_3072_mont_setup(m, &mp); + sp_3072_mont_norm_134(norm, m); + + bits = ((bits + 3) / 4) * 4; + i = ((bits + 22) / 23) - 1; + c = bits % 23; + if (c == 0) + c = 23; + if (i < 134) + n = e[i--] << (32 - c); + else { + n = 0; + i--; + } + if (c < 4) { + n |= e[i--] << (9 - c); + c += 23; + } + y = (n >> 28) & 0xf; + n <<= 4; + c -= 4; + sp_3072_lshift_134(r, norm, y); + for (; i>=0 || c>=4; ) { + if (c < 4) { + n |= e[i--] << (9 - c); + c += 23; + } + y = (n >> 28) & 0xf; + n <<= 4; + c -= 4; + + sp_3072_mont_sqr_134(r, r, m, mp); + sp_3072_mont_sqr_134(r, r, m, mp); + sp_3072_mont_sqr_134(r, r, m, mp); + sp_3072_mont_sqr_134(r, r, m, mp); + + sp_3072_lshift_134(r, r, y); + sp_3072_mul_d_134(tmp, norm, (r[134] << 10) + (r[133] >> 13)); + r[134] = 0; + r[133] &= 0x1fffL; + sp_3072_add_134(r, r, tmp); + sp_3072_norm_134(r); + o = sp_3072_cmp_134(r, m); + sp_3072_cond_sub_134(r, r, m, (o < 0) - 1); + } + + sp_3072_mont_reduce_134(r, m, mp); + n = sp_3072_cmp_134(r, m); + sp_3072_cond_sub_134(r, r, m, (n < 0) - 1); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} + +#endif /* HAVE_FFDHE_3072 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -7110,7 +7582,14 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen, sp_3072_from_bin(e, 134, exp, expLen); sp_3072_from_mp(m, 134, mod); - err = sp_3072_mod_exp_134(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_3072 + if (base->used == 1 && base->dp[0] == 2 && + ((m[133] << 3) | (m[132] >> 20)) == 0xffffL) { + err = sp_3072_mod_exp_2_134(r, e, expLen * 8, m); + } + else + #endif + err = sp_3072_mod_exp_134(r, b, e, expLen * 8, m, 0); } if (err == MP_OKAY) { @@ -7169,7 +7648,14 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen, sp_3072_from_bin(e, 134, exp, expLen); sp_3072_from_mp(m, 134, mod); - err = sp_3072_mod_exp_134(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_3072 + if (base->used == 1 && base->dp[0] == 2 && + ((m[133] << 3) | (m[132] >> 20)) == 0xffffL) { + err = sp_3072_mod_exp_2_134(r, e, expLen * 8, m); + } + else + #endif + err = sp_3072_mod_exp_134(r, b, e, expLen * 8, m, 0); } if (err == MP_OKAY) { diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index 801a7c01e..5db293b36 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -2594,7 +2594,7 @@ static int sp_2048_mod_exp_36(sp_digit* r, sp_digit* a, sp_digit* e, int bits, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_18(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_18(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -3231,6 +3231,160 @@ int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_2048 +SP_NOINLINE static void sp_2048_lshift_36(sp_digit* r, sp_digit* a, byte n) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + r[36] = a[35] >> (57 - n); + for (i=35; i>0; i--) + r[i] = ((a[i] << n) | (a[i-1] >> (57 - n)) & 0x1ffffffffffffffl; +#else + r[36] = a[35] >> (57 - n); + r[35] = ((a[35] << n) | (a[34] >> (57 - n))) & 0x1ffffffffffffffl; + r[34] = ((a[34] << n) | (a[33] >> (57 - n))) & 0x1ffffffffffffffl; + r[33] = ((a[33] << n) | (a[32] >> (57 - n))) & 0x1ffffffffffffffl; + r[32] = ((a[32] << n) | (a[31] >> (57 - n))) & 0x1ffffffffffffffl; + r[31] = ((a[31] << n) | (a[30] >> (57 - n))) & 0x1ffffffffffffffl; + r[30] = ((a[30] << n) | (a[29] >> (57 - n))) & 0x1ffffffffffffffl; + r[29] = ((a[29] << n) | (a[28] >> (57 - n))) & 0x1ffffffffffffffl; + r[28] = ((a[28] << n) | (a[27] >> (57 - n))) & 0x1ffffffffffffffl; + r[27] = ((a[27] << n) | (a[26] >> (57 - n))) & 0x1ffffffffffffffl; + r[26] = ((a[26] << n) | (a[25] >> (57 - n))) & 0x1ffffffffffffffl; + r[25] = ((a[25] << n) | (a[24] >> (57 - n))) & 0x1ffffffffffffffl; + r[24] = ((a[24] << n) | (a[23] >> (57 - n))) & 0x1ffffffffffffffl; + r[23] = ((a[23] << n) | (a[22] >> (57 - n))) & 0x1ffffffffffffffl; + r[22] = ((a[22] << n) | (a[21] >> (57 - n))) & 0x1ffffffffffffffl; + r[21] = ((a[21] << n) | (a[20] >> (57 - n))) & 0x1ffffffffffffffl; + r[20] = ((a[20] << n) | (a[19] >> (57 - n))) & 0x1ffffffffffffffl; + r[19] = ((a[19] << n) | (a[18] >> (57 - n))) & 0x1ffffffffffffffl; + r[18] = ((a[18] << n) | (a[17] >> (57 - n))) & 0x1ffffffffffffffl; + r[17] = ((a[17] << n) | (a[16] >> (57 - n))) & 0x1ffffffffffffffl; + r[16] = ((a[16] << n) | (a[15] >> (57 - n))) & 0x1ffffffffffffffl; + r[15] = ((a[15] << n) | (a[14] >> (57 - n))) & 0x1ffffffffffffffl; + r[14] = ((a[14] << n) | (a[13] >> (57 - n))) & 0x1ffffffffffffffl; + r[13] = ((a[13] << n) | (a[12] >> (57 - n))) & 0x1ffffffffffffffl; + r[12] = ((a[12] << n) | (a[11] >> (57 - n))) & 0x1ffffffffffffffl; + r[11] = ((a[11] << n) | (a[10] >> (57 - n))) & 0x1ffffffffffffffl; + r[10] = ((a[10] << n) | (a[9] >> (57 - n))) & 0x1ffffffffffffffl; + r[9] = ((a[9] << n) | (a[8] >> (57 - n))) & 0x1ffffffffffffffl; + r[8] = ((a[8] << n) | (a[7] >> (57 - n))) & 0x1ffffffffffffffl; + r[7] = ((a[7] << n) | (a[6] >> (57 - n))) & 0x1ffffffffffffffl; + r[6] = ((a[6] << n) | (a[5] >> (57 - n))) & 0x1ffffffffffffffl; + r[5] = ((a[5] << n) | (a[4] >> (57 - n))) & 0x1ffffffffffffffl; + r[4] = ((a[4] << n) | (a[3] >> (57 - n))) & 0x1ffffffffffffffl; + r[3] = ((a[3] << n) | (a[2] >> (57 - n))) & 0x1ffffffffffffffl; + r[2] = ((a[2] << n) | (a[1] >> (57 - n))) & 0x1ffffffffffffffl; + r[1] = ((a[1] << n) | (a[0] >> (57 - n))) & 0x1ffffffffffffffl; +#endif + r[0] = (a[0] << n) & 0x1ffffffffffffffl; +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_2048_mod_exp_2_36(sp_digit* r, sp_digit* e, int bits, sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[72]; + sp_digit td[37]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 109, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 72; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + XMEMSET(td, 0, sizeof(td)); + sp_2048_mont_setup(m, &mp); + sp_2048_mont_norm_36(norm, m); + + bits = ((bits + 4) / 5) * 5; + i = ((bits + 56) / 57) - 1; + c = bits % 57; + if (c == 0) + c = 57; + if (i < 36) + n = e[i--] << (64 - c); + else { + n = 0; + i--; + } + if (c < 5) { + n |= e[i--] << (7 - c); + c += 57; + } + y = (n >> 59) & 0x1f; + n <<= 5; + c -= 5; + sp_2048_lshift_36(r, norm, y); + for (; i>=0 || c>=5; ) { + if (c < 5) { + n |= e[i--] << (7 - c); + c += 57; + } + y = (n >> 59) & 0x1f; + n <<= 5; + c -= 5; + + sp_2048_mont_sqr_36(r, r, m, mp); + sp_2048_mont_sqr_36(r, r, m, mp); + sp_2048_mont_sqr_36(r, r, m, mp); + sp_2048_mont_sqr_36(r, r, m, mp); + sp_2048_mont_sqr_36(r, r, m, mp); + + sp_2048_lshift_36(r, r, y); + sp_2048_mul_d_36(tmp, norm, (r[36] << 4) + (r[35] >> 53)); + r[36] = 0; + r[35] &= 0x1fffffffffffffL; + sp_2048_add_36(r, r, tmp); + sp_2048_norm_36(r); + o = sp_2048_cmp_36(r, m); + sp_2048_cond_sub_36(r, r, m, (o < 0) - 1); + } + + sp_2048_mont_reduce_36(r, m, mp); + n = sp_2048_cmp_36(r, m); + sp_2048_cond_sub_36(r, r, m, (n < 0) - 1); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} + +#endif /* HAVE_FFDHE_2048 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -3276,7 +3430,14 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen, sp_2048_from_bin(e, 36, exp, expLen); sp_2048_from_mp(m, 36, mod); - err = sp_2048_mod_exp_36(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_2048 + if (base->used == 1 && base->dp[0] == 2 && + (m[35] >> 21) == 0xffffffffL) { + err = sp_2048_mod_exp_2_36(r, e, expLen * 8, m); + } + else + #endif + err = sp_2048_mod_exp_36(r, b, e, expLen * 8, m, 0); } if (err == MP_OKAY) { @@ -3335,7 +3496,14 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen, sp_2048_from_bin(e, 36, exp, expLen); sp_2048_from_mp(m, 36, mod); - err = sp_2048_mod_exp_36(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_2048 + if (base->used == 1 && base->dp[0] == 2 && + (m[35] >> 21) == 0xffffffffL) { + err = sp_2048_mod_exp_2_36(r, e, expLen * 8, m); + } + else + #endif + err = sp_2048_mod_exp_36(r, b, e, expLen * 8, m, 0); } if (err == MP_OKAY) { @@ -6196,7 +6364,7 @@ static int sp_3072_mod_exp_54(sp_digit* r, sp_digit* a, sp_digit* e, int bits, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_27(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_27(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -6834,6 +7002,178 @@ int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_3072 +SP_NOINLINE static void sp_3072_lshift_54(sp_digit* r, sp_digit* a, byte n) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + r[54] = a[53] >> (57 - n); + for (i=53; i>0; i--) + r[i] = ((a[i] << n) | (a[i-1] >> (57 - n)) & 0x1ffffffffffffffl; +#else + r[54] = a[53] >> (57 - n); + r[53] = ((a[53] << n) | (a[52] >> (57 - n))) & 0x1ffffffffffffffl; + r[52] = ((a[52] << n) | (a[51] >> (57 - n))) & 0x1ffffffffffffffl; + r[51] = ((a[51] << n) | (a[50] >> (57 - n))) & 0x1ffffffffffffffl; + r[50] = ((a[50] << n) | (a[49] >> (57 - n))) & 0x1ffffffffffffffl; + r[49] = ((a[49] << n) | (a[48] >> (57 - n))) & 0x1ffffffffffffffl; + r[48] = ((a[48] << n) | (a[47] >> (57 - n))) & 0x1ffffffffffffffl; + r[47] = ((a[47] << n) | (a[46] >> (57 - n))) & 0x1ffffffffffffffl; + r[46] = ((a[46] << n) | (a[45] >> (57 - n))) & 0x1ffffffffffffffl; + r[45] = ((a[45] << n) | (a[44] >> (57 - n))) & 0x1ffffffffffffffl; + r[44] = ((a[44] << n) | (a[43] >> (57 - n))) & 0x1ffffffffffffffl; + r[43] = ((a[43] << n) | (a[42] >> (57 - n))) & 0x1ffffffffffffffl; + r[42] = ((a[42] << n) | (a[41] >> (57 - n))) & 0x1ffffffffffffffl; + r[41] = ((a[41] << n) | (a[40] >> (57 - n))) & 0x1ffffffffffffffl; + r[40] = ((a[40] << n) | (a[39] >> (57 - n))) & 0x1ffffffffffffffl; + r[39] = ((a[39] << n) | (a[38] >> (57 - n))) & 0x1ffffffffffffffl; + r[38] = ((a[38] << n) | (a[37] >> (57 - n))) & 0x1ffffffffffffffl; + r[37] = ((a[37] << n) | (a[36] >> (57 - n))) & 0x1ffffffffffffffl; + r[36] = ((a[36] << n) | (a[35] >> (57 - n))) & 0x1ffffffffffffffl; + r[35] = ((a[35] << n) | (a[34] >> (57 - n))) & 0x1ffffffffffffffl; + r[34] = ((a[34] << n) | (a[33] >> (57 - n))) & 0x1ffffffffffffffl; + r[33] = ((a[33] << n) | (a[32] >> (57 - n))) & 0x1ffffffffffffffl; + r[32] = ((a[32] << n) | (a[31] >> (57 - n))) & 0x1ffffffffffffffl; + r[31] = ((a[31] << n) | (a[30] >> (57 - n))) & 0x1ffffffffffffffl; + r[30] = ((a[30] << n) | (a[29] >> (57 - n))) & 0x1ffffffffffffffl; + r[29] = ((a[29] << n) | (a[28] >> (57 - n))) & 0x1ffffffffffffffl; + r[28] = ((a[28] << n) | (a[27] >> (57 - n))) & 0x1ffffffffffffffl; + r[27] = ((a[27] << n) | (a[26] >> (57 - n))) & 0x1ffffffffffffffl; + r[26] = ((a[26] << n) | (a[25] >> (57 - n))) & 0x1ffffffffffffffl; + r[25] = ((a[25] << n) | (a[24] >> (57 - n))) & 0x1ffffffffffffffl; + r[24] = ((a[24] << n) | (a[23] >> (57 - n))) & 0x1ffffffffffffffl; + r[23] = ((a[23] << n) | (a[22] >> (57 - n))) & 0x1ffffffffffffffl; + r[22] = ((a[22] << n) | (a[21] >> (57 - n))) & 0x1ffffffffffffffl; + r[21] = ((a[21] << n) | (a[20] >> (57 - n))) & 0x1ffffffffffffffl; + r[20] = ((a[20] << n) | (a[19] >> (57 - n))) & 0x1ffffffffffffffl; + r[19] = ((a[19] << n) | (a[18] >> (57 - n))) & 0x1ffffffffffffffl; + r[18] = ((a[18] << n) | (a[17] >> (57 - n))) & 0x1ffffffffffffffl; + r[17] = ((a[17] << n) | (a[16] >> (57 - n))) & 0x1ffffffffffffffl; + r[16] = ((a[16] << n) | (a[15] >> (57 - n))) & 0x1ffffffffffffffl; + r[15] = ((a[15] << n) | (a[14] >> (57 - n))) & 0x1ffffffffffffffl; + r[14] = ((a[14] << n) | (a[13] >> (57 - n))) & 0x1ffffffffffffffl; + r[13] = ((a[13] << n) | (a[12] >> (57 - n))) & 0x1ffffffffffffffl; + r[12] = ((a[12] << n) | (a[11] >> (57 - n))) & 0x1ffffffffffffffl; + r[11] = ((a[11] << n) | (a[10] >> (57 - n))) & 0x1ffffffffffffffl; + r[10] = ((a[10] << n) | (a[9] >> (57 - n))) & 0x1ffffffffffffffl; + r[9] = ((a[9] << n) | (a[8] >> (57 - n))) & 0x1ffffffffffffffl; + r[8] = ((a[8] << n) | (a[7] >> (57 - n))) & 0x1ffffffffffffffl; + r[7] = ((a[7] << n) | (a[6] >> (57 - n))) & 0x1ffffffffffffffl; + r[6] = ((a[6] << n) | (a[5] >> (57 - n))) & 0x1ffffffffffffffl; + r[5] = ((a[5] << n) | (a[4] >> (57 - n))) & 0x1ffffffffffffffl; + r[4] = ((a[4] << n) | (a[3] >> (57 - n))) & 0x1ffffffffffffffl; + r[3] = ((a[3] << n) | (a[2] >> (57 - n))) & 0x1ffffffffffffffl; + r[2] = ((a[2] << n) | (a[1] >> (57 - n))) & 0x1ffffffffffffffl; + r[1] = ((a[1] << n) | (a[0] >> (57 - n))) & 0x1ffffffffffffffl; +#endif + r[0] = (a[0] << n) & 0x1ffffffffffffffl; +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_3072_mod_exp_2_54(sp_digit* r, sp_digit* e, int bits, sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[108]; + sp_digit td[55]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 163, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 108; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + XMEMSET(td, 0, sizeof(td)); + sp_3072_mont_setup(m, &mp); + sp_3072_mont_norm_54(norm, m); + + bits = ((bits + 4) / 5) * 5; + i = ((bits + 56) / 57) - 1; + c = bits % 57; + if (c == 0) + c = 57; + if (i < 54) + n = e[i--] << (64 - c); + else { + n = 0; + i--; + } + if (c < 5) { + n |= e[i--] << (7 - c); + c += 57; + } + y = (n >> 59) & 0x1f; + n <<= 5; + c -= 5; + sp_3072_lshift_54(r, norm, y); + for (; i>=0 || c>=5; ) { + if (c < 5) { + n |= e[i--] << (7 - c); + c += 57; + } + y = (n >> 59) & 0x1f; + n <<= 5; + c -= 5; + + sp_3072_mont_sqr_54(r, r, m, mp); + sp_3072_mont_sqr_54(r, r, m, mp); + sp_3072_mont_sqr_54(r, r, m, mp); + sp_3072_mont_sqr_54(r, r, m, mp); + sp_3072_mont_sqr_54(r, r, m, mp); + + sp_3072_lshift_54(r, r, y); + sp_3072_mul_d_54(tmp, norm, (r[54] << 6) + (r[53] >> 51)); + r[54] = 0; + r[53] &= 0x7ffffffffffffL; + sp_3072_add_54(r, r, tmp); + sp_3072_norm_54(r); + o = sp_3072_cmp_54(r, m); + sp_3072_cond_sub_54(r, r, m, (o < 0) - 1); + } + + sp_3072_mont_reduce_54(r, m, mp); + n = sp_3072_cmp_54(r, m); + sp_3072_cond_sub_54(r, r, m, (n < 0) - 1); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} + +#endif /* HAVE_FFDHE_3072 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -6879,7 +7219,14 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen, sp_3072_from_bin(e, 54, exp, expLen); sp_3072_from_mp(m, 54, mod); - err = sp_3072_mod_exp_54(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_3072 + if (base->used == 1 && base->dp[0] == 2 && + (m[53] >> 19) == 0xffffffffL) { + err = sp_3072_mod_exp_2_54(r, e, expLen * 8, m); + } + else + #endif + err = sp_3072_mod_exp_54(r, b, e, expLen * 8, m, 0); } if (err == MP_OKAY) { @@ -6938,7 +7285,14 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen, sp_3072_from_bin(e, 54, exp, expLen); sp_3072_from_mp(m, 54, mod); - err = sp_3072_mod_exp_54(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_3072 + if (base->used == 1 && base->dp[0] == 2 && + (m[53] >> 19) == 0xffffffffL) { + err = sp_3072_mod_exp_2_54(r, e, expLen * 8, m); + } + else + #endif + err = sp_3072_mod_exp_54(r, b, e, expLen * 8, m, 0); } if (err == MP_OKAY) { diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index e9952bfd3..bf6eb34a0 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -107,14 +107,14 @@ static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a) s = 32 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 32 <= DIGIT_BIT) { s += 32; r[j] &= 0xffffffff; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -1314,7 +1314,7 @@ SP_NOINLINE static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_8(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -1698,7 +1698,7 @@ SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -2355,7 +2355,7 @@ SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -2708,7 +2708,7 @@ SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) { int i; @@ -3275,14 +3275,14 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" @@ -3467,9 +3467,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -3499,10 +3502,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_32(r, r, m, mp); - sp_2048_mont_mul_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_32(r, m, mp); @@ -3610,9 +3609,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -3643,10 +3645,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_32(r, r, m, mp); - sp_2048_mont_mul_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_32(r, m, mp); @@ -3872,14 +3870,14 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" @@ -3901,7 +3899,7 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_64(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -4146,9 +4144,12 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 64); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -4178,10 +4179,6 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_64(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_64(r, r, m, mp); - sp_2048_mont_mul_64(r, r, t[y], m, mp); XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); sp_2048_mont_reduce_64(r, m, mp); @@ -4289,9 +4286,12 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 64); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -4322,10 +4322,6 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_64(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_64(r, r, m, mp); - sp_2048_mont_mul_64(r, r, t[y], m, mp); XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); sp_2048_mont_reduce_64(r, m, mp); @@ -4638,7 +4634,7 @@ static int sp_2048_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 64; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 32 >= DIGIT_BIT) { - #if DIGIT_BIT < 32 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -4695,6 +4691,515 @@ int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_2048 +static void sp_2048_lshift_64(sp_digit* r, sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "mov r6, #31\n\t" + "sub r6, r6, %[n]\n\t" + "add %[a], %[a], #192\n\t" + "add %[r], %[r], #192\n\t" + "ldr r3, [%[a], #60]\n\t" + "lsr r4, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r4, r4, r6\n\t" + "ldr r2, [%[a], #56]\n\t" + "str r4, [%[r], #64]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #52]\n\t" + "str r3, [%[r], #60]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #48]\n\t" + "str r2, [%[r], #56]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #44]\n\t" + "str r4, [%[r], #52]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #40]\n\t" + "str r3, [%[r], #48]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #36]\n\t" + "str r2, [%[r], #44]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #32]\n\t" + "str r4, [%[r], #40]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #28]\n\t" + "str r3, [%[r], #36]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #24]\n\t" + "str r2, [%[r], #32]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #20]\n\t" + "str r4, [%[r], #28]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #16]\n\t" + "str r3, [%[r], #24]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #12]\n\t" + "str r2, [%[r], #20]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #8]\n\t" + "str r4, [%[r], #16]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #4]\n\t" + "str r3, [%[r], #12]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #0]\n\t" + "str r2, [%[r], #8]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r2, [%[a], #60]\n\t" + "str r4, [%[r], #68]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #56]\n\t" + "str r3, [%[r], #64]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #52]\n\t" + "str r2, [%[r], #60]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #48]\n\t" + "str r4, [%[r], #56]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #44]\n\t" + "str r3, [%[r], #52]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #40]\n\t" + "str r2, [%[r], #48]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #36]\n\t" + "str r4, [%[r], #44]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #32]\n\t" + "str r3, [%[r], #40]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #28]\n\t" + "str r2, [%[r], #36]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #24]\n\t" + "str r4, [%[r], #32]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #20]\n\t" + "str r3, [%[r], #28]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #16]\n\t" + "str r2, [%[r], #24]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #12]\n\t" + "str r4, [%[r], #20]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #8]\n\t" + "str r3, [%[r], #16]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #4]\n\t" + "str r2, [%[r], #12]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #0]\n\t" + "str r4, [%[r], #8]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r4, [%[a], #60]\n\t" + "str r3, [%[r], #68]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #56]\n\t" + "str r2, [%[r], #64]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #52]\n\t" + "str r4, [%[r], #60]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #48]\n\t" + "str r3, [%[r], #56]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #44]\n\t" + "str r2, [%[r], #52]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #40]\n\t" + "str r4, [%[r], #48]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #36]\n\t" + "str r3, [%[r], #44]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #32]\n\t" + "str r2, [%[r], #40]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #28]\n\t" + "str r4, [%[r], #36]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #24]\n\t" + "str r3, [%[r], #32]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #20]\n\t" + "str r2, [%[r], #28]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #16]\n\t" + "str r4, [%[r], #24]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #12]\n\t" + "str r3, [%[r], #20]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #8]\n\t" + "str r2, [%[r], #16]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #4]\n\t" + "str r4, [%[r], #12]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #0]\n\t" + "str r3, [%[r], #8]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r3, [%[a], #60]\n\t" + "str r2, [%[r], #68]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #56]\n\t" + "str r4, [%[r], #64]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #52]\n\t" + "str r3, [%[r], #60]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #48]\n\t" + "str r2, [%[r], #56]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #44]\n\t" + "str r4, [%[r], #52]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #40]\n\t" + "str r3, [%[r], #48]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #36]\n\t" + "str r2, [%[r], #44]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #32]\n\t" + "str r4, [%[r], #40]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #28]\n\t" + "str r3, [%[r], #36]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #24]\n\t" + "str r2, [%[r], #32]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #20]\n\t" + "str r4, [%[r], #28]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #16]\n\t" + "str r3, [%[r], #24]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #12]\n\t" + "str r2, [%[r], #20]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #8]\n\t" + "str r4, [%[r], #16]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #4]\n\t" + "str r3, [%[r], #12]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #0]\n\t" + "str r2, [%[r], #8]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "str r3, [%[r]]\n\t" + "str r4, [%[r], #4]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "r2", "r3", "r4", "r5", "r6" + ); +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_2048_mod_exp_2_64(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[128]; + sp_digit td[65]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 193, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 128; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_2048_mont_setup(m, &mp); + sp_2048_mont_norm_64(norm, m); + + i = (bits - 1) / 32; + n = e[i--]; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; + sp_2048_lshift_64(r, norm, y); + for (; i>=0 || c>=5; ) { + if (c == 0) { + n = e[i--]; + y = n >> 27; + n <<= 5; + c = 27; + } + else if (c < 5) { + y = n >> 27; + n = e[i--]; + c = 5 - c; + y |= n >> (32 - c); + n <<= c; + c = 32 - c; + } + else { + y = (n >> 27) & 0x1f; + n <<= 5; + c -= 5; + } + + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + sp_2048_mont_sqr_64(r, r, m, mp); + + sp_2048_lshift_64(r, r, y); + sp_2048_mul_d_64(tmp, norm, r[64]); + r[64] = 0; + o = sp_2048_add_64(r, r, tmp); + sp_2048_cond_sub_64(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); + sp_2048_mont_reduce_64(r, m, mp); + + mask = 0 - (sp_2048_cmp_64(r, m) >= 0); + sp_2048_cond_sub_64(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_FFDHE_2048 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -4725,7 +5230,13 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen, sp_2048_from_bin(e, 64, exp, expLen); sp_2048_from_mp(m, 64, mod); - err = sp_2048_mod_exp_64(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_2048 + if (base->used == 1 && base->dp[0] == 2 && m[63] == (sp_digit)-1) + err = sp_2048_mod_exp_2_64(r, e, expLen * 8, m); + else + #endif + err = sp_2048_mod_exp_64(r, b, e, expLen * 8, m, 0); + } if (err == MP_OKAY) { @@ -4843,14 +5354,14 @@ static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a) s = 32 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 32 <= DIGIT_BIT) { s += 32; r[j] &= 0xffffffff; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -4925,509 +5436,70 @@ static void sp_3072_to_bin(sp_digit* r, byte* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, +SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b) { - sp_digit tmp[8]; + sp_digit tmp[12 * 2]; __asm__ __volatile__ ( - "mov r8, %[r]\n\t" + "mov r3, #0\n\t" + "mov r4, #0\n\t" + "mov r8, r3\n\t" + "mov r11, %[r]\n\t" + "mov r9, %[a]\n\t" + "mov r10, %[b]\n\t" + "mov r6, #48\n\t" + "add r6, r6, r9\n\t" + "mov r12, r6\n\t" + "\n1:\n\t" "mov %[r], #0\n\t" - /* A[0] * B[0] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #0]\n\t" - "umull r3, r4, r6, r7\n\t" "mov r5, #0\n\t" - "str r3, [%[tmp], #0]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[1] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #4]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[1] * B[0] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #0]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [%[tmp], #4]\n\t" - "mov r4, #0\n\t" - /* A[0] * B[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[1] * B[1] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #4]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[2] * B[0] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #0]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "str r5, [%[tmp], #8]\n\t" - "mov r5, #0\n\t" - /* A[0] * B[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #12]\n\t" + "mov r6, #44\n\t" + "mov %[a], r8\n\t" + "subs %[a], %[a], r6\n\t" + "sbc r6, r6, r6\n\t" + "mvn r6, r6\n\t" + "and %[a], %[a], r6\n\t" + "mov %[b], r8\n\t" + "sub %[b], %[b], %[a]\n\t" + "add %[a], %[a], r9\n\t" + "add %[b], %[b], r10\n\t" + "\n2:\n\t" + /* Multiply Start */ + "ldr r6, [%[a]]\n\t" + "ldr r7, [%[b]]\n\t" "umull r6, r7, r6, r7\n\t" "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, %[r]\n\t" - /* A[1] * B[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[2] * B[1] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #4]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[3] * B[0] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #0]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "str r3, [%[tmp], #12]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[1] * B[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[2] * B[2] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[3] * B[1] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #4]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[4] * B[0] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #0]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [%[tmp], #16]\n\t" - "mov r4, #0\n\t" - /* A[0] * B[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[1] * B[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[2] * B[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[3] * B[2] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[4] * B[1] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #4]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[5] * B[0] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #0]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "str r5, [%[tmp], #20]\n\t" - "mov r5, #0\n\t" - /* A[0] * B[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[1] * B[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[2] * B[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[3] * B[3] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[4] * B[2] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[5] * B[1] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #4]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[6] * B[0] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #0]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "str r3, [%[tmp], #24]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[1] * B[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[2] * B[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[3] * B[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[4] * B[3] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[5] * B[2] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[6] * B[1] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #4]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[7] * B[0] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #0]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [%[tmp], #28]\n\t" - "mov r4, #0\n\t" - /* A[1] * B[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[2] * B[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[3] * B[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[4] * B[4] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[5] * B[3] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[6] * B[2] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[7] * B[1] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #4]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "str r5, [r8, #32]\n\t" - "mov r5, #0\n\t" - /* A[2] * B[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[3] * B[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[4] * B[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[5] * B[4] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[6] * B[3] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[7] * B[2] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "str r3, [r8, #36]\n\t" - "mov r3, #0\n\t" - /* A[3] * B[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[4] * B[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[5] * B[5] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[6] * B[4] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[7] * B[3] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [r8, #40]\n\t" - "mov r4, #0\n\t" - /* A[4] * B[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[5] * B[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[6] * B[5] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[7] * B[4] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "str r5, [r8, #44]\n\t" - "mov r5, #0\n\t" - /* A[5] * B[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[6] * B[6] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[7] * B[5] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "str r3, [r8, #48]\n\t" - "mov r3, #0\n\t" - /* A[6] * B[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[7] * B[6] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [r8, #52]\n\t" - "mov r4, #0\n\t" - /* A[7] * B[7] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "str r5, [r8, #56]\n\t" - "str r3, [r8, #60]\n\t" - "ldr r3, [%[tmp], #0]\n\t" - "ldr r4, [%[tmp], #4]\n\t" - "ldr r5, [%[tmp], #8]\n\t" - "ldr r6, [%[tmp], #12]\n\t" - "str r3, [r8, #0]\n\t" - "str r4, [r8, #4]\n\t" - "str r5, [r8, #8]\n\t" - "str r6, [r8, #12]\n\t" - "ldr r3, [%[tmp], #16]\n\t" - "ldr r4, [%[tmp], #20]\n\t" - "ldr r5, [%[tmp], #24]\n\t" - "ldr r6, [%[tmp], #28]\n\t" - "str r3, [r8, #16]\n\t" - "str r4, [r8, #20]\n\t" - "str r5, [r8, #24]\n\t" - "str r6, [r8, #28]\n\t" - "mov %[r], r8\n\t" + /* Multiply Done */ + "add %[a], %[a], #4\n\t" + "sub %[b], %[b], #4\n\t" + "cmp %[a], r12\n\t" + "beq 3f\n\t" + "mov r6, r8\n\t" + "add r6, r6, r9\n\t" + "cmp %[a], r6\n\t" + "ble 2b\n\t" + "\n3:\n\t" + "mov %[r], r11\n\t" + "mov r7, r8\n\t" + "str r3, [%[r], r7]\n\t" + "mov r3, r4\n\t" + "mov r4, r5\n\t" + "add r7, r7, #4\n\t" + "mov r8, r7\n\t" + "mov r6, #88\n\t" + "cmp r7, r6\n\t" + "ble 1b\n\t" + "str r3, [%[r], r7]\n\t" + "mov %[a], r9\n\t" + "mov %[b], r10\n\t" : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); + + XMEMCPY(r, tmp, sizeof(tmp)); } /* Square a and put result in r. (r = a * a) @@ -5435,387 +5507,94 @@ SP_NOINLINE static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) { - sp_digit tmp[8]; __asm__ __volatile__ ( - "mov r8, %[r]\n\t" + "mov r3, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r8, r3\n\t" + "mov r11, %[r]\n\t" + "mov r6, #96\n\t" + "neg r6, r6\n\t" + "add sp, sp, r6\n\t" + "mov r10, sp\n\t" + "mov r9, %[a]\n\t" + "\n1:\n\t" "mov %[r], #0\n\t" - /* A[0] * A[0] */ - "ldr r6, [%[a], #0]\n\t" - "umull r3, r4, r6, r6\n\t" + "mov r6, #44\n\t" + "mov %[a], r8\n\t" + "subs %[a], %[a], r6\n\t" + "sbc r6, r6, r6\n\t" + "mvn r6, r6\n\t" + "and %[a], %[a], r6\n\t" + "mov r2, r8\n\t" + "sub r2, r2, %[a]\n\t" + "add %[a], %[a], r9\n\t" + "add r2, r2, r9\n\t" + "\n2:\n\t" + "cmp r2, %[a]\n\t" + "beq 4f\n\t" + /* Multiply * 2: Start */ + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, %[r]\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, %[r]\n\t" + /* Multiply * 2: Done */ + "bal 5f\n\t" + "\n4:\n\t" + /* Square: Start */ + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, r6\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc r5, r5, %[r]\n\t" + /* Square: Done */ + "\n5:\n\t" + "add %[a], %[a], #4\n\t" + "sub r2, r2, #4\n\t" + "mov r6, #48\n\t" + "add r6, r6, r9\n\t" + "cmp %[a], r6\n\t" + "beq 3f\n\t" + "cmp %[a], r2\n\t" + "bgt 3f\n\t" + "mov r7, r8\n\t" + "add r7, r7, r9\n\t" + "cmp %[a], r7\n\t" + "ble 2b\n\t" + "\n3:\n\t" + "mov %[r], r10\n\t" + "mov r7, r8\n\t" + "str r3, [%[r], r7]\n\t" + "mov r3, r4\n\t" + "mov r4, r5\n\t" "mov r5, #0\n\t" - "str r3, [%[tmp], #0]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[1] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[a], #4]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [%[tmp], #4]\n\t" - "mov r4, #0\n\t" - /* A[0] * A[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[a], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[1] * A[1] */ - "ldr r6, [%[a], #4]\n\t" - "umull r6, r7, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "str r5, [%[tmp], #8]\n\t" - "mov r5, #0\n\t" - /* A[0] * A[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[a], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[1] * A[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[a], #8]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "str r3, [%[tmp], #12]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[a], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[1] * A[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[a], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[2] * A[2] */ - "ldr r6, [%[a], #8]\n\t" - "umull r6, r7, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [%[tmp], #16]\n\t" - "mov r4, #0\n\t" - /* A[0] * A[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[a], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[1] * A[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[a], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[2] * A[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #12]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "str r5, [%[tmp], #20]\n\t" - "mov r5, #0\n\t" - /* A[0] * A[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[a], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[1] * A[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[a], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[2] * A[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[3] * A[3] */ - "ldr r6, [%[a], #12]\n\t" - "umull r6, r7, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "str r3, [%[tmp], #24]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[a], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[1] * A[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[a], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[2] * A[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[3] * A[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[a], #16]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [%[tmp], #28]\n\t" - "mov r4, #0\n\t" - /* A[1] * A[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[a], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[2] * A[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[3] * A[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[a], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[4] * A[4] */ - "ldr r6, [%[a], #16]\n\t" - "umull r6, r7, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "str r5, [r8, #32]\n\t" - "mov r5, #0\n\t" - /* A[2] * A[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[3] * A[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[a], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[4] * A[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[a], #20]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "str r3, [r8, #36]\n\t" - "mov r3, #0\n\t" - /* A[3] * A[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[a], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[4] * A[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[a], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - /* A[5] * A[5] */ - "ldr r6, [%[a], #20]\n\t" - "umull r6, r7, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [r8, #40]\n\t" - "mov r4, #0\n\t" - /* A[4] * A[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[a], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - /* A[5] * A[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[a], #24]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, %[r]\n\t" - "str r5, [r8, #44]\n\t" - "mov r5, #0\n\t" - /* A[5] * A[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[a], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - /* A[6] * A[6] */ - "ldr r6, [%[a], #24]\n\t" - "umull r6, r7, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, %[r]\n\t" - "str r3, [r8, #48]\n\t" - "mov r3, #0\n\t" - /* A[6] * A[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[a], #28]\n\t" - "umull r6, r7, r6, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, %[r]\n\t" - "str r4, [r8, #52]\n\t" - "mov r4, #0\n\t" - /* A[7] * A[7] */ - "ldr r6, [%[a], #28]\n\t" - "umull r6, r7, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "str r5, [r8, #56]\n\t" - "str r3, [r8, #60]\n\t" - "ldr r3, [%[tmp], #0]\n\t" - "ldr r4, [%[tmp], #4]\n\t" - "ldr r5, [%[tmp], #8]\n\t" - "ldr r6, [%[tmp], #12]\n\t" - "str r3, [r8, #0]\n\t" - "str r4, [r8, #4]\n\t" - "str r5, [r8, #8]\n\t" - "str r6, [r8, #12]\n\t" - "ldr r3, [%[tmp], #16]\n\t" - "ldr r4, [%[tmp], #20]\n\t" - "ldr r5, [%[tmp], #24]\n\t" - "ldr r6, [%[tmp], #28]\n\t" - "str r3, [r8, #16]\n\t" - "str r4, [r8, #20]\n\t" - "str r5, [r8, #24]\n\t" - "str r6, [r8, #28]\n\t" - "mov %[r], r8\n\t" + "add r7, r7, #4\n\t" + "mov r8, r7\n\t" + "mov r6, #88\n\t" + "cmp r7, r6\n\t" + "ble 1b\n\t" + "mov %[a], r9\n\t" + "str r3, [%[r], r7]\n\t" + "mov %[r], r11\n\t" + "mov %[a], r10\n\t" + "mov r3, #92\n\t" + "\n4:\n\t" + "ldr r6, [%[a], r3]\n\t" + "str r6, [%[r], r3]\n\t" + "subs r3, r3, #4\n\t" + "bge 4b\n\t" + "mov r6, #96\n\t" + "add sp, sp, r6\n\t" : - : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + : [r] "r" (r), [a] "r" (a) + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); } @@ -5825,7 +5604,7 @@ SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_8(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit c = 0; @@ -5863,6 +5642,22 @@ SP_NOINLINE static sp_digit sp_3072_add_8(sp_digit* r, const sp_digit* a, "ldr r5, [%[b], #28]\n\t" "adcs r4, r4, r5\n\t" "str r4, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[b], #32]\n\t" + "adcs r4, r4, r5\n\t" + "str r4, [%[r], #32]\n\t" + "ldr r4, [%[a], #36]\n\t" + "ldr r5, [%[b], #36]\n\t" + "adcs r4, r4, r5\n\t" + "str r4, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[b], #40]\n\t" + "adcs r4, r4, r5\n\t" + "str r4, [%[r], #40]\n\t" + "ldr r4, [%[a], #44]\n\t" + "ldr r5, [%[b], #44]\n\t" + "adcs r4, r4, r5\n\t" + "str r4, [%[r], #44]\n\t" "mov %[c], #0\n\t" "adc %[c], %[c], %[c]\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -5879,7 +5674,7 @@ SP_NOINLINE static sp_digit sp_3072_add_8(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_16(sp_digit* a, +SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a, const sp_digit* b) { sp_digit c = 0; @@ -5949,6 +5744,38 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_16(sp_digit* a, "sbcs r4, r4, r6\n\t" "str r3, [%[a], #56]\n\t" "str r4, [%[a], #60]\n\t" + "ldr r3, [%[a], #64]\n\t" + "ldr r4, [%[a], #68]\n\t" + "ldr r5, [%[b], #64]\n\t" + "ldr r6, [%[b], #68]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #64]\n\t" + "str r4, [%[a], #68]\n\t" + "ldr r3, [%[a], #72]\n\t" + "ldr r4, [%[a], #76]\n\t" + "ldr r5, [%[b], #72]\n\t" + "ldr r6, [%[b], #76]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #72]\n\t" + "str r4, [%[a], #76]\n\t" + "ldr r3, [%[a], #80]\n\t" + "ldr r4, [%[a], #84]\n\t" + "ldr r5, [%[b], #80]\n\t" + "ldr r6, [%[b], #84]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #80]\n\t" + "str r4, [%[a], #84]\n\t" + "ldr r3, [%[a], #88]\n\t" + "ldr r4, [%[a], #92]\n\t" + "ldr r5, [%[b], #88]\n\t" + "ldr r6, [%[b], #92]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #88]\n\t" + "str r4, [%[a], #92]\n\t" "sbc %[c], %[c], %[c]\n\t" : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) : @@ -5964,327 +5791,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_16(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldr r4, [%[a], #0]\n\t" - "ldr r5, [%[b], #0]\n\t" - "adds r4, r4, r5\n\t" - "str r4, [%[r], #0]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b], #4]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #4]\n\t" - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[b], #8]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #8]\n\t" - "ldr r4, [%[a], #12]\n\t" - "ldr r5, [%[b], #12]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #12]\n\t" - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[b], #16]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[b], #20]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #20]\n\t" - "ldr r4, [%[a], #24]\n\t" - "ldr r5, [%[b], #24]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #24]\n\t" - "ldr r4, [%[a], #28]\n\t" - "ldr r5, [%[b], #28]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #28]\n\t" - "ldr r4, [%[a], #32]\n\t" - "ldr r5, [%[b], #32]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #32]\n\t" - "ldr r4, [%[a], #36]\n\t" - "ldr r5, [%[b], #36]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #36]\n\t" - "ldr r4, [%[a], #40]\n\t" - "ldr r5, [%[b], #40]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #40]\n\t" - "ldr r4, [%[a], #44]\n\t" - "ldr r5, [%[b], #44]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #44]\n\t" - "ldr r4, [%[a], #48]\n\t" - "ldr r5, [%[b], #48]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #48]\n\t" - "ldr r4, [%[a], #52]\n\t" - "ldr r5, [%[b], #52]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #52]\n\t" - "ldr r4, [%[a], #56]\n\t" - "ldr r5, [%[b], #56]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #56]\n\t" - "ldr r4, [%[a], #60]\n\t" - "ldr r5, [%[b], #60]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #60]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r4", "r5" - ); - - return c; -} - -/* AND m into each word of a and store in r. - * - * r A single precision integer. - * a A single precision integer. - * m Mask to AND against each digit. - */ -static void sp_3072_mask_8(sp_digit* r, sp_digit* a, sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - int i; - - for (i=0; i<8; i++) - r[i] = a[i] & m; -#else - r[0] = a[0] & m; - r[1] = a[1] & m; - r[2] = a[2] & m; - r[3] = a[3] & m; - r[4] = a[4] & m; - r[5] = a[5] & m; - r[6] = a[6] & m; - r[7] = a[7] & m; -#endif -} - -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_3072_mul_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit* z0 = r; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit b1[8]; - sp_digit z2[16]; - sp_digit u, ca, cb; - - ca = sp_3072_add_8(a1, a, &a[8]); - cb = sp_3072_add_8(b1, b, &b[8]); - u = ca & cb; - sp_3072_mul_8(z1, a1, b1); - sp_3072_mul_8(z2, &a[8], &b[8]); - sp_3072_mul_8(z0, a, b); - sp_3072_mask_8(r + 16, a1, 0 - cb); - sp_3072_mask_8(b1, b1, 0 - ca); - u += sp_3072_add_8(r + 16, r + 16, b1); - u += sp_3072_sub_in_place_16(z1, z2); - u += sp_3072_sub_in_place_16(z1, z0); - u += sp_3072_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - sp_3072_add_16(r + 16, r + 16, z2); -} - -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_3072_sqr_16(sp_digit* r, const sp_digit* a) -{ - sp_digit* z0 = r; - sp_digit z2[16]; - sp_digit z1[16]; - sp_digit a1[8]; - sp_digit u; - - u = sp_3072_add_8(a1, a, &a[8]); - sp_3072_sqr_8(z1, a1); - sp_3072_sqr_8(z2, &a[8]); - sp_3072_sqr_8(z0, a); - sp_3072_mask_8(r + 16, a1, 0 - u); - u += sp_3072_add_8(r + 16, r + 16, r + 16); - u += sp_3072_sub_in_place_16(z1, z2); - u += sp_3072_sub_in_place_16(z1, z0); - u += sp_3072_add_16(r + 8, r + 8, z1); - r[24] = u; - XMEMSET(r + 24 + 1, 0, sizeof(sp_digit) * (8 - 1)); - sp_3072_add_16(r + 16, r + 16, z2); -} - -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldr r4, [%[a], #0]\n\t" - "ldr r5, [%[a], #4]\n\t" - "ldr r6, [%[b], #0]\n\t" - "ldr r7, [%[b], #4]\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #0]\n\t" - "str r5, [%[r], #4]\n\t" - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #12]\n\t" - "ldr r6, [%[b], #8]\n\t" - "ldr r7, [%[b], #12]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #12]\n\t" - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #20]\n\t" - "ldr r6, [%[b], #16]\n\t" - "ldr r7, [%[b], #20]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #20]\n\t" - "ldr r4, [%[a], #24]\n\t" - "ldr r5, [%[a], #28]\n\t" - "ldr r6, [%[b], #24]\n\t" - "ldr r7, [%[b], #28]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #24]\n\t" - "str r5, [%[r], #28]\n\t" - "ldr r4, [%[a], #32]\n\t" - "ldr r5, [%[a], #36]\n\t" - "ldr r6, [%[b], #32]\n\t" - "ldr r7, [%[b], #36]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #32]\n\t" - "str r5, [%[r], #36]\n\t" - "ldr r4, [%[a], #40]\n\t" - "ldr r5, [%[a], #44]\n\t" - "ldr r6, [%[b], #40]\n\t" - "ldr r7, [%[b], #44]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #40]\n\t" - "str r5, [%[r], #44]\n\t" - "ldr r4, [%[a], #48]\n\t" - "ldr r5, [%[a], #52]\n\t" - "ldr r6, [%[b], #48]\n\t" - "ldr r7, [%[b], #52]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #48]\n\t" - "str r5, [%[r], #52]\n\t" - "ldr r4, [%[a], #56]\n\t" - "ldr r5, [%[a], #60]\n\t" - "ldr r6, [%[b], #56]\n\t" - "ldr r7, [%[b], #60]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #56]\n\t" - "str r5, [%[r], #60]\n\t" - "ldr r4, [%[a], #64]\n\t" - "ldr r5, [%[a], #68]\n\t" - "ldr r6, [%[b], #64]\n\t" - "ldr r7, [%[b], #68]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #64]\n\t" - "str r5, [%[r], #68]\n\t" - "ldr r4, [%[a], #72]\n\t" - "ldr r5, [%[a], #76]\n\t" - "ldr r6, [%[b], #72]\n\t" - "ldr r7, [%[b], #76]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #72]\n\t" - "str r5, [%[r], #76]\n\t" - "ldr r4, [%[a], #80]\n\t" - "ldr r5, [%[a], #84]\n\t" - "ldr r6, [%[b], #80]\n\t" - "ldr r7, [%[b], #84]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #80]\n\t" - "str r5, [%[r], #84]\n\t" - "ldr r4, [%[a], #88]\n\t" - "ldr r5, [%[a], #92]\n\t" - "ldr r6, [%[b], #88]\n\t" - "ldr r7, [%[b], #92]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #88]\n\t" - "str r5, [%[r], #92]\n\t" - "ldr r4, [%[a], #96]\n\t" - "ldr r5, [%[a], #100]\n\t" - "ldr r6, [%[b], #96]\n\t" - "ldr r7, [%[b], #100]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #96]\n\t" - "str r5, [%[r], #100]\n\t" - "ldr r4, [%[a], #104]\n\t" - "ldr r5, [%[a], #108]\n\t" - "ldr r6, [%[b], #104]\n\t" - "ldr r7, [%[b], #108]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #104]\n\t" - "str r5, [%[r], #108]\n\t" - "ldr r4, [%[a], #112]\n\t" - "ldr r5, [%[a], #116]\n\t" - "ldr r6, [%[b], #112]\n\t" - "ldr r7, [%[b], #116]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #112]\n\t" - "str r5, [%[r], #116]\n\t" - "ldr r4, [%[a], #120]\n\t" - "ldr r5, [%[a], #124]\n\t" - "ldr r6, [%[b], #120]\n\t" - "ldr r7, [%[b], #124]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r7\n\t" - "str r4, [%[r], #120]\n\t" - "str r5, [%[r], #124]\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r4", "r5", "r6", "r7" - ); - - return c; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a, +SP_NOINLINE static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, const sp_digit* b) { sp_digit c = 0; @@ -6386,38 +5893,6 @@ SP_NOINLINE static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a, "ldr r5, [%[b], #92]\n\t" "adcs r4, r4, r5\n\t" "str r4, [%[r], #92]\n\t" - "ldr r4, [%[a], #96]\n\t" - "ldr r5, [%[b], #96]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #96]\n\t" - "ldr r4, [%[a], #100]\n\t" - "ldr r5, [%[b], #100]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #100]\n\t" - "ldr r4, [%[a], #104]\n\t" - "ldr r5, [%[b], #104]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #104]\n\t" - "ldr r4, [%[a], #108]\n\t" - "ldr r5, [%[b], #108]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #108]\n\t" - "ldr r4, [%[a], #112]\n\t" - "ldr r5, [%[b], #112]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #112]\n\t" - "ldr r4, [%[a], #116]\n\t" - "ldr r5, [%[b], #116]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #116]\n\t" - "ldr r4, [%[a], #120]\n\t" - "ldr r5, [%[b], #120]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #120]\n\t" - "ldr r4, [%[a], #124]\n\t" - "ldr r5, [%[b], #124]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r], #124]\n\t" "mov %[c], #0\n\t" "adc %[c], %[c], %[c]\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -6428,96 +5903,311 @@ SP_NOINLINE static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a, return c; } +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<12; i++) + r[i] = a[i] & m; +#else + r[0] = a[0] & m; + r[1] = a[1] & m; + r[2] = a[2] & m; + r[3] = a[3] & m; + r[4] = a[4] & m; + r[5] = a[5] & m; + r[6] = a[6] & m; + r[7] = a[7] & m; + r[8] = a[8] & m; + r[9] = a[9] & m; + r[10] = a[10] & m; + r[11] = a[11] & m; +#endif +} + /* Multiply a and b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) +SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, + const sp_digit* b) { - sp_digit p0[32]; - sp_digit p1[32]; - sp_digit p2[32]; - sp_digit p3[32]; - sp_digit p4[32]; - sp_digit p5[32]; - sp_digit t0[32]; - sp_digit t1[32]; - sp_digit t2[32]; - sp_digit a0[16]; - sp_digit a1[16]; - sp_digit a2[16]; - sp_digit b0[16]; - sp_digit b1[16]; - sp_digit b2[16]; - sp_3072_add_16(a0, a, &a[16]); - sp_3072_add_16(b0, b, &b[16]); - sp_3072_add_16(a1, &a[16], &a[32]); - sp_3072_add_16(b1, &b[16], &b[32]); - sp_3072_add_16(a2, a0, &a[32]); - sp_3072_add_16(b2, b0, &b[32]); - sp_3072_mul_16(p0, a, b); - sp_3072_mul_16(p2, &a[16], &b[16]); - sp_3072_mul_16(p4, &a[32], &b[32]); - sp_3072_mul_16(p1, a0, b0); - sp_3072_mul_16(p3, a1, b1); - sp_3072_mul_16(p5, a2, b2); - XMEMSET(r, 0, sizeof(*r)*2*48); - sp_3072_sub_32(t0, p3, p2); - sp_3072_sub_32(t1, p1, p2); - sp_3072_sub_32(t2, p5, t0); - sp_3072_sub_32(t2, t2, t1); - sp_3072_sub_32(t0, t0, p4); - sp_3072_sub_32(t1, t1, p0); - sp_3072_add_32(r, r, p0); - sp_3072_add_32(&r[16], &r[16], t1); - sp_3072_add_32(&r[32], &r[32], t2); - sp_3072_add_32(&r[48], &r[48], t0); - sp_3072_add_32(&r[64], &r[64], p4); + sp_digit* z0 = r; + sp_digit z1[24]; + sp_digit a1[12]; + sp_digit b1[12]; + sp_digit z2[24]; + sp_digit u, ca, cb; + + ca = sp_3072_add_12(a1, a, &a[12]); + cb = sp_3072_add_12(b1, b, &b[12]); + u = ca & cb; + sp_3072_mul_12(z1, a1, b1); + sp_3072_mul_12(z2, &a[12], &b[12]); + sp_3072_mul_12(z0, a, b); + sp_3072_mask_12(r + 24, a1, 0 - cb); + sp_3072_mask_12(b1, b1, 0 - ca); + u += sp_3072_add_12(r + 24, r + 24, b1); + u += sp_3072_sub_in_place_24(z1, z2); + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_add_24(r + 12, r + 12, z1); + r[36] = u; + XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); + sp_3072_add_24(r + 24, r + 24, z2); } -/* Square a into r. (r = a * a) +/* Square a and put result in r. (r = a * a) * * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) +SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) { - sp_digit p0[32]; - sp_digit p1[32]; - sp_digit p2[32]; - sp_digit p3[32]; - sp_digit p4[32]; - sp_digit p5[32]; - sp_digit t0[32]; - sp_digit t1[32]; - sp_digit t2[32]; - sp_digit a0[16]; - sp_digit a1[16]; - sp_digit a2[16]; - sp_3072_add_16(a0, a, &a[16]); - sp_3072_add_16(a1, &a[16], &a[32]); - sp_3072_add_16(a2, a0, &a[32]); - sp_3072_sqr_16(p0, a); - sp_3072_sqr_16(p2, &a[16]); - sp_3072_sqr_16(p4, &a[32]); - sp_3072_sqr_16(p1, a0); - sp_3072_sqr_16(p3, a1); - sp_3072_sqr_16(p5, a2); - XMEMSET(r, 0, sizeof(*r)*2*48); - sp_3072_sub_32(t0, p3, p2); - sp_3072_sub_32(t1, p1, p2); - sp_3072_sub_32(t2, p5, t0); - sp_3072_sub_32(t2, t2, t1); - sp_3072_sub_32(t0, t0, p4); - sp_3072_sub_32(t1, t1, p0); - sp_3072_add_32(r, r, p0); - sp_3072_add_32(&r[16], &r[16], t1); - sp_3072_add_32(&r[32], &r[32], t2); - sp_3072_add_32(&r[48], &r[48], t0); - sp_3072_add_32(&r[64], &r[64], p4); + sp_digit* z0 = r; + sp_digit z2[24]; + sp_digit z1[24]; + sp_digit a1[12]; + sp_digit u; + + u = sp_3072_add_12(a1, a, &a[12]); + sp_3072_sqr_12(z1, a1); + sp_3072_sqr_12(z2, &a[12]); + sp_3072_sqr_12(z0, a); + sp_3072_mask_12(r + 24, a1, 0 - u); + u += sp_3072_add_12(r + 24, r + 24, r + 24); + u += sp_3072_sub_in_place_24(z1, z2); + u += sp_3072_sub_in_place_24(z1, z0); + u += sp_3072_add_24(r + 12, r + 12, z1); + r[36] = u; + XMEMSET(r + 36 + 1, 0, sizeof(sp_digit) * (12 - 1)); + sp_3072_add_24(r + 24, r + 24, z2); +} + +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldr r3, [%[a], #0]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[b], #0]\n\t" + "ldr r6, [%[b], #4]\n\t" + "subs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #0]\n\t" + "str r4, [%[a], #4]\n\t" + "ldr r3, [%[a], #8]\n\t" + "ldr r4, [%[a], #12]\n\t" + "ldr r5, [%[b], #8]\n\t" + "ldr r6, [%[b], #12]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #8]\n\t" + "str r4, [%[a], #12]\n\t" + "ldr r3, [%[a], #16]\n\t" + "ldr r4, [%[a], #20]\n\t" + "ldr r5, [%[b], #16]\n\t" + "ldr r6, [%[b], #20]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #16]\n\t" + "str r4, [%[a], #20]\n\t" + "ldr r3, [%[a], #24]\n\t" + "ldr r4, [%[a], #28]\n\t" + "ldr r5, [%[b], #24]\n\t" + "ldr r6, [%[b], #28]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #24]\n\t" + "str r4, [%[a], #28]\n\t" + "ldr r3, [%[a], #32]\n\t" + "ldr r4, [%[a], #36]\n\t" + "ldr r5, [%[b], #32]\n\t" + "ldr r6, [%[b], #36]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #32]\n\t" + "str r4, [%[a], #36]\n\t" + "ldr r3, [%[a], #40]\n\t" + "ldr r4, [%[a], #44]\n\t" + "ldr r5, [%[b], #40]\n\t" + "ldr r6, [%[b], #44]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #40]\n\t" + "str r4, [%[a], #44]\n\t" + "ldr r3, [%[a], #48]\n\t" + "ldr r4, [%[a], #52]\n\t" + "ldr r5, [%[b], #48]\n\t" + "ldr r6, [%[b], #52]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #48]\n\t" + "str r4, [%[a], #52]\n\t" + "ldr r3, [%[a], #56]\n\t" + "ldr r4, [%[a], #60]\n\t" + "ldr r5, [%[b], #56]\n\t" + "ldr r6, [%[b], #60]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #56]\n\t" + "str r4, [%[a], #60]\n\t" + "ldr r3, [%[a], #64]\n\t" + "ldr r4, [%[a], #68]\n\t" + "ldr r5, [%[b], #64]\n\t" + "ldr r6, [%[b], #68]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #64]\n\t" + "str r4, [%[a], #68]\n\t" + "ldr r3, [%[a], #72]\n\t" + "ldr r4, [%[a], #76]\n\t" + "ldr r5, [%[b], #72]\n\t" + "ldr r6, [%[b], #76]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #72]\n\t" + "str r4, [%[a], #76]\n\t" + "ldr r3, [%[a], #80]\n\t" + "ldr r4, [%[a], #84]\n\t" + "ldr r5, [%[b], #80]\n\t" + "ldr r6, [%[b], #84]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #80]\n\t" + "str r4, [%[a], #84]\n\t" + "ldr r3, [%[a], #88]\n\t" + "ldr r4, [%[a], #92]\n\t" + "ldr r5, [%[b], #88]\n\t" + "ldr r6, [%[b], #92]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #88]\n\t" + "str r4, [%[a], #92]\n\t" + "ldr r3, [%[a], #96]\n\t" + "ldr r4, [%[a], #100]\n\t" + "ldr r5, [%[b], #96]\n\t" + "ldr r6, [%[b], #100]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #96]\n\t" + "str r4, [%[a], #100]\n\t" + "ldr r3, [%[a], #104]\n\t" + "ldr r4, [%[a], #108]\n\t" + "ldr r5, [%[b], #104]\n\t" + "ldr r6, [%[b], #108]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #104]\n\t" + "str r4, [%[a], #108]\n\t" + "ldr r3, [%[a], #112]\n\t" + "ldr r4, [%[a], #116]\n\t" + "ldr r5, [%[b], #112]\n\t" + "ldr r6, [%[b], #116]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #112]\n\t" + "str r4, [%[a], #116]\n\t" + "ldr r3, [%[a], #120]\n\t" + "ldr r4, [%[a], #124]\n\t" + "ldr r5, [%[b], #120]\n\t" + "ldr r6, [%[b], #124]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #120]\n\t" + "str r4, [%[a], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #0x80\n\t" + "add %[b], %[b], #0x80\n\t" + "mov r5, #0\n\t" + "sub r5, r5, %[c]\n\t" + "ldr r3, [%[a], #0]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[b], #0]\n\t" + "ldr r6, [%[b], #4]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #0]\n\t" + "str r4, [%[a], #4]\n\t" + "ldr r3, [%[a], #8]\n\t" + "ldr r4, [%[a], #12]\n\t" + "ldr r5, [%[b], #8]\n\t" + "ldr r6, [%[b], #12]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #8]\n\t" + "str r4, [%[a], #12]\n\t" + "ldr r3, [%[a], #16]\n\t" + "ldr r4, [%[a], #20]\n\t" + "ldr r5, [%[b], #16]\n\t" + "ldr r6, [%[b], #20]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #16]\n\t" + "str r4, [%[a], #20]\n\t" + "ldr r3, [%[a], #24]\n\t" + "ldr r4, [%[a], #28]\n\t" + "ldr r5, [%[b], #24]\n\t" + "ldr r6, [%[b], #28]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #24]\n\t" + "str r4, [%[a], #28]\n\t" + "ldr r3, [%[a], #32]\n\t" + "ldr r4, [%[a], #36]\n\t" + "ldr r5, [%[b], #32]\n\t" + "ldr r6, [%[b], #36]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #32]\n\t" + "str r4, [%[a], #36]\n\t" + "ldr r3, [%[a], #40]\n\t" + "ldr r4, [%[a], #44]\n\t" + "ldr r5, [%[b], #40]\n\t" + "ldr r6, [%[b], #44]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #40]\n\t" + "str r4, [%[a], #44]\n\t" + "ldr r3, [%[a], #48]\n\t" + "ldr r4, [%[a], #52]\n\t" + "ldr r5, [%[b], #48]\n\t" + "ldr r6, [%[b], #52]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #48]\n\t" + "str r4, [%[a], #52]\n\t" + "ldr r3, [%[a], #56]\n\t" + "ldr r4, [%[a], #60]\n\t" + "ldr r5, [%[b], #56]\n\t" + "ldr r6, [%[b], #60]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a], #56]\n\t" + "str r4, [%[a], #60]\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + + return c; } /* Add b to a into r. (r = a + b) @@ -6742,6 +6432,95 @@ SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, return c; } +/* AND m into each word of a and store in r. + * + * r A single precision integer. + * a A single precision integer. + * m Mask to AND against each digit. + */ +static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + int i; + + for (i=0; i<24; i++) + r[i] = a[i] & m; +#else + int i; + + for (i = 0; i < 24; i += 8) { + r[i+0] = a[i+0] & m; + r[i+1] = a[i+1] & m; + r[i+2] = a[i+2] & m; + r[i+3] = a[i+3] & m; + r[i+4] = a[i+4] & m; + r[i+5] = a[i+5] & m; + r[i+6] = a[i+6] & m; + r[i+7] = a[i+7] & m; + } +#endif +} + +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit* z0 = r; + sp_digit z1[48]; + sp_digit a1[24]; + sp_digit b1[24]; + sp_digit z2[48]; + sp_digit u, ca, cb; + + ca = sp_3072_add_24(a1, a, &a[24]); + cb = sp_3072_add_24(b1, b, &b[24]); + u = ca & cb; + sp_3072_mul_24(z1, a1, b1); + sp_3072_mul_24(z2, &a[24], &b[24]); + sp_3072_mul_24(z0, a, b); + sp_3072_mask_24(r + 48, a1, 0 - cb); + sp_3072_mask_24(b1, b1, 0 - ca); + u += sp_3072_add_24(r + 48, r + 48, b1); + u += sp_3072_sub_in_place_48(z1, z2); + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_add_48(r + 24, r + 24, z1); + r[72] = u; + XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); + sp_3072_add_48(r + 48, r + 48, z2); +} + +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) +{ + sp_digit* z0 = r; + sp_digit z2[48]; + sp_digit z1[48]; + sp_digit a1[24]; + sp_digit u; + + u = sp_3072_add_24(a1, a, &a[24]); + sp_3072_sqr_24(z1, a1); + sp_3072_sqr_24(z2, &a[24]); + sp_3072_sqr_24(z0, a); + sp_3072_mask_24(r + 48, a1, 0 - u); + u += sp_3072_add_24(r + 48, r + 48, r + 48); + u += sp_3072_sub_in_place_48(z1, z2); + u += sp_3072_sub_in_place_48(z1, z0); + u += sp_3072_add_48(r + 24, r + 24, z1); + r[72] = u; + XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1)); + sp_3072_add_48(r + 48, r + 48, z2); +} + /* Sub b from a into r. (r = a - b) * * r A single precision integer. @@ -7583,7 +7362,7 @@ SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -7944,7 +7723,7 @@ SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) { int i; @@ -7991,6 +7770,44 @@ SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, return c; } +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into a. (a -= b) + * + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + __asm__ __volatile__ ( + "mov r7, %[a]\n\t" + "add r7, r7, #192\n\t" + "\n1:\n\t" + "mov r5, #0\n\t" + "subs r5, r5, %[c]\n\t" + "ldr r3, [%[a]]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[b]]\n\t" + "ldr r6, [%[b], #4]\n\t" + "sbcs r3, r3, r5\n\t" + "sbcs r4, r4, r6\n\t" + "str r3, [%[a]]\n\t" + "str r4, [%[a], #4]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #8\n\t" + "add %[b], %[b], #8\n\t" + "cmp %[a], r7\n\t" + "bne 1b\n\t" + : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7" + ); + + return c; +} + #endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Multiply a and b into r. (r = a * b) @@ -8238,263 +8055,6 @@ SP_NOINLINE static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, } #if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY) -#ifdef WOLFSSL_SP_SMALL -/* Sub b from a into a. (a -= b) - * - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r7, %[a]\n\t" - "add r7, r7, #192\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r7\n\t" - "bne 1b\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r7" - ); - - return c; -} - -#else -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldr r3, [%[a], #0]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b], #0]\n\t" - "ldr r6, [%[b], #4]\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #0]\n\t" - "str r4, [%[a], #4]\n\t" - "ldr r3, [%[a], #8]\n\t" - "ldr r4, [%[a], #12]\n\t" - "ldr r5, [%[b], #8]\n\t" - "ldr r6, [%[b], #12]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #8]\n\t" - "str r4, [%[a], #12]\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[b], #16]\n\t" - "ldr r6, [%[b], #20]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #16]\n\t" - "str r4, [%[a], #20]\n\t" - "ldr r3, [%[a], #24]\n\t" - "ldr r4, [%[a], #28]\n\t" - "ldr r5, [%[b], #24]\n\t" - "ldr r6, [%[b], #28]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #24]\n\t" - "str r4, [%[a], #28]\n\t" - "ldr r3, [%[a], #32]\n\t" - "ldr r4, [%[a], #36]\n\t" - "ldr r5, [%[b], #32]\n\t" - "ldr r6, [%[b], #36]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #32]\n\t" - "str r4, [%[a], #36]\n\t" - "ldr r3, [%[a], #40]\n\t" - "ldr r4, [%[a], #44]\n\t" - "ldr r5, [%[b], #40]\n\t" - "ldr r6, [%[b], #44]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #40]\n\t" - "str r4, [%[a], #44]\n\t" - "ldr r3, [%[a], #48]\n\t" - "ldr r4, [%[a], #52]\n\t" - "ldr r5, [%[b], #48]\n\t" - "ldr r6, [%[b], #52]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #48]\n\t" - "str r4, [%[a], #52]\n\t" - "ldr r3, [%[a], #56]\n\t" - "ldr r4, [%[a], #60]\n\t" - "ldr r5, [%[b], #56]\n\t" - "ldr r6, [%[b], #60]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #56]\n\t" - "str r4, [%[a], #60]\n\t" - "ldr r3, [%[a], #64]\n\t" - "ldr r4, [%[a], #68]\n\t" - "ldr r5, [%[b], #64]\n\t" - "ldr r6, [%[b], #68]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #64]\n\t" - "str r4, [%[a], #68]\n\t" - "ldr r3, [%[a], #72]\n\t" - "ldr r4, [%[a], #76]\n\t" - "ldr r5, [%[b], #72]\n\t" - "ldr r6, [%[b], #76]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #72]\n\t" - "str r4, [%[a], #76]\n\t" - "ldr r3, [%[a], #80]\n\t" - "ldr r4, [%[a], #84]\n\t" - "ldr r5, [%[b], #80]\n\t" - "ldr r6, [%[b], #84]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #80]\n\t" - "str r4, [%[a], #84]\n\t" - "ldr r3, [%[a], #88]\n\t" - "ldr r4, [%[a], #92]\n\t" - "ldr r5, [%[b], #88]\n\t" - "ldr r6, [%[b], #92]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #88]\n\t" - "str r4, [%[a], #92]\n\t" - "ldr r3, [%[a], #96]\n\t" - "ldr r4, [%[a], #100]\n\t" - "ldr r5, [%[b], #96]\n\t" - "ldr r6, [%[b], #100]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #96]\n\t" - "str r4, [%[a], #100]\n\t" - "ldr r3, [%[a], #104]\n\t" - "ldr r4, [%[a], #108]\n\t" - "ldr r5, [%[b], #104]\n\t" - "ldr r6, [%[b], #108]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #104]\n\t" - "str r4, [%[a], #108]\n\t" - "ldr r3, [%[a], #112]\n\t" - "ldr r4, [%[a], #116]\n\t" - "ldr r5, [%[b], #112]\n\t" - "ldr r6, [%[b], #116]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #112]\n\t" - "str r4, [%[a], #116]\n\t" - "ldr r3, [%[a], #120]\n\t" - "ldr r4, [%[a], #124]\n\t" - "ldr r5, [%[b], #120]\n\t" - "ldr r6, [%[b], #124]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #120]\n\t" - "str r4, [%[a], #124]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #0x80\n\t" - "add %[b], %[b], #0x80\n\t" - "mov r5, #0\n\t" - "sub r5, r5, %[c]\n\t" - "ldr r3, [%[a], #0]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b], #0]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #0]\n\t" - "str r4, [%[a], #4]\n\t" - "ldr r3, [%[a], #8]\n\t" - "ldr r4, [%[a], #12]\n\t" - "ldr r5, [%[b], #8]\n\t" - "ldr r6, [%[b], #12]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #8]\n\t" - "str r4, [%[a], #12]\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[b], #16]\n\t" - "ldr r6, [%[b], #20]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #16]\n\t" - "str r4, [%[a], #20]\n\t" - "ldr r3, [%[a], #24]\n\t" - "ldr r4, [%[a], #28]\n\t" - "ldr r5, [%[b], #24]\n\t" - "ldr r6, [%[b], #28]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #24]\n\t" - "str r4, [%[a], #28]\n\t" - "ldr r3, [%[a], #32]\n\t" - "ldr r4, [%[a], #36]\n\t" - "ldr r5, [%[b], #32]\n\t" - "ldr r6, [%[b], #36]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #32]\n\t" - "str r4, [%[a], #36]\n\t" - "ldr r3, [%[a], #40]\n\t" - "ldr r4, [%[a], #44]\n\t" - "ldr r5, [%[b], #40]\n\t" - "ldr r6, [%[b], #44]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #40]\n\t" - "str r4, [%[a], #44]\n\t" - "ldr r3, [%[a], #48]\n\t" - "ldr r4, [%[a], #52]\n\t" - "ldr r5, [%[b], #48]\n\t" - "ldr r6, [%[b], #52]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #48]\n\t" - "str r4, [%[a], #52]\n\t" - "ldr r3, [%[a], #56]\n\t" - "ldr r4, [%[a], #60]\n\t" - "ldr r5, [%[b], #56]\n\t" - "ldr r6, [%[b], #60]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a], #56]\n\t" - "str r4, [%[a], #60]\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6" - ); - - return c; -} - -#endif /* WOLFSSL_SP_SMALL */ /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. * @@ -8739,14 +8299,14 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" @@ -8931,9 +8491,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -8963,10 +8526,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_48(r, r, m, mp); - sp_3072_mont_mul_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_48(r, m, mp); @@ -9074,9 +8633,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -9107,10 +8669,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_48(r, r, m, mp); - sp_3072_mont_mul_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_48(r, m, mp); @@ -9340,14 +8898,14 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" @@ -9369,7 +8927,7 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_96(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -9616,9 +9174,12 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 28; - n <<= 4; - c = 28; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 4; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 96); for (; i>=0 || c>=4; ) { if (c == 0) { @@ -9648,10 +9209,6 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_96(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_96(r, r, m, mp); - sp_3072_mont_mul_96(r, r, t[y], m, mp); XMEMSET(&r[96], 0, sizeof(sp_digit) * 96); sp_3072_mont_reduce_96(r, m, mp); @@ -9759,9 +9316,12 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 32; n = e[i--]; - y = n >> 27; - n <<= 5; - c = 27; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 96); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -9792,10 +9352,6 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_96(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_96(r, r, m, mp); - sp_3072_mont_mul_96(r, r, t[y], m, mp); XMEMSET(&r[96], 0, sizeof(sp_digit) * 96); sp_3072_mont_reduce_96(r, m, mp); @@ -10108,7 +9664,7 @@ static int sp_3072_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 96; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 32 >= DIGIT_BIT) { - #if DIGIT_BIT < 32 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -10165,6 +9721,711 @@ int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) } #ifdef WOLFSSL_HAVE_SP_DH + +#ifdef HAVE_FFDHE_3072 +static void sp_3072_lshift_96(sp_digit* r, sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "mov r6, #31\n\t" + "sub r6, r6, %[n]\n\t" + "add %[a], %[a], #320\n\t" + "add %[r], %[r], #320\n\t" + "ldr r3, [%[a], #60]\n\t" + "lsr r4, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r4, r4, r6\n\t" + "ldr r2, [%[a], #56]\n\t" + "str r4, [%[r], #64]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #52]\n\t" + "str r3, [%[r], #60]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #48]\n\t" + "str r2, [%[r], #56]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #44]\n\t" + "str r4, [%[r], #52]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #40]\n\t" + "str r3, [%[r], #48]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #36]\n\t" + "str r2, [%[r], #44]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #32]\n\t" + "str r4, [%[r], #40]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #28]\n\t" + "str r3, [%[r], #36]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #24]\n\t" + "str r2, [%[r], #32]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #20]\n\t" + "str r4, [%[r], #28]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #16]\n\t" + "str r3, [%[r], #24]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #12]\n\t" + "str r2, [%[r], #20]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #8]\n\t" + "str r4, [%[r], #16]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #4]\n\t" + "str r3, [%[r], #12]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #0]\n\t" + "str r2, [%[r], #8]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r2, [%[a], #60]\n\t" + "str r4, [%[r], #68]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #56]\n\t" + "str r3, [%[r], #64]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #52]\n\t" + "str r2, [%[r], #60]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #48]\n\t" + "str r4, [%[r], #56]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #44]\n\t" + "str r3, [%[r], #52]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #40]\n\t" + "str r2, [%[r], #48]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #36]\n\t" + "str r4, [%[r], #44]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #32]\n\t" + "str r3, [%[r], #40]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #28]\n\t" + "str r2, [%[r], #36]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #24]\n\t" + "str r4, [%[r], #32]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #20]\n\t" + "str r3, [%[r], #28]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #16]\n\t" + "str r2, [%[r], #24]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #12]\n\t" + "str r4, [%[r], #20]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #8]\n\t" + "str r3, [%[r], #16]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #4]\n\t" + "str r2, [%[r], #12]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #0]\n\t" + "str r4, [%[r], #8]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r4, [%[a], #60]\n\t" + "str r3, [%[r], #68]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #56]\n\t" + "str r2, [%[r], #64]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #52]\n\t" + "str r4, [%[r], #60]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #48]\n\t" + "str r3, [%[r], #56]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #44]\n\t" + "str r2, [%[r], #52]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #40]\n\t" + "str r4, [%[r], #48]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #36]\n\t" + "str r3, [%[r], #44]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #32]\n\t" + "str r2, [%[r], #40]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #28]\n\t" + "str r4, [%[r], #36]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #24]\n\t" + "str r3, [%[r], #32]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #20]\n\t" + "str r2, [%[r], #28]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #16]\n\t" + "str r4, [%[r], #24]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #12]\n\t" + "str r3, [%[r], #20]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #8]\n\t" + "str r2, [%[r], #16]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #4]\n\t" + "str r4, [%[r], #12]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #0]\n\t" + "str r3, [%[r], #8]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r3, [%[a], #60]\n\t" + "str r2, [%[r], #68]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #56]\n\t" + "str r4, [%[r], #64]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #52]\n\t" + "str r3, [%[r], #60]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #48]\n\t" + "str r2, [%[r], #56]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #44]\n\t" + "str r4, [%[r], #52]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #40]\n\t" + "str r3, [%[r], #48]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #36]\n\t" + "str r2, [%[r], #44]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #32]\n\t" + "str r4, [%[r], #40]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #28]\n\t" + "str r3, [%[r], #36]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #24]\n\t" + "str r2, [%[r], #32]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #20]\n\t" + "str r4, [%[r], #28]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #16]\n\t" + "str r3, [%[r], #24]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #12]\n\t" + "str r2, [%[r], #20]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #8]\n\t" + "str r4, [%[r], #16]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #4]\n\t" + "str r3, [%[r], #12]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #0]\n\t" + "str r2, [%[r], #8]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r2, [%[a], #60]\n\t" + "str r4, [%[r], #68]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #56]\n\t" + "str r3, [%[r], #64]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #52]\n\t" + "str r2, [%[r], #60]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #48]\n\t" + "str r4, [%[r], #56]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #44]\n\t" + "str r3, [%[r], #52]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #40]\n\t" + "str r2, [%[r], #48]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #36]\n\t" + "str r4, [%[r], #44]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #32]\n\t" + "str r3, [%[r], #40]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #28]\n\t" + "str r2, [%[r], #36]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #24]\n\t" + "str r4, [%[r], #32]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #20]\n\t" + "str r3, [%[r], #28]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #16]\n\t" + "str r2, [%[r], #24]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #12]\n\t" + "str r4, [%[r], #20]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #8]\n\t" + "str r3, [%[r], #16]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #4]\n\t" + "str r2, [%[r], #12]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #0]\n\t" + "str r4, [%[r], #8]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "sub %[a], %[a], #64\n\t" + "sub %[r], %[r], #64\n\t" + "ldr r4, [%[a], #60]\n\t" + "str r3, [%[r], #68]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #56]\n\t" + "str r2, [%[r], #64]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #52]\n\t" + "str r4, [%[r], #60]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #48]\n\t" + "str r3, [%[r], #56]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #44]\n\t" + "str r2, [%[r], #52]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #40]\n\t" + "str r4, [%[r], #48]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #36]\n\t" + "str r3, [%[r], #44]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #32]\n\t" + "str r2, [%[r], #40]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #28]\n\t" + "str r4, [%[r], #36]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #24]\n\t" + "str r3, [%[r], #32]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #20]\n\t" + "str r2, [%[r], #28]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #16]\n\t" + "str r4, [%[r], #24]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #12]\n\t" + "str r3, [%[r], #20]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "ldr r3, [%[a], #8]\n\t" + "str r2, [%[r], #16]\n\t" + "lsr r5, r3, #1\n\t" + "lsl r3, r3, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r4, r4, r5\n\t" + "ldr r2, [%[a], #4]\n\t" + "str r4, [%[r], #12]\n\t" + "lsr r5, r2, #1\n\t" + "lsl r2, r2, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r3, r3, r5\n\t" + "ldr r4, [%[a], #0]\n\t" + "str r3, [%[r], #8]\n\t" + "lsr r5, r4, #1\n\t" + "lsl r4, r4, %[n]\n\t" + "lsr r5, r5, r6\n\t" + "orr r2, r2, r5\n\t" + "str r4, [%[r]]\n\t" + "str r2, [%[r], #4]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "r2", "r3", "r4", "r5", "r6" + ); +} + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_3072_mod_exp_2_96(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[192]; + sp_digit td[97]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 289, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 192; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_3072_mont_setup(m, &mp); + sp_3072_mont_norm_96(norm, m); + + i = (bits - 1) / 32; + n = e[i--]; + c = bits & 31; + if (c == 0) + c = 32; + c -= bits % 5; + y = n >> c; + n <<= 32 - c; + sp_3072_lshift_96(r, norm, y); + for (; i>=0 || c>=5; ) { + if (c == 0) { + n = e[i--]; + y = n >> 27; + n <<= 5; + c = 27; + } + else if (c < 5) { + y = n >> 27; + n = e[i--]; + c = 5 - c; + y |= n >> (32 - c); + n <<= c; + c = 32 - c; + } + else { + y = (n >> 27) & 0x1f; + n <<= 5; + c -= 5; + } + + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + sp_3072_mont_sqr_96(r, r, m, mp); + + sp_3072_lshift_96(r, r, y); + sp_3072_mul_d_96(tmp, norm, r[96]); + r[96] = 0; + o = sp_3072_add_96(r, r, tmp); + sp_3072_cond_sub_96(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[96], 0, sizeof(sp_digit) * 96); + sp_3072_mont_reduce_96(r, m, mp); + + mask = 0 - (sp_3072_cmp_96(r, m) >= 0); + sp_3072_cond_sub_96(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_FFDHE_3072 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -10195,7 +10456,13 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen, sp_3072_from_bin(e, 96, exp, expLen); sp_3072_from_mp(m, 96, mod); - err = sp_3072_mod_exp_96(r, b, e, expLen * 8, m, 0); + #ifdef HAVE_FFDHE_3072 + if (base->used == 1 && base->dp[0] == 2 && m[95] == (sp_digit)-1) + err = sp_3072_mod_exp_2_96(r, e, expLen * 8, m); + else + #endif + err = sp_3072_mod_exp_96(r, b, e, expLen * 8, m, 0); + } if (err == MP_OKAY) { @@ -10460,14 +10727,14 @@ static void sp_256_from_mp(sp_digit* r, int max, mp_int* a) s = 32 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 32 <= DIGIT_BIT) { s += 32; r[j] &= 0xffffffff; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -10561,7 +10828,7 @@ static int sp_256_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 8; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 32 >= DIGIT_BIT) { - #if DIGIT_BIT < 32 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -15859,14 +16126,14 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" "subs %[d0], %[d0], r4\n\t" "sbc %[d1], %[d1], r5\n\t" "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr 16\n\t" + "orr r4, r4, %[d0], lsr #16\n\t" "udiv r4, r4, r6\n\t" "add r7, r7, r4\n\t" "umull r4, r5, %[div], r4\n\t" @@ -15888,7 +16155,7 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_256_mask_8(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_256_mask_8(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c index f7d716501..d0fbedda8 100644 --- a/wolfcrypt/src/sp_x86_64.c +++ b/wolfcrypt/src/sp_x86_64.c @@ -102,14 +102,14 @@ static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a) s = 64 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 64 <= DIGIT_BIT) { s += 64; r[j] &= 0xffffffffffffffffl; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -190,7 +190,7 @@ extern sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -580,9 +580,12 @@ static int sp_2048_mod_exp_16(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 16); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -613,10 +616,6 @@ static int sp_2048_mod_exp_16(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_16(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_16(r, r, m, mp); - sp_2048_mont_mul_16(r, r, t[y], m, mp); XMEMSET(&r[16], 0, sizeof(sp_digit) * 16); sp_2048_mont_reduce_16(r, m, mp); @@ -760,9 +759,12 @@ static int sp_2048_mod_exp_avx2_16(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 16); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -793,10 +795,6 @@ static int sp_2048_mod_exp_avx2_16(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_avx2_16(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_avx2_16(r, r, m, mp); - sp_2048_mont_mul_avx2_16(r, r, t[y], m, mp); XMEMSET(&r[16], 0, sizeof(sp_digit) * 16); sp_2048_mont_reduce_avx2_16(r, m, mp); @@ -891,7 +889,7 @@ static WC_INLINE sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -1125,9 +1123,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -1158,10 +1159,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_32(r, r, m, mp); - sp_2048_mont_mul_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_32(r, m, mp); @@ -1307,9 +1304,12 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 32); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -1340,10 +1340,6 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, sp_digit* a, sp_digit* e, sp_2048_mont_mul_avx2_32(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_2048_mont_sqr_avx2_32(r, r, m, mp); - sp_2048_mont_mul_avx2_32(r, r, t[y], m, mp); XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_avx2_32(r, m, mp); @@ -1711,7 +1707,7 @@ static int sp_2048_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 32; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 64 >= DIGIT_BIT) { - #if DIGIT_BIT < 64 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -1775,6 +1771,220 @@ int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) return err; } +#ifdef HAVE_FFDHE_2048 +extern void sp_2048_lshift_32(sp_digit* r, const sp_digit* a, int n); +#ifdef HAVE_INTEL_AVX2 +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_2048_mod_exp_2_avx2_32(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[64]; + sp_digit td[33]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 97, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 64; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_2048_mont_setup(m, &mp); + sp_2048_mont_norm_32(norm, m); + + i = (bits - 1) / 64; + n = e[i--]; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 6; + y = n >> c; + n <<= 64 - c; + sp_2048_lshift_32(r, norm, y); + for (; i>=0 || c>=6; ) { + if (c == 0) { + n = e[i--]; + y = n >> 58; + n <<= 6; + c = 58; + } + else if (c < 6) { + y = n >> 58; + n = e[i--]; + c = 6 - c; + y |= n >> (64 - c); + n <<= c; + c = 64 - c; + } + else { + y = (n >> 58) & 0x3f; + n <<= 6; + c -= 6; + } + + sp_2048_mont_sqr_avx2_32(r, r, m, mp); + sp_2048_mont_sqr_avx2_32(r, r, m, mp); + sp_2048_mont_sqr_avx2_32(r, r, m, mp); + sp_2048_mont_sqr_avx2_32(r, r, m, mp); + sp_2048_mont_sqr_avx2_32(r, r, m, mp); + sp_2048_mont_sqr_avx2_32(r, r, m, mp); + + sp_2048_lshift_32(r, r, y); + sp_2048_mul_d_avx2_32(tmp, norm, r[32]); + r[32] = 0; + o = sp_2048_add_32(r, r, tmp); + sp_2048_cond_sub_32(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); + sp_2048_mont_reduce_avx2_32(r, m, mp); + + mask = 0 - (sp_2048_cmp_32(r, m) >= 0); + sp_2048_cond_sub_32(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_INTEL_AVX2 */ + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_2048_mod_exp_2_32(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[64]; + sp_digit td[33]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 97, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 64; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_2048_mont_setup(m, &mp); + sp_2048_mont_norm_32(norm, m); + + i = (bits - 1) / 64; + n = e[i--]; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 6; + y = n >> c; + n <<= 64 - c; + sp_2048_lshift_32(r, norm, y); + for (; i>=0 || c>=6; ) { + if (c == 0) { + n = e[i--]; + y = n >> 58; + n <<= 6; + c = 58; + } + else if (c < 6) { + y = n >> 58; + n = e[i--]; + c = 6 - c; + y |= n >> (64 - c); + n <<= c; + c = 64 - c; + } + else { + y = (n >> 58) & 0x3f; + n <<= 6; + c -= 6; + } + + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + sp_2048_mont_sqr_32(r, r, m, mp); + + sp_2048_lshift_32(r, r, y); + sp_2048_mul_d_32(tmp, norm, r[32]); + r[32] = 0; + o = sp_2048_add_32(r, r, tmp); + sp_2048_cond_sub_32(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); + sp_2048_mont_reduce_32(r, m, mp); + + mask = 0 - (sp_2048_cmp_32(r, m) >= 0); + sp_2048_cond_sub_32(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} + +#endif /* HAVE_FFDHE_2048 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -1808,12 +2018,25 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen, sp_2048_from_bin(e, 32, exp, expLen); sp_2048_from_mp(m, 32, mod); + #ifdef HAVE_FFDHE_2048 + if (base->used == 1 && base->dp[0] == 2 && m[31] == (sp_digit)-1) { #ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) - err = sp_2048_mod_exp_avx2_32(r, b, e, expLen * 8, m, 0); - else + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) + err = sp_2048_mod_exp_2_avx2_32(r, e, expLen * 8, m); + else #endif - err = sp_2048_mod_exp_32(r, b, e, expLen * 8, m, 0); + err = sp_2048_mod_exp_2_32(r, e, expLen * 8, m); + } + else + #endif + { +#ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) + err = sp_2048_mod_exp_avx2_32(r, b, e, expLen * 8, m, 0); + else +#endif + err = sp_2048_mod_exp_32(r, b, e, expLen * 8, m, 0); + } } if (err == MP_OKAY) { @@ -1934,14 +2157,14 @@ static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a) s = 64 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 64 <= DIGIT_BIT) { s += 64; r[j] &= 0xffffffffffffffffl; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -2022,7 +2245,7 @@ extern sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_24(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -2412,9 +2635,12 @@ static int sp_3072_mod_exp_24(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 24); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -2445,10 +2671,6 @@ static int sp_3072_mod_exp_24(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_24(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_24(r, r, m, mp); - sp_3072_mont_mul_24(r, r, t[y], m, mp); XMEMSET(&r[24], 0, sizeof(sp_digit) * 24); sp_3072_mont_reduce_24(r, m, mp); @@ -2592,9 +2814,12 @@ static int sp_3072_mod_exp_avx2_24(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 24); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -2625,10 +2850,6 @@ static int sp_3072_mod_exp_avx2_24(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_avx2_24(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_avx2_24(r, r, m, mp); - sp_3072_mont_mul_avx2_24(r, r, t[y], m, mp); XMEMSET(&r[24], 0, sizeof(sp_digit) * 24); sp_3072_mont_reduce_avx2_24(r, m, mp); @@ -2723,7 +2944,7 @@ static WC_INLINE sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; @@ -2957,9 +3178,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -2990,10 +3214,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_48(r, r, m, mp); - sp_3072_mont_mul_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_48(r, m, mp); @@ -3139,9 +3359,12 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, sp_digit* a, sp_digit* e, i = (bits - 1) / 64; n = e[i--]; - y = n >> 59; - n <<= 5; - c = 59; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 5; + y = n >> c; + n <<= 64 - c; XMEMCPY(r, t[y], sizeof(sp_digit) * 48); for (; i>=0 || c>=5; ) { if (c == 0) { @@ -3172,10 +3395,6 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, sp_digit* a, sp_digit* e, sp_3072_mont_mul_avx2_48(r, r, t[y], m, mp); } - y = e[0] & ((1 << c) - 1); - for (; c > 0; c--) - sp_3072_mont_sqr_avx2_48(r, r, m, mp); - sp_3072_mont_mul_avx2_48(r, r, t[y], m, mp); XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_avx2_48(r, m, mp); @@ -3543,7 +3762,7 @@ static int sp_3072_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 48; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 64 >= DIGIT_BIT) { - #if DIGIT_BIT < 64 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -3607,6 +3826,220 @@ int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res) return err; } +#ifdef HAVE_FFDHE_3072 +extern void sp_3072_lshift_48(sp_digit* r, const sp_digit* a, int n); +#ifdef HAVE_INTEL_AVX2 +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_3072_mod_exp_2_avx2_48(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[96]; + sp_digit td[49]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 145, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 96; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_3072_mont_setup(m, &mp); + sp_3072_mont_norm_48(norm, m); + + i = (bits - 1) / 64; + n = e[i--]; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 6; + y = n >> c; + n <<= 64 - c; + sp_3072_lshift_48(r, norm, y); + for (; i>=0 || c>=6; ) { + if (c == 0) { + n = e[i--]; + y = n >> 58; + n <<= 6; + c = 58; + } + else if (c < 6) { + y = n >> 58; + n = e[i--]; + c = 6 - c; + y |= n >> (64 - c); + n <<= c; + c = 64 - c; + } + else { + y = (n >> 58) & 0x3f; + n <<= 6; + c -= 6; + } + + sp_3072_mont_sqr_avx2_48(r, r, m, mp); + sp_3072_mont_sqr_avx2_48(r, r, m, mp); + sp_3072_mont_sqr_avx2_48(r, r, m, mp); + sp_3072_mont_sqr_avx2_48(r, r, m, mp); + sp_3072_mont_sqr_avx2_48(r, r, m, mp); + sp_3072_mont_sqr_avx2_48(r, r, m, mp); + + sp_3072_lshift_48(r, r, y); + sp_3072_mul_d_avx2_48(tmp, norm, r[48]); + r[48] = 0; + o = sp_3072_add_48(r, r, tmp); + sp_3072_cond_sub_48(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); + sp_3072_mont_reduce_avx2_48(r, m, mp); + + mask = 0 - (sp_3072_cmp_48(r, m) >= 0); + sp_3072_cond_sub_48(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} +#endif /* HAVE_INTEL_AVX2 */ + +/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m) + * + * r A single precision number that is the result of the operation. + * e A single precision number that is the exponent. + * bits The number of bits in the exponent. + * m A single precision number that is the modulus. + * returns 0 on success and MEMORY_E on dynamic memory allocation failure. + */ +static int sp_3072_mod_exp_2_48(sp_digit* r, sp_digit* e, int bits, + sp_digit* m) +{ +#ifndef WOLFSSL_SMALL_STACK + sp_digit nd[96]; + sp_digit td[49]; +#else + sp_digit* td; +#endif + sp_digit* norm; + sp_digit* tmp; + sp_digit mp = 1; + sp_digit n, o; + sp_digit mask; + int i; + int c, y; + int err = MP_OKAY; + +#ifdef WOLFSSL_SMALL_STACK + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 145, NULL, + DYNAMIC_TYPE_TMP_BUFFER); + if (td == NULL) + err = MEMORY_E; + + if (err == MP_OKAY) { + norm = td; + tmp = td + 96; + } +#else + norm = nd; + tmp = td; +#endif + + if (err == MP_OKAY) { + sp_3072_mont_setup(m, &mp); + sp_3072_mont_norm_48(norm, m); + + i = (bits - 1) / 64; + n = e[i--]; + c = bits & 63; + if (c == 0) + c = 64; + c -= bits % 6; + y = n >> c; + n <<= 64 - c; + sp_3072_lshift_48(r, norm, y); + for (; i>=0 || c>=6; ) { + if (c == 0) { + n = e[i--]; + y = n >> 58; + n <<= 6; + c = 58; + } + else if (c < 6) { + y = n >> 58; + n = e[i--]; + c = 6 - c; + y |= n >> (64 - c); + n <<= c; + c = 64 - c; + } + else { + y = (n >> 58) & 0x3f; + n <<= 6; + c -= 6; + } + + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + sp_3072_mont_sqr_48(r, r, m, mp); + + sp_3072_lshift_48(r, r, y); + sp_3072_mul_d_48(tmp, norm, r[48]); + r[48] = 0; + o = sp_3072_add_48(r, r, tmp); + sp_3072_cond_sub_48(r, r, m, (sp_digit)0 - o); + } + + XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); + sp_3072_mont_reduce_48(r, m, mp); + + mask = 0 - (sp_3072_cmp_48(r, m) >= 0); + sp_3072_cond_sub_48(r, r, m, mask); + } + +#ifdef WOLFSSL_SMALL_STACK + if (td != NULL) + XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif + + return err; +} + +#endif /* HAVE_FFDHE_3072 */ + /* Perform the modular exponentiation for Diffie-Hellman. * * base Base. @@ -3640,12 +4073,25 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen, sp_3072_from_bin(e, 48, exp, expLen); sp_3072_from_mp(m, 48, mod); + #ifdef HAVE_FFDHE_3072 + if (base->used == 1 && base->dp[0] == 2 && m[47] == (sp_digit)-1) { #ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) - err = sp_3072_mod_exp_avx2_48(r, b, e, expLen * 8, m, 0); - else + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) + err = sp_3072_mod_exp_2_avx2_48(r, e, expLen * 8, m); + else #endif - err = sp_3072_mod_exp_48(r, b, e, expLen * 8, m, 0); + err = sp_3072_mod_exp_2_48(r, e, expLen * 8, m); + } + else + #endif + { +#ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) + err = sp_3072_mod_exp_avx2_48(r, b, e, expLen * 8, m, 0); + else +#endif + err = sp_3072_mod_exp_48(r, b, e, expLen * 8, m, 0); + } } if (err == MP_OKAY) { @@ -3911,14 +4357,14 @@ static void sp_256_from_mp(sp_digit* r, int max, mp_int* a) s = 64 - s; if (j + 1 >= max) break; - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); while (s + 64 <= DIGIT_BIT) { s += 64; r[j] &= 0xffffffffffffffffl; if (j + 1 >= max) break; if (s < DIGIT_BIT) - r[++j] = a->dp[i] >> s; + r[++j] = (sp_digit)(a->dp[i] >> s); else r[++j] = 0; } @@ -4012,7 +4458,7 @@ static int sp_256_to_mp(sp_digit* a, mp_int* r) for (i = 0; i < 4; i++) { r->dp[j] |= ((mp_digit)a[i]) << s; if (s + 64 >= DIGIT_BIT) { - #if DIGIT_BIT < 64 + #if DIGIT_BIT != 32 && DIGIT_BIT != 64 r->dp[j] &= (1l << DIGIT_BIT) - 1; #endif s = DIGIT_BIT - s; @@ -19921,7 +20367,7 @@ static WC_INLINE sp_digit div_256_word_4(sp_digit d1, sp_digit d0, * a A single precision integer. * m Mask to AND against each digit. */ -static void sp_256_mask_4(sp_digit* r, sp_digit* a, sp_digit m) +static void sp_256_mask_4(sp_digit* r, const sp_digit* a, sp_digit m) { #ifdef WOLFSSL_SP_SMALL int i; diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index fa93f5402..26249845b 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -8474,6 +8474,123 @@ L_mont_loop_avx2_32: .size sp_2048_mont_reduce_avx2_32,.-sp_2048_mont_reduce_avx2_32 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +/* Shift number left by n bit. (r = a << n) + * + * r Result of left shift by n. + * a Number to shift. + * n Amoutnt o shift. + */ +#ifndef __APPLE__ +.globl sp_2048_lshift_32 +.type sp_2048_lshift_32,@function +.align 16 +sp_2048_lshift_32: +#else +.globl _sp_2048_lshift_32 +.p2align 4 +_sp_2048_lshift_32: +#endif /* __APPLE__ */ + movq %rdx, %rcx + movq $0, %r10 + movq 216(%rsi), %r11 + movq 224(%rsi), %rdx + movq 232(%rsi), %rax + movq 240(%rsi), %r8 + movq 248(%rsi), %r9 + shldq %cl, %r9, %r10 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 224(%rdi) + movq %rax, 232(%rdi) + movq %r8, 240(%rdi) + movq %r9, 248(%rdi) + movq %r10, 256(%rdi) + movq 184(%rsi), %r9 + movq 192(%rsi), %rdx + movq 200(%rsi), %rax + movq 208(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r9, %rdx + movq %rdx, 192(%rdi) + movq %rax, 200(%rdi) + movq %r8, 208(%rdi) + movq %r11, 216(%rdi) + movq 152(%rsi), %r11 + movq 160(%rsi), %rdx + movq 168(%rsi), %rax + movq 176(%rsi), %r8 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 160(%rdi) + movq %rax, 168(%rdi) + movq %r8, 176(%rdi) + movq %r9, 184(%rdi) + movq 120(%rsi), %r9 + movq 128(%rsi), %rdx + movq 136(%rsi), %rax + movq 144(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r9, %rdx + movq %rdx, 128(%rdi) + movq %rax, 136(%rdi) + movq %r8, 144(%rdi) + movq %r11, 152(%rdi) + movq 88(%rsi), %r11 + movq 96(%rsi), %rdx + movq 104(%rsi), %rax + movq 112(%rsi), %r8 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 96(%rdi) + movq %rax, 104(%rdi) + movq %r8, 112(%rdi) + movq %r9, 120(%rdi) + movq 56(%rsi), %r9 + movq 64(%rsi), %rdx + movq 72(%rsi), %rax + movq 80(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r9, %rdx + movq %rdx, 64(%rdi) + movq %rax, 72(%rdi) + movq %r8, 80(%rdi) + movq %r11, 88(%rdi) + movq 24(%rsi), %r11 + movq 32(%rsi), %rdx + movq 40(%rsi), %rax + movq 48(%rsi), %r8 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 32(%rdi) + movq %rax, 40(%rdi) + movq %r8, 48(%rdi) + movq %r9, 56(%rdi) + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shlq %cl, %rdx + movq %rdx, (%rdi) + movq %rax, 8(%rdi) + movq %r8, 16(%rdi) + movq %r11, 24(%rdi) + repz retq /* Multiply a and b into r. (r = a * b) * * r A single precision integer. @@ -24330,6 +24447,171 @@ L_mont_loop_avx2_48: .size sp_3072_mont_reduce_avx2_48,.-sp_3072_mont_reduce_avx2_48 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +/* Shift number left by n bit. (r = a << n) + * + * r Result of left shift by n. + * a Number to shift. + * n Amoutnt o shift. + */ +#ifndef __APPLE__ +.globl sp_3072_lshift_48 +.type sp_3072_lshift_48,@function +.align 16 +sp_3072_lshift_48: +#else +.globl _sp_3072_lshift_48 +.p2align 4 +_sp_3072_lshift_48: +#endif /* __APPLE__ */ + movq %rdx, %rcx + movq $0, %r10 + movq 344(%rsi), %r11 + movq 352(%rsi), %rdx + movq 360(%rsi), %rax + movq 368(%rsi), %r8 + movq 376(%rsi), %r9 + shldq %cl, %r9, %r10 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 352(%rdi) + movq %rax, 360(%rdi) + movq %r8, 368(%rdi) + movq %r9, 376(%rdi) + movq %r10, 384(%rdi) + movq 312(%rsi), %r9 + movq 320(%rsi), %rdx + movq 328(%rsi), %rax + movq 336(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r9, %rdx + movq %rdx, 320(%rdi) + movq %rax, 328(%rdi) + movq %r8, 336(%rdi) + movq %r11, 344(%rdi) + movq 280(%rsi), %r11 + movq 288(%rsi), %rdx + movq 296(%rsi), %rax + movq 304(%rsi), %r8 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 288(%rdi) + movq %rax, 296(%rdi) + movq %r8, 304(%rdi) + movq %r9, 312(%rdi) + movq 248(%rsi), %r9 + movq 256(%rsi), %rdx + movq 264(%rsi), %rax + movq 272(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r9, %rdx + movq %rdx, 256(%rdi) + movq %rax, 264(%rdi) + movq %r8, 272(%rdi) + movq %r11, 280(%rdi) + movq 216(%rsi), %r11 + movq 224(%rsi), %rdx + movq 232(%rsi), %rax + movq 240(%rsi), %r8 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 224(%rdi) + movq %rax, 232(%rdi) + movq %r8, 240(%rdi) + movq %r9, 248(%rdi) + movq 184(%rsi), %r9 + movq 192(%rsi), %rdx + movq 200(%rsi), %rax + movq 208(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r9, %rdx + movq %rdx, 192(%rdi) + movq %rax, 200(%rdi) + movq %r8, 208(%rdi) + movq %r11, 216(%rdi) + movq 152(%rsi), %r11 + movq 160(%rsi), %rdx + movq 168(%rsi), %rax + movq 176(%rsi), %r8 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 160(%rdi) + movq %rax, 168(%rdi) + movq %r8, 176(%rdi) + movq %r9, 184(%rdi) + movq 120(%rsi), %r9 + movq 128(%rsi), %rdx + movq 136(%rsi), %rax + movq 144(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r9, %rdx + movq %rdx, 128(%rdi) + movq %rax, 136(%rdi) + movq %r8, 144(%rdi) + movq %r11, 152(%rdi) + movq 88(%rsi), %r11 + movq 96(%rsi), %rdx + movq 104(%rsi), %rax + movq 112(%rsi), %r8 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 96(%rdi) + movq %rax, 104(%rdi) + movq %r8, 112(%rdi) + movq %r9, 120(%rdi) + movq 56(%rsi), %r9 + movq 64(%rsi), %rdx + movq 72(%rsi), %rax + movq 80(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r9, %rdx + movq %rdx, 64(%rdi) + movq %rax, 72(%rdi) + movq %r8, 80(%rdi) + movq %r11, 88(%rdi) + movq 24(%rsi), %r11 + movq 32(%rsi), %rdx + movq 40(%rsi), %rax + movq 48(%rsi), %r8 + shldq %cl, %r8, %r9 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shldq %cl, %r11, %rdx + movq %rdx, 32(%rdi) + movq %rax, 40(%rdi) + movq %r8, 48(%rdi) + movq %r9, 56(%rdi) + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %r8 + shldq %cl, %r8, %r11 + shldq %cl, %rax, %r8 + shldq %cl, %rdx, %rax + shlq %cl, %rdx + movq %rdx, (%rdi) + movq %rax, 8(%rdi) + movq %r8, 16(%rdi) + movq %r11, 24(%rdi) + repz retq /* Conditionally copy a into r using the mask m. * m is -1 to copy and 0 when not. * diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 9223ae7a7..3c3c157ef 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -12910,6 +12910,79 @@ static int dh_test_check_pubvalue(void) } #endif +#if defined(WOLFSSL_HAVE_SP_DH) && defined(HAVE_FFDHE) + +#ifdef HAVE_FFDHE_3072 + #define FFDHE_KEY_SIZE (3072/8) +#else + #define FFDHE_KEY_SIZE (2048/8) +#endif + +static int dh_test_ffdhe(WC_RNG *rng, const DhParams* params) +{ + int ret; + word32 privSz, pubSz, privSz2, pubSz2; + byte priv[FFDHE_KEY_SIZE]; + byte pub[FFDHE_KEY_SIZE]; + byte priv2[FFDHE_KEY_SIZE]; + byte pub2[FFDHE_KEY_SIZE]; + byte agree[FFDHE_KEY_SIZE]; + byte agree2[FFDHE_KEY_SIZE]; + word32 agreeSz = (word32)sizeof(agree); + word32 agreeSz2 = (word32)sizeof(agree2); + DhKey key; + DhKey key2; + + ret = wc_InitDhKey_ex(&key, HEAP_HINT, devId); + if (ret != 0) { + ERROR_OUT(-7180, done); + } + ret = wc_InitDhKey_ex(&key2, HEAP_HINT, devId); + if (ret != 0) { + ERROR_OUT(-7181, done); + } + + ret = wc_DhSetKey(&key, params->p, params->p_len, params->g, params->g_len); + if (ret != 0) { + ERROR_OUT(-7182, done); + } + + ret = wc_DhSetKey(&key2, params->p, params->p_len, params->g, + params->g_len); + if (ret != 0) { + ERROR_OUT(-7183, done); + } + + ret = wc_DhGenerateKeyPair(&key, rng, priv, &privSz, pub, &pubSz); + if (ret != 0) { + ERROR_OUT(-7184, done); + } + + ret = wc_DhGenerateKeyPair(&key2, rng, priv2, &privSz2, pub2, &pubSz2); + if (ret != 0) { + ERROR_OUT(-7185, done); + } + + ret = wc_DhAgree(&key, agree, &agreeSz, priv, privSz, pub2, pubSz2); + if (ret != 0) { + ERROR_OUT(-7186, done); + } + + ret = wc_DhAgree(&key2, agree2, &agreeSz2, priv2, privSz2, pub, pubSz); + if (ret != 0) { + ERROR_OUT(-7187, done); + } + + if (agreeSz != agreeSz2 || XMEMCMP(agree, agree2, agreeSz)) { + ERROR_OUT(-7188, done); + } + +done: + return ret; +} + +#endif /* WOLFSSL_HAVE_SP_DH && HAVE_FFDHE */ + int dh_test(void) { int ret; @@ -13062,6 +13135,17 @@ int dh_test(void) ret = dh_test_check_pubvalue(); #endif +#ifdef WOLFSSL_HAVE_SP_DH + /* Specialized code for key gen when using FFDHE-2048 and FFDHE-3072. */ + #ifdef HAVE_FFDHE_2048 + if (ret == 0) + ret = dh_test_ffdhe(&rng, wc_Dh_ffdhe2048_Get()); + #endif + #ifdef HAVE_FFDHE_3072 + if (ret == 0) + ret = dh_test_ffdhe(&rng, wc_Dh_ffdhe3072_Get()); + #endif +#endif /* WOLFSSL_HAVE_SP_DH */ wc_FreeDhKey(&key); keyInit = 0;