From 1390b505ce93271fcb12e2c6c6749522869f9885 Mon Sep 17 00:00:00 2001 From: pmvr Date: Tue, 9 Jun 2020 19:31:53 +0200 Subject: [PATCH] combining mul+add in one call --- README.md | 16 ++-- mpy-modules/curve25519/arithmetic.c | 142 +++++++++++++++++++--------- 2 files changed, 107 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 0d7f873..5aa1b95 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,13 @@ Python script `x25519.py` Test vectors from https://tools.ietf.org/html/rfc8031#appendix-A Test 1: X25519: q = d*u - Computatation time: 39 ms + Computatation time: 26 ms q [hex/dec] = 66c7fb0d9f7090f777fa8493081ce8a4f174dbbbf9a36f16ba571206d4ddd548 46489245826987382655505058740283756869827209462947799117248009944518788765000 Test 1 passed. Test 2: X25519 + y-coordinate recovery + transform to Edwards-curve (x, y) = Edward(q, r), (q, r) = d*(u, v) - Computatation time: 44 ms + Computatation time: 27 ms x [hex/dec] = 1ce7e6e3a747a25352df2d3155f06427ba389769e37755731dead2b54c5cef03 13074494971479542188989287385397236998770807488645203601973104535274459557635 y [hex/dec] = 4dd1c7c2001c147333ceedf77ebd48b1100e2a95f88cf1f40d1b74ec7279e657 35198739055214410372845858661063095427357109357427482712729161712065293444695 Test 2 passed. @@ -41,24 +41,24 @@ Python script `x25519.py` Python script `ed25519.py` Test 1: Length of message: 0 bytes - Computatation time: 89 ms + Computatation time: 58 ms Test 1 passed. Test 2: Length of message: 1 byte - Computatation time: 90 ms + Computatation time: 58 ms Test 2 passed. Test 3: Length of message: 2 bytes - Computatation time: 90 ms + Computatation time: 58 ms Test 3 passed. Test 4: Length of message: 1023 bytes - Computatation time: 98 ms + Computatation time: 67 ms Test 4 passed. Test 5: Length of message: 64 bytes - Computatation time: 90 ms + Computatation time: 59 ms Test 5 passed. ## Warning -The code is not ready for production for both security reasons and missing regression tests. +The source code is not ready for production for both security reasons and missing regression tests. diff --git a/mpy-modules/curve25519/arithmetic.c b/mpy-modules/curve25519/arithmetic.c index 34e45a4..cac13af 100644 --- a/mpy-modules/curve25519/arithmetic.c +++ b/mpy-modules/curve25519/arithmetic.c @@ -70,93 +70,159 @@ void sub_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) { } -void mul_zxy(uint32_t *z, uint32_t *x, uint32_t y) { +void mul_add_zxy(uint32_t *z, uint32_t *x, uint32_t y) { + // z += x*y + // Note, UMAAL is not available __asm__ volatile ( // 0 "LDMIA %0!, {r3}\n" "UMULL r5, r6, r3, %1\n" + "LDMIA %2, {r3}\n" + "ADDS r5, r5, r3\n" "STMIA %2!, {r5}\n" // 1 "LDMIA %0!, {r3}\n" "MOV r5, 0\n" "UMLAL r6, r5, r3, %1\n" + "LDMIA %2, {r3}\n" + "ADCS r6, r6, r3\n" "STMIA %2!, {r6}\n" // 2 "LDMIA %0!, {r3}\n" "MOV r6, 0\n" "UMLAL r5, r6, r3, %1\n" + "LDMIA %2, {r3}\n" + "ADCS r5, r5, r3\n" "STMIA %2!, {r5}\n" // 3 "LDMIA %0!, {r3}\n" "MOV r5, 0\n" "UMLAL r6, r5, r3, %1\n" + "LDMIA %2, {r3}\n" + "ADCS r6, r6, r3\n" "STMIA %2!, {r6}\n" // 4 "LDMIA %0!, {r3}\n" "MOV r6, 0\n" "UMLAL r5, r6, r3, %1\n" + "LDMIA %2, {r3}\n" + "ADCS r5, r5, r3\n" "STMIA %2!, {r5}\n" // 5 "LDMIA %0!, {r3}\n" "MOV r5, 0\n" "UMLAL r6, r5, r3, %1\n" + "LDMIA %2, {r3}\n" + "ADCS r6, r6, r3\n" "STMIA %2!, {r6}\n" // 6 "LDMIA %0!, {r3}\n" "MOV r6, 0\n" "UMLAL r5, r6, r3, %1\n" + "LDMIA %2, {r3}\n" + "ADCS r5, r5, r3\n" "STMIA %2!, {r5}\n" // 7 "LDMIA %0!, {r3}\n" "MOV r5, 0\n" "UMLAL r6, r5, r3, %1\n" + "LDMIA %2, {r3}\n" + "ADCS r6, r6, r3\n" "STMIA %2!, {r6}\n" + "LDMIA %2, {r3}\n" + "ADCS r5, r5, r3\n" "STMIA %2, {r5}\n" : : "r" (x), "r" (y), "r" (z) : "r3", "r5", "r6" ); } - -void mul_zx0y0(uint32_t *z, uint32_t x, uint32_t y) { + +void mul_add_zx0y0(uint32_t *z, uint32_t x, uint32_t y) { + // z += x0 * y0 __asm__ volatile ( // 0 "UMULL r5, r6, %0, %1\n" - "STMIA %2!, {r5}\n" - "STMIA %2!, {r6}\n" + "LDMIA %2, {r3}\n" + "ADDS r3, r3, r5\n" + "STMIA %2!, {r3}\n" + "LDMIA %2, {r3}\n" + "ADCS r3, r3, r6\n" + "STMIA %2!, {r3}\n" "MOV r5, 0\n" - "MOV r6, 0\n" - "STMIA %2!, {r5, r6}\n" - "STMIA %2!, {r5, r6}\n" - "STMIA %2!, {r5, r6}\n" - "STMIA %2, {r5}\n" - : : "r" (x), "r" (y), "r" (z) : "r5", "r6" + "LDMIA %2, {r3}\n" + "ADCS r3, r3, r5\n" + "STMIA %2!, {r3}\n" + "LDMIA %2, {r3}\n" + "ADCS r3, r3, r5\n" + "STMIA %2!, {r3}\n" + "LDMIA %2, {r3}\n" + "ADCS r3, r3, r5\n" + "STMIA %2!, {r3}\n" + "LDMIA %2, {r3}\n" + "ADCS r3, r3, r5\n" + "STMIA %2!, {r3}\n" + "LDMIA %2, {r3}\n" + "ADCS r3, r3, r5\n" + "STMIA %2!, {r3}\n" + "LDMIA %2, {r3}\n" + "ADCS r3, r3, r5\n" + "STMIA %2!, {r3}\n" + "LDMIA %2, {r3}\n" + "ADCS r3, r3, r5\n" + "STMIA %2, {r3}\n" + //"SUBS %2, 32\n" //* + : : "r" (x), "r" (y), "r" (z) : "r3", "r5", "r6" ); } -void pu(uint32_t *t, uint32_t u) { - // computes (2^255 -19) * u +void pu_add_shift(uint32_t *t, uint32_t u) { + // computes t = (t + (2^255 -19) * u) >> 32 __asm__ volatile ( "MOV r3, 19\n" - "UMULL r5, r6, r3, %1\n" + "UMULL r5, r6, r3, %1\n" "MOV r2, 0\n" "LSRS %1, %1, 1\n" "RRXS r2, r2\n" - "MOV r3, 0\n" - "SUBS r4, r3, r5\n" - "STMIA %0!, {r4}\n" - "SBCS r4, r3, r6\n" - "STMIA %0!, {r4}\n" - "SBCS r4, r3, r3\n" - "STMIA %0!, {r4}\n" - "STMIA %0!, {r4}\n" - "STMIA %0!, {r4}\n" - "STMIA %0!, {r4}\n" - "STMIA %0!, {r4}\n" - "SBCS r4, r2, r3\n" - "STMIA %0!, {r4}\n" - "SBCS r4, %1, r3\n" - "STMIA %0, {r4}\n" + // t[8]:t[7] + (u << 255) + "ADDS %0, %0, 28\n" + "LDMIA %0, {r3}\n" + "ADDS r3, r3, r2\n" + "STMIA %0!, {r3}\n" + "LDMIA %0, {r3}\n" + "ADCS r3, r3, %1\n" + "STMIA %0, {r3}\n" + "SUBS %0, %0, 32\n" + // t - 19*u + "MOV r2, %0\n" + "LDMIA %0!, {r4}\n" + "SUBS r4, r4, r5\n" + "LDMIA %0!, {r4}\n" + "SBCS r4, r4, r6\n" + "STMIA r2!, {r4}\n" + "MOV r5, 0\n" + "LDMIA %0!, {r4}\n" + "SBCS r4, r4, r5\n" + "STMIA r2!, {r4}\n" + "LDMIA %0!, {r4}\n" + "SBCS r4, r4, r5\n" + "STMIA r2!, {r4}\n" + "LDMIA %0!, {r4}\n" + "SBCS r4, r4, r5\n" + "STMIA r2!, {r4}\n" + "LDMIA %0!, {r4}\n" + "SBCS r4, r4, r5\n" + "STMIA r2!, {r4}\n" + "LDMIA %0!, {r4}\n" + "SBCS r4, r4, r5\n" + "STMIA r2!, {r4}\n" + "LDMIA %0!, {r4}\n" + "SBCS r4, r4, r5\n" + "STMIA r2!, {r4}\n" + "LDMIA %0, {r4}\n" + "SBCS r4, r4, r5\n" + "STMIA r2!, {r4}\n" + "STMIA r2, {r5}\n" // ms-word = 0 //"SUBS %0, %0, 32\n" : : "r" (t), "r" (u) : "r2", "r3", "r4", "r5", "r6" ); @@ -173,13 +239,8 @@ void mont_mul_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) { for (int i=0; i<9; i++) a[i] = 0; for (int i=0; i<8; i++) { u = (a[0] + x[i] * y[0]) * 678152731; - mul_zxy(tmp, y, x[i]); - a[8] += tmp[8] + add_zxy(a, a, tmp); - // mul_zxy(tmp, p, u); - pu(tmp, u); - a[8] += tmp[8] + add_zxy(a, a, tmp); // A <- (A + xi y + u m) / b - for (int j=0; j<8; j++) a[j] = a[j+1]; - a[8] = 0; + mul_add_zxy(a, y, x[i]); // A <- A + xi y + pu_add_shift(a, u); // A <- (A + u m) / b } uint32_t c = sub_zxy(tmp, a, p); // carry not set if negative for (int i=0; i<8; i++) z[i] = ret[c][i]; @@ -196,13 +257,8 @@ void mont_mul_zxy0_mod_p(uint32_t *z, uint32_t *x, uint32_t y, uint32_t *p) { for (int i=0; i<9; i++) a[i] = 0; for (int i=0; i<8; i++) { u = (a[0] + x[i] * y) * 678152731; - mul_zx0y0(tmp, x[i], y); - a[8] += tmp[8] + add_zxy(a, a, tmp); - // mul_zxy(tmp, p, u); - pu(tmp, u); - a[8] += tmp[8] + add_zxy(a, a, tmp); // A <- (A + xi y + u m) / b - for (int j=0; j<8; j++) a[j] = a[j+1]; - a[8] = 0; + mul_add_zx0y0(a, y, x[i]); // A <- A + xi y + pu_add_shift(a, u); // A <- (A + u m) / b } uint32_t c = sub_zxy(tmp, a, p); // carry not set if negative for (int i=0; i<8; i++) z[i] = ret[c][i];