diff --git a/README.md b/README.md index ffb9676..cd1acf6 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,13 @@ Python script `x25519.py` Test vectors from https://tools.ietf.org/html/rfc8031#appendix-A Test 1: X25519: q = d*u - Computation time: 26 ms + Computation time: 24 ms q [hex/dec] = 66c7fb0d9f7090f777fa8493081ce8a4f174dbbbf9a36f16ba571206d4ddd548 46489245826987382655505058740283756869827209462947799117248009944518788765000 Test 1 passed. Test 2: X25519 + y-coordinate recovery + transform to Edwards-curve (x, y) = Edward(q, r), (q, r) = d*(u, v) - Computation time: 27 ms + Computation time: 25 ms x [hex/dec] = 1ce7e6e3a747a25352df2d3155f06427ba389769e37755731dead2b54c5cef03 13074494971479542188989287385397236998770807488645203601973104535274459557635 y [hex/dec] = 4dd1c7c2001c147333ceedf77ebd48b1100e2a95f88cf1f40d1b74ec7279e657 35198739055214410372845858661063095427357109357427482712729161712065293444695 Test 2 passed. @@ -41,23 +41,23 @@ Python script `x25519.py` Python script `ed25519.py` Test 1: Length of message: 0 bytes - Computation time: 58 ms + Computation time: 53 ms Test 1 passed. Test 2: Length of message: 1 byte - Computation time: 58 ms + Computation time: 53 ms Test 2 passed. Test 3: Length of message: 2 bytes - Computation time: 58 ms + Computation time: 53 ms Test 3 passed. Test 4: Length of message: 1023 bytes - Computation time: 67 ms + Computation time: 61 ms Test 4 passed. Test 5: Length of message: 64 bytes - Computation time: 59 ms + Computation time: 53 ms Test 5 passed. ## Warning diff --git a/mpy-modules/curve25519/arithmetic.c b/mpy-modules/curve25519/arithmetic.c index 8fa15e3..a987bbb 100644 --- a/mpy-modules/curve25519/arithmetic.c +++ b/mpy-modules/curve25519/arithmetic.c @@ -2,7 +2,137 @@ #include "ec.h" -uint32_t add_zxy(uint32_t *z, uint32_t *x, uint32_t *y) { +void shift_right(uint32_t *x) { + // for mod_inverse, operates on 9 words + __asm__ volatile ( + "LDMIA %0, {r1-r9}\n" + "AND r10, r9, 0x80000000\n" // store sign + "LSRS r9, r9, 1\n" + "RRXS r8, r8\n" + "RRXS r7, r7\n" + "RRXS r6, r6\n" + "RRXS r5, r5\n" + "RRXS r4, r4\n" + "RRXS r3, r3\n" + "RRXS r2, r2\n" + "RRXS r1, r1\n" + "ORR r9, r9, r10\n" // restore sign + "STMIA %0, {r1-r9}\n" + : : "r" (x) : "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); +} + + +uint32_t add9_zxy(uint32_t *z, uint32_t *x, uint32_t *y) { + // for mod_inverse, operates on 9 words + uint32_t carry; + __asm__ volatile ( + "LDMIA %1!, {r3-r6}\n" + "LDMIA %2!, {r7-r10}\n" + "ADDS r3, r3, r7\n" + "ADCS r4, r4, r8\n" + "ADCS r5, r5, r9\n" + "ADCS r6, r6, r10\n" + "STMIA %3!, {r3-r6}\n" + "LDMIA %1!, {r3-r6}\n" + "LDMIA %2!, {r7-r10}\n" + "ADCS r3, r3, r7\n" + "ADCS r4, r4, r8\n" + "ADCS r5, r5, r9\n" + "ADCS r6, r6, r10\n" + "STMIA %3!, {r3-r6}\n" + "LDMIA %1, {r3}\n" + "LDMIA %2, {r7}\n" + "ADCS r3, r3, r7\n" + "STMIA %3, {r3}\n" + "MOV %0, 0\n" + "ADCS %0, %0, 0\n" + : "=r" (carry) : "r" (x), "r" (y), "r" (z) : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return carry; +} + + +uint32_t sub9_zxy(uint32_t *z, uint32_t *x, uint32_t *y) { + // for mod_inverse, operates on 9 words + uint32_t carry; + __asm__ volatile ( + "LDMIA %1!, {r3-r6}\n" + "LDMIA %2!, {r7-r10}\n" + "SUBS r3, r3, r7\n" + "SBCS r4, r4, r8\n" + "SBCS r5, r5, r9\n" + "SBCS r6, r6, r10\n" + "STMIA %3!, {r3-r6}\n" + "LDMIA %1!, {r3-r6}\n" + "LDMIA %2!, {r7-r10}\n" + "SBCS r3, r3, r7\n" + "SBCS r4, r4, r8\n" + "SBCS r5, r5, r9\n" + "SBCS r6, r6, r10\n" + "STMIA %3!, {r3-r6}\n" + "LDMIA %1, {r3}\n" + "LDMIA %2, {r7}\n" + "SBCS r3, r3, r7\n" + "STMIA %3, {r3}\n" + "MOV %0, 0\n" + "ADCS %0, %0, 0\n" + : "=r" (carry) : "r" (x), "r" (y), "r" (z) : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return carry; +} + + +void mod_inverse(const uint32_t *x, uint32_t *y) { + // HoAC Alg. 14.61 and Note 14.64 + // input: x=modulus, y=a, output y=a^-1 + // it is required that the modulus is odd + uint32_t u[9], v[9], B[9], D[9], m[9]; + + for (uint32_t i=0; i<8; i++) { + B[i] = D[i] = 0; + m[i] = u[i] = x[i]; + v[i] = y[i]; + } + D[8] = B[8] = m[8] = u[8] = v[8] = 0; + D[0] = 1; +STEP_4: + while ((u[0] & 1) == 0) { + shift_right(u); + if ((B[0] & 1) == 1) { + sub9_zxy(B, B, m); + } + shift_right(B); + } + while ((v[0] & 1) == 0) { + shift_right(v); + if ((D[0] & 1) == 1) { + sub9_zxy(D, D, m); + } + shift_right(D); + } + if (sub9_zxy(u, u, v) == 1) { // u >= v + sub9_zxy(B, B, D); + } + else { + add9_zxy(u, u, v); + sub9_zxy(v, v, u); + sub9_zxy(D, D, B); + } + uint32_t cmp = 0; + for (int i=0; i<9; i++) cmp |= u[i]; + if (cmp == 0) { + if (D[8] & 0x80000000) { + add9_zxy(D, D, m); // D < 0 + } + for (uint32_t i=0; i<8; i++) y[i] = D[i]; + return; + } + goto STEP_4; +} + + +uint32_t add_zxy(uint32_t *z, const uint32_t *x, const uint32_t *y) { uint32_t carry; __asm__ volatile ( "LDMIA %1!, {r3-r6}\n" @@ -27,7 +157,7 @@ uint32_t add_zxy(uint32_t *z, uint32_t *x, uint32_t *y) { } -uint32_t sub_zxy(uint32_t *z, uint32_t *x, uint32_t *y) { +uint32_t sub_zxy(uint32_t *z, const uint32_t *x, const uint32_t *y) { uint32_t carry; __asm__ volatile ( "LDMIA %1!, {r3-r6}\n" @@ -52,7 +182,7 @@ uint32_t sub_zxy(uint32_t *z, uint32_t *x, uint32_t *y) { } -void add_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) { +void add_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) { uint32_t z1[8], z2[8]; uint32_t* ret[2] = {z1, z2}; add_zxy(z1, x, y); @@ -61,7 +191,7 @@ void add_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) { } -void sub_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) { +void sub_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) { // x-y % p uint32_t z1[8], z2[8]; uint32_t* ret[2] = {z2, z1}; @@ -71,7 +201,7 @@ void sub_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) { } -void mul_add_zxy(uint32_t *z, uint32_t *x, uint32_t y) { +void mul_add_zxy(uint32_t *z, const uint32_t *x, const uint32_t y) { // z += x*y // Note, UMAAL is not available __asm__ volatile ( @@ -230,7 +360,7 @@ void pu_add_shift(uint32_t *t, uint32_t u) { } -void mont_mul_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) { +void mont_mul_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) { // see Alg. 14.36 HoAC // -m^(-1) mod b is 678152731 for curve25519 uint32_t u; @@ -248,7 +378,7 @@ void mont_mul_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) { } -void mont_mul_zxy0_mod_p(uint32_t *z, uint32_t *x, uint32_t y, uint32_t *p) { +void mont_mul_zxy0_mod_p(uint32_t *z, const uint32_t *x, const uint32_t y, const uint32_t *p) { // see Alg. 14.36 HoAC // -m^(-1) mod b is 678152731 for curve25519 uint32_t u; diff --git a/mpy-modules/curve25519/arithmetic.h b/mpy-modules/curve25519/arithmetic.h index b7e4301..87a48fa 100644 --- a/mpy-modules/curve25519/arithmetic.h +++ b/mpy-modules/curve25519/arithmetic.h @@ -1,9 +1,10 @@ #ifndef __arithmetic__ #define __arithmetic__ -void add_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p); -void sub_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p); -void mont_mul_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p); -void mont_mul_zxy0_mod_p(uint32_t *z, uint32_t *x, uint32_t y, uint32_t *p); +void add_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p); +void sub_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p); +void mont_mul_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p); +void mont_mul_zxy0_mod_p(uint32_t *z, const uint32_t *x, const uint32_t y, const uint32_t *p); +void mod_inverse(const uint32_t *x, uint32_t *y); #endif diff --git a/mpy-modules/curve25519/ec.c b/mpy-modules/curve25519/ec.c index ee9ad73..01352e9 100644 --- a/mpy-modules/curve25519/ec.c +++ b/mpy-modules/curve25519/ec.c @@ -2,12 +2,10 @@ #include "arithmetic.h" const uint32_t a24R = 0x468ba6; -const uint32_t p[8] = {0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffed}; +const uint32_t Curve_p[8] = {0xffffffed, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff}; // little endian const uint32_t pR2 = 0x5a4; const uint32_t R = 38; -const uint32_t sqrt_minus_486664R[8] = {0x4038adb9, 0xa83f001e, 0xc1bcaf57, 0x688c332e, 0xa9fa8eee, 0xcb6e1095, 0xa7ab4e9e, 0x1baf4abd}; - -uint32_t Curve_p[8]; +const uint32_t sqrt_minus_486664R[8] = {0x1baf4abd, 0xa7ab4e9e, 0xcb6e1095, 0xa9fa8eee, 0x688c332e, 0xc1bcaf57, 0xa83f001e, 0x4038adb9}; // little endian void random_z(uint32_t *z) { // TODO: a true random source should be provided @@ -16,13 +14,6 @@ void random_z(uint32_t *z) { } -void ini_curve() { - for (int i=0; i<8; i++) { - Curve_p[i] = p[7-i]; - } -} - - void to_Montgomery(uint32_t *x) { uint32_t s[8]; mont_mul_zxy0_mod_p(s, x, pR2, Curve_p); @@ -37,90 +28,6 @@ void from_Montgomery(uint32_t *x) { } -void mod_inverse(uint32_t *z_inv, uint32_t *z) { - uint32_t z2[8]; - uint32_t z9[8]; - uint32_t z11[8]; - uint32_t z2_5_0[8]; - uint32_t z2_10_0[8]; - uint32_t z2_20_0[8]; - uint32_t z2_50_0[8]; - uint32_t z2_100_0[8]; - uint32_t t0[8]; - uint32_t t1[8]; - uint32_t i; - - mont_mul_zxy_mod_p(z2, z, z, Curve_p); // 2 - mont_mul_zxy_mod_p(t1, z2, z2, Curve_p); // 4 - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 8 - mont_mul_zxy_mod_p(z9, t0, z, Curve_p); // 9 - mont_mul_zxy_mod_p(z11, z9, z2, Curve_p); // 11 - mont_mul_zxy_mod_p(t0, z11, z11, Curve_p); // 22 - mont_mul_zxy_mod_p(z2_5_0, t0, z9, Curve_p); // 31 = 2^5 - 2^0 - - mont_mul_zxy_mod_p(t0, z2_5_0, z2_5_0, Curve_p); // 2^6 - 2^1 - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^7 - 2^2 - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^8 - 2^3 - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^9 - 2^4 - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^10 - 2^5 - mont_mul_zxy_mod_p(z2_10_0, t0, z2_5_0, Curve_p); // 2^10 - 2^0 - - mont_mul_zxy_mod_p(t0, z2_10_0, z2_10_0, Curve_p); // 2^11 - 2^1 - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^12 - 2^2 - for (i = 2; i < 10; i += 2) { - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); - } // 2^20 - 2^10 - mont_mul_zxy_mod_p(z2_20_0, t1, z2_10_0, Curve_p); // 2^20 - 2^0 - - mont_mul_zxy_mod_p(t0, z2_20_0, z2_20_0, Curve_p); // 2^21 - 2^1 - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^22 - 2^2 - for (i = 2; i < 20; i += 2) { - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); - } // 2^40 - 2^20 - mont_mul_zxy_mod_p(t0, t1, z2_20_0, Curve_p); // 2^40 - 2^0 - - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^41 - 2^1 - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^42 - 2^2 - for (i = 2; i < 10; i += 2) { - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); - } // 2^50 - 2^10 - mont_mul_zxy_mod_p(z2_50_0, t0, z2_10_0, Curve_p); // 2^50 - 2^0 - - mont_mul_zxy_mod_p(t0, z2_50_0, z2_50_0, Curve_p); // 2^51 - 2^1 - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^42 - 2^2 - for (i = 2; i < 50; i += 2) { - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); - } // 2^100 - 2^50 - mont_mul_zxy_mod_p(z2_100_0, t1, z2_50_0, Curve_p); // 2^100 - 2^0 - - mont_mul_zxy_mod_p(t1, z2_100_0, z2_100_0, Curve_p); // 2^101 - 2^1 - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^102 - 2^2 - for (i = 2; i < 100; i += 2) { - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); - } // 2^200 - 2^100 - mont_mul_zxy_mod_p(t1, t0, z2_100_0, Curve_p); // 2^200 - 2^0 - - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^201 - 2^1 - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^102 - 2^2 - for (i = 2; i < 50; i += 2) { - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); - } // 2^250 - 2^50 - mont_mul_zxy_mod_p(t0, t1, z2_50_0, Curve_p); // 2^250 - 2^0 - - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^251 - 2^1 - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^252 - 2^2 - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^253 - 2^3 - mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^254 - 2^4 - mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^255 - 2^5 - mont_mul_zxy_mod_p(z_inv, t1, z11, Curve_p); // 2^255 - 21 -} - void recover_y(uint32_t *x, uint32_t *y, uint32_t *z, uint32_t *xp, uint32_t *yp, uint32_t *xq, uint32_t *zq, uint32_t *xpq, uint32_t *zpq) { // https://eprint.iacr.org/2017/212.pdf uint32_t AR = 18493156; @@ -157,32 +64,34 @@ void montgomery2edward(uint32_t *x, uint32_t *y, uint32_t *u, uint32_t *v) { t[0] = R; // 1*R add_zxy_mod_p(s, u, t, Curve_p); // s = u+1 mont_mul_zxy_mod_p(w, s, v, Curve_p); - mod_inverse(w, w); // w = (u+1)^(-1) * v^(-1) + // mod_inverse(w, w); // w = (u+1)^(-1) * v^(-1) + mod_inverse(Curve_p, w); + mont_mul_zxy0_mod_p(w, w, pR2, Curve_p); + mont_mul_zxy0_mod_p(w, w, pR2, Curve_p); sub_zxy_mod_p(y, u, t, Curve_p); // y = u-1 mont_mul_zxy_mod_p(x, y, w, Curve_p); // x = (u-1)/((u+1) * v) mont_mul_zxy_mod_p(y, x, v, Curve_p); - for (uint32_t i=0; i<8; i++) t[i] = sqrt_minus_486664R[7-i]; - mont_mul_zxy_mod_p(x, t, w, Curve_p); + mont_mul_zxy_mod_p(x, sqrt_minus_486664R, w, Curve_p); mont_mul_zxy_mod_p(t, x, s, Curve_p); mont_mul_zxy_mod_p(x, t, u, Curve_p); } -void cswap(uint32_t swap, uint32_t **x, uint32_t **y) { - // https://tools.ietf.org/html/rfc7748 sec. 5 - __asm__ volatile ( - "LDMIA %1, {r3}\n" - "LDMIA %2, {r4}\n" - "RSB r5, %0, 0\n" - "EOR r6, r3, r4\n" - "AND r5, r5, r6\n" - "EOR r3, r3, r5\n" - "EOR r4, r4, r5\n" - "STMIA %1, {r3}\n" - "STMIA %2, {r4}\n" - : : "r" (swap), "r" (x), "r" (y) : "r3", "r4", "r5", "r6" +// https://tools.ietf.org/html/rfc7748 sec. 5 +#define cswap(swap, x, y) \ + __asm__ volatile ( \ + "LDMIA %1, {r3}\n" \ + "LDMIA %2, {r4}\n" \ + "RSB r5, %0, 0\n" \ + "EOR r6, r3, r4\n" \ + "AND r5, r5, r6\n" \ + "EOR r3, r3, r5\n" \ + "EOR r4, r4, r5\n" \ + "STMIA %1, {r3}\n" \ + "STMIA %2, {r4}\n" \ + : : "r" (swap), "r" (x), "r" (y) : "r3", "r4", "r5", "r6" \ ); -} + void X25519(uint32_t *q, uint32_t *r, uint8_t *k, uint32_t *u, uint32_t *v, uint8_t toEdward) { // cpmputes q = k * u @@ -192,8 +101,6 @@ void X25519(uint32_t *q, uint32_t *r, uint8_t *k, uint32_t *u, uint32_t *v, uint uint8_t swap, kt, kw; uint32_t A[8], AA[8], B[8], BB[8], C[8], D[8], E[8], DA[8], CB[8]; - ini_curve(); - // x1, z1 = u, 1 // x2, z2 = 1, 0 // x3, z3 = u, 1 @@ -249,23 +156,22 @@ void X25519(uint32_t *q, uint32_t *r, uint8_t *k, uint32_t *u, uint32_t *v, uint to_Montgomery(AA); to_Montgomery(BB); recover_y(A, B, C, AA, BB, x2, z2, x3, z3); - mod_inverse(C, C); + from_Montgomery(C); + mod_inverse(Curve_p, C); mont_mul_zxy_mod_p(q, A, C, Curve_p); mont_mul_zxy_mod_p(r, B, C, Curve_p); if (toEdward) { - for (uint32_t i=0; i<8; i++) { - A[i] = q[i]; - B[i] = r[i]; - } + mont_mul_zxy0_mod_p(A, q, pR2, Curve_p); + mont_mul_zxy0_mod_p(B, r, pR2, Curve_p); montgomery2edward(q, r, A, B); + from_Montgomery(q); + from_Montgomery(r); } - from_Montgomery(q); - from_Montgomery(r); } else { // compute x-only // compute z2^(-1) = z2^{p-2} - mod_inverse(A, z2); - mont_mul_zxy_mod_p(q, x2, A, Curve_p); - from_Montgomery(q); + from_Montgomery(z2); + mod_inverse(Curve_p, z2); + mont_mul_zxy_mod_p(q, x2, z2, Curve_p); } } diff --git a/x25519.py b/x25519.py index 6c7c45e..0954c29 100644 --- a/x25519.py +++ b/x25519.py @@ -3,7 +3,7 @@ import time # from urandom import randint d = b'\x70\x1f\xb4\x30\x86\x55\xb4\x76\xb6\x78\x9b\x73\x25\xf9\xea\x8c\xdd\xd1\x6a\x58\x53\x3f\xf6\xd9\xe6\x00\x09\x46\x4a\x5f\x9d\x54\x00\x00\x00\x00' -u = b'\x09' + bytes(35) +u = b'\x09' + bytes(31) v = b'\xd9\xd3\xce~\xa2\xc5\xe9)\xb2a|m~M=\x92L\xd1Hw,\xdd\x1e\xe0\xb4\x86\xa0\xb8\xa1\x19\xae \x00\x00\x00\x00' print('Test vectors from https://tools.ietf.org/html/rfc8031#appendix-A')