changed inversion routine

master v0.4
pmvr 2020-06-14 09:43:35 +02:00
parent 7911f28208
commit 831e1776e4
5 changed files with 180 additions and 143 deletions

View File

@ -26,13 +26,13 @@ Python script `x25519.py`
Test vectors from https://tools.ietf.org/html/rfc8031#appendix-A
Test 1: X25519: q = d*u
Computation time: 26 ms
Computation time: 24 ms
q [hex/dec] = 66c7fb0d9f7090f777fa8493081ce8a4f174dbbbf9a36f16ba571206d4ddd548 46489245826987382655505058740283756869827209462947799117248009944518788765000
Test 1 passed.
Test 2: X25519 + y-coordinate recovery + transform to Edwards-curve
(x, y) = Edward(q, r), (q, r) = d*(u, v)
Computation time: 27 ms
Computation time: 25 ms
x [hex/dec] = 1ce7e6e3a747a25352df2d3155f06427ba389769e37755731dead2b54c5cef03 13074494971479542188989287385397236998770807488645203601973104535274459557635
y [hex/dec] = 4dd1c7c2001c147333ceedf77ebd48b1100e2a95f88cf1f40d1b74ec7279e657 35198739055214410372845858661063095427357109357427482712729161712065293444695
Test 2 passed.
@ -41,23 +41,23 @@ Python script `x25519.py`
Python script `ed25519.py`
Test 1: Length of message: 0 bytes
Computation time: 58 ms
Computation time: 53 ms
Test 1 passed.
Test 2: Length of message: 1 byte
Computation time: 58 ms
Computation time: 53 ms
Test 2 passed.
Test 3: Length of message: 2 bytes
Computation time: 58 ms
Computation time: 53 ms
Test 3 passed.
Test 4: Length of message: 1023 bytes
Computation time: 67 ms
Computation time: 61 ms
Test 4 passed.
Test 5: Length of message: 64 bytes
Computation time: 59 ms
Computation time: 53 ms
Test 5 passed.
## Warning

View File

@ -2,7 +2,137 @@
#include "ec.h"
uint32_t add_zxy(uint32_t *z, uint32_t *x, uint32_t *y) {
void shift_right(uint32_t *x) {
// for mod_inverse, operates on 9 words
__asm__ volatile (
"LDMIA %0, {r1-r9}\n"
"AND r10, r9, 0x80000000\n" // store sign
"LSRS r9, r9, 1\n"
"RRXS r8, r8\n"
"RRXS r7, r7\n"
"RRXS r6, r6\n"
"RRXS r5, r5\n"
"RRXS r4, r4\n"
"RRXS r3, r3\n"
"RRXS r2, r2\n"
"RRXS r1, r1\n"
"ORR r9, r9, r10\n" // restore sign
"STMIA %0, {r1-r9}\n"
: : "r" (x) : "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
);
}
uint32_t add9_zxy(uint32_t *z, uint32_t *x, uint32_t *y) {
// for mod_inverse, operates on 9 words
uint32_t carry;
__asm__ volatile (
"LDMIA %1!, {r3-r6}\n"
"LDMIA %2!, {r7-r10}\n"
"ADDS r3, r3, r7\n"
"ADCS r4, r4, r8\n"
"ADCS r5, r5, r9\n"
"ADCS r6, r6, r10\n"
"STMIA %3!, {r3-r6}\n"
"LDMIA %1!, {r3-r6}\n"
"LDMIA %2!, {r7-r10}\n"
"ADCS r3, r3, r7\n"
"ADCS r4, r4, r8\n"
"ADCS r5, r5, r9\n"
"ADCS r6, r6, r10\n"
"STMIA %3!, {r3-r6}\n"
"LDMIA %1, {r3}\n"
"LDMIA %2, {r7}\n"
"ADCS r3, r3, r7\n"
"STMIA %3, {r3}\n"
"MOV %0, 0\n"
"ADCS %0, %0, 0\n"
: "=r" (carry) : "r" (x), "r" (y), "r" (z) : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
);
return carry;
}
uint32_t sub9_zxy(uint32_t *z, uint32_t *x, uint32_t *y) {
// for mod_inverse, operates on 9 words
uint32_t carry;
__asm__ volatile (
"LDMIA %1!, {r3-r6}\n"
"LDMIA %2!, {r7-r10}\n"
"SUBS r3, r3, r7\n"
"SBCS r4, r4, r8\n"
"SBCS r5, r5, r9\n"
"SBCS r6, r6, r10\n"
"STMIA %3!, {r3-r6}\n"
"LDMIA %1!, {r3-r6}\n"
"LDMIA %2!, {r7-r10}\n"
"SBCS r3, r3, r7\n"
"SBCS r4, r4, r8\n"
"SBCS r5, r5, r9\n"
"SBCS r6, r6, r10\n"
"STMIA %3!, {r3-r6}\n"
"LDMIA %1, {r3}\n"
"LDMIA %2, {r7}\n"
"SBCS r3, r3, r7\n"
"STMIA %3, {r3}\n"
"MOV %0, 0\n"
"ADCS %0, %0, 0\n"
: "=r" (carry) : "r" (x), "r" (y), "r" (z) : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
);
return carry;
}
void mod_inverse(const uint32_t *x, uint32_t *y) {
// HoAC Alg. 14.61 and Note 14.64
// input: x=modulus, y=a, output y=a^-1
// it is required that the modulus is odd
uint32_t u[9], v[9], B[9], D[9], m[9];
for (uint32_t i=0; i<8; i++) {
B[i] = D[i] = 0;
m[i] = u[i] = x[i];
v[i] = y[i];
}
D[8] = B[8] = m[8] = u[8] = v[8] = 0;
D[0] = 1;
STEP_4:
while ((u[0] & 1) == 0) {
shift_right(u);
if ((B[0] & 1) == 1) {
sub9_zxy(B, B, m);
}
shift_right(B);
}
while ((v[0] & 1) == 0) {
shift_right(v);
if ((D[0] & 1) == 1) {
sub9_zxy(D, D, m);
}
shift_right(D);
}
if (sub9_zxy(u, u, v) == 1) { // u >= v
sub9_zxy(B, B, D);
}
else {
add9_zxy(u, u, v);
sub9_zxy(v, v, u);
sub9_zxy(D, D, B);
}
uint32_t cmp = 0;
for (int i=0; i<9; i++) cmp |= u[i];
if (cmp == 0) {
if (D[8] & 0x80000000) {
add9_zxy(D, D, m); // D < 0
}
for (uint32_t i=0; i<8; i++) y[i] = D[i];
return;
}
goto STEP_4;
}
uint32_t add_zxy(uint32_t *z, const uint32_t *x, const uint32_t *y) {
uint32_t carry;
__asm__ volatile (
"LDMIA %1!, {r3-r6}\n"
@ -27,7 +157,7 @@ uint32_t add_zxy(uint32_t *z, uint32_t *x, uint32_t *y) {
}
uint32_t sub_zxy(uint32_t *z, uint32_t *x, uint32_t *y) {
uint32_t sub_zxy(uint32_t *z, const uint32_t *x, const uint32_t *y) {
uint32_t carry;
__asm__ volatile (
"LDMIA %1!, {r3-r6}\n"
@ -52,7 +182,7 @@ uint32_t sub_zxy(uint32_t *z, uint32_t *x, uint32_t *y) {
}
void add_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) {
void add_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) {
uint32_t z1[8], z2[8];
uint32_t* ret[2] = {z1, z2};
add_zxy(z1, x, y);
@ -61,7 +191,7 @@ void add_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) {
}
void sub_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) {
void sub_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) {
// x-y % p
uint32_t z1[8], z2[8];
uint32_t* ret[2] = {z2, z1};
@ -71,7 +201,7 @@ void sub_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) {
}
void mul_add_zxy(uint32_t *z, uint32_t *x, uint32_t y) {
void mul_add_zxy(uint32_t *z, const uint32_t *x, const uint32_t y) {
// z += x*y
// Note, UMAAL is not available
__asm__ volatile (
@ -230,7 +360,7 @@ void pu_add_shift(uint32_t *t, uint32_t u) {
}
void mont_mul_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) {
void mont_mul_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) {
// see Alg. 14.36 HoAC
// -m^(-1) mod b is 678152731 for curve25519
uint32_t u;
@ -248,7 +378,7 @@ void mont_mul_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) {
}
void mont_mul_zxy0_mod_p(uint32_t *z, uint32_t *x, uint32_t y, uint32_t *p) {
void mont_mul_zxy0_mod_p(uint32_t *z, const uint32_t *x, const uint32_t y, const uint32_t *p) {
// see Alg. 14.36 HoAC
// -m^(-1) mod b is 678152731 for curve25519
uint32_t u;

View File

@ -1,9 +1,10 @@
#ifndef __arithmetic__
#define __arithmetic__
void add_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p);
void sub_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p);
void mont_mul_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p);
void mont_mul_zxy0_mod_p(uint32_t *z, uint32_t *x, uint32_t y, uint32_t *p);
void add_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p);
void sub_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p);
void mont_mul_zxy_mod_p(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p);
void mont_mul_zxy0_mod_p(uint32_t *z, const uint32_t *x, const uint32_t y, const uint32_t *p);
void mod_inverse(const uint32_t *x, uint32_t *y);
#endif

View File

@ -2,12 +2,10 @@
#include "arithmetic.h"
const uint32_t a24R = 0x468ba6;
const uint32_t p[8] = {0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffed};
const uint32_t Curve_p[8] = {0xffffffed, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff}; // little endian
const uint32_t pR2 = 0x5a4;
const uint32_t R = 38;
const uint32_t sqrt_minus_486664R[8] = {0x4038adb9, 0xa83f001e, 0xc1bcaf57, 0x688c332e, 0xa9fa8eee, 0xcb6e1095, 0xa7ab4e9e, 0x1baf4abd};
uint32_t Curve_p[8];
const uint32_t sqrt_minus_486664R[8] = {0x1baf4abd, 0xa7ab4e9e, 0xcb6e1095, 0xa9fa8eee, 0x688c332e, 0xc1bcaf57, 0xa83f001e, 0x4038adb9}; // little endian
void random_z(uint32_t *z) {
// TODO: a true random source should be provided
@ -16,13 +14,6 @@ void random_z(uint32_t *z) {
}
void ini_curve() {
for (int i=0; i<8; i++) {
Curve_p[i] = p[7-i];
}
}
void to_Montgomery(uint32_t *x) {
uint32_t s[8];
mont_mul_zxy0_mod_p(s, x, pR2, Curve_p);
@ -37,90 +28,6 @@ void from_Montgomery(uint32_t *x) {
}
void mod_inverse(uint32_t *z_inv, uint32_t *z) {
uint32_t z2[8];
uint32_t z9[8];
uint32_t z11[8];
uint32_t z2_5_0[8];
uint32_t z2_10_0[8];
uint32_t z2_20_0[8];
uint32_t z2_50_0[8];
uint32_t z2_100_0[8];
uint32_t t0[8];
uint32_t t1[8];
uint32_t i;
mont_mul_zxy_mod_p(z2, z, z, Curve_p); // 2
mont_mul_zxy_mod_p(t1, z2, z2, Curve_p); // 4
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 8
mont_mul_zxy_mod_p(z9, t0, z, Curve_p); // 9
mont_mul_zxy_mod_p(z11, z9, z2, Curve_p); // 11
mont_mul_zxy_mod_p(t0, z11, z11, Curve_p); // 22
mont_mul_zxy_mod_p(z2_5_0, t0, z9, Curve_p); // 31 = 2^5 - 2^0
mont_mul_zxy_mod_p(t0, z2_5_0, z2_5_0, Curve_p); // 2^6 - 2^1
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^7 - 2^2
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^8 - 2^3
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^9 - 2^4
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^10 - 2^5
mont_mul_zxy_mod_p(z2_10_0, t0, z2_5_0, Curve_p); // 2^10 - 2^0
mont_mul_zxy_mod_p(t0, z2_10_0, z2_10_0, Curve_p); // 2^11 - 2^1
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^12 - 2^2
for (i = 2; i < 10; i += 2) {
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p);
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p);
} // 2^20 - 2^10
mont_mul_zxy_mod_p(z2_20_0, t1, z2_10_0, Curve_p); // 2^20 - 2^0
mont_mul_zxy_mod_p(t0, z2_20_0, z2_20_0, Curve_p); // 2^21 - 2^1
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^22 - 2^2
for (i = 2; i < 20; i += 2) {
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p);
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p);
} // 2^40 - 2^20
mont_mul_zxy_mod_p(t0, t1, z2_20_0, Curve_p); // 2^40 - 2^0
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^41 - 2^1
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^42 - 2^2
for (i = 2; i < 10; i += 2) {
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p);
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p);
} // 2^50 - 2^10
mont_mul_zxy_mod_p(z2_50_0, t0, z2_10_0, Curve_p); // 2^50 - 2^0
mont_mul_zxy_mod_p(t0, z2_50_0, z2_50_0, Curve_p); // 2^51 - 2^1
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^42 - 2^2
for (i = 2; i < 50; i += 2) {
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p);
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p);
} // 2^100 - 2^50
mont_mul_zxy_mod_p(z2_100_0, t1, z2_50_0, Curve_p); // 2^100 - 2^0
mont_mul_zxy_mod_p(t1, z2_100_0, z2_100_0, Curve_p); // 2^101 - 2^1
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^102 - 2^2
for (i = 2; i < 100; i += 2) {
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p);
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p);
} // 2^200 - 2^100
mont_mul_zxy_mod_p(t1, t0, z2_100_0, Curve_p); // 2^200 - 2^0
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^201 - 2^1
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^102 - 2^2
for (i = 2; i < 50; i += 2) {
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p);
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p);
} // 2^250 - 2^50
mont_mul_zxy_mod_p(t0, t1, z2_50_0, Curve_p); // 2^250 - 2^0
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^251 - 2^1
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^252 - 2^2
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^253 - 2^3
mont_mul_zxy_mod_p(t0, t1, t1, Curve_p); // 2^254 - 2^4
mont_mul_zxy_mod_p(t1, t0, t0, Curve_p); // 2^255 - 2^5
mont_mul_zxy_mod_p(z_inv, t1, z11, Curve_p); // 2^255 - 21
}
void recover_y(uint32_t *x, uint32_t *y, uint32_t *z, uint32_t *xp, uint32_t *yp, uint32_t *xq, uint32_t *zq, uint32_t *xpq, uint32_t *zpq) {
// https://eprint.iacr.org/2017/212.pdf
uint32_t AR = 18493156;
@ -157,32 +64,34 @@ void montgomery2edward(uint32_t *x, uint32_t *y, uint32_t *u, uint32_t *v) {
t[0] = R; // 1*R
add_zxy_mod_p(s, u, t, Curve_p); // s = u+1
mont_mul_zxy_mod_p(w, s, v, Curve_p);
mod_inverse(w, w); // w = (u+1)^(-1) * v^(-1)
// mod_inverse(w, w); // w = (u+1)^(-1) * v^(-1)
mod_inverse(Curve_p, w);
mont_mul_zxy0_mod_p(w, w, pR2, Curve_p);
mont_mul_zxy0_mod_p(w, w, pR2, Curve_p);
sub_zxy_mod_p(y, u, t, Curve_p); // y = u-1
mont_mul_zxy_mod_p(x, y, w, Curve_p); // x = (u-1)/((u+1) * v)
mont_mul_zxy_mod_p(y, x, v, Curve_p);
for (uint32_t i=0; i<8; i++) t[i] = sqrt_minus_486664R[7-i];
mont_mul_zxy_mod_p(x, t, w, Curve_p);
mont_mul_zxy_mod_p(x, sqrt_minus_486664R, w, Curve_p);
mont_mul_zxy_mod_p(t, x, s, Curve_p);
mont_mul_zxy_mod_p(x, t, u, Curve_p);
}
void cswap(uint32_t swap, uint32_t **x, uint32_t **y) {
// https://tools.ietf.org/html/rfc7748 sec. 5
__asm__ volatile (
"LDMIA %1, {r3}\n"
"LDMIA %2, {r4}\n"
"RSB r5, %0, 0\n"
"EOR r6, r3, r4\n"
"AND r5, r5, r6\n"
"EOR r3, r3, r5\n"
"EOR r4, r4, r5\n"
"STMIA %1, {r3}\n"
"STMIA %2, {r4}\n"
: : "r" (swap), "r" (x), "r" (y) : "r3", "r4", "r5", "r6"
// https://tools.ietf.org/html/rfc7748 sec. 5
#define cswap(swap, x, y) \
__asm__ volatile ( \
"LDMIA %1, {r3}\n" \
"LDMIA %2, {r4}\n" \
"RSB r5, %0, 0\n" \
"EOR r6, r3, r4\n" \
"AND r5, r5, r6\n" \
"EOR r3, r3, r5\n" \
"EOR r4, r4, r5\n" \
"STMIA %1, {r3}\n" \
"STMIA %2, {r4}\n" \
: : "r" (swap), "r" (x), "r" (y) : "r3", "r4", "r5", "r6" \
);
}
void X25519(uint32_t *q, uint32_t *r, uint8_t *k, uint32_t *u, uint32_t *v, uint8_t toEdward) {
// cpmputes q = k * u
@ -192,8 +101,6 @@ void X25519(uint32_t *q, uint32_t *r, uint8_t *k, uint32_t *u, uint32_t *v, uint
uint8_t swap, kt, kw;
uint32_t A[8], AA[8], B[8], BB[8], C[8], D[8], E[8], DA[8], CB[8];
ini_curve();
// x1, z1 = u, 1
// x2, z2 = 1, 0
// x3, z3 = u, 1
@ -249,23 +156,22 @@ void X25519(uint32_t *q, uint32_t *r, uint8_t *k, uint32_t *u, uint32_t *v, uint
to_Montgomery(AA);
to_Montgomery(BB);
recover_y(A, B, C, AA, BB, x2, z2, x3, z3);
mod_inverse(C, C);
from_Montgomery(C);
mod_inverse(Curve_p, C);
mont_mul_zxy_mod_p(q, A, C, Curve_p);
mont_mul_zxy_mod_p(r, B, C, Curve_p);
if (toEdward) {
for (uint32_t i=0; i<8; i++) {
A[i] = q[i];
B[i] = r[i];
}
mont_mul_zxy0_mod_p(A, q, pR2, Curve_p);
mont_mul_zxy0_mod_p(B, r, pR2, Curve_p);
montgomery2edward(q, r, A, B);
from_Montgomery(q);
from_Montgomery(r);
}
from_Montgomery(q);
from_Montgomery(r);
} else {
// compute x-only
// compute z2^(-1) = z2^{p-2}
mod_inverse(A, z2);
mont_mul_zxy_mod_p(q, x2, A, Curve_p);
from_Montgomery(q);
from_Montgomery(z2);
mod_inverse(Curve_p, z2);
mont_mul_zxy_mod_p(q, x2, z2, Curve_p);
}
}

View File

@ -3,7 +3,7 @@ import time
# from urandom import randint
d = b'\x70\x1f\xb4\x30\x86\x55\xb4\x76\xb6\x78\x9b\x73\x25\xf9\xea\x8c\xdd\xd1\x6a\x58\x53\x3f\xf6\xd9\xe6\x00\x09\x46\x4a\x5f\x9d\x54\x00\x00\x00\x00'
u = b'\x09' + bytes(35)
u = b'\x09' + bytes(31)
v = b'\xd9\xd3\xce~\xa2\xc5\xe9)\xb2a|m~M=\x92L\xd1Hw,\xdd\x1e\xe0\xb4\x86\xa0\xb8\xa1\x19\xae \x00\x00\x00\x00'
print('Test vectors from https://tools.ietf.org/html/rfc8031#appendix-A')