combining mul+add in one call

master
pmvr 2020-06-09 19:31:53 +02:00
parent fed427c269
commit 1390b505ce
2 changed files with 107 additions and 51 deletions

View File

@ -26,13 +26,13 @@ Python script `x25519.py`
Test vectors from https://tools.ietf.org/html/rfc8031#appendix-A
Test 1: X25519: q = d*u
Computatation time: 39 ms
Computatation time: 26 ms
q [hex/dec] = 66c7fb0d9f7090f777fa8493081ce8a4f174dbbbf9a36f16ba571206d4ddd548 46489245826987382655505058740283756869827209462947799117248009944518788765000
Test 1 passed.
Test 2: X25519 + y-coordinate recovery + transform to Edwards-curve
(x, y) = Edward(q, r), (q, r) = d*(u, v)
Computatation time: 44 ms
Computatation time: 27 ms
x [hex/dec] = 1ce7e6e3a747a25352df2d3155f06427ba389769e37755731dead2b54c5cef03 13074494971479542188989287385397236998770807488645203601973104535274459557635
y [hex/dec] = 4dd1c7c2001c147333ceedf77ebd48b1100e2a95f88cf1f40d1b74ec7279e657 35198739055214410372845858661063095427357109357427482712729161712065293444695
Test 2 passed.
@ -41,24 +41,24 @@ Python script `x25519.py`
Python script `ed25519.py`
Test 1: Length of message: 0 bytes
Computatation time: 89 ms
Computatation time: 58 ms
Test 1 passed.
Test 2: Length of message: 1 byte
Computatation time: 90 ms
Computatation time: 58 ms
Test 2 passed.
Test 3: Length of message: 2 bytes
Computatation time: 90 ms
Computatation time: 58 ms
Test 3 passed.
Test 4: Length of message: 1023 bytes
Computatation time: 98 ms
Computatation time: 67 ms
Test 4 passed.
Test 5: Length of message: 64 bytes
Computatation time: 90 ms
Computatation time: 59 ms
Test 5 passed.
## Warning
The code is not ready for production for both security reasons and missing regression tests.
The source code is not ready for production for both security reasons and missing regression tests.

View File

@ -70,93 +70,159 @@ void sub_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) {
}
void mul_zxy(uint32_t *z, uint32_t *x, uint32_t y) {
void mul_add_zxy(uint32_t *z, uint32_t *x, uint32_t y) {
// z += x*y
// Note, UMAAL is not available
__asm__ volatile (
// 0
"LDMIA %0!, {r3}\n"
"UMULL r5, r6, r3, %1\n"
"LDMIA %2, {r3}\n"
"ADDS r5, r5, r3\n"
"STMIA %2!, {r5}\n"
// 1
"LDMIA %0!, {r3}\n"
"MOV r5, 0\n"
"UMLAL r6, r5, r3, %1\n"
"LDMIA %2, {r3}\n"
"ADCS r6, r6, r3\n"
"STMIA %2!, {r6}\n"
// 2
"LDMIA %0!, {r3}\n"
"MOV r6, 0\n"
"UMLAL r5, r6, r3, %1\n"
"LDMIA %2, {r3}\n"
"ADCS r5, r5, r3\n"
"STMIA %2!, {r5}\n"
// 3
"LDMIA %0!, {r3}\n"
"MOV r5, 0\n"
"UMLAL r6, r5, r3, %1\n"
"LDMIA %2, {r3}\n"
"ADCS r6, r6, r3\n"
"STMIA %2!, {r6}\n"
// 4
"LDMIA %0!, {r3}\n"
"MOV r6, 0\n"
"UMLAL r5, r6, r3, %1\n"
"LDMIA %2, {r3}\n"
"ADCS r5, r5, r3\n"
"STMIA %2!, {r5}\n"
// 5
"LDMIA %0!, {r3}\n"
"MOV r5, 0\n"
"UMLAL r6, r5, r3, %1\n"
"LDMIA %2, {r3}\n"
"ADCS r6, r6, r3\n"
"STMIA %2!, {r6}\n"
// 6
"LDMIA %0!, {r3}\n"
"MOV r6, 0\n"
"UMLAL r5, r6, r3, %1\n"
"LDMIA %2, {r3}\n"
"ADCS r5, r5, r3\n"
"STMIA %2!, {r5}\n"
// 7
"LDMIA %0!, {r3}\n"
"MOV r5, 0\n"
"UMLAL r6, r5, r3, %1\n"
"LDMIA %2, {r3}\n"
"ADCS r6, r6, r3\n"
"STMIA %2!, {r6}\n"
"LDMIA %2, {r3}\n"
"ADCS r5, r5, r3\n"
"STMIA %2, {r5}\n"
: : "r" (x), "r" (y), "r" (z) : "r3", "r5", "r6"
);
}
void mul_zx0y0(uint32_t *z, uint32_t x, uint32_t y) {
void mul_add_zx0y0(uint32_t *z, uint32_t x, uint32_t y) {
// z += x0 * y0
__asm__ volatile (
// 0
"UMULL r5, r6, %0, %1\n"
"STMIA %2!, {r5}\n"
"STMIA %2!, {r6}\n"
"LDMIA %2, {r3}\n"
"ADDS r3, r3, r5\n"
"STMIA %2!, {r3}\n"
"LDMIA %2, {r3}\n"
"ADCS r3, r3, r6\n"
"STMIA %2!, {r3}\n"
"MOV r5, 0\n"
"MOV r6, 0\n"
"STMIA %2!, {r5, r6}\n"
"STMIA %2!, {r5, r6}\n"
"STMIA %2!, {r5, r6}\n"
"STMIA %2, {r5}\n"
: : "r" (x), "r" (y), "r" (z) : "r5", "r6"
"LDMIA %2, {r3}\n"
"ADCS r3, r3, r5\n"
"STMIA %2!, {r3}\n"
"LDMIA %2, {r3}\n"
"ADCS r3, r3, r5\n"
"STMIA %2!, {r3}\n"
"LDMIA %2, {r3}\n"
"ADCS r3, r3, r5\n"
"STMIA %2!, {r3}\n"
"LDMIA %2, {r3}\n"
"ADCS r3, r3, r5\n"
"STMIA %2!, {r3}\n"
"LDMIA %2, {r3}\n"
"ADCS r3, r3, r5\n"
"STMIA %2!, {r3}\n"
"LDMIA %2, {r3}\n"
"ADCS r3, r3, r5\n"
"STMIA %2!, {r3}\n"
"LDMIA %2, {r3}\n"
"ADCS r3, r3, r5\n"
"STMIA %2, {r3}\n"
//"SUBS %2, 32\n" //*
: : "r" (x), "r" (y), "r" (z) : "r3", "r5", "r6"
);
}
void pu(uint32_t *t, uint32_t u) {
// computes (2^255 -19) * u
void pu_add_shift(uint32_t *t, uint32_t u) {
// computes t = (t + (2^255 -19) * u) >> 32
__asm__ volatile (
"MOV r3, 19\n"
"UMULL r5, r6, r3, %1\n"
"UMULL r5, r6, r3, %1\n"
"MOV r2, 0\n"
"LSRS %1, %1, 1\n"
"RRXS r2, r2\n"
"MOV r3, 0\n"
"SUBS r4, r3, r5\n"
"STMIA %0!, {r4}\n"
"SBCS r4, r3, r6\n"
"STMIA %0!, {r4}\n"
"SBCS r4, r3, r3\n"
"STMIA %0!, {r4}\n"
"STMIA %0!, {r4}\n"
"STMIA %0!, {r4}\n"
"STMIA %0!, {r4}\n"
"STMIA %0!, {r4}\n"
"SBCS r4, r2, r3\n"
"STMIA %0!, {r4}\n"
"SBCS r4, %1, r3\n"
"STMIA %0, {r4}\n"
// t[8]:t[7] + (u << 255)
"ADDS %0, %0, 28\n"
"LDMIA %0, {r3}\n"
"ADDS r3, r3, r2\n"
"STMIA %0!, {r3}\n"
"LDMIA %0, {r3}\n"
"ADCS r3, r3, %1\n"
"STMIA %0, {r3}\n"
"SUBS %0, %0, 32\n"
// t - 19*u
"MOV r2, %0\n"
"LDMIA %0!, {r4}\n"
"SUBS r4, r4, r5\n"
"LDMIA %0!, {r4}\n"
"SBCS r4, r4, r6\n"
"STMIA r2!, {r4}\n"
"MOV r5, 0\n"
"LDMIA %0!, {r4}\n"
"SBCS r4, r4, r5\n"
"STMIA r2!, {r4}\n"
"LDMIA %0!, {r4}\n"
"SBCS r4, r4, r5\n"
"STMIA r2!, {r4}\n"
"LDMIA %0!, {r4}\n"
"SBCS r4, r4, r5\n"
"STMIA r2!, {r4}\n"
"LDMIA %0!, {r4}\n"
"SBCS r4, r4, r5\n"
"STMIA r2!, {r4}\n"
"LDMIA %0!, {r4}\n"
"SBCS r4, r4, r5\n"
"STMIA r2!, {r4}\n"
"LDMIA %0!, {r4}\n"
"SBCS r4, r4, r5\n"
"STMIA r2!, {r4}\n"
"LDMIA %0, {r4}\n"
"SBCS r4, r4, r5\n"
"STMIA r2!, {r4}\n"
"STMIA r2, {r5}\n" // ms-word = 0
//"SUBS %0, %0, 32\n"
: : "r" (t), "r" (u) : "r2", "r3", "r4", "r5", "r6"
);
@ -173,13 +239,8 @@ void mont_mul_zxy_mod_p(uint32_t *z, uint32_t *x, uint32_t *y, uint32_t *p) {
for (int i=0; i<9; i++) a[i] = 0;
for (int i=0; i<8; i++) {
u = (a[0] + x[i] * y[0]) * 678152731;
mul_zxy(tmp, y, x[i]);
a[8] += tmp[8] + add_zxy(a, a, tmp);
// mul_zxy(tmp, p, u);
pu(tmp, u);
a[8] += tmp[8] + add_zxy(a, a, tmp); // A <- (A + xi y + u m) / b
for (int j=0; j<8; j++) a[j] = a[j+1];
a[8] = 0;
mul_add_zxy(a, y, x[i]); // A <- A + xi y
pu_add_shift(a, u); // A <- (A + u m) / b
}
uint32_t c = sub_zxy(tmp, a, p); // carry not set if negative
for (int i=0; i<8; i++) z[i] = ret[c][i];
@ -196,13 +257,8 @@ void mont_mul_zxy0_mod_p(uint32_t *z, uint32_t *x, uint32_t y, uint32_t *p) {
for (int i=0; i<9; i++) a[i] = 0;
for (int i=0; i<8; i++) {
u = (a[0] + x[i] * y) * 678152731;
mul_zx0y0(tmp, x[i], y);
a[8] += tmp[8] + add_zxy(a, a, tmp);
// mul_zxy(tmp, p, u);
pu(tmp, u);
a[8] += tmp[8] + add_zxy(a, a, tmp); // A <- (A + xi y + u m) / b
for (int j=0; j<8; j++) a[j] = a[j+1];
a[8] = 0;
mul_add_zx0y0(a, y, x[i]); // A <- A + xi y
pu_add_shift(a, u); // A <- (A + u m) / b
}
uint32_t c = sub_zxy(tmp, a, p); // carry not set if negative
for (int i=0; i<8; i++) z[i] = ret[c][i];