From 5bcf54b9e200ea651fb11c5b26eaf8ff66305123 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 27 Sep 2019 09:41:54 +1000 Subject: [PATCH] Refix stopping use of x18 register --- wolfcrypt/src/sp_arm64.c | 806 +++++++++++++++++++-------------------- 1 file changed, 403 insertions(+), 403 deletions(-) diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 814cfc964..ab64926db 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -194,9 +194,9 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "ldp x12, x13, [%[a], 32]\n\t" "ldp x14, x15, [%[a], 48]\n\t" "ldp x16, x17, [%[b], 0]\n\t" - "ldp x18, x19, [%[b], 16]\n\t" - "ldp x20, x21, [%[b], 32]\n\t" - "ldp x22, x23, [%[b], 48]\n\t" + "ldp x19, x20, [%[b], 16]\n\t" + "ldp x21, x22, [%[b], 32]\n\t" + "ldp x23, x24, [%[b], 48]\n\t" "# A[0] * B[0]\n\t" "mul x3, x8, x16\n\t" "umulh x4, x8, x16\n\t" @@ -214,8 +214,8 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x3, xzr, xzr\n\t" "str x4, [%[tmp], 8]\n\t" "# A[0] * B[2]\n\t" - "mul x6, x8, x18\n\t" - "umulh x7, x8, x18\n\t" + "mul x6, x8, x19\n\t" + "umulh x7, x8, x19\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, xzr, xzr\n\t" @@ -233,14 +233,14 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x4, x4, xzr\n\t" "str x5, [%[tmp], 16]\n\t" "# A[0] * B[3]\n\t" - "mul x6, x8, x19\n\t" - "umulh x7, x8, x19\n\t" + "mul x6, x8, x20\n\t" + "umulh x7, x8, x20\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, xzr, xzr\n\t" "# A[1] * B[2]\n\t" - "mul x6, x9, x18\n\t" - "umulh x7, x9, x18\n\t" + "mul x6, x9, x19\n\t" + "umulh x7, x9, x19\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" @@ -258,20 +258,20 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x5, x5, xzr\n\t" "str x3, [%[tmp], 24]\n\t" "# A[0] * B[4]\n\t" - "mul x6, x8, x20\n\t" - "umulh x7, x8, x20\n\t" + "mul x6, x8, x21\n\t" + "umulh x7, x8, x21\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, xzr, xzr\n\t" "# A[1] * B[3]\n\t" - "mul x6, x9, x19\n\t" - "umulh x7, x9, x19\n\t" + "mul x6, x9, x20\n\t" + "umulh x7, x9, x20\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[2] * B[2]\n\t" - "mul x6, x10, x18\n\t" - "umulh x7, x10, x18\n\t" + "mul x6, x10, x19\n\t" + "umulh x7, x10, x19\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" @@ -289,26 +289,26 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x3, x3, xzr\n\t" "str x4, [%[tmp], 32]\n\t" "# A[0] * B[5]\n\t" - "mul x6, x8, x21\n\t" - "umulh x7, x8, x21\n\t" + "mul x6, x8, x22\n\t" + "umulh x7, x8, x22\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, xzr, xzr\n\t" "# A[1] * B[4]\n\t" - "mul x6, x9, x20\n\t" - "umulh x7, x9, x20\n\t" + "mul x6, x9, x21\n\t" + "umulh x7, x9, x21\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[2] * B[3]\n\t" - "mul x6, x10, x19\n\t" - "umulh x7, x10, x19\n\t" + "mul x6, x10, x20\n\t" + "umulh x7, x10, x20\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[3] * B[2]\n\t" - "mul x6, x11, x18\n\t" - "umulh x7, x11, x18\n\t" + "mul x6, x11, x19\n\t" + "umulh x7, x11, x19\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" @@ -326,32 +326,32 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x4, x4, xzr\n\t" "str x5, [%[tmp], 40]\n\t" "# A[0] * B[6]\n\t" - "mul x6, x8, x22\n\t" - "umulh x7, x8, x22\n\t" + "mul x6, x8, x23\n\t" + "umulh x7, x8, x23\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, xzr, xzr\n\t" "# A[1] * B[5]\n\t" - "mul x6, x9, x21\n\t" - "umulh x7, x9, x21\n\t" + "mul x6, x9, x22\n\t" + "umulh x7, x9, x22\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[2] * B[4]\n\t" - "mul x6, x10, x20\n\t" - "umulh x7, x10, x20\n\t" + "mul x6, x10, x21\n\t" + "umulh x7, x10, x21\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[3] * B[3]\n\t" - "mul x6, x11, x19\n\t" - "umulh x7, x11, x19\n\t" + "mul x6, x11, x20\n\t" + "umulh x7, x11, x20\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[4] * B[2]\n\t" - "mul x6, x12, x18\n\t" - "umulh x7, x12, x18\n\t" + "mul x6, x12, x19\n\t" + "umulh x7, x12, x19\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" @@ -369,38 +369,38 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x5, x5, xzr\n\t" "str x3, [%[tmp], 48]\n\t" "# A[0] * B[7]\n\t" - "mul x6, x8, x23\n\t" - "umulh x7, x8, x23\n\t" + "mul x6, x8, x24\n\t" + "umulh x7, x8, x24\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, xzr, xzr\n\t" "# A[1] * B[6]\n\t" - "mul x6, x9, x22\n\t" - "umulh x7, x9, x22\n\t" + "mul x6, x9, x23\n\t" + "umulh x7, x9, x23\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[2] * B[5]\n\t" - "mul x6, x10, x21\n\t" - "umulh x7, x10, x21\n\t" + "mul x6, x10, x22\n\t" + "umulh x7, x10, x22\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[3] * B[4]\n\t" - "mul x6, x11, x20\n\t" - "umulh x7, x11, x20\n\t" + "mul x6, x11, x21\n\t" + "umulh x7, x11, x21\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[4] * B[3]\n\t" - "mul x6, x12, x19\n\t" - "umulh x7, x12, x19\n\t" + "mul x6, x12, x20\n\t" + "umulh x7, x12, x20\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[5] * B[2]\n\t" - "mul x6, x13, x18\n\t" - "umulh x7, x13, x18\n\t" + "mul x6, x13, x19\n\t" + "umulh x7, x13, x19\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" @@ -418,38 +418,38 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x3, x3, xzr\n\t" "str x4, [%[tmp], 56]\n\t" "# A[1] * B[7]\n\t" - "mul x6, x9, x23\n\t" - "umulh x7, x9, x23\n\t" + "mul x6, x9, x24\n\t" + "umulh x7, x9, x24\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, xzr, xzr\n\t" "# A[2] * B[6]\n\t" - "mul x6, x10, x22\n\t" - "umulh x7, x10, x22\n\t" + "mul x6, x10, x23\n\t" + "umulh x7, x10, x23\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[3] * B[5]\n\t" - "mul x6, x11, x21\n\t" - "umulh x7, x11, x21\n\t" + "mul x6, x11, x22\n\t" + "umulh x7, x11, x22\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[4] * B[4]\n\t" - "mul x6, x12, x20\n\t" - "umulh x7, x12, x20\n\t" + "mul x6, x12, x21\n\t" + "umulh x7, x12, x21\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[5] * B[3]\n\t" - "mul x6, x13, x19\n\t" - "umulh x7, x13, x19\n\t" + "mul x6, x13, x20\n\t" + "umulh x7, x13, x20\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[6] * B[2]\n\t" - "mul x6, x14, x18\n\t" - "umulh x7, x14, x18\n\t" + "mul x6, x14, x19\n\t" + "umulh x7, x14, x19\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" @@ -461,139 +461,139 @@ static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "adc x4, x4, xzr\n\t" "str x5, [%[r], 64]\n\t" "# A[2] * B[7]\n\t" - "mul x6, x10, x23\n\t" - "umulh x7, x10, x23\n\t" + "mul x6, x10, x24\n\t" + "umulh x7, x10, x24\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, xzr, xzr\n\t" "# A[3] * B[6]\n\t" - "mul x6, x11, x22\n\t" - "umulh x7, x11, x22\n\t" + "mul x6, x11, x23\n\t" + "umulh x7, x11, x23\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[4] * B[5]\n\t" - "mul x6, x12, x21\n\t" - "umulh x7, x12, x21\n\t" + "mul x6, x12, x22\n\t" + "umulh x7, x12, x22\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[5] * B[4]\n\t" - "mul x6, x13, x20\n\t" - "umulh x7, x13, x20\n\t" + "mul x6, x13, x21\n\t" + "umulh x7, x13, x21\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[6] * B[3]\n\t" - "mul x6, x14, x19\n\t" - "umulh x7, x14, x19\n\t" + "mul x6, x14, x20\n\t" + "umulh x7, x14, x20\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[7] * B[2]\n\t" - "mul x6, x15, x18\n\t" - "umulh x7, x15, x18\n\t" + "mul x6, x15, x19\n\t" + "umulh x7, x15, x19\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "str x3, [%[r], 72]\n\t" "# A[3] * B[7]\n\t" - "mul x6, x11, x23\n\t" - "umulh x7, x11, x23\n\t" + "mul x6, x11, x24\n\t" + "umulh x7, x11, x24\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, xzr, xzr\n\t" "# A[4] * B[6]\n\t" - "mul x6, x12, x22\n\t" - "umulh x7, x12, x22\n\t" + "mul x6, x12, x23\n\t" + "umulh x7, x12, x23\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[5] * B[5]\n\t" - "mul x6, x13, x21\n\t" - "umulh x7, x13, x21\n\t" + "mul x6, x13, x22\n\t" + "umulh x7, x13, x22\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[6] * B[4]\n\t" - "mul x6, x14, x20\n\t" - "umulh x7, x14, x20\n\t" + "mul x6, x14, x21\n\t" + "umulh x7, x14, x21\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "# A[7] * B[3]\n\t" - "mul x6, x15, x19\n\t" - "umulh x7, x15, x19\n\t" + "mul x6, x15, x20\n\t" + "umulh x7, x15, x20\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "str x4, [%[r], 80]\n\t" "# A[4] * B[7]\n\t" - "mul x6, x12, x23\n\t" - "umulh x7, x12, x23\n\t" + "mul x6, x12, x24\n\t" + "umulh x7, x12, x24\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, xzr, xzr\n\t" "# A[5] * B[6]\n\t" - "mul x6, x13, x22\n\t" - "umulh x7, x13, x22\n\t" + "mul x6, x13, x23\n\t" + "umulh x7, x13, x23\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[6] * B[5]\n\t" - "mul x6, x14, x21\n\t" - "umulh x7, x14, x21\n\t" + "mul x6, x14, x22\n\t" + "umulh x7, x14, x22\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "# A[7] * B[4]\n\t" - "mul x6, x15, x20\n\t" - "umulh x7, x15, x20\n\t" + "mul x6, x15, x21\n\t" + "umulh x7, x15, x21\n\t" "adds x5, x5, x6\n\t" "adcs x3, x3, x7\n\t" "adc x4, x4, xzr\n\t" "str x5, [%[r], 88]\n\t" "# A[5] * B[7]\n\t" - "mul x6, x13, x23\n\t" - "umulh x7, x13, x23\n\t" + "mul x6, x13, x24\n\t" + "umulh x7, x13, x24\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, xzr, xzr\n\t" "# A[6] * B[6]\n\t" - "mul x6, x14, x22\n\t" - "umulh x7, x14, x22\n\t" + "mul x6, x14, x23\n\t" + "umulh x7, x14, x23\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "# A[7] * B[5]\n\t" - "mul x6, x15, x21\n\t" - "umulh x7, x15, x21\n\t" + "mul x6, x15, x22\n\t" + "umulh x7, x15, x22\n\t" "adds x3, x3, x6\n\t" "adcs x4, x4, x7\n\t" "adc x5, x5, xzr\n\t" "str x3, [%[r], 96]\n\t" "# A[6] * B[7]\n\t" - "mul x6, x14, x23\n\t" - "umulh x7, x14, x23\n\t" + "mul x6, x14, x24\n\t" + "umulh x7, x14, x24\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, xzr, xzr\n\t" "# A[7] * B[6]\n\t" - "mul x6, x15, x22\n\t" - "umulh x7, x15, x22\n\t" + "mul x6, x15, x23\n\t" + "umulh x7, x15, x23\n\t" "adds x4, x4, x6\n\t" "adcs x5, x5, x7\n\t" "adc x3, x3, xzr\n\t" "str x4, [%[r], 104]\n\t" "# A[7] * B[7]\n\t" - "mul x6, x15, x23\n\t" - "umulh x7, x15, x23\n\t" + "mul x6, x15, x24\n\t" + "umulh x7, x15, x24\n\t" "adds x5, x5, x6\n\t" "adc x3, x3, x7\n\t" "stp x5, x3, [%[r], 112]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24" ); XMEMCPY(r, tmp, sizeof(tmp)); @@ -2352,11 +2352,11 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "ldp x12, x13, [%[m], 0]\n\t" "ldp x14, x15, [%[m], 16]\n\t" "ldp x16, x17, [%[m], 32]\n\t" - "ldp x18, x19, [%[m], 48]\n\t" - "ldp x20, x21, [%[m], 64]\n\t" - "ldp x22, x23, [%[m], 80]\n\t" - "ldp x24, x25, [%[m], 96]\n\t" - "ldp x26, x27, [%[m], 112]\n\t" + "ldp x19, x20, [%[m], 48]\n\t" + "ldp x21, x22, [%[m], 64]\n\t" + "ldp x23, x24, [%[m], 80]\n\t" + "ldp x25, x26, [%[m], 96]\n\t" + "ldp x27, x28, [%[m], 112]\n\t" "# i = 0\n\t" "mov x3, 0\n\t" "ldp x10, x11, [%[a], 0]\n\t" @@ -2414,8 +2414,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+6] += m[6] * mu\n\t" "ldr x9, [%[a], 48]\n\t" - "mul x6, x18, x8\n\t" - "umulh x7, x18, x8\n\t" + "mul x6, x19, x8\n\t" + "umulh x7, x19, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2423,8 +2423,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+7] += m[7] * mu\n\t" "ldr x9, [%[a], 56]\n\t" - "mul x6, x19, x8\n\t" - "umulh x7, x19, x8\n\t" + "mul x6, x20, x8\n\t" + "umulh x7, x20, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -2432,8 +2432,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+8] += m[8] * mu\n\t" "ldr x9, [%[a], 64]\n\t" - "mul x6, x20, x8\n\t" - "umulh x7, x20, x8\n\t" + "mul x6, x21, x8\n\t" + "umulh x7, x21, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2441,8 +2441,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+9] += m[9] * mu\n\t" "ldr x9, [%[a], 72]\n\t" - "mul x6, x21, x8\n\t" - "umulh x7, x21, x8\n\t" + "mul x6, x22, x8\n\t" + "umulh x7, x22, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -2450,8 +2450,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+10] += m[10] * mu\n\t" "ldr x9, [%[a], 80]\n\t" - "mul x6, x22, x8\n\t" - "umulh x7, x22, x8\n\t" + "mul x6, x23, x8\n\t" + "umulh x7, x23, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2459,8 +2459,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+11] += m[11] * mu\n\t" "ldr x9, [%[a], 88]\n\t" - "mul x6, x23, x8\n\t" - "umulh x7, x23, x8\n\t" + "mul x6, x24, x8\n\t" + "umulh x7, x24, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -2468,8 +2468,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+12] += m[12] * mu\n\t" "ldr x9, [%[a], 96]\n\t" - "mul x6, x24, x8\n\t" - "umulh x7, x24, x8\n\t" + "mul x6, x25, x8\n\t" + "umulh x7, x25, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2477,8 +2477,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+13] += m[13] * mu\n\t" "ldr x9, [%[a], 104]\n\t" - "mul x6, x25, x8\n\t" - "umulh x7, x25, x8\n\t" + "mul x6, x26, x8\n\t" + "umulh x7, x26, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -2486,8 +2486,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+14] += m[14] * mu\n\t" "ldr x9, [%[a], 112]\n\t" - "mul x6, x26, x8\n\t" - "umulh x7, x26, x8\n\t" + "mul x6, x27, x8\n\t" + "umulh x7, x27, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -2495,8 +2495,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+15] += m[15] * mu\n\t" "ldr x9, [%[a], 120]\n\t" - "mul x6, x27, x8\n\t" - "umulh x7, x27, x8\n\t" + "mul x6, x28, x8\n\t" + "umulh x7, x28, x8\n\t" "adds x5, x5, x6\n\t" "adcs x7, x7, %[ca]\n\t" "cset %[ca], cs\n\t" @@ -2515,7 +2515,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "str x11, [%[a], 8]\n\t" : [ca] "+r" (ca), [a] "+r" (a) : [m] "r" (m), [mp] "r" (mp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); sp_2048_cond_sub_16(a - 16, a, m, (sp_digit)0 - ca); @@ -3537,11 +3537,11 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "ldp x12, x13, [%[m], 0]\n\t" "ldp x14, x15, [%[m], 16]\n\t" "ldp x16, x17, [%[m], 32]\n\t" - "ldp x18, x19, [%[m], 48]\n\t" - "ldp x20, x21, [%[m], 64]\n\t" - "ldp x22, x23, [%[m], 80]\n\t" - "ldp x24, x25, [%[m], 96]\n\t" - "ldp x26, x27, [%[m], 112]\n\t" + "ldp x19, x20, [%[m], 48]\n\t" + "ldp x21, x22, [%[m], 64]\n\t" + "ldp x23, x24, [%[m], 80]\n\t" + "ldp x25, x26, [%[m], 96]\n\t" + "ldp x27, x28, [%[m], 112]\n\t" "# i = 0\n\t" "mov x3, 0\n\t" "ldp x10, x11, [%[a], 0]\n\t" @@ -3599,8 +3599,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+6] += m[6] * mu\n\t" "ldr x9, [%[a], 48]\n\t" - "mul x6, x18, x8\n\t" - "umulh x7, x18, x8\n\t" + "mul x6, x19, x8\n\t" + "umulh x7, x19, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3608,8 +3608,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+7] += m[7] * mu\n\t" "ldr x9, [%[a], 56]\n\t" - "mul x6, x19, x8\n\t" - "umulh x7, x19, x8\n\t" + "mul x6, x20, x8\n\t" + "umulh x7, x20, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3617,8 +3617,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+8] += m[8] * mu\n\t" "ldr x9, [%[a], 64]\n\t" - "mul x6, x20, x8\n\t" - "umulh x7, x20, x8\n\t" + "mul x6, x21, x8\n\t" + "umulh x7, x21, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3626,8 +3626,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+9] += m[9] * mu\n\t" "ldr x9, [%[a], 72]\n\t" - "mul x6, x21, x8\n\t" - "umulh x7, x21, x8\n\t" + "mul x6, x22, x8\n\t" + "umulh x7, x22, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3635,8 +3635,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+10] += m[10] * mu\n\t" "ldr x9, [%[a], 80]\n\t" - "mul x6, x22, x8\n\t" - "umulh x7, x22, x8\n\t" + "mul x6, x23, x8\n\t" + "umulh x7, x23, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3644,8 +3644,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+11] += m[11] * mu\n\t" "ldr x9, [%[a], 88]\n\t" - "mul x6, x23, x8\n\t" - "umulh x7, x23, x8\n\t" + "mul x6, x24, x8\n\t" + "umulh x7, x24, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3653,8 +3653,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+12] += m[12] * mu\n\t" "ldr x9, [%[a], 96]\n\t" - "mul x6, x24, x8\n\t" - "umulh x7, x24, x8\n\t" + "mul x6, x25, x8\n\t" + "umulh x7, x25, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3662,8 +3662,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+13] += m[13] * mu\n\t" "ldr x9, [%[a], 104]\n\t" - "mul x6, x25, x8\n\t" - "umulh x7, x25, x8\n\t" + "mul x6, x26, x8\n\t" + "umulh x7, x26, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3671,8 +3671,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+14] += m[14] * mu\n\t" "ldr x9, [%[a], 112]\n\t" - "mul x6, x26, x8\n\t" - "umulh x7, x26, x8\n\t" + "mul x6, x27, x8\n\t" + "umulh x7, x27, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -3680,8 +3680,8 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+15] += m[15] * mu\n\t" "ldr x9, [%[a], 120]\n\t" - "mul x6, x27, x8\n\t" - "umulh x7, x27, x8\n\t" + "mul x6, x28, x8\n\t" + "umulh x7, x28, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -3860,7 +3860,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "str x11, [%[a], 8]\n\t" : [ca] "+r" (ca), [a] "+r" (a) : [m] "r" (m), [mp] "r" (mp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - ca); @@ -6770,8 +6770,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "ldp x12, x13, [%[a], 16]\n\t" "ldp x14, x15, [%[a], 32]\n\t" "ldp x16, x17, [%[a], 48]\n\t" - "ldp x18, x19, [%[a], 64]\n\t" - "ldp x20, x21, [%[a], 80]\n\t" + "ldp x19, x20, [%[a], 64]\n\t" + "ldp x21, x22, [%[a], 80]\n\t" "# A[0] * A[0]\n\t" "mul x2, x10, x10\n\t" "umulh x3, x10, x10\n\t" @@ -6932,8 +6932,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, x7\n\t" "str x3, [%[tmp], 56]\n\t" "# A[0] * A[8]\n\t" - "mul x5, x10, x18\n\t" - "umulh x6, x10, x18\n\t" + "mul x5, x10, x19\n\t" + "umulh x6, x10, x19\n\t" "mov x3, 0\n\t" "mov x7, 0\n\t" "# A[1] * A[7]\n\t" @@ -6968,13 +6968,13 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x3, x3, x7\n\t" "str x4, [%[tmp], 64]\n\t" "# A[0] * A[9]\n\t" - "mul x5, x10, x19\n\t" - "umulh x6, x10, x19\n\t" + "mul x5, x10, x20\n\t" + "umulh x6, x10, x20\n\t" "mov x4, 0\n\t" "mov x7, 0\n\t" "# A[1] * A[8]\n\t" - "mul x8, x11, x18\n\t" - "umulh x9, x11, x18\n\t" + "mul x8, x11, x19\n\t" + "umulh x9, x11, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7004,19 +7004,19 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x4, x4, x7\n\t" "str x2, [%[tmp], 72]\n\t" "# A[0] * A[10]\n\t" - "mul x5, x10, x20\n\t" - "umulh x6, x10, x20\n\t" + "mul x5, x10, x21\n\t" + "umulh x6, x10, x21\n\t" "mov x2, 0\n\t" "mov x7, 0\n\t" "# A[1] * A[9]\n\t" - "mul x8, x11, x19\n\t" - "umulh x9, x11, x19\n\t" + "mul x8, x11, x20\n\t" + "umulh x9, x11, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[2] * A[8]\n\t" - "mul x8, x12, x18\n\t" - "umulh x9, x12, x18\n\t" + "mul x8, x12, x19\n\t" + "umulh x9, x12, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7046,25 +7046,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, x7\n\t" "str x3, [%[tmp], 80]\n\t" "# A[0] * A[11]\n\t" - "mul x5, x10, x21\n\t" - "umulh x6, x10, x21\n\t" + "mul x5, x10, x22\n\t" + "umulh x6, x10, x22\n\t" "mov x3, 0\n\t" "mov x7, 0\n\t" "# A[1] * A[10]\n\t" - "mul x8, x11, x20\n\t" - "umulh x9, x11, x20\n\t" + "mul x8, x11, x21\n\t" + "umulh x9, x11, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[2] * A[9]\n\t" - "mul x8, x12, x19\n\t" - "umulh x9, x12, x19\n\t" + "mul x8, x12, x20\n\t" + "umulh x9, x12, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[3] * A[8]\n\t" - "mul x8, x13, x18\n\t" - "umulh x9, x13, x18\n\t" + "mul x8, x13, x19\n\t" + "umulh x9, x13, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7088,25 +7088,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x3, x3, x7\n\t" "str x4, [%[tmp], 88]\n\t" "# A[1] * A[11]\n\t" - "mul x5, x11, x21\n\t" - "umulh x6, x11, x21\n\t" + "mul x5, x11, x22\n\t" + "umulh x6, x11, x22\n\t" "mov x4, 0\n\t" "mov x7, 0\n\t" "# A[2] * A[10]\n\t" - "mul x8, x12, x20\n\t" - "umulh x9, x12, x20\n\t" + "mul x8, x12, x21\n\t" + "umulh x9, x12, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[3] * A[9]\n\t" - "mul x8, x13, x19\n\t" - "umulh x9, x13, x19\n\t" + "mul x8, x13, x20\n\t" + "umulh x9, x13, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[4] * A[8]\n\t" - "mul x8, x14, x18\n\t" - "umulh x9, x14, x18\n\t" + "mul x8, x14, x19\n\t" + "umulh x9, x14, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7130,25 +7130,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x4, x4, x7\n\t" "str x2, [%[r], 96]\n\t" "# A[2] * A[11]\n\t" - "mul x5, x12, x21\n\t" - "umulh x6, x12, x21\n\t" + "mul x5, x12, x22\n\t" + "umulh x6, x12, x22\n\t" "mov x2, 0\n\t" "mov x7, 0\n\t" "# A[3] * A[10]\n\t" - "mul x8, x13, x20\n\t" - "umulh x9, x13, x20\n\t" + "mul x8, x13, x21\n\t" + "umulh x9, x13, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[4] * A[9]\n\t" - "mul x8, x14, x19\n\t" - "umulh x9, x14, x19\n\t" + "mul x8, x14, x20\n\t" + "umulh x9, x14, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[5] * A[8]\n\t" - "mul x8, x15, x18\n\t" - "umulh x9, x15, x18\n\t" + "mul x8, x15, x19\n\t" + "umulh x9, x15, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7166,25 +7166,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, x7\n\t" "str x3, [%[r], 104]\n\t" "# A[3] * A[11]\n\t" - "mul x5, x13, x21\n\t" - "umulh x6, x13, x21\n\t" + "mul x5, x13, x22\n\t" + "umulh x6, x13, x22\n\t" "mov x3, 0\n\t" "mov x7, 0\n\t" "# A[4] * A[10]\n\t" - "mul x8, x14, x20\n\t" - "umulh x9, x14, x20\n\t" + "mul x8, x14, x21\n\t" + "umulh x9, x14, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[5] * A[9]\n\t" - "mul x8, x15, x19\n\t" - "umulh x9, x15, x19\n\t" + "mul x8, x15, x20\n\t" + "umulh x9, x15, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[6] * A[8]\n\t" - "mul x8, x16, x18\n\t" - "umulh x9, x16, x18\n\t" + "mul x8, x16, x19\n\t" + "umulh x9, x16, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7202,25 +7202,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x3, x3, x7\n\t" "str x4, [%[r], 112]\n\t" "# A[4] * A[11]\n\t" - "mul x5, x14, x21\n\t" - "umulh x6, x14, x21\n\t" + "mul x5, x14, x22\n\t" + "umulh x6, x14, x22\n\t" "mov x4, 0\n\t" "mov x7, 0\n\t" "# A[5] * A[10]\n\t" - "mul x8, x15, x20\n\t" - "umulh x9, x15, x20\n\t" + "mul x8, x15, x21\n\t" + "umulh x9, x15, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[6] * A[9]\n\t" - "mul x8, x16, x19\n\t" - "umulh x9, x16, x19\n\t" + "mul x8, x16, x20\n\t" + "umulh x9, x16, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[7] * A[8]\n\t" - "mul x8, x17, x18\n\t" - "umulh x9, x17, x18\n\t" + "mul x8, x17, x19\n\t" + "umulh x9, x17, x19\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7232,25 +7232,25 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x4, x4, x7\n\t" "str x2, [%[r], 120]\n\t" "# A[5] * A[11]\n\t" - "mul x5, x15, x21\n\t" - "umulh x6, x15, x21\n\t" + "mul x5, x15, x22\n\t" + "umulh x6, x15, x22\n\t" "mov x2, 0\n\t" "mov x7, 0\n\t" "# A[6] * A[10]\n\t" - "mul x8, x16, x20\n\t" - "umulh x9, x16, x20\n\t" + "mul x8, x16, x21\n\t" + "umulh x9, x16, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[7] * A[9]\n\t" - "mul x8, x17, x19\n\t" - "umulh x9, x17, x19\n\t" + "mul x8, x17, x20\n\t" + "umulh x9, x17, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[8] * A[8]\n\t" - "mul x8, x18, x18\n\t" - "umulh x9, x18, x18\n\t" + "mul x8, x19, x19\n\t" + "umulh x9, x19, x19\n\t" "adds x5, x5, x5\n\t" "adcs x6, x6, x6\n\t" "adc x7, x7, x7\n\t" @@ -7262,19 +7262,19 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, x7\n\t" "str x3, [%[r], 128]\n\t" "# A[6] * A[11]\n\t" - "mul x5, x16, x21\n\t" - "umulh x6, x16, x21\n\t" + "mul x5, x16, x22\n\t" + "umulh x6, x16, x22\n\t" "mov x3, 0\n\t" "mov x7, 0\n\t" "# A[7] * A[10]\n\t" - "mul x8, x17, x20\n\t" - "umulh x9, x17, x20\n\t" + "mul x8, x17, x21\n\t" + "umulh x9, x17, x21\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" "# A[8] * A[9]\n\t" - "mul x8, x18, x19\n\t" - "umulh x9, x18, x19\n\t" + "mul x8, x19, x20\n\t" + "umulh x9, x19, x20\n\t" "adds x5, x5, x8\n\t" "adcs x6, x6, x9\n\t" "adc x7, x7, xzr\n\t" @@ -7286,8 +7286,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x3, x3, x7\n\t" "str x4, [%[r], 136]\n\t" "# A[7] * A[11]\n\t" - "mul x8, x17, x21\n\t" - "umulh x9, x17, x21\n\t" + "mul x8, x17, x22\n\t" + "umulh x9, x17, x22\n\t" "adds x2, x2, x8\n\t" "adcs x3, x3, x9\n\t" "adc x4, xzr, xzr\n\t" @@ -7295,8 +7295,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adcs x3, x3, x9\n\t" "adc x4, x4, xzr\n\t" "# A[8] * A[10]\n\t" - "mul x8, x18, x20\n\t" - "umulh x9, x18, x20\n\t" + "mul x8, x19, x21\n\t" + "umulh x9, x19, x21\n\t" "adds x2, x2, x8\n\t" "adcs x3, x3, x9\n\t" "adc x4, x4, xzr\n\t" @@ -7304,15 +7304,15 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adcs x3, x3, x9\n\t" "adc x4, x4, xzr\n\t" "# A[9] * A[9]\n\t" - "mul x8, x19, x19\n\t" - "umulh x9, x19, x19\n\t" + "mul x8, x20, x20\n\t" + "umulh x9, x20, x20\n\t" "adds x2, x2, x8\n\t" "adcs x3, x3, x9\n\t" "adc x4, x4, xzr\n\t" "str x2, [%[r], 144]\n\t" "# A[8] * A[11]\n\t" - "mul x8, x18, x21\n\t" - "umulh x9, x18, x21\n\t" + "mul x8, x19, x22\n\t" + "umulh x9, x19, x22\n\t" "adds x3, x3, x8\n\t" "adcs x4, x4, x9\n\t" "adc x2, xzr, xzr\n\t" @@ -7320,8 +7320,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adcs x4, x4, x9\n\t" "adc x2, x2, xzr\n\t" "# A[9] * A[10]\n\t" - "mul x8, x19, x20\n\t" - "umulh x9, x19, x20\n\t" + "mul x8, x20, x21\n\t" + "umulh x9, x20, x21\n\t" "adds x3, x3, x8\n\t" "adcs x4, x4, x9\n\t" "adc x2, x2, xzr\n\t" @@ -7330,8 +7330,8 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x2, x2, xzr\n\t" "str x3, [%[r], 152]\n\t" "# A[9] * A[11]\n\t" - "mul x8, x19, x21\n\t" - "umulh x9, x19, x21\n\t" + "mul x8, x20, x22\n\t" + "umulh x9, x20, x22\n\t" "adds x4, x4, x8\n\t" "adcs x2, x2, x9\n\t" "adc x3, xzr, xzr\n\t" @@ -7339,15 +7339,15 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adcs x2, x2, x9\n\t" "adc x3, x3, xzr\n\t" "# A[10] * A[10]\n\t" - "mul x8, x20, x20\n\t" - "umulh x9, x20, x20\n\t" + "mul x8, x21, x21\n\t" + "umulh x9, x21, x21\n\t" "adds x4, x4, x8\n\t" "adcs x2, x2, x9\n\t" "adc x3, x3, xzr\n\t" "str x4, [%[r], 160]\n\t" "# A[10] * A[11]\n\t" - "mul x8, x20, x21\n\t" - "umulh x9, x20, x21\n\t" + "mul x8, x21, x22\n\t" + "umulh x9, x21, x22\n\t" "adds x2, x2, x8\n\t" "adcs x3, x3, x9\n\t" "adc x4, xzr, xzr\n\t" @@ -7356,14 +7356,14 @@ static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) "adc x4, x4, xzr\n\t" "str x2, [%[r], 168]\n\t" "# A[11] * A[11]\n\t" - "mul x8, x21, x21\n\t" - "umulh x9, x21, x21\n\t" + "mul x8, x22, x22\n\t" + "umulh x9, x22, x22\n\t" "adds x3, x3, x8\n\t" "adc x4, x4, x9\n\t" "stp x3, x4, [%[r], 176]\n\t" : : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) - : "memory", "x2", "x3", "x4", "x8", "x9", "x10", "x5", "x6", "x7", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21" + : "memory", "x2", "x3", "x4", "x8", "x9", "x10", "x5", "x6", "x7", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" ); XMEMCPY(r, tmp, sizeof(tmp)); @@ -9129,11 +9129,11 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "ldp x12, x13, [%[m], 0]\n\t" "ldp x14, x15, [%[m], 16]\n\t" "ldp x16, x17, [%[m], 32]\n\t" - "ldp x18, x19, [%[m], 48]\n\t" - "ldp x20, x21, [%[m], 64]\n\t" - "ldp x22, x23, [%[m], 80]\n\t" - "ldp x24, x25, [%[m], 96]\n\t" - "ldp x26, x27, [%[m], 112]\n\t" + "ldp x19, x20, [%[m], 48]\n\t" + "ldp x21, x22, [%[m], 64]\n\t" + "ldp x23, x24, [%[m], 80]\n\t" + "ldp x25, x26, [%[m], 96]\n\t" + "ldp x27, x28, [%[m], 112]\n\t" "# i = 0\n\t" "mov x3, 0\n\t" "ldp x10, x11, [%[a], 0]\n\t" @@ -9191,8 +9191,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+6] += m[6] * mu\n\t" "ldr x9, [%[a], 48]\n\t" - "mul x6, x18, x8\n\t" - "umulh x7, x18, x8\n\t" + "mul x6, x19, x8\n\t" + "umulh x7, x19, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9200,8 +9200,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+7] += m[7] * mu\n\t" "ldr x9, [%[a], 56]\n\t" - "mul x6, x19, x8\n\t" - "umulh x7, x19, x8\n\t" + "mul x6, x20, x8\n\t" + "umulh x7, x20, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9209,8 +9209,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+8] += m[8] * mu\n\t" "ldr x9, [%[a], 64]\n\t" - "mul x6, x20, x8\n\t" - "umulh x7, x20, x8\n\t" + "mul x6, x21, x8\n\t" + "umulh x7, x21, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9218,8 +9218,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+9] += m[9] * mu\n\t" "ldr x9, [%[a], 72]\n\t" - "mul x6, x21, x8\n\t" - "umulh x7, x21, x8\n\t" + "mul x6, x22, x8\n\t" + "umulh x7, x22, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9227,8 +9227,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+10] += m[10] * mu\n\t" "ldr x9, [%[a], 80]\n\t" - "mul x6, x22, x8\n\t" - "umulh x7, x22, x8\n\t" + "mul x6, x23, x8\n\t" + "umulh x7, x23, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9236,8 +9236,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+11] += m[11] * mu\n\t" "ldr x9, [%[a], 88]\n\t" - "mul x6, x23, x8\n\t" - "umulh x7, x23, x8\n\t" + "mul x6, x24, x8\n\t" + "umulh x7, x24, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9245,8 +9245,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+12] += m[12] * mu\n\t" "ldr x9, [%[a], 96]\n\t" - "mul x6, x24, x8\n\t" - "umulh x7, x24, x8\n\t" + "mul x6, x25, x8\n\t" + "umulh x7, x25, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9254,8 +9254,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+13] += m[13] * mu\n\t" "ldr x9, [%[a], 104]\n\t" - "mul x6, x25, x8\n\t" - "umulh x7, x25, x8\n\t" + "mul x6, x26, x8\n\t" + "umulh x7, x26, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9263,8 +9263,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+14] += m[14] * mu\n\t" "ldr x9, [%[a], 112]\n\t" - "mul x6, x26, x8\n\t" - "umulh x7, x26, x8\n\t" + "mul x6, x27, x8\n\t" + "umulh x7, x27, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -9272,8 +9272,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+15] += m[15] * mu\n\t" "ldr x9, [%[a], 120]\n\t" - "mul x6, x27, x8\n\t" - "umulh x7, x27, x8\n\t" + "mul x6, x28, x8\n\t" + "umulh x7, x28, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -9372,7 +9372,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "str x11, [%[a], 8]\n\t" : [ca] "+r" (ca), [a] "+r" (a) : [m] "r" (m), [mp] "r" (mp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); sp_3072_cond_sub_24(a - 24, a, m, (sp_digit)0 - ca); @@ -10610,11 +10610,11 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "ldp x12, x13, [%[m], 0]\n\t" "ldp x14, x15, [%[m], 16]\n\t" "ldp x16, x17, [%[m], 32]\n\t" - "ldp x18, x19, [%[m], 48]\n\t" - "ldp x20, x21, [%[m], 64]\n\t" - "ldp x22, x23, [%[m], 80]\n\t" - "ldp x24, x25, [%[m], 96]\n\t" - "ldp x26, x27, [%[m], 112]\n\t" + "ldp x19, x20, [%[m], 48]\n\t" + "ldp x21, x22, [%[m], 64]\n\t" + "ldp x23, x24, [%[m], 80]\n\t" + "ldp x25, x26, [%[m], 96]\n\t" + "ldp x27, x28, [%[m], 112]\n\t" "# i = 0\n\t" "mov x3, 0\n\t" "ldp x10, x11, [%[a], 0]\n\t" @@ -10672,8 +10672,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+6] += m[6] * mu\n\t" "ldr x9, [%[a], 48]\n\t" - "mul x6, x18, x8\n\t" - "umulh x7, x18, x8\n\t" + "mul x6, x19, x8\n\t" + "umulh x7, x19, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10681,8 +10681,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+7] += m[7] * mu\n\t" "ldr x9, [%[a], 56]\n\t" - "mul x6, x19, x8\n\t" - "umulh x7, x19, x8\n\t" + "mul x6, x20, x8\n\t" + "umulh x7, x20, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -10690,8 +10690,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+8] += m[8] * mu\n\t" "ldr x9, [%[a], 64]\n\t" - "mul x6, x20, x8\n\t" - "umulh x7, x20, x8\n\t" + "mul x6, x21, x8\n\t" + "umulh x7, x21, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10699,8 +10699,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+9] += m[9] * mu\n\t" "ldr x9, [%[a], 72]\n\t" - "mul x6, x21, x8\n\t" - "umulh x7, x21, x8\n\t" + "mul x6, x22, x8\n\t" + "umulh x7, x22, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -10708,8 +10708,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+10] += m[10] * mu\n\t" "ldr x9, [%[a], 80]\n\t" - "mul x6, x22, x8\n\t" - "umulh x7, x22, x8\n\t" + "mul x6, x23, x8\n\t" + "umulh x7, x23, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10717,8 +10717,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+11] += m[11] * mu\n\t" "ldr x9, [%[a], 88]\n\t" - "mul x6, x23, x8\n\t" - "umulh x7, x23, x8\n\t" + "mul x6, x24, x8\n\t" + "umulh x7, x24, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -10726,8 +10726,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+12] += m[12] * mu\n\t" "ldr x9, [%[a], 96]\n\t" - "mul x6, x24, x8\n\t" - "umulh x7, x24, x8\n\t" + "mul x6, x25, x8\n\t" + "umulh x7, x25, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10735,8 +10735,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+13] += m[13] * mu\n\t" "ldr x9, [%[a], 104]\n\t" - "mul x6, x25, x8\n\t" - "umulh x7, x25, x8\n\t" + "mul x6, x26, x8\n\t" + "umulh x7, x26, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -10744,8 +10744,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x4, x4, xzr\n\t" "# a[i+14] += m[14] * mu\n\t" "ldr x9, [%[a], 112]\n\t" - "mul x6, x26, x8\n\t" - "umulh x7, x26, x8\n\t" + "mul x6, x27, x8\n\t" + "umulh x7, x27, x8\n\t" "adds x9, x9, x6\n\t" "adc x5, x7, xzr\n\t" "adds x9, x9, x4\n\t" @@ -10753,8 +10753,8 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "adc x5, x5, xzr\n\t" "# a[i+15] += m[15] * mu\n\t" "ldr x9, [%[a], 120]\n\t" - "mul x6, x27, x8\n\t" - "umulh x7, x27, x8\n\t" + "mul x6, x28, x8\n\t" + "umulh x7, x28, x8\n\t" "adds x9, x9, x6\n\t" "adc x4, x7, xzr\n\t" "adds x9, x9, x5\n\t" @@ -11093,7 +11093,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "str x11, [%[a], 8]\n\t" : [ca] "+r" (ca), [a] "+r" (a) : [m] "r" (m), [mp] "r" (mp) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - ca); @@ -13535,102 +13535,102 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const (void)mp; __asm__ __volatile__ ( - "ldr x18, [%[a], 0]\n\t" - "ldr x19, [%[a], 8]\n\t" - "ldr x20, [%[a], 16]\n\t" - "ldr x21, [%[a], 24]\n\t" - "ldr x22, [%[b], 0]\n\t" - "ldr x23, [%[b], 8]\n\t" - "ldr x24, [%[b], 16]\n\t" - "ldr x25, [%[b], 24]\n\t" + "ldr x19, [%[a], 0]\n\t" + "ldr x20, [%[a], 8]\n\t" + "ldr x21, [%[a], 16]\n\t" + "ldr x22, [%[a], 24]\n\t" + "ldr x23, [%[b], 0]\n\t" + "ldr x24, [%[b], 8]\n\t" + "ldr x25, [%[b], 16]\n\t" + "ldr x26, [%[b], 24]\n\t" "# A[0] * B[0]\n\t" - "mul x10, x18, x22\n\t" - "umulh x11, x18, x22\n\t" + "mul x10, x19, x23\n\t" + "umulh x11, x19, x23\n\t" "# A[0] * B[1]\n\t" - "mul x5, x18, x23\n\t" - "umulh x6, x18, x23\n\t" + "mul x5, x19, x24\n\t" + "umulh x6, x19, x24\n\t" "adds x11, x11, x5\n\t" "adc x12, xzr, x6\n\t" "# A[1] * B[0]\n\t" - "mul x5, x19, x22\n\t" - "umulh x6, x19, x22\n\t" + "mul x5, x20, x23\n\t" + "umulh x6, x20, x23\n\t" "adds x11, x11, x5\n\t" "adcs x12, x12, x6\n\t" "adc x13, xzr, xzr\n\t" "# A[0] * B[2]\n\t" - "mul x5, x18, x24\n\t" - "umulh x6, x18, x24\n\t" + "mul x5, x19, x25\n\t" + "umulh x6, x19, x25\n\t" "adds x12, x12, x5\n\t" "adc x13, x13, x6\n\t" "# A[1] * B[1]\n\t" - "mul x5, x19, x23\n\t" - "umulh x6, x19, x23\n\t" + "mul x5, x20, x24\n\t" + "umulh x6, x20, x24\n\t" "adds x12, x12, x5\n\t" "adcs x13, x13, x6\n\t" "adc x14, xzr, xzr\n\t" "# A[2] * B[0]\n\t" - "mul x5, x20, x22\n\t" - "umulh x6, x20, x22\n\t" + "mul x5, x21, x23\n\t" + "umulh x6, x21, x23\n\t" "adds x12, x12, x5\n\t" "adcs x13, x13, x6\n\t" "adc x14, x14, xzr\n\t" "# A[0] * B[3]\n\t" - "mul x5, x18, x25\n\t" - "umulh x6, x18, x25\n\t" + "mul x5, x19, x26\n\t" + "umulh x6, x19, x26\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, xzr, xzr\n\t" "# A[1] * B[2]\n\t" - "mul x5, x19, x24\n\t" - "umulh x6, x19, x24\n\t" + "mul x5, x20, x25\n\t" + "umulh x6, x20, x25\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, x15, xzr\n\t" "# A[2] * B[1]\n\t" - "mul x5, x20, x23\n\t" - "umulh x6, x20, x23\n\t" + "mul x5, x21, x24\n\t" + "umulh x6, x21, x24\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, x15, xzr\n\t" "# A[3] * B[0]\n\t" - "mul x5, x21, x22\n\t" - "umulh x6, x21, x22\n\t" + "mul x5, x22, x23\n\t" + "umulh x6, x22, x23\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, x15, xzr\n\t" "# A[1] * B[3]\n\t" - "mul x5, x19, x25\n\t" - "umulh x6, x19, x25\n\t" + "mul x5, x20, x26\n\t" + "umulh x6, x20, x26\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, x6\n\t" "adc x16, xzr, xzr\n\t" "# A[2] * B[2]\n\t" - "mul x5, x20, x24\n\t" - "umulh x6, x20, x24\n\t" + "mul x5, x21, x25\n\t" + "umulh x6, x21, x25\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, x6\n\t" "adc x16, x16, xzr\n\t" "# A[3] * B[1]\n\t" - "mul x5, x21, x23\n\t" - "umulh x6, x21, x23\n\t" + "mul x5, x22, x24\n\t" + "umulh x6, x22, x24\n\t" "adds x14, x14, x5\n\t" "adcs x15, x15, x6\n\t" "adc x16, x16, xzr\n\t" "# A[2] * B[3]\n\t" - "mul x5, x20, x25\n\t" - "umulh x6, x20, x25\n\t" + "mul x5, x21, x26\n\t" + "umulh x6, x21, x26\n\t" "adds x15, x15, x5\n\t" "adcs x16, x16, x6\n\t" "adc x17, xzr, xzr\n\t" "# A[3] * B[2]\n\t" - "mul x5, x21, x24\n\t" - "umulh x6, x21, x24\n\t" + "mul x5, x22, x25\n\t" + "umulh x6, x22, x25\n\t" "adds x15, x15, x5\n\t" "adcs x16, x16, x6\n\t" "adc x17, x17, xzr\n\t" "# A[3] * B[3]\n\t" - "mul x5, x21, x25\n\t" - "umulh x6, x21, x25\n\t" + "mul x5, x22, x26\n\t" + "umulh x6, x22, x26\n\t" "adds x16, x16, x5\n\t" "adc x17, x17, x6\n\t" "# Start Reduction\n\t" @@ -13645,12 +13645,12 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const "add x8, x8, x10\n\t" "# a[0]-a[2] << 32\n\t" "lsl x10, x10, 32\n\t" - "lsr x18, x5, 32\n\t" + "lsr x19, x5, 32\n\t" "lsl x11, x6, 32\n\t" - "lsr x19, x6, 32\n\t" + "lsr x20, x6, 32\n\t" "lsl x12, x7, 32\n\t" - "eor x11, x11, x18\n\t" - "eor x12, x12, x19\n\t" + "eor x11, x11, x19\n\t" + "eor x12, x12, x20\n\t" "# - a[0] << 32 << 192\n\t" "sub x8, x8, x10\n\t" "# + a[0]-a[2] << 32 << 64\n\t" @@ -13670,47 +13670,47 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const "adcs x15, x15, x7\n\t" "adcs x16, x16, x8\n\t" "adcs x17, x17, xzr\n\t" - "csetm x20, cs\n\t" - "add x10, x10, x20\n\t" + "csetm x21, cs\n\t" + "add x10, x10, x21\n\t" "# mu <<= 32\n\t" "lsr x9, x8, 32\n\t" - "lsr x18, x5, 32\n\t" + "lsr x19, x5, 32\n\t" "lsl x5, x5, 32\n\t" - "lsr x19, x6, 32\n\t" + "lsr x20, x6, 32\n\t" "lsl x6, x6, 32\n\t" - "lsr x20, x7, 32\n\t" + "lsr x21, x7, 32\n\t" "lsl x7, x7, 32\n\t" "lsl x8, x8, 32\n\t" - "eor x6, x6, x18\n\t" - "eor x7, x7, x19\n\t" - "eor x8, x8, x20\n\t" + "eor x6, x6, x19\n\t" + "eor x7, x7, x20\n\t" + "eor x8, x8, x21\n\t" "# a += (mu << 32) << 64\n\t" "adds x13, x13, x7\n\t" "adcs x14, x14, x8\n\t" "adcs x15, x15, x9\n\t" "adcs x16, x16, xzr\n\t" "adcs x17, x17, xzr\n\t" - "csetm x20, cs\n\t" - "add x10, x10, x20\n\t" + "csetm x21, cs\n\t" + "add x10, x10, x21\n\t" "# a -= (mu << 32) << 192\n\t" "subs x13, x13, x5\n\t" - "mov x18, 0xffffffff\n\t" + "mov x19, 0xffffffff\n\t" "sbcs x14, x14, x6\n\t" - "mov x19, 0xffffffff00000001\n\t" + "mov x20, 0xffffffff00000001\n\t" "sbcs x15, x15, x7\n\t" "sbcs x16, x16, x8\n\t" "sbcs x17, x17, x9\n\t" - "cset x20, cc\n\t" - "add x10, x10, x20\n\t" + "cset x21, cc\n\t" + "add x10, x10, x21\n\t" "# mask m and sub from result if overflow\n\t" "# m[0] = -1 & mask = mask\n\t" - "and x18, x18, x10\n\t" - "# m[2] = 0 & mask = 0\n\t" "and x19, x19, x10\n\t" + "# m[2] = 0 & mask = 0\n\t" + "and x20, x20, x10\n\t" "subs x14, x14, x10\n\t" - "sbcs x15, x15, x18\n\t" + "sbcs x15, x15, x19\n\t" "sbcs x16, x16, xzr\n\t" - "sbc x17, x17, x19\n\t" + "sbc x17, x17, x20\n\t" "str x14, [%[r], 0]\n\t" "str x15, [%[r], 8]\n\t" "str x16, [%[r], 16]\n\t" @@ -13718,8 +13718,8 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const : [m] "+r" (m), [a] "+r" (a), [b] "+r" (b) : [r] "r" (r) : "memory", "x5", "x6", "x7", "x8", "x9", - "x18", "x19", "x20", "x21", - "x22", "x23", "x24", "x25", + "x19", "x20", "x21", "x22", + "x23", "x24", "x25", "x26", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" ); } @@ -13735,37 +13735,37 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const sp_digit mp) { __asm__ __volatile__ ( - "ldr x18, [%[a], 0]\n\t" - "ldr x19, [%[a], 8]\n\t" - "ldr x20, [%[a], 16]\n\t" - "ldr x21, [%[a], 24]\n\t" + "ldr x19, [%[a], 0]\n\t" + "ldr x20, [%[a], 8]\n\t" + "ldr x21, [%[a], 16]\n\t" + "ldr x22, [%[a], 24]\n\t" "# A[0] * A[1]\n\t" - "mul x11, x18, x19\n\t" - "umulh x12, x18, x19\n\t" + "mul x11, x19, x20\n\t" + "umulh x12, x19, x20\n\t" "# A[0] * A[2]\n\t" - "mul x5, x18, x20\n\t" - "umulh x6, x18, x20\n\t" + "mul x5, x19, x21\n\t" + "umulh x6, x19, x21\n\t" "adds x12, x12, x5\n\t" "adc x13, xzr, x6\n\t" "# A[0] * A[3]\n\t" - "mul x5, x18, x21\n\t" - "umulh x6, x18, x21\n\t" + "mul x5, x19, x22\n\t" + "umulh x6, x19, x22\n\t" "adds x13, x13, x5\n\t" "adc x14, xzr, x6\n\t" "# A[1] * A[2]\n\t" - "mul x5, x19, x20\n\t" - "umulh x6, x19, x20\n\t" + "mul x5, x20, x21\n\t" + "umulh x6, x20, x21\n\t" "adds x13, x13, x5\n\t" "adcs x14, x14, x6\n\t" "adc x15, xzr, xzr\n\t" "# A[1] * A[3]\n\t" - "mul x5, x19, x21\n\t" - "umulh x6, x19, x21\n\t" + "mul x5, x20, x22\n\t" + "umulh x6, x20, x22\n\t" "adds x14, x14, x5\n\t" "adc x15, x15, x6\n\t" "# A[2] * A[3]\n\t" - "mul x5, x20, x21\n\t" - "umulh x6, x20, x21\n\t" + "mul x5, x21, x22\n\t" + "umulh x6, x21, x22\n\t" "adds x15, x15, x5\n\t" "adc x16, xzr, x6\n\t" "# Double\n\t" @@ -13777,24 +13777,24 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const "adcs x16, x16, x16\n\t" "cset x17, cs\n\t" "# A[0] * A[0]\n\t" - "mul x10, x18, x18\n\t" - "umulh x4, x18, x18\n\t" + "mul x10, x19, x19\n\t" + "umulh x4, x19, x19\n\t" "# A[1] * A[1]\n\t" - "mul x5, x19, x19\n\t" - "umulh x6, x19, x19\n\t" + "mul x5, x20, x20\n\t" + "umulh x6, x20, x20\n\t" "# A[2] * A[2]\n\t" - "mul x7, x20, x20\n\t" - "umulh x8, x20, x20\n\t" + "mul x7, x21, x21\n\t" + "umulh x8, x21, x21\n\t" "# A[3] * A[3]\n\t" - "mul x9, x21, x21\n\t" - "umulh x18, x21, x21\n\t" + "mul x9, x22, x22\n\t" + "umulh x19, x22, x22\n\t" "adds x11, x11, x4\n\t" "adcs x12, x12, x5\n\t" "adcs x13, x13, x6\n\t" "adcs x14, x14, x7\n\t" "adcs x15, x15, x8\n\t" "adcs x16, x16, x9\n\t" - "adc x17, x17, x18\n\t" + "adc x17, x17, x19\n\t" "# Start Reduction\n\t" "mov x5, x10\n\t" "mov x6, x11\n\t" @@ -13807,12 +13807,12 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const "add x8, x8, x10\n\t" "# a[0]-a[2] << 32\n\t" "lsl x10, x10, 32\n\t" - "lsr x18, x5, 32\n\t" + "lsr x19, x5, 32\n\t" "lsl x11, x6, 32\n\t" - "lsr x19, x6, 32\n\t" + "lsr x20, x6, 32\n\t" "lsl x12, x7, 32\n\t" - "eor x11, x11, x18\n\t" - "eor x12, x12, x19\n\t" + "eor x11, x11, x19\n\t" + "eor x12, x12, x20\n\t" "# - a[0] << 32 << 192\n\t" "sub x8, x8, x10\n\t" "# + a[0]-a[2] << 32 << 64\n\t" @@ -13832,47 +13832,47 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const "adcs x15, x15, x7\n\t" "adcs x16, x16, x8\n\t" "adcs x17, x17, xzr\n\t" - "csetm x20, cs\n\t" - "add x10, x10, x20\n\t" + "csetm x21, cs\n\t" + "add x10, x10, x21\n\t" "# mu <<= 32\n\t" "lsr x9, x8, 32\n\t" - "lsr x18, x5, 32\n\t" + "lsr x19, x5, 32\n\t" "lsl x5, x5, 32\n\t" - "lsr x19, x6, 32\n\t" + "lsr x20, x6, 32\n\t" "lsl x6, x6, 32\n\t" - "lsr x20, x7, 32\n\t" + "lsr x21, x7, 32\n\t" "lsl x7, x7, 32\n\t" "lsl x8, x8, 32\n\t" - "eor x6, x6, x18\n\t" - "eor x7, x7, x19\n\t" - "eor x8, x8, x20\n\t" + "eor x6, x6, x19\n\t" + "eor x7, x7, x20\n\t" + "eor x8, x8, x21\n\t" "# a += (mu << 32) << 64\n\t" "adds x13, x13, x7\n\t" "adcs x14, x14, x8\n\t" "adcs x15, x15, x9\n\t" "adcs x16, x16, xzr\n\t" "adcs x17, x17, xzr\n\t" - "csetm x20, cs\n\t" - "add x10, x10, x20\n\t" + "csetm x21, cs\n\t" + "add x10, x10, x21\n\t" "# a -= (mu << 32) << 192\n\t" "subs x13, x13, x5\n\t" - "mov x18, 0xffffffff\n\t" + "mov x19, 0xffffffff\n\t" "sbcs x14, x14, x6\n\t" - "mov x19, 0xffffffff00000001\n\t" + "mov x20, 0xffffffff00000001\n\t" "sbcs x15, x15, x7\n\t" "sbcs x16, x16, x8\n\t" "sbcs x17, x17, x9\n\t" - "cset x20, cc\n\t" - "add x10, x10, x20\n\t" + "cset x21, cc\n\t" + "add x10, x10, x21\n\t" "# mask m and sub from result if overflow\n\t" "# m[0] = -1 & mask = mask\n\t" - "and x18, x18, x10\n\t" - "# m[2] = 0 & mask = 0\n\t" "and x19, x19, x10\n\t" + "# m[2] = 0 & mask = 0\n\t" + "and x20, x20, x10\n\t" "subs x14, x14, x10\n\t" - "sbcs x15, x15, x18\n\t" + "sbcs x15, x15, x19\n\t" "sbcs x16, x16, xzr\n\t" - "sbc x17, x17, x19\n\t" + "sbc x17, x17, x20\n\t" "str x14, [%[r], 0]\n\t" "str x15, [%[r], 8]\n\t" "str x16, [%[r], 16]\n\t" @@ -13880,7 +13880,7 @@ SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const : [m] "+r" (m), [a] "+r" (a), [mp] "+r" (mp) : [r] "r" (r) : "memory", "x4", "x5", "x6", "x7", "x8", "x9", - "x18", "x19", "x20", "x21", + "x19", "x20", "x21", "x22", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" ); }