mirror of https://github.com/wolfSSL/wolfssl.git
SP ARM64 - use fewer registers in mont_reduces
parent
ebb490204a
commit
da5d9a923b
|
@ -2503,7 +2503,6 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m,
|
|||
"ldp x23, x24, [%[m], 64]\n\t"
|
||||
"ldp x25, x26, [%[m], 80]\n\t"
|
||||
"ldp x27, x28, [%[m], 96]\n\t"
|
||||
"ldp x29, x30, [%[m], 112]\n\t"
|
||||
"# i = 16\n\t"
|
||||
"mov x4, 16\n\t"
|
||||
"ldp x12, x13, [%[a], 0]\n\t"
|
||||
|
@ -2628,19 +2627,21 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m,
|
|||
"# a[i+14] += m[14] * mu\n\t"
|
||||
"ldr x11, [%[a], 112]\n\t"
|
||||
"adc x5, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 112]\n\t"
|
||||
"adds x10, x10, x6\n\t"
|
||||
"mul x7, x29, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x5, x5, xzr\n\t"
|
||||
"umulh x8, x29, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"str x10, [%[a], 104]\n\t"
|
||||
"adds x11, x11, x7\n\t"
|
||||
"# a[i+15] += m[15] * mu\n\t"
|
||||
"ldr x10, [%[a], 120]\n\t"
|
||||
"adc x6, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 120]\n\t"
|
||||
"adds x11, x11, x5\n\t"
|
||||
"mul x7, x30, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x6, x6, xzr\n\t"
|
||||
"umulh x8, x30, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"adds x6, x6, x7\n\t"
|
||||
"adcs x8, x8, %[ca]\n\t"
|
||||
"str x11, [%[a], 112]\n\t"
|
||||
|
@ -2657,7 +2658,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m,
|
|||
"stp x12, x13, [%[a], 0]\n\t"
|
||||
: [ca] "+r" (ca), [a] "+r" (a)
|
||||
: [m] "r" (m), [mp] "r" (mp)
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
|
||||
);
|
||||
|
||||
sp_2048_cond_sub_16(a - 16, a, m, (sp_digit)0 - ca);
|
||||
|
@ -3616,7 +3617,6 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m,
|
|||
"ldp x23, x24, [%[m], 64]\n\t"
|
||||
"ldp x25, x26, [%[m], 80]\n\t"
|
||||
"ldp x27, x28, [%[m], 96]\n\t"
|
||||
"ldp x29, x30, [%[m], 112]\n\t"
|
||||
"# i = 32\n\t"
|
||||
"mov x4, 32\n\t"
|
||||
"ldp x12, x13, [%[a], 0]\n\t"
|
||||
|
@ -3741,19 +3741,21 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m,
|
|||
"# a[i+14] += m[14] * mu\n\t"
|
||||
"ldr x11, [%[a], 112]\n\t"
|
||||
"adc x5, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 112]\n\t"
|
||||
"adds x10, x10, x6\n\t"
|
||||
"mul x7, x29, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x5, x5, xzr\n\t"
|
||||
"umulh x8, x29, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"str x10, [%[a], 104]\n\t"
|
||||
"adds x11, x11, x7\n\t"
|
||||
"# a[i+15] += m[15] * mu\n\t"
|
||||
"ldr x10, [%[a], 120]\n\t"
|
||||
"adc x6, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 120]\n\t"
|
||||
"adds x11, x11, x5\n\t"
|
||||
"mul x7, x30, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x6, x6, xzr\n\t"
|
||||
"umulh x8, x30, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"str x11, [%[a], 112]\n\t"
|
||||
"adds x10, x10, x7\n\t"
|
||||
"# a[i+16] += m[16] * mu\n\t"
|
||||
|
@ -3930,7 +3932,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m,
|
|||
"stp x12, x13, [%[a], 0]\n\t"
|
||||
: [ca] "+r" (ca), [a] "+r" (a)
|
||||
: [m] "r" (m), [mp] "r" (mp)
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
|
||||
);
|
||||
|
||||
sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - ca);
|
||||
|
@ -9633,7 +9635,6 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m,
|
|||
"ldp x23, x24, [%[m], 64]\n\t"
|
||||
"ldp x25, x26, [%[m], 80]\n\t"
|
||||
"ldp x27, x28, [%[m], 96]\n\t"
|
||||
"ldp x29, x30, [%[m], 112]\n\t"
|
||||
"# i = 24\n\t"
|
||||
"mov x4, 24\n\t"
|
||||
"ldp x12, x13, [%[a], 0]\n\t"
|
||||
|
@ -9758,19 +9759,21 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m,
|
|||
"# a[i+14] += m[14] * mu\n\t"
|
||||
"ldr x11, [%[a], 112]\n\t"
|
||||
"adc x5, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 112]\n\t"
|
||||
"adds x10, x10, x6\n\t"
|
||||
"mul x7, x29, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x5, x5, xzr\n\t"
|
||||
"umulh x8, x29, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"str x10, [%[a], 104]\n\t"
|
||||
"adds x11, x11, x7\n\t"
|
||||
"# a[i+15] += m[15] * mu\n\t"
|
||||
"ldr x10, [%[a], 120]\n\t"
|
||||
"adc x6, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 120]\n\t"
|
||||
"adds x11, x11, x5\n\t"
|
||||
"mul x7, x30, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x6, x6, xzr\n\t"
|
||||
"umulh x8, x30, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"str x11, [%[a], 112]\n\t"
|
||||
"adds x10, x10, x7\n\t"
|
||||
"# a[i+16] += m[16] * mu\n\t"
|
||||
|
@ -9867,7 +9870,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m,
|
|||
"stp x12, x13, [%[a], 0]\n\t"
|
||||
: [ca] "+r" (ca), [a] "+r" (a)
|
||||
: [m] "r" (m), [mp] "r" (mp)
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
|
||||
);
|
||||
|
||||
sp_3072_cond_sub_24(a - 24, a, m, (sp_digit)0 - ca);
|
||||
|
@ -11006,7 +11009,6 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m,
|
|||
"ldp x23, x24, [%[m], 64]\n\t"
|
||||
"ldp x25, x26, [%[m], 80]\n\t"
|
||||
"ldp x27, x28, [%[m], 96]\n\t"
|
||||
"ldp x29, x30, [%[m], 112]\n\t"
|
||||
"# i = 48\n\t"
|
||||
"mov x4, 48\n\t"
|
||||
"ldp x12, x13, [%[a], 0]\n\t"
|
||||
|
@ -11131,19 +11133,21 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m,
|
|||
"# a[i+14] += m[14] * mu\n\t"
|
||||
"ldr x11, [%[a], 112]\n\t"
|
||||
"adc x5, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 112]\n\t"
|
||||
"adds x10, x10, x6\n\t"
|
||||
"mul x7, x29, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x5, x5, xzr\n\t"
|
||||
"umulh x8, x29, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"str x10, [%[a], 104]\n\t"
|
||||
"adds x11, x11, x7\n\t"
|
||||
"# a[i+15] += m[15] * mu\n\t"
|
||||
"ldr x10, [%[a], 120]\n\t"
|
||||
"adc x6, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 120]\n\t"
|
||||
"adds x11, x11, x5\n\t"
|
||||
"mul x7, x30, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x6, x6, xzr\n\t"
|
||||
"umulh x8, x30, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"str x11, [%[a], 112]\n\t"
|
||||
"adds x10, x10, x7\n\t"
|
||||
"# a[i+16] += m[16] * mu\n\t"
|
||||
|
@ -11480,7 +11484,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m,
|
|||
"stp x12, x13, [%[a], 0]\n\t"
|
||||
: [ca] "+r" (ca), [a] "+r" (a)
|
||||
: [m] "r" (m), [mp] "r" (mp)
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
|
||||
);
|
||||
|
||||
sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - ca);
|
||||
|
@ -15623,7 +15627,6 @@ SP_NOINLINE static void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m,
|
|||
"ldp x23, x24, [%[m], 64]\n\t"
|
||||
"ldp x25, x26, [%[m], 80]\n\t"
|
||||
"ldp x27, x28, [%[m], 96]\n\t"
|
||||
"ldp x29, x30, [%[m], 112]\n\t"
|
||||
"# i = 64\n\t"
|
||||
"mov x4, 64\n\t"
|
||||
"ldp x12, x13, [%[a], 0]\n\t"
|
||||
|
@ -15748,19 +15751,21 @@ SP_NOINLINE static void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m,
|
|||
"# a[i+14] += m[14] * mu\n\t"
|
||||
"ldr x11, [%[a], 112]\n\t"
|
||||
"adc x5, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 112]\n\t"
|
||||
"adds x10, x10, x6\n\t"
|
||||
"mul x7, x29, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x5, x5, xzr\n\t"
|
||||
"umulh x8, x29, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"str x10, [%[a], 104]\n\t"
|
||||
"adds x11, x11, x7\n\t"
|
||||
"# a[i+15] += m[15] * mu\n\t"
|
||||
"ldr x10, [%[a], 120]\n\t"
|
||||
"adc x6, x8, xzr\n\t"
|
||||
"ldr x8, [%[m], 120]\n\t"
|
||||
"adds x11, x11, x5\n\t"
|
||||
"mul x7, x30, x9\n\t"
|
||||
"mul x7, x8, x9\n\t"
|
||||
"adc x6, x6, xzr\n\t"
|
||||
"umulh x8, x30, x9\n\t"
|
||||
"umulh x8, x8, x9\n\t"
|
||||
"str x11, [%[a], 112]\n\t"
|
||||
"adds x10, x10, x7\n\t"
|
||||
"# a[i+16] += m[16] * mu\n\t"
|
||||
|
@ -16257,7 +16262,7 @@ SP_NOINLINE static void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m,
|
|||
"stp x12, x13, [%[a], 0]\n\t"
|
||||
: [ca] "+r" (ca), [a] "+r" (a)
|
||||
: [m] "r" (m), [mp] "r" (mp)
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
|
||||
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
|
||||
);
|
||||
|
||||
sp_4096_cond_sub_64(a - 64, a, m, (sp_digit)0 - ca);
|
||||
|
|
Loading…
Reference in New Issue