SP ARM64 - use fewer registers in mont_reduces

pull/2907/head
Sean Parkinson 2020-04-16 09:20:04 +10:00
parent ebb490204a
commit da5d9a923b
1 changed files with 35 additions and 30 deletions

View File

@ -2503,7 +2503,6 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m,
"ldp x23, x24, [%[m], 64]\n\t"
"ldp x25, x26, [%[m], 80]\n\t"
"ldp x27, x28, [%[m], 96]\n\t"
"ldp x29, x30, [%[m], 112]\n\t"
"# i = 16\n\t"
"mov x4, 16\n\t"
"ldp x12, x13, [%[a], 0]\n\t"
@ -2628,19 +2627,21 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m,
"# a[i+14] += m[14] * mu\n\t"
"ldr x11, [%[a], 112]\n\t"
"adc x5, x8, xzr\n\t"
"ldr x8, [%[m], 112]\n\t"
"adds x10, x10, x6\n\t"
"mul x7, x29, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x5, x5, xzr\n\t"
"umulh x8, x29, x9\n\t"
"umulh x8, x8, x9\n\t"
"str x10, [%[a], 104]\n\t"
"adds x11, x11, x7\n\t"
"# a[i+15] += m[15] * mu\n\t"
"ldr x10, [%[a], 120]\n\t"
"adc x6, x8, xzr\n\t"
"ldr x8, [%[m], 120]\n\t"
"adds x11, x11, x5\n\t"
"mul x7, x30, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x6, x6, xzr\n\t"
"umulh x8, x30, x9\n\t"
"umulh x8, x8, x9\n\t"
"adds x6, x6, x7\n\t"
"adcs x8, x8, %[ca]\n\t"
"str x11, [%[a], 112]\n\t"
@ -2657,7 +2658,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m,
"stp x12, x13, [%[a], 0]\n\t"
: [ca] "+r" (ca), [a] "+r" (a)
: [m] "r" (m), [mp] "r" (mp)
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
sp_2048_cond_sub_16(a - 16, a, m, (sp_digit)0 - ca);
@ -3616,7 +3617,6 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m,
"ldp x23, x24, [%[m], 64]\n\t"
"ldp x25, x26, [%[m], 80]\n\t"
"ldp x27, x28, [%[m], 96]\n\t"
"ldp x29, x30, [%[m], 112]\n\t"
"# i = 32\n\t"
"mov x4, 32\n\t"
"ldp x12, x13, [%[a], 0]\n\t"
@ -3741,19 +3741,21 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m,
"# a[i+14] += m[14] * mu\n\t"
"ldr x11, [%[a], 112]\n\t"
"adc x5, x8, xzr\n\t"
"ldr x8, [%[m], 112]\n\t"
"adds x10, x10, x6\n\t"
"mul x7, x29, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x5, x5, xzr\n\t"
"umulh x8, x29, x9\n\t"
"umulh x8, x8, x9\n\t"
"str x10, [%[a], 104]\n\t"
"adds x11, x11, x7\n\t"
"# a[i+15] += m[15] * mu\n\t"
"ldr x10, [%[a], 120]\n\t"
"adc x6, x8, xzr\n\t"
"ldr x8, [%[m], 120]\n\t"
"adds x11, x11, x5\n\t"
"mul x7, x30, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x6, x6, xzr\n\t"
"umulh x8, x30, x9\n\t"
"umulh x8, x8, x9\n\t"
"str x11, [%[a], 112]\n\t"
"adds x10, x10, x7\n\t"
"# a[i+16] += m[16] * mu\n\t"
@ -3930,7 +3932,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m,
"stp x12, x13, [%[a], 0]\n\t"
: [ca] "+r" (ca), [a] "+r" (a)
: [m] "r" (m), [mp] "r" (mp)
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - ca);
@ -9633,7 +9635,6 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m,
"ldp x23, x24, [%[m], 64]\n\t"
"ldp x25, x26, [%[m], 80]\n\t"
"ldp x27, x28, [%[m], 96]\n\t"
"ldp x29, x30, [%[m], 112]\n\t"
"# i = 24\n\t"
"mov x4, 24\n\t"
"ldp x12, x13, [%[a], 0]\n\t"
@ -9758,19 +9759,21 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m,
"# a[i+14] += m[14] * mu\n\t"
"ldr x11, [%[a], 112]\n\t"
"adc x5, x8, xzr\n\t"
"ldr x8, [%[m], 112]\n\t"
"adds x10, x10, x6\n\t"
"mul x7, x29, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x5, x5, xzr\n\t"
"umulh x8, x29, x9\n\t"
"umulh x8, x8, x9\n\t"
"str x10, [%[a], 104]\n\t"
"adds x11, x11, x7\n\t"
"# a[i+15] += m[15] * mu\n\t"
"ldr x10, [%[a], 120]\n\t"
"adc x6, x8, xzr\n\t"
"ldr x8, [%[m], 120]\n\t"
"adds x11, x11, x5\n\t"
"mul x7, x30, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x6, x6, xzr\n\t"
"umulh x8, x30, x9\n\t"
"umulh x8, x8, x9\n\t"
"str x11, [%[a], 112]\n\t"
"adds x10, x10, x7\n\t"
"# a[i+16] += m[16] * mu\n\t"
@ -9867,7 +9870,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m,
"stp x12, x13, [%[a], 0]\n\t"
: [ca] "+r" (ca), [a] "+r" (a)
: [m] "r" (m), [mp] "r" (mp)
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
sp_3072_cond_sub_24(a - 24, a, m, (sp_digit)0 - ca);
@ -11006,7 +11009,6 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m,
"ldp x23, x24, [%[m], 64]\n\t"
"ldp x25, x26, [%[m], 80]\n\t"
"ldp x27, x28, [%[m], 96]\n\t"
"ldp x29, x30, [%[m], 112]\n\t"
"# i = 48\n\t"
"mov x4, 48\n\t"
"ldp x12, x13, [%[a], 0]\n\t"
@ -11131,19 +11133,21 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m,
"# a[i+14] += m[14] * mu\n\t"
"ldr x11, [%[a], 112]\n\t"
"adc x5, x8, xzr\n\t"
"ldr x8, [%[m], 112]\n\t"
"adds x10, x10, x6\n\t"
"mul x7, x29, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x5, x5, xzr\n\t"
"umulh x8, x29, x9\n\t"
"umulh x8, x8, x9\n\t"
"str x10, [%[a], 104]\n\t"
"adds x11, x11, x7\n\t"
"# a[i+15] += m[15] * mu\n\t"
"ldr x10, [%[a], 120]\n\t"
"adc x6, x8, xzr\n\t"
"ldr x8, [%[m], 120]\n\t"
"adds x11, x11, x5\n\t"
"mul x7, x30, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x6, x6, xzr\n\t"
"umulh x8, x30, x9\n\t"
"umulh x8, x8, x9\n\t"
"str x11, [%[a], 112]\n\t"
"adds x10, x10, x7\n\t"
"# a[i+16] += m[16] * mu\n\t"
@ -11480,7 +11484,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m,
"stp x12, x13, [%[a], 0]\n\t"
: [ca] "+r" (ca), [a] "+r" (a)
: [m] "r" (m), [mp] "r" (mp)
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - ca);
@ -15623,7 +15627,6 @@ SP_NOINLINE static void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m,
"ldp x23, x24, [%[m], 64]\n\t"
"ldp x25, x26, [%[m], 80]\n\t"
"ldp x27, x28, [%[m], 96]\n\t"
"ldp x29, x30, [%[m], 112]\n\t"
"# i = 64\n\t"
"mov x4, 64\n\t"
"ldp x12, x13, [%[a], 0]\n\t"
@ -15748,19 +15751,21 @@ SP_NOINLINE static void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m,
"# a[i+14] += m[14] * mu\n\t"
"ldr x11, [%[a], 112]\n\t"
"adc x5, x8, xzr\n\t"
"ldr x8, [%[m], 112]\n\t"
"adds x10, x10, x6\n\t"
"mul x7, x29, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x5, x5, xzr\n\t"
"umulh x8, x29, x9\n\t"
"umulh x8, x8, x9\n\t"
"str x10, [%[a], 104]\n\t"
"adds x11, x11, x7\n\t"
"# a[i+15] += m[15] * mu\n\t"
"ldr x10, [%[a], 120]\n\t"
"adc x6, x8, xzr\n\t"
"ldr x8, [%[m], 120]\n\t"
"adds x11, x11, x5\n\t"
"mul x7, x30, x9\n\t"
"mul x7, x8, x9\n\t"
"adc x6, x6, xzr\n\t"
"umulh x8, x30, x9\n\t"
"umulh x8, x8, x9\n\t"
"str x11, [%[a], 112]\n\t"
"adds x10, x10, x7\n\t"
"# a[i+16] += m[16] * mu\n\t"
@ -16257,7 +16262,7 @@ SP_NOINLINE static void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m,
"stp x12, x13, [%[a], 0]\n\t"
: [ca] "+r" (ca), [a] "+r" (a)
: [m] "r" (m), [mp] "r" (mp)
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
: "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
sp_4096_cond_sub_64(a - 64, a, m, (sp_digit)0 - ca);