Armv8 (Aarch64) ASM fixes for Green Hills compiler

Change branch instructions to proper form.
Use constant value rather than POLY1305_BLOCK_SIZE.
Remove duplicate clobber registers - both w and x versions.
Make clamp unconditionally compiled.
pull/8774/head
Sean Parkinson 2025-05-15 08:58:40 +10:00
parent 978a29da0b
commit f8bb889712
7 changed files with 117 additions and 122 deletions

View File

@ -978,9 +978,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */
"1: \n"
"CMP w11, #1 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"CMP w11, #0 \n"
"BEQ 3f \n"
"B.EQ 3f \n"
"MOV v0.16b, v13.16b \n"
"AESE v0.16b, v1.16b \n"
@ -1129,9 +1129,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */
"1: \n"
"CMP w11, #1 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"CMP w11, #0 \n"
"BEQ 3f \n"
"B.EQ 3f \n"
"MOV v0.16b, v15.16b \n"
"AESE v0.16b, v1.16b \n"
@ -1295,9 +1295,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */
"1: \n"
"CMP w11, #1 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"CMP w11, #0 \n"
"BEQ 3f \n"
"B.EQ 3f \n"
"MOV v0.16b, v17.16b \n"
"AESE v0.16b, v1.16b \n"
@ -22274,7 +22274,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"MOV r12, %[R] \n"
"CMP r12, #10 \n"
"BEQ 1f \n"
"B.EQ 1f \n"
"VLD1.32 {q1}, [%[Key]]! \n"
"AESMC.8 q0, q0\n"
"VLD1.32 {q2}, [%[Key]]! \n"
@ -22283,7 +22283,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"AESE.8 q0, q2\n"
"CMP r12, #12 \n"
"BEQ 1f \n"
"B.EQ 1f \n"
"VLD1.32 {q1}, [%[Key]]! \n"
"AESMC.8 q0, q0\n"
"VLD1.32 {q2}, [%[Key]]! \n"
@ -22350,7 +22350,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"MOV r12, %[R] \n"
"CMP r12, #10 \n"
"BEQ 1f \n"
"B.EQ 1f \n"
"VLD1.32 {q1}, [%[Key]]! \n"
"AESIMC.8 q0, q0\n"
"VLD1.32 {q2}, [%[Key]]! \n"
@ -22359,7 +22359,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"AESD.8 q0, q2\n"
"CMP r12, #12 \n"
"BEQ 1f \n"
"B.EQ 1f \n"
"VLD1.32 {q1}, [%[Key]]! \n"
"AESIMC.8 q0, q0\n"
"VLD1.32 {q2}, [%[Key]]! \n"
@ -22462,7 +22462,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VST1.32 {q0}, [%[out]]! \n"
"CMP r11, #0 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"VLD1.32 {q12}, [%[input]]! \n"
"B 1b \n"
@ -22529,7 +22529,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VST1.32 {q0}, [%[out]]! \n"
"CMP r11, #0 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"VLD1.32 {q12}, [%[input]]! \n"
"B 1b \n"
@ -22603,7 +22603,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"SUB %[Key], %[Key], #16 \n"
"CMP r11, #0 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"VLD1.32 {q12}, [%[input]]! \n"
"B 1b \n"
@ -22701,7 +22701,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VMOV.32 q13, q12 \n"
"CMP r11, #0 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"VLD1.32 {q0}, [%[input]]! \n"
"B 1b \n"
@ -22770,7 +22770,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VMOV.32 q14, q15 \n"
"CMP r11, #0 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"VLD1.32 {q0}, [%[input]]! \n"
"B 1b \n"
@ -22846,7 +22846,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VMOV.32 q14, q15 \n"
"CMP r11, #0 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"VLD1.32 {q0}, [%[input]]! \n"
"B 1b \n"
@ -22902,9 +22902,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */
"1: \n"
"CMP r11, #1 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"CMP r11, #0 \n"
"BEQ 3f \n"
"B.EQ 3f \n"
"VMOV.32 q0, q13 \n"
"AESE.8 q0, q1\n"
@ -23066,9 +23066,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */
"1: \n"
"CMP r11, #1 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"CMP r11, #0 \n"
"BEQ 3f \n"
"B.EQ 3f \n"
"VMOV.32 q0, q13\n"
"AESE.8 q0, q1\n"
@ -23252,9 +23252,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */
"1: \n"
"CMP r11, #1 \n"
"BEQ 2f \n"
"B.EQ 2f \n"
"CMP r11, #0 \n"
"BEQ 3f \n"
"B.EQ 3f \n"
"VMOV.32 q0, q13 \n"
"AESE.8 q0, q1\n"
@ -25017,11 +25017,11 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"# Put last 2 blocks of keys based on rounds into v14, v15\n"
"SUBS WZR, %w[rounds], #14 \n"
"BEQ 40f \n"
"B.EQ 40f \n"
"SUBS WZR, %w[rounds], #12 \n"
"MOV v14.16b, v12.16b \n"
"MOV v15.16b, v13.16b \n"
"BEQ 40f \n"
"B.EQ 40f \n"
"MOV v14.16b, v10.16b \n"
"MOV v15.16b, v11.16b \n"
"40: \n"
@ -25041,17 +25041,17 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"# Put last 2 blocks of keys based on rounds into v14, v15\n"
"SUBS WZR, %w[rounds], #14 \n"
"BEQ 41f \n"
"B.EQ 41f \n"
"SUBS WZR, %w[rounds], #10 \n"
"MOV v14.16b, v10.16b \n"
"MOV v15.16b, v11.16b \n"
"BEQ 41f \n"
"B.EQ 41f \n"
"MOV v14.16b, v12.16b \n"
"MOV v15.16b, v13.16b \n"
"41: \n"
"SUBS WZR, %w[blocks], #4 \n"
"BLT 1f \n"
"B.LT 1f \n"
"AND %w[sz], %w[sz], 0x3f \n"
@ -25174,7 +25174,7 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"AESMC v19.16b, v19.16b \n"
"SUBS WZR, %w[rounds], #10 \n"
"BEQ 21f \n"
"B.EQ 21f \n"
"AESE v16.16b, v10.16b \n"
"AESMC v16.16b, v16.16b \n"
"AESE v17.16b, v10.16b \n"
@ -25193,7 +25193,7 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"AESMC v19.16b, v19.16b \n"
"SUBS WZR, %w[rounds], #12 \n"
"BEQ 21f \n"
"B.EQ 21f \n"
"AESE v16.16b, v12.16b \n"
"AESMC v16.16b, v16.16b \n"
"AESE v17.16b, v12.16b \n"
@ -25231,7 +25231,7 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"ST1 {v16.16b-v19.16b}, [%[out]], #64 \n"
"SUBS %w[blocks], %w[blocks], #4 \n"
"BGE 20b \n"
"B.GE 20b \n"
"ADD %w[blocks], %w[blocks], #4 \n"
"CBZ %w[sz], 3f \n"
@ -25340,11 +25340,11 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"LD1 {v13.2d-v15.2d}, [x10] \n"
"SUBS WZR, %w[rounds], #14 \n"
"BEQ 40f \n"
"B.EQ 40f \n"
"SUBS WZR, %w[rounds], #12 \n"
"MOV v14.16b, v12.16b \n"
"MOV v15.16b, v13.16b \n"
"BEQ 40f \n"
"B.EQ 40f \n"
"MOV v14.16b, v10.16b \n"
"MOV v15.16b, v11.16b \n"
"40: \n"
@ -25362,11 +25362,11 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"LD1 {v13.2d-v15.2d}, [x11] \n"
"SUBS WZR, %w[rounds], #14 \n"
"BEQ 41f \n"
"B.EQ 41f \n"
"SUBS WZR, %w[rounds], #12 \n"
"MOV v14.16b, v12.16b \n"
"MOV v15.16b, v13.16b \n"
"BEQ 41f \n"
"B.EQ 41f \n"
"MOV v14.16b, v10.16b \n"
"MOV v15.16b, v11.16b \n"
"41: \n"
@ -25374,7 +25374,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"CBZ %w[blocks], 3f \n"
"SUBS WZR, %w[blocks], #4 \n"
"BLT 1f \n"
"B.LT 1f \n"
"AND x17, x19, x10, ASR #63\n"
"EXTR x12, x10, x9, #63 \n"
@ -25495,7 +25495,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"AESIMC v19.16b, v19.16b \n"
"SUBS WZR, %w[rounds], #10 \n"
"BEQ 21f \n"
"B.EQ 21f \n"
"AESD v16.16b, v10.16b \n"
"AESIMC v16.16b, v16.16b \n"
"AESD v17.16b, v10.16b \n"
@ -25514,7 +25514,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"AESIMC v19.16b, v19.16b \n"
"SUBS WZR, %w[rounds], #12 \n"
"BEQ 21f \n"
"B.EQ 21f \n"
"AESD v16.16b, v12.16b \n"
"AESIMC v16.16b, v16.16b \n"
"AESD v17.16b, v12.16b \n"
@ -25553,7 +25553,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"SUBS %w[blocks], %w[blocks], #4 \n"
"SUB %w[sz], %w[sz], #64 \n"
"BGE 20b \n"
"B.GE 20b \n"
"ADD %w[blocks], %w[blocks], #4 \n"
"CBZ %w[sz], 4f \n"
@ -25914,7 +25914,7 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"BGT 1b \n"
"CMP %[sz], #0 \n"
"BEQ 3f \n"
"B.EQ 3f \n"
"30: \n"
"#Partial block \n"
@ -26026,7 +26026,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"VLD1.32 {d18, d19}, [%[key2]]! \n"
"CMP %[blocks], #0 \n"
"BEQ 3f \n"
"B.EQ 3f \n"
"1: \n"
"VLD1.32 {q0}, [%[in]]! \n"
@ -26050,7 +26050,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"BGT 1b \n"
"CMP %[sz], #0 \n"
"BEQ 4f \n"
"B.EQ 4f \n"
"3: \n"

View File

@ -868,7 +868,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t"
"bne L_fe_invert1_%=\n\t"
"b.ne L_fe_invert1_%=\n\t"
/* Store */
"stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t"
@ -969,7 +969,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t"
"bne L_fe_invert2_%=\n\t"
"b.ne L_fe_invert2_%=\n\t"
/* Store */
"stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t"
@ -1070,7 +1070,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t"
"bne L_fe_invert3_%=\n\t"
"b.ne L_fe_invert3_%=\n\t"
/* Store */
"stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t"
@ -1171,7 +1171,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t"
"bne L_fe_invert4_%=\n\t"
"b.ne L_fe_invert4_%=\n\t"
/* Store */
"stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t"
@ -1270,7 +1270,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t"
"bne L_fe_invert5_%=\n\t"
"b.ne L_fe_invert5_%=\n\t"
/* Store */
"stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t"
@ -1371,7 +1371,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t"
"bne L_fe_invert6_%=\n\t"
"b.ne L_fe_invert6_%=\n\t"
/* Store */
"stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t"
@ -1472,7 +1472,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t"
"bne L_fe_invert7_%=\n\t"
"b.ne L_fe_invert7_%=\n\t"
/* Store */
"stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t"
@ -1571,7 +1571,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t"
"bne L_fe_invert8_%=\n\t"
"b.ne L_fe_invert8_%=\n\t"
/* Store */
"stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t"
@ -2830,7 +2830,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t"
"bne L_curve25519_inv_1_%=\n\t"
"b.ne L_curve25519_inv_1_%=\n\t"
/* Store */
"stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t"
@ -2931,7 +2931,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t"
"bne L_curve25519_inv_2_%=\n\t"
"b.ne L_curve25519_inv_2_%=\n\t"
/* Store */
"stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t"
@ -3032,7 +3032,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t"
"bne L_curve25519_inv_3_%=\n\t"
"b.ne L_curve25519_inv_3_%=\n\t"
/* Store */
"stp x6, x7, [x29, #144]\n\t"
"stp x8, x9, [x29, #160]\n\t"
@ -3133,7 +3133,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t"
"bne L_curve25519_inv_4_%=\n\t"
"b.ne L_curve25519_inv_4_%=\n\t"
/* Store */
"stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t"
@ -3232,7 +3232,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t"
"bne L_curve25519_inv_5_%=\n\t"
"b.ne L_curve25519_inv_5_%=\n\t"
/* Store */
"stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t"
@ -3333,7 +3333,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t"
"bne L_curve25519_inv_6_%=\n\t"
"b.ne L_curve25519_inv_6_%=\n\t"
/* Store */
"stp x6, x7, [x29, #144]\n\t"
"stp x8, x9, [x29, #160]\n\t"
@ -3434,7 +3434,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t"
"bne L_curve25519_inv_7_%=\n\t"
"b.ne L_curve25519_inv_7_%=\n\t"
/* Store */
"stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t"
@ -3533,7 +3533,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t"
"bne L_curve25519_inv_8_%=\n\t"
"b.ne L_curve25519_inv_8_%=\n\t"
/* Store */
"stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t"
@ -3854,7 +3854,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t"
"bne L_fe_pow22523_1_%=\n\t"
"b.ne L_fe_pow22523_1_%=\n\t"
/* Store */
"stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t"
@ -3957,7 +3957,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t"
"bne L_fe_pow22523_2_%=\n\t"
"b.ne L_fe_pow22523_2_%=\n\t"
/* Store */
"stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t"
@ -4058,7 +4058,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t"
"bne L_fe_pow22523_3_%=\n\t"
"b.ne L_fe_pow22523_3_%=\n\t"
/* Store */
"stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t"
@ -4159,7 +4159,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t"
"bne L_fe_pow22523_4_%=\n\t"
"b.ne L_fe_pow22523_4_%=\n\t"
/* Store */
"stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t"
@ -4258,7 +4258,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t"
"bne L_fe_pow22523_5_%=\n\t"
"b.ne L_fe_pow22523_5_%=\n\t"
/* Store */
"stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t"
@ -4359,7 +4359,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t"
"bne L_fe_pow22523_6_%=\n\t"
"b.ne L_fe_pow22523_6_%=\n\t"
/* Store */
"stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t"
@ -4460,7 +4460,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t"
"bne L_fe_pow22523_7_%=\n\t"
"b.ne L_fe_pow22523_7_%=\n\t"
/* Store */
"stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t"

View File

@ -8553,7 +8553,7 @@ int mlkem_cmp_neon(const byte* a, const byte* b, int sz)
"orr v10.16b, v10.16b, v2.16b\n\t"
"orr v11.16b, v11.16b, v3.16b\n\t"
"subs %w[sz], %w[sz], #0x300\n\t"
"beq L_mlkem_aarch64_cmp_neon_done_%=\n\t"
"b.eq L_mlkem_aarch64_cmp_neon_done_%=\n\t"
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t"
"ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t"
"eor v0.16b, v0.16b, v4.16b\n\t"
@ -8605,7 +8605,7 @@ int mlkem_cmp_neon(const byte* a, const byte* b, int sz)
"orr v10.16b, v10.16b, v2.16b\n\t"
"orr v11.16b, v11.16b, v3.16b\n\t"
"subs %w[sz], %w[sz], #0x140\n\t"
"beq L_mlkem_aarch64_cmp_neon_done_%=\n\t"
"b.eq L_mlkem_aarch64_cmp_neon_done_%=\n\t"
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t"
"ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t"
"eor v0.16b, v0.16b, v4.16b\n\t"
@ -9278,9 +9278,9 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
"ldr q3, [x5]\n\t"
"ldr q2, [x6]\n\t"
"subs wzr, %w[len], #0\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t"
"b.eq L_mlkem_rej_uniform_done_%=\n\t"
"subs wzr, %w[len], #16\n\t"
"blt L_mlkem_rej_uniform_loop_4_%=\n\t"
"b.lt L_mlkem_rej_uniform_loop_4_%=\n\t"
"\n"
"L_mlkem_rej_uniform_loop_16_%=: \n\t"
"ld3 {v4.8b, v5.8b, v6.8b}, [%x[r]], #24\n\t"
@ -9323,17 +9323,17 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
"add %x[p], %x[p], x11, lsl 1\n\t"
"add x12, x12, x11\n\t"
"subs %w[rLen], %w[rLen], #24\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t"
"b.eq L_mlkem_rej_uniform_done_%=\n\t"
"sub w10, %w[len], w12\n\t"
"subs x10, x10, #16\n\t"
"blt L_mlkem_rej_uniform_loop_4_%=\n\t"
"b.lt L_mlkem_rej_uniform_loop_4_%=\n\t"
"b L_mlkem_rej_uniform_loop_16_%=\n\t"
"\n"
"L_mlkem_rej_uniform_loop_4_%=: \n\t"
"subs w10, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t"
"b.eq L_mlkem_rej_uniform_done_%=\n\t"
"subs x10, x10, #4\n\t"
"blt L_mlkem_rej_uniform_loop_lt_4_%=\n\t"
"b.lt L_mlkem_rej_uniform_loop_lt_4_%=\n\t"
"ldr x4, [%x[r]], #6\n\t"
"lsr x5, x4, #12\n\t"
"lsr x6, x4, #24\n\t"
@ -9363,7 +9363,7 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
"cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t"
"subs %w[rLen], %w[rLen], #6\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t"
"b.eq L_mlkem_rej_uniform_done_%=\n\t"
"b L_mlkem_rej_uniform_loop_4_%=\n\t"
"\n"
"L_mlkem_rej_uniform_loop_lt_4_%=: \n\t"
@ -9381,30 +9381,30 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
"cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t"
"subs wzr, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t"
"b.eq L_mlkem_rej_uniform_done_%=\n\t"
"strh w5, [%x[p]]\n\t"
"subs xzr, x5, x13\n\t"
"cinc %x[p], %x[p], lt\n\t"
"cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t"
"subs wzr, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t"
"b.eq L_mlkem_rej_uniform_done_%=\n\t"
"strh w6, [%x[p]]\n\t"
"subs xzr, x6, x13\n\t"
"cinc %x[p], %x[p], lt\n\t"
"cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t"
"subs wzr, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t"
"b.eq L_mlkem_rej_uniform_done_%=\n\t"
"strh w7, [%x[p]]\n\t"
"subs xzr, x7, x13\n\t"
"cinc %x[p], %x[p], lt\n\t"
"cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t"
"subs wzr, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t"
"b.eq L_mlkem_rej_uniform_done_%=\n\t"
"subs %w[rLen], %w[rLen], #6\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t"
"b.eq L_mlkem_rej_uniform_done_%=\n\t"
"b L_mlkem_rej_uniform_loop_lt_4_%=\n\t"
"\n"
"L_mlkem_rej_uniform_done_%=: \n\t"
@ -9695,7 +9695,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
"mov v30.d[1], %x[state]\n\t"
"eor x1, x1, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t"
"b.ne L_SHA3_transform_blocksx3_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -10037,7 +10037,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
"mov v30.d[1], %x[state]\n\t"
"eor x2, x2, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t"
"b.ne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -10379,7 +10379,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
"mov v30.d[1], %x[state]\n\t"
"eor x2, x2, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t"
"b.ne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -10785,7 +10785,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
"mov v30.d[1], %x[state]\n\t"
"eor x1, x1, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t"
"b.ne L_SHA3_transform_blocksx3_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -11212,7 +11212,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
"mov v30.d[1], %x[state]\n\t"
"eor x2, x2, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t"
"b.ne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -11639,7 +11639,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
"mov v30.d[1], %x[state]\n\t"
"eor x2, x2, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t"
"b.ne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"

View File

@ -48,7 +48,7 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
{
__asm__ __volatile__ (
/* Check for zero bytes to do. */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
"CMP %[bytes], #16 \n\t"
"BLO L_poly1305_aarch64_16_done_%= \n\t"
"MOV x12, #1 \n\t"
@ -127,8 +127,8 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
"ADCS x5, x5, x15\n\t"
"ADC x6, x6, xzr\n\t"
"SUBS %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]\n\t"
"ADD %[m], %[m], %[POLY1305_BLOCK_SIZE]\n\t"
"SUBS %[bytes], %[bytes], #16\n\t"
"ADD %[m], %[m], #16\n\t"
"BGT L_poly1305_aarch64_16_loop_%=\n\t"
/* Base 64 -> Base 26 */
@ -160,7 +160,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
{
__asm__ __volatile__ (
/* If less than 4 blocks to process then use regular method */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t"
"CMP %[bytes], #64 \n\t"
"BLO L_poly1305_aarch64_64_done_%= \n\t"
"MOV x9, #0x3ffffff \n\t"
/* Load h */
@ -188,7 +188,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"MOV v26.D[0], x9 \n\t"
"MOV v26.D[1], x9 \n\t"
"DUP v30.4S, v26.S[0] \n\t"
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t"
"CMP %[bytes], #96 \n\t"
"BLO L_poly1305_aarch64_64_start_block_size_64_%= \n\t"
/* Load r^2 to NEON v0, v1, v2, v3, v4 */
"LD4 { v0.S-v3.S }[2], [%[r_2]], #16 \n\t"
@ -229,7 +229,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
/* Load m */
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t"
"SUB %[bytes], %[bytes], #64 \n\t"
"USHR v14.4S, v13.4S, #8 \n\t"
"ORR v14.16B, v14.16B, v30.16B \n\t"
"SHL v13.4S, v13.4S, #18 \n\t"
@ -362,12 +362,12 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"UMLAL2 v24.2D, v14.4S, v9.4S \n\t"
"UMLAL2 v25.2D, v14.4S, v0.4S \n\t"
/* If less than six message blocks left then leave loop */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t"
"CMP %[bytes], #96 \n\t"
"BLS L_poly1305_aarch64_64_loop_128_final_%= \n\t"
/* Load m */
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t"
"SUB %[bytes], %[bytes], #64 \n\t"
"USHR v14.4S, v13.4S, #8 \n\t"
"ORR v14.16B, v14.16B, v30.16B \n\t"
"SHL v13.4S, v13.4S, #18 \n\t"
@ -424,7 +424,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
/* Copy r^2 to lower half of registers */
"MOV v0.D[0], v0.D[1] \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
"SUB %[bytes], %[bytes], #32 \n\t"
"MOV v5.D[0], v5.D[1] \n\t"
"USHR v14.2D, v11.2D, #40 \n\t"
"MOV v1.D[0], v1.D[1] \n\t"
@ -492,7 +492,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"MOV v18.S[1], v18.S[2] \n\t"
"MOV v19.S[1], v19.S[2] \n\t"
/* If less than 2 blocks left go straight to final multiplication. */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
"CMP %[bytes], #32 \n\t"
"BLO L_poly1305_aarch64_64_last_mult_%= \n\t"
/* Else go to one loop of L_poly1305_aarch64_64_loop_64 */
"B L_poly1305_aarch64_64_loop_64_%= \n\t"
@ -524,7 +524,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
/* Load m */
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
"SUB %[bytes], %[bytes], #32 \n\t"
"USHR v14.2D, v11.2D, #40 \n\t"
"ORR v14.16B, v14.16B, v26.16B \n\t"
"USHR v13.2D, v11.2D, #14 \n\t"
@ -616,7 +616,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
/* Reduce h % P */
"MOV x14, #5 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
"SUB %[bytes], %[bytes], #32 \n\t"
"ADD x10, x10, x9, LSR #26 \n\t"
"USHR v14.2D, v11.2D, #40 \n\t"
"ADD x13, x13, x12, LSR #26 \n\t"
@ -676,7 +676,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"MOV v18.S[1], v18.S[2] \n\t"
"MOV v19.S[1], v19.S[2] \n\t"
/* If at least two message blocks left then loop_64 */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
"CMP %[bytes], #32 \n\t"
"BHS L_poly1305_aarch64_64_loop_64_%= \n\t"
"\n"
".align 2 \n\t"
@ -831,11 +831,9 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17",
"w19", "w20", "w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28",
"w30", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
"x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27",
"x28", "x30"
"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
"x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28",
"x30"
);
poly1305_blocks_aarch64_16(ctx, m, bytes);
}
@ -845,12 +843,10 @@ void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m)
poly1305_blocks_aarch64_16(ctx, m, POLY1305_BLOCK_SIZE);
}
#if defined(POLY130564)
static word64 clamp[] = {
0x0ffffffc0fffffff,
0x0ffffffc0ffffffc,
};
#endif /* POLY130564 */
int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
@ -1112,7 +1108,6 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
[ctx_r64] "r" (ctx->r64), [ctx_r] "r" (ctx->r),
[ctx_r_2] "r" (ctx->r_2), [ctx_r_4] "r" (ctx->r_4)
: "memory", "cc",
"w4", "w5", "w6", "w7", "w8",
"x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10"
);

View File

@ -153,7 +153,7 @@ void BlockSha3_crypto(word64* state)
"ld1r {v30.2d}, [x1], #8\n\t"
"subs x2, x2, #1\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_sha3_crypto_begin_%=\n\t"
"b.ne L_sha3_crypto_begin_%=\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
"st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t"
@ -352,7 +352,7 @@ void BlockSha3_base(word64* state)
"ldr %x[state], [x27], #8\n\t"
"subs x28, x28, #1\n\t"
"eor x1, x1, %x[state]\n\t"
"bne L_SHA3_transform_base_begin_%=\n\t"
"b.ne L_SHA3_transform_base_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t"
"stp x1, x2, [%x[state]]\n\t"
"stp x3, x4, [%x[state], #16]\n\t"

View File

@ -629,7 +629,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
"add x8, x8, x4\n\t"
"add x4, x4, x14\n\t"
"subs x27, x27, #1\n\t"
"bne L_sha512_len_neon_start_%=\n\t"
"b.ne L_sha512_len_neon_start_%=\n\t"
/* Round 0 */
"mov x13, v0.d[0]\n\t"
"ldr x15, [x3], #8\n\t"
@ -998,7 +998,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
"add x3, x3, %[L_SHA512_transform_neon_len_k]@PAGEOFF\n\t"
#endif /* __APPLE__ */
"subs %w[len], %w[len], #0x80\n\t"
"bne L_sha512_len_neon_begin_%=\n\t"
"b.ne L_sha512_len_neon_begin_%=\n\t"
"stp x4, x5, [%x[sha512]]\n\t"
"stp x6, x7, [%x[sha512], #16]\n\t"
"stp x8, x9, [%x[sha512], #32]\n\t"
@ -1576,7 +1576,7 @@ void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data,
"add v25.2d, v25.2d, v29.2d\n\t"
"add v24.2d, v24.2d, v28.2d\n\t"
"subs %w[len], %w[len], #0x80\n\t"
"bne L_sha512_len_crypto_begin_%=\n\t"
"b.ne L_sha512_len_crypto_begin_%=\n\t"
/* Store digest back */
"st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [%x[sha512]]\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)

View File

@ -93,7 +93,7 @@ static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t"
"blt 2f\n\t"
"b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */
"1:\n\t"
"ldr x8, [x4], -8\n\t"
@ -3351,7 +3351,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m,
"adc x3, x3, xzr\n\t"
"subs x4, x4, 1\n\t"
"add %[a], %[a], 8\n\t"
"bne 1b\n\t"
"b.ne 1b\n\t"
"# Create mask\n\t"
"neg x3, x3\n\t"
"mov x9, %[a]\n\t"
@ -6980,7 +6980,7 @@ static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t"
"blt 2f\n\t"
"b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */
"1:\n\t"
"ldr x8, [x4], -8\n\t"
@ -16577,7 +16577,7 @@ static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t"
"blt 2f\n\t"
"b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */
"1:\n\t"
"ldr x8, [x4], -8\n\t"
@ -39659,7 +39659,7 @@ static void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t"
"blt 2f\n\t"
"b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */
"1:\n\t"
"ldr x8, [x4], -8\n\t"
@ -43865,7 +43865,7 @@ SP_NOINLINE static void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit*
"adc x3, x3, xzr\n\t"
"subs x4, x4, 1\n\t"
"add %[a], %[a], 8\n\t"
"bne 1b\n\t"
"b.ne 1b\n\t"
"# Create mask\n\t"
"neg x3, x3\n\t"
"mov x9, %[a]\n\t"
@ -66408,7 +66408,7 @@ static void sp_384_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t"
"blt 2f\n\t"
"b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */
"1:\n\t"
"ldr x8, [x4], -8\n\t"
@ -72238,7 +72238,7 @@ SP_NOINLINE static void sp_521_mont_reduce_9(sp_digit* a, const sp_digit* m,
"# mu = a[i] * mp\n\t"
"mul x9, %[mp], x13\n\t"
"cmp x4, #1\n\t"
"bne L_521_mont_reduce_9_nomask\n\t"
"b.ne L_521_mont_reduce_9_nomask\n\t"
"and x9, x9, #0x1ff\n\t"
"L_521_mont_reduce_9_nomask:\n\t"
"# a[i+0] += m[0] * mu\n\t"
@ -72312,7 +72312,7 @@ SP_NOINLINE static void sp_521_mont_reduce_9(sp_digit* a, const sp_digit* m,
"adc x3, x3, xzr\n\t"
"subs x4, x4, 1\n\t"
"add %[a], %[a], 8\n\t"
"bne 1b\n\t"
"b.ne 1b\n\t"
"extr x12, x13, x12, 9\n\t"
"extr x13, x14, x13, 9\n\t"
"extr x14, x15, x14, 9\n\t"
@ -111555,7 +111555,7 @@ static void sp_521_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t"
"blt 2f\n\t"
"b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */
"1:\n\t"
"ldr x8, [x4], -8\n\t"
@ -115993,7 +115993,7 @@ SP_NOINLINE static void sp_1024_mont_reduce_16(sp_digit* a, const sp_digit* m,
"adc x3, x3, xzr\n\t"
"subs x4, x4, 1\n\t"
"add %[a], %[a], 8\n\t"
"bne 1b\n\t"
"b.ne 1b\n\t"
"# Create mask\n\t"
"subs x11, x10, x28\n\t"
"neg x3, x3\n\t"
@ -125143,7 +125143,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t"
"blt 2f\n\t"
"b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */
"1:\n\t"
"ldr x8, [x4], -8\n\t"