Armv8 (Aarch64) ASM fixes for Green Hills compiler

Change branch instructions to proper form.
Use constant value rather than POLY1305_BLOCK_SIZE.
Remove duplicate clobber registers - both w and x versions.
Make clamp unconditionally compiled.
pull/8774/head
Sean Parkinson 2025-05-15 08:58:40 +10:00
parent 978a29da0b
commit f8bb889712
7 changed files with 117 additions and 122 deletions

View File

@ -978,9 +978,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */ /* double block */
"1: \n" "1: \n"
"CMP w11, #1 \n" "CMP w11, #1 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"CMP w11, #0 \n" "CMP w11, #0 \n"
"BEQ 3f \n" "B.EQ 3f \n"
"MOV v0.16b, v13.16b \n" "MOV v0.16b, v13.16b \n"
"AESE v0.16b, v1.16b \n" "AESE v0.16b, v1.16b \n"
@ -1129,9 +1129,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */ /* double block */
"1: \n" "1: \n"
"CMP w11, #1 \n" "CMP w11, #1 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"CMP w11, #0 \n" "CMP w11, #0 \n"
"BEQ 3f \n" "B.EQ 3f \n"
"MOV v0.16b, v15.16b \n" "MOV v0.16b, v15.16b \n"
"AESE v0.16b, v1.16b \n" "AESE v0.16b, v1.16b \n"
@ -1295,9 +1295,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */ /* double block */
"1: \n" "1: \n"
"CMP w11, #1 \n" "CMP w11, #1 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"CMP w11, #0 \n" "CMP w11, #0 \n"
"BEQ 3f \n" "B.EQ 3f \n"
"MOV v0.16b, v17.16b \n" "MOV v0.16b, v17.16b \n"
"AESE v0.16b, v1.16b \n" "AESE v0.16b, v1.16b \n"
@ -22274,7 +22274,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"MOV r12, %[R] \n" "MOV r12, %[R] \n"
"CMP r12, #10 \n" "CMP r12, #10 \n"
"BEQ 1f \n" "B.EQ 1f \n"
"VLD1.32 {q1}, [%[Key]]! \n" "VLD1.32 {q1}, [%[Key]]! \n"
"AESMC.8 q0, q0\n" "AESMC.8 q0, q0\n"
"VLD1.32 {q2}, [%[Key]]! \n" "VLD1.32 {q2}, [%[Key]]! \n"
@ -22283,7 +22283,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"AESE.8 q0, q2\n" "AESE.8 q0, q2\n"
"CMP r12, #12 \n" "CMP r12, #12 \n"
"BEQ 1f \n" "B.EQ 1f \n"
"VLD1.32 {q1}, [%[Key]]! \n" "VLD1.32 {q1}, [%[Key]]! \n"
"AESMC.8 q0, q0\n" "AESMC.8 q0, q0\n"
"VLD1.32 {q2}, [%[Key]]! \n" "VLD1.32 {q2}, [%[Key]]! \n"
@ -22350,7 +22350,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"MOV r12, %[R] \n" "MOV r12, %[R] \n"
"CMP r12, #10 \n" "CMP r12, #10 \n"
"BEQ 1f \n" "B.EQ 1f \n"
"VLD1.32 {q1}, [%[Key]]! \n" "VLD1.32 {q1}, [%[Key]]! \n"
"AESIMC.8 q0, q0\n" "AESIMC.8 q0, q0\n"
"VLD1.32 {q2}, [%[Key]]! \n" "VLD1.32 {q2}, [%[Key]]! \n"
@ -22359,7 +22359,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"AESD.8 q0, q2\n" "AESD.8 q0, q2\n"
"CMP r12, #12 \n" "CMP r12, #12 \n"
"BEQ 1f \n" "B.EQ 1f \n"
"VLD1.32 {q1}, [%[Key]]! \n" "VLD1.32 {q1}, [%[Key]]! \n"
"AESIMC.8 q0, q0\n" "AESIMC.8 q0, q0\n"
"VLD1.32 {q2}, [%[Key]]! \n" "VLD1.32 {q2}, [%[Key]]! \n"
@ -22462,7 +22462,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VST1.32 {q0}, [%[out]]! \n" "VST1.32 {q0}, [%[out]]! \n"
"CMP r11, #0 \n" "CMP r11, #0 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"VLD1.32 {q12}, [%[input]]! \n" "VLD1.32 {q12}, [%[input]]! \n"
"B 1b \n" "B 1b \n"
@ -22529,7 +22529,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VST1.32 {q0}, [%[out]]! \n" "VST1.32 {q0}, [%[out]]! \n"
"CMP r11, #0 \n" "CMP r11, #0 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"VLD1.32 {q12}, [%[input]]! \n" "VLD1.32 {q12}, [%[input]]! \n"
"B 1b \n" "B 1b \n"
@ -22603,7 +22603,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"SUB %[Key], %[Key], #16 \n" "SUB %[Key], %[Key], #16 \n"
"CMP r11, #0 \n" "CMP r11, #0 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"VLD1.32 {q12}, [%[input]]! \n" "VLD1.32 {q12}, [%[input]]! \n"
"B 1b \n" "B 1b \n"
@ -22701,7 +22701,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VMOV.32 q13, q12 \n" "VMOV.32 q13, q12 \n"
"CMP r11, #0 \n" "CMP r11, #0 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"VLD1.32 {q0}, [%[input]]! \n" "VLD1.32 {q0}, [%[input]]! \n"
"B 1b \n" "B 1b \n"
@ -22770,7 +22770,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VMOV.32 q14, q15 \n" "VMOV.32 q14, q15 \n"
"CMP r11, #0 \n" "CMP r11, #0 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"VLD1.32 {q0}, [%[input]]! \n" "VLD1.32 {q0}, [%[input]]! \n"
"B 1b \n" "B 1b \n"
@ -22846,7 +22846,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
"VMOV.32 q14, q15 \n" "VMOV.32 q14, q15 \n"
"CMP r11, #0 \n" "CMP r11, #0 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"VLD1.32 {q0}, [%[input]]! \n" "VLD1.32 {q0}, [%[input]]! \n"
"B 1b \n" "B 1b \n"
@ -22902,9 +22902,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */ /* double block */
"1: \n" "1: \n"
"CMP r11, #1 \n" "CMP r11, #1 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"CMP r11, #0 \n" "CMP r11, #0 \n"
"BEQ 3f \n" "B.EQ 3f \n"
"VMOV.32 q0, q13 \n" "VMOV.32 q0, q13 \n"
"AESE.8 q0, q1\n" "AESE.8 q0, q1\n"
@ -23066,9 +23066,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */ /* double block */
"1: \n" "1: \n"
"CMP r11, #1 \n" "CMP r11, #1 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"CMP r11, #0 \n" "CMP r11, #0 \n"
"BEQ 3f \n" "B.EQ 3f \n"
"VMOV.32 q0, q13\n" "VMOV.32 q0, q13\n"
"AESE.8 q0, q1\n" "AESE.8 q0, q1\n"
@ -23252,9 +23252,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
/* double block */ /* double block */
"1: \n" "1: \n"
"CMP r11, #1 \n" "CMP r11, #1 \n"
"BEQ 2f \n" "B.EQ 2f \n"
"CMP r11, #0 \n" "CMP r11, #0 \n"
"BEQ 3f \n" "B.EQ 3f \n"
"VMOV.32 q0, q13 \n" "VMOV.32 q0, q13 \n"
"AESE.8 q0, q1\n" "AESE.8 q0, q1\n"
@ -25017,11 +25017,11 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"# Put last 2 blocks of keys based on rounds into v14, v15\n" "# Put last 2 blocks of keys based on rounds into v14, v15\n"
"SUBS WZR, %w[rounds], #14 \n" "SUBS WZR, %w[rounds], #14 \n"
"BEQ 40f \n" "B.EQ 40f \n"
"SUBS WZR, %w[rounds], #12 \n" "SUBS WZR, %w[rounds], #12 \n"
"MOV v14.16b, v12.16b \n" "MOV v14.16b, v12.16b \n"
"MOV v15.16b, v13.16b \n" "MOV v15.16b, v13.16b \n"
"BEQ 40f \n" "B.EQ 40f \n"
"MOV v14.16b, v10.16b \n" "MOV v14.16b, v10.16b \n"
"MOV v15.16b, v11.16b \n" "MOV v15.16b, v11.16b \n"
"40: \n" "40: \n"
@ -25041,17 +25041,17 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"# Put last 2 blocks of keys based on rounds into v14, v15\n" "# Put last 2 blocks of keys based on rounds into v14, v15\n"
"SUBS WZR, %w[rounds], #14 \n" "SUBS WZR, %w[rounds], #14 \n"
"BEQ 41f \n" "B.EQ 41f \n"
"SUBS WZR, %w[rounds], #10 \n" "SUBS WZR, %w[rounds], #10 \n"
"MOV v14.16b, v10.16b \n" "MOV v14.16b, v10.16b \n"
"MOV v15.16b, v11.16b \n" "MOV v15.16b, v11.16b \n"
"BEQ 41f \n" "B.EQ 41f \n"
"MOV v14.16b, v12.16b \n" "MOV v14.16b, v12.16b \n"
"MOV v15.16b, v13.16b \n" "MOV v15.16b, v13.16b \n"
"41: \n" "41: \n"
"SUBS WZR, %w[blocks], #4 \n" "SUBS WZR, %w[blocks], #4 \n"
"BLT 1f \n" "B.LT 1f \n"
"AND %w[sz], %w[sz], 0x3f \n" "AND %w[sz], %w[sz], 0x3f \n"
@ -25174,7 +25174,7 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"AESMC v19.16b, v19.16b \n" "AESMC v19.16b, v19.16b \n"
"SUBS WZR, %w[rounds], #10 \n" "SUBS WZR, %w[rounds], #10 \n"
"BEQ 21f \n" "B.EQ 21f \n"
"AESE v16.16b, v10.16b \n" "AESE v16.16b, v10.16b \n"
"AESMC v16.16b, v16.16b \n" "AESMC v16.16b, v16.16b \n"
"AESE v17.16b, v10.16b \n" "AESE v17.16b, v10.16b \n"
@ -25193,7 +25193,7 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"AESMC v19.16b, v19.16b \n" "AESMC v19.16b, v19.16b \n"
"SUBS WZR, %w[rounds], #12 \n" "SUBS WZR, %w[rounds], #12 \n"
"BEQ 21f \n" "B.EQ 21f \n"
"AESE v16.16b, v12.16b \n" "AESE v16.16b, v12.16b \n"
"AESMC v16.16b, v16.16b \n" "AESMC v16.16b, v16.16b \n"
"AESE v17.16b, v12.16b \n" "AESE v17.16b, v12.16b \n"
@ -25231,7 +25231,7 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"ST1 {v16.16b-v19.16b}, [%[out]], #64 \n" "ST1 {v16.16b-v19.16b}, [%[out]], #64 \n"
"SUBS %w[blocks], %w[blocks], #4 \n" "SUBS %w[blocks], %w[blocks], #4 \n"
"BGE 20b \n" "B.GE 20b \n"
"ADD %w[blocks], %w[blocks], #4 \n" "ADD %w[blocks], %w[blocks], #4 \n"
"CBZ %w[sz], 3f \n" "CBZ %w[sz], 3f \n"
@ -25340,11 +25340,11 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"LD1 {v13.2d-v15.2d}, [x10] \n" "LD1 {v13.2d-v15.2d}, [x10] \n"
"SUBS WZR, %w[rounds], #14 \n" "SUBS WZR, %w[rounds], #14 \n"
"BEQ 40f \n" "B.EQ 40f \n"
"SUBS WZR, %w[rounds], #12 \n" "SUBS WZR, %w[rounds], #12 \n"
"MOV v14.16b, v12.16b \n" "MOV v14.16b, v12.16b \n"
"MOV v15.16b, v13.16b \n" "MOV v15.16b, v13.16b \n"
"BEQ 40f \n" "B.EQ 40f \n"
"MOV v14.16b, v10.16b \n" "MOV v14.16b, v10.16b \n"
"MOV v15.16b, v11.16b \n" "MOV v15.16b, v11.16b \n"
"40: \n" "40: \n"
@ -25362,11 +25362,11 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"LD1 {v13.2d-v15.2d}, [x11] \n" "LD1 {v13.2d-v15.2d}, [x11] \n"
"SUBS WZR, %w[rounds], #14 \n" "SUBS WZR, %w[rounds], #14 \n"
"BEQ 41f \n" "B.EQ 41f \n"
"SUBS WZR, %w[rounds], #12 \n" "SUBS WZR, %w[rounds], #12 \n"
"MOV v14.16b, v12.16b \n" "MOV v14.16b, v12.16b \n"
"MOV v15.16b, v13.16b \n" "MOV v15.16b, v13.16b \n"
"BEQ 41f \n" "B.EQ 41f \n"
"MOV v14.16b, v10.16b \n" "MOV v14.16b, v10.16b \n"
"MOV v15.16b, v11.16b \n" "MOV v15.16b, v11.16b \n"
"41: \n" "41: \n"
@ -25374,7 +25374,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"CBZ %w[blocks], 3f \n" "CBZ %w[blocks], 3f \n"
"SUBS WZR, %w[blocks], #4 \n" "SUBS WZR, %w[blocks], #4 \n"
"BLT 1f \n" "B.LT 1f \n"
"AND x17, x19, x10, ASR #63\n" "AND x17, x19, x10, ASR #63\n"
"EXTR x12, x10, x9, #63 \n" "EXTR x12, x10, x9, #63 \n"
@ -25495,7 +25495,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"AESIMC v19.16b, v19.16b \n" "AESIMC v19.16b, v19.16b \n"
"SUBS WZR, %w[rounds], #10 \n" "SUBS WZR, %w[rounds], #10 \n"
"BEQ 21f \n" "B.EQ 21f \n"
"AESD v16.16b, v10.16b \n" "AESD v16.16b, v10.16b \n"
"AESIMC v16.16b, v16.16b \n" "AESIMC v16.16b, v16.16b \n"
"AESD v17.16b, v10.16b \n" "AESD v17.16b, v10.16b \n"
@ -25514,7 +25514,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"AESIMC v19.16b, v19.16b \n" "AESIMC v19.16b, v19.16b \n"
"SUBS WZR, %w[rounds], #12 \n" "SUBS WZR, %w[rounds], #12 \n"
"BEQ 21f \n" "B.EQ 21f \n"
"AESD v16.16b, v12.16b \n" "AESD v16.16b, v12.16b \n"
"AESIMC v16.16b, v16.16b \n" "AESIMC v16.16b, v16.16b \n"
"AESD v17.16b, v12.16b \n" "AESD v17.16b, v12.16b \n"
@ -25553,7 +25553,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"SUBS %w[blocks], %w[blocks], #4 \n" "SUBS %w[blocks], %w[blocks], #4 \n"
"SUB %w[sz], %w[sz], #64 \n" "SUB %w[sz], %w[sz], #64 \n"
"BGE 20b \n" "B.GE 20b \n"
"ADD %w[blocks], %w[blocks], #4 \n" "ADD %w[blocks], %w[blocks], #4 \n"
"CBZ %w[sz], 4f \n" "CBZ %w[sz], 4f \n"
@ -25914,7 +25914,7 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"BGT 1b \n" "BGT 1b \n"
"CMP %[sz], #0 \n" "CMP %[sz], #0 \n"
"BEQ 3f \n" "B.EQ 3f \n"
"30: \n" "30: \n"
"#Partial block \n" "#Partial block \n"
@ -26026,7 +26026,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"VLD1.32 {d18, d19}, [%[key2]]! \n" "VLD1.32 {d18, d19}, [%[key2]]! \n"
"CMP %[blocks], #0 \n" "CMP %[blocks], #0 \n"
"BEQ 3f \n" "B.EQ 3f \n"
"1: \n" "1: \n"
"VLD1.32 {q0}, [%[in]]! \n" "VLD1.32 {q0}, [%[in]]! \n"
@ -26050,7 +26050,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
"BGT 1b \n" "BGT 1b \n"
"CMP %[sz], #0 \n" "CMP %[sz], #0 \n"
"BEQ 4f \n" "B.EQ 4f \n"
"3: \n" "3: \n"

View File

@ -868,7 +868,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t" "subs x20, x20, #1\n\t"
"bne L_fe_invert1_%=\n\t" "b.ne L_fe_invert1_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #80]\n\t" "stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t" "stp x8, x9, [x29, #96]\n\t"
@ -969,7 +969,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t" "subs x20, x20, #1\n\t"
"bne L_fe_invert2_%=\n\t" "b.ne L_fe_invert2_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #80]\n\t" "stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t" "stp x8, x9, [x29, #96]\n\t"
@ -1070,7 +1070,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t" "subs x20, x20, #1\n\t"
"bne L_fe_invert3_%=\n\t" "b.ne L_fe_invert3_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #112]\n\t" "stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t" "stp x8, x9, [x29, #128]\n\t"
@ -1171,7 +1171,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t" "subs x20, x20, #1\n\t"
"bne L_fe_invert4_%=\n\t" "b.ne L_fe_invert4_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #80]\n\t" "stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t" "stp x8, x9, [x29, #96]\n\t"
@ -1270,7 +1270,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t" "subs x20, x20, #1\n\t"
"bne L_fe_invert5_%=\n\t" "b.ne L_fe_invert5_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #80]\n\t" "stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t" "stp x8, x9, [x29, #96]\n\t"
@ -1371,7 +1371,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t" "subs x20, x20, #1\n\t"
"bne L_fe_invert6_%=\n\t" "b.ne L_fe_invert6_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #112]\n\t" "stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t" "stp x8, x9, [x29, #128]\n\t"
@ -1472,7 +1472,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t" "subs x20, x20, #1\n\t"
"bne L_fe_invert7_%=\n\t" "b.ne L_fe_invert7_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #80]\n\t" "stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t" "stp x8, x9, [x29, #96]\n\t"
@ -1571,7 +1571,7 @@ void fe_invert(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x20, x20, #1\n\t" "subs x20, x20, #1\n\t"
"bne L_fe_invert8_%=\n\t" "b.ne L_fe_invert8_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #48]\n\t" "stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t" "stp x8, x9, [x29, #64]\n\t"
@ -2830,7 +2830,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t" "subs x24, x24, #1\n\t"
"bne L_curve25519_inv_1_%=\n\t" "b.ne L_curve25519_inv_1_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #112]\n\t" "stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t" "stp x8, x9, [x29, #128]\n\t"
@ -2931,7 +2931,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t" "subs x24, x24, #1\n\t"
"bne L_curve25519_inv_2_%=\n\t" "b.ne L_curve25519_inv_2_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #112]\n\t" "stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t" "stp x8, x9, [x29, #128]\n\t"
@ -3032,7 +3032,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t" "subs x24, x24, #1\n\t"
"bne L_curve25519_inv_3_%=\n\t" "b.ne L_curve25519_inv_3_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #144]\n\t" "stp x6, x7, [x29, #144]\n\t"
"stp x8, x9, [x29, #160]\n\t" "stp x8, x9, [x29, #160]\n\t"
@ -3133,7 +3133,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t" "subs x24, x24, #1\n\t"
"bne L_curve25519_inv_4_%=\n\t" "b.ne L_curve25519_inv_4_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #112]\n\t" "stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t" "stp x8, x9, [x29, #128]\n\t"
@ -3232,7 +3232,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t" "subs x24, x24, #1\n\t"
"bne L_curve25519_inv_5_%=\n\t" "b.ne L_curve25519_inv_5_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #112]\n\t" "stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t" "stp x8, x9, [x29, #128]\n\t"
@ -3333,7 +3333,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t" "subs x24, x24, #1\n\t"
"bne L_curve25519_inv_6_%=\n\t" "b.ne L_curve25519_inv_6_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #144]\n\t" "stp x6, x7, [x29, #144]\n\t"
"stp x8, x9, [x29, #160]\n\t" "stp x8, x9, [x29, #160]\n\t"
@ -3434,7 +3434,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t" "subs x24, x24, #1\n\t"
"bne L_curve25519_inv_7_%=\n\t" "b.ne L_curve25519_inv_7_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #112]\n\t" "stp x6, x7, [x29, #112]\n\t"
"stp x8, x9, [x29, #128]\n\t" "stp x8, x9, [x29, #128]\n\t"
@ -3533,7 +3533,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x24, x24, #1\n\t" "subs x24, x24, #1\n\t"
"bne L_curve25519_inv_8_%=\n\t" "b.ne L_curve25519_inv_8_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #80]\n\t" "stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t" "stp x8, x9, [x29, #96]\n\t"
@ -3854,7 +3854,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t" "subs x23, x23, #1\n\t"
"bne L_fe_pow22523_1_%=\n\t" "b.ne L_fe_pow22523_1_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #48]\n\t" "stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t" "stp x8, x9, [x29, #64]\n\t"
@ -3957,7 +3957,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t" "subs x23, x23, #1\n\t"
"bne L_fe_pow22523_2_%=\n\t" "b.ne L_fe_pow22523_2_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #48]\n\t" "stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t" "stp x8, x9, [x29, #64]\n\t"
@ -4058,7 +4058,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t" "subs x23, x23, #1\n\t"
"bne L_fe_pow22523_3_%=\n\t" "b.ne L_fe_pow22523_3_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #80]\n\t" "stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t" "stp x8, x9, [x29, #96]\n\t"
@ -4159,7 +4159,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t" "subs x23, x23, #1\n\t"
"bne L_fe_pow22523_4_%=\n\t" "b.ne L_fe_pow22523_4_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #48]\n\t" "stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t" "stp x8, x9, [x29, #64]\n\t"
@ -4258,7 +4258,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t" "subs x23, x23, #1\n\t"
"bne L_fe_pow22523_5_%=\n\t" "b.ne L_fe_pow22523_5_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #48]\n\t" "stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t" "stp x8, x9, [x29, #64]\n\t"
@ -4359,7 +4359,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t" "subs x23, x23, #1\n\t"
"bne L_fe_pow22523_6_%=\n\t" "b.ne L_fe_pow22523_6_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #80]\n\t" "stp x6, x7, [x29, #80]\n\t"
"stp x8, x9, [x29, #96]\n\t" "stp x8, x9, [x29, #96]\n\t"
@ -4460,7 +4460,7 @@ void fe_pow22523(fe r, const fe a)
"adcs x8, x12, x15\n\t" "adcs x8, x12, x15\n\t"
"adc x9, x13, x16\n\t" "adc x9, x13, x16\n\t"
"subs x23, x23, #1\n\t" "subs x23, x23, #1\n\t"
"bne L_fe_pow22523_7_%=\n\t" "b.ne L_fe_pow22523_7_%=\n\t"
/* Store */ /* Store */
"stp x6, x7, [x29, #48]\n\t" "stp x6, x7, [x29, #48]\n\t"
"stp x8, x9, [x29, #64]\n\t" "stp x8, x9, [x29, #64]\n\t"

View File

@ -8553,7 +8553,7 @@ int mlkem_cmp_neon(const byte* a, const byte* b, int sz)
"orr v10.16b, v10.16b, v2.16b\n\t" "orr v10.16b, v10.16b, v2.16b\n\t"
"orr v11.16b, v11.16b, v3.16b\n\t" "orr v11.16b, v11.16b, v3.16b\n\t"
"subs %w[sz], %w[sz], #0x300\n\t" "subs %w[sz], %w[sz], #0x300\n\t"
"beq L_mlkem_aarch64_cmp_neon_done_%=\n\t" "b.eq L_mlkem_aarch64_cmp_neon_done_%=\n\t"
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t"
"ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t"
"eor v0.16b, v0.16b, v4.16b\n\t" "eor v0.16b, v0.16b, v4.16b\n\t"
@ -8605,7 +8605,7 @@ int mlkem_cmp_neon(const byte* a, const byte* b, int sz)
"orr v10.16b, v10.16b, v2.16b\n\t" "orr v10.16b, v10.16b, v2.16b\n\t"
"orr v11.16b, v11.16b, v3.16b\n\t" "orr v11.16b, v11.16b, v3.16b\n\t"
"subs %w[sz], %w[sz], #0x140\n\t" "subs %w[sz], %w[sz], #0x140\n\t"
"beq L_mlkem_aarch64_cmp_neon_done_%=\n\t" "b.eq L_mlkem_aarch64_cmp_neon_done_%=\n\t"
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t"
"ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t"
"eor v0.16b, v0.16b, v4.16b\n\t" "eor v0.16b, v0.16b, v4.16b\n\t"
@ -9278,9 +9278,9 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
"ldr q3, [x5]\n\t" "ldr q3, [x5]\n\t"
"ldr q2, [x6]\n\t" "ldr q2, [x6]\n\t"
"subs wzr, %w[len], #0\n\t" "subs wzr, %w[len], #0\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t" "b.eq L_mlkem_rej_uniform_done_%=\n\t"
"subs wzr, %w[len], #16\n\t" "subs wzr, %w[len], #16\n\t"
"blt L_mlkem_rej_uniform_loop_4_%=\n\t" "b.lt L_mlkem_rej_uniform_loop_4_%=\n\t"
"\n" "\n"
"L_mlkem_rej_uniform_loop_16_%=: \n\t" "L_mlkem_rej_uniform_loop_16_%=: \n\t"
"ld3 {v4.8b, v5.8b, v6.8b}, [%x[r]], #24\n\t" "ld3 {v4.8b, v5.8b, v6.8b}, [%x[r]], #24\n\t"
@ -9323,17 +9323,17 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
"add %x[p], %x[p], x11, lsl 1\n\t" "add %x[p], %x[p], x11, lsl 1\n\t"
"add x12, x12, x11\n\t" "add x12, x12, x11\n\t"
"subs %w[rLen], %w[rLen], #24\n\t" "subs %w[rLen], %w[rLen], #24\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t" "b.eq L_mlkem_rej_uniform_done_%=\n\t"
"sub w10, %w[len], w12\n\t" "sub w10, %w[len], w12\n\t"
"subs x10, x10, #16\n\t" "subs x10, x10, #16\n\t"
"blt L_mlkem_rej_uniform_loop_4_%=\n\t" "b.lt L_mlkem_rej_uniform_loop_4_%=\n\t"
"b L_mlkem_rej_uniform_loop_16_%=\n\t" "b L_mlkem_rej_uniform_loop_16_%=\n\t"
"\n" "\n"
"L_mlkem_rej_uniform_loop_4_%=: \n\t" "L_mlkem_rej_uniform_loop_4_%=: \n\t"
"subs w10, %w[len], w12\n\t" "subs w10, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t" "b.eq L_mlkem_rej_uniform_done_%=\n\t"
"subs x10, x10, #4\n\t" "subs x10, x10, #4\n\t"
"blt L_mlkem_rej_uniform_loop_lt_4_%=\n\t" "b.lt L_mlkem_rej_uniform_loop_lt_4_%=\n\t"
"ldr x4, [%x[r]], #6\n\t" "ldr x4, [%x[r]], #6\n\t"
"lsr x5, x4, #12\n\t" "lsr x5, x4, #12\n\t"
"lsr x6, x4, #24\n\t" "lsr x6, x4, #24\n\t"
@ -9363,7 +9363,7 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
"cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t" "cinc x12, x12, lt\n\t"
"subs %w[rLen], %w[rLen], #6\n\t" "subs %w[rLen], %w[rLen], #6\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t" "b.eq L_mlkem_rej_uniform_done_%=\n\t"
"b L_mlkem_rej_uniform_loop_4_%=\n\t" "b L_mlkem_rej_uniform_loop_4_%=\n\t"
"\n" "\n"
"L_mlkem_rej_uniform_loop_lt_4_%=: \n\t" "L_mlkem_rej_uniform_loop_lt_4_%=: \n\t"
@ -9381,30 +9381,30 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
"cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t" "cinc x12, x12, lt\n\t"
"subs wzr, %w[len], w12\n\t" "subs wzr, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t" "b.eq L_mlkem_rej_uniform_done_%=\n\t"
"strh w5, [%x[p]]\n\t" "strh w5, [%x[p]]\n\t"
"subs xzr, x5, x13\n\t" "subs xzr, x5, x13\n\t"
"cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t"
"cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t" "cinc x12, x12, lt\n\t"
"subs wzr, %w[len], w12\n\t" "subs wzr, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t" "b.eq L_mlkem_rej_uniform_done_%=\n\t"
"strh w6, [%x[p]]\n\t" "strh w6, [%x[p]]\n\t"
"subs xzr, x6, x13\n\t" "subs xzr, x6, x13\n\t"
"cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t"
"cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t" "cinc x12, x12, lt\n\t"
"subs wzr, %w[len], w12\n\t" "subs wzr, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t" "b.eq L_mlkem_rej_uniform_done_%=\n\t"
"strh w7, [%x[p]]\n\t" "strh w7, [%x[p]]\n\t"
"subs xzr, x7, x13\n\t" "subs xzr, x7, x13\n\t"
"cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t"
"cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t"
"cinc x12, x12, lt\n\t" "cinc x12, x12, lt\n\t"
"subs wzr, %w[len], w12\n\t" "subs wzr, %w[len], w12\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t" "b.eq L_mlkem_rej_uniform_done_%=\n\t"
"subs %w[rLen], %w[rLen], #6\n\t" "subs %w[rLen], %w[rLen], #6\n\t"
"beq L_mlkem_rej_uniform_done_%=\n\t" "b.eq L_mlkem_rej_uniform_done_%=\n\t"
"b L_mlkem_rej_uniform_loop_lt_4_%=\n\t" "b L_mlkem_rej_uniform_loop_lt_4_%=\n\t"
"\n" "\n"
"L_mlkem_rej_uniform_done_%=: \n\t" "L_mlkem_rej_uniform_done_%=: \n\t"
@ -9695,7 +9695,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
"mov v30.d[1], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t"
"eor x1, x1, %x[state]\n\t" "eor x1, x1, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t" "eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" "b.ne L_SHA3_transform_blocksx3_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t" "ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -10037,7 +10037,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
"mov v30.d[1], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t"
"eor x2, x2, %x[state]\n\t" "eor x2, x2, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t" "eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" "b.ne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t" "ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -10379,7 +10379,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
"mov v30.d[1], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t"
"eor x2, x2, %x[state]\n\t" "eor x2, x2, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t" "eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" "b.ne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t" "ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -10785,7 +10785,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
"mov v30.d[1], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t"
"eor x1, x1, %x[state]\n\t" "eor x1, x1, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t" "eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" "b.ne L_SHA3_transform_blocksx3_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t" "ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -11212,7 +11212,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
"mov v30.d[1], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t"
"eor x2, x2, %x[state]\n\t" "eor x2, x2, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t" "eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" "b.ne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t" "ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
@ -11639,7 +11639,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
"mov v30.d[1], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t"
"eor x2, x2, %x[state]\n\t" "eor x2, x2, %x[state]\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t" "eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" "b.ne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t" "ldr %x[state], [x29, #40]\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"

View File

@ -48,7 +48,7 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
/* Check for zero bytes to do. */ /* Check for zero bytes to do. */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t" "CMP %[bytes], #16 \n\t"
"BLO L_poly1305_aarch64_16_done_%= \n\t" "BLO L_poly1305_aarch64_16_done_%= \n\t"
"MOV x12, #1 \n\t" "MOV x12, #1 \n\t"
@ -127,8 +127,8 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
"ADCS x5, x5, x15\n\t" "ADCS x5, x5, x15\n\t"
"ADC x6, x6, xzr\n\t" "ADC x6, x6, xzr\n\t"
"SUBS %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]\n\t" "SUBS %[bytes], %[bytes], #16\n\t"
"ADD %[m], %[m], %[POLY1305_BLOCK_SIZE]\n\t" "ADD %[m], %[m], #16\n\t"
"BGT L_poly1305_aarch64_16_loop_%=\n\t" "BGT L_poly1305_aarch64_16_loop_%=\n\t"
/* Base 64 -> Base 26 */ /* Base 64 -> Base 26 */
@ -160,7 +160,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
/* If less than 4 blocks to process then use regular method */ /* If less than 4 blocks to process then use regular method */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t" "CMP %[bytes], #64 \n\t"
"BLO L_poly1305_aarch64_64_done_%= \n\t" "BLO L_poly1305_aarch64_64_done_%= \n\t"
"MOV x9, #0x3ffffff \n\t" "MOV x9, #0x3ffffff \n\t"
/* Load h */ /* Load h */
@ -188,7 +188,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"MOV v26.D[0], x9 \n\t" "MOV v26.D[0], x9 \n\t"
"MOV v26.D[1], x9 \n\t" "MOV v26.D[1], x9 \n\t"
"DUP v30.4S, v26.S[0] \n\t" "DUP v30.4S, v26.S[0] \n\t"
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t" "CMP %[bytes], #96 \n\t"
"BLO L_poly1305_aarch64_64_start_block_size_64_%= \n\t" "BLO L_poly1305_aarch64_64_start_block_size_64_%= \n\t"
/* Load r^2 to NEON v0, v1, v2, v3, v4 */ /* Load r^2 to NEON v0, v1, v2, v3, v4 */
"LD4 { v0.S-v3.S }[2], [%[r_2]], #16 \n\t" "LD4 { v0.S-v3.S }[2], [%[r_2]], #16 \n\t"
@ -229,7 +229,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
/* Load m */ /* Load m */
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */ /* Load four message blocks to NEON v10, v11, v12, v13, v14 */
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t" "LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t" "SUB %[bytes], %[bytes], #64 \n\t"
"USHR v14.4S, v13.4S, #8 \n\t" "USHR v14.4S, v13.4S, #8 \n\t"
"ORR v14.16B, v14.16B, v30.16B \n\t" "ORR v14.16B, v14.16B, v30.16B \n\t"
"SHL v13.4S, v13.4S, #18 \n\t" "SHL v13.4S, v13.4S, #18 \n\t"
@ -362,12 +362,12 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"UMLAL2 v24.2D, v14.4S, v9.4S \n\t" "UMLAL2 v24.2D, v14.4S, v9.4S \n\t"
"UMLAL2 v25.2D, v14.4S, v0.4S \n\t" "UMLAL2 v25.2D, v14.4S, v0.4S \n\t"
/* If less than six message blocks left then leave loop */ /* If less than six message blocks left then leave loop */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t" "CMP %[bytes], #96 \n\t"
"BLS L_poly1305_aarch64_64_loop_128_final_%= \n\t" "BLS L_poly1305_aarch64_64_loop_128_final_%= \n\t"
/* Load m */ /* Load m */
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */ /* Load four message blocks to NEON v10, v11, v12, v13, v14 */
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t" "LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t" "SUB %[bytes], %[bytes], #64 \n\t"
"USHR v14.4S, v13.4S, #8 \n\t" "USHR v14.4S, v13.4S, #8 \n\t"
"ORR v14.16B, v14.16B, v30.16B \n\t" "ORR v14.16B, v14.16B, v30.16B \n\t"
"SHL v13.4S, v13.4S, #18 \n\t" "SHL v13.4S, v13.4S, #18 \n\t"
@ -424,7 +424,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t" "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
/* Copy r^2 to lower half of registers */ /* Copy r^2 to lower half of registers */
"MOV v0.D[0], v0.D[1] \n\t" "MOV v0.D[0], v0.D[1] \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" "SUB %[bytes], %[bytes], #32 \n\t"
"MOV v5.D[0], v5.D[1] \n\t" "MOV v5.D[0], v5.D[1] \n\t"
"USHR v14.2D, v11.2D, #40 \n\t" "USHR v14.2D, v11.2D, #40 \n\t"
"MOV v1.D[0], v1.D[1] \n\t" "MOV v1.D[0], v1.D[1] \n\t"
@ -492,7 +492,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"MOV v18.S[1], v18.S[2] \n\t" "MOV v18.S[1], v18.S[2] \n\t"
"MOV v19.S[1], v19.S[2] \n\t" "MOV v19.S[1], v19.S[2] \n\t"
/* If less than 2 blocks left go straight to final multiplication. */ /* If less than 2 blocks left go straight to final multiplication. */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" "CMP %[bytes], #32 \n\t"
"BLO L_poly1305_aarch64_64_last_mult_%= \n\t" "BLO L_poly1305_aarch64_64_last_mult_%= \n\t"
/* Else go to one loop of L_poly1305_aarch64_64_loop_64 */ /* Else go to one loop of L_poly1305_aarch64_64_loop_64 */
"B L_poly1305_aarch64_64_loop_64_%= \n\t" "B L_poly1305_aarch64_64_loop_64_%= \n\t"
@ -524,7 +524,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
/* Load m */ /* Load m */
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */ /* Load two message blocks to NEON v10, v11, v12, v13, v14 */
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t" "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" "SUB %[bytes], %[bytes], #32 \n\t"
"USHR v14.2D, v11.2D, #40 \n\t" "USHR v14.2D, v11.2D, #40 \n\t"
"ORR v14.16B, v14.16B, v26.16B \n\t" "ORR v14.16B, v14.16B, v26.16B \n\t"
"USHR v13.2D, v11.2D, #14 \n\t" "USHR v13.2D, v11.2D, #14 \n\t"
@ -616,7 +616,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t" "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
/* Reduce h % P */ /* Reduce h % P */
"MOV x14, #5 \n\t" "MOV x14, #5 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" "SUB %[bytes], %[bytes], #32 \n\t"
"ADD x10, x10, x9, LSR #26 \n\t" "ADD x10, x10, x9, LSR #26 \n\t"
"USHR v14.2D, v11.2D, #40 \n\t" "USHR v14.2D, v11.2D, #40 \n\t"
"ADD x13, x13, x12, LSR #26 \n\t" "ADD x13, x13, x12, LSR #26 \n\t"
@ -676,7 +676,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"MOV v18.S[1], v18.S[2] \n\t" "MOV v18.S[1], v18.S[2] \n\t"
"MOV v19.S[1], v19.S[2] \n\t" "MOV v19.S[1], v19.S[2] \n\t"
/* If at least two message blocks left then loop_64 */ /* If at least two message blocks left then loop_64 */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" "CMP %[bytes], #32 \n\t"
"BHS L_poly1305_aarch64_64_loop_64_%= \n\t" "BHS L_poly1305_aarch64_64_loop_64_%= \n\t"
"\n" "\n"
".align 2 \n\t" ".align 2 \n\t"
@ -831,11 +831,9 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
"w19", "w20", "w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28",
"w30", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x30"
"x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27",
"x28", "x30"
); );
poly1305_blocks_aarch64_16(ctx, m, bytes); poly1305_blocks_aarch64_16(ctx, m, bytes);
} }
@ -845,12 +843,10 @@ void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m)
poly1305_blocks_aarch64_16(ctx, m, POLY1305_BLOCK_SIZE); poly1305_blocks_aarch64_16(ctx, m, POLY1305_BLOCK_SIZE);
} }
#if defined(POLY130564)
static word64 clamp[] = { static word64 clamp[] = {
0x0ffffffc0fffffff, 0x0ffffffc0fffffff,
0x0ffffffc0ffffffc, 0x0ffffffc0ffffffc,
}; };
#endif /* POLY130564 */
int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
@ -1112,7 +1108,6 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
[ctx_r64] "r" (ctx->r64), [ctx_r] "r" (ctx->r), [ctx_r64] "r" (ctx->r64), [ctx_r] "r" (ctx->r),
[ctx_r_2] "r" (ctx->r_2), [ctx_r_4] "r" (ctx->r_4) [ctx_r_2] "r" (ctx->r_2), [ctx_r_4] "r" (ctx->r_4)
: "memory", "cc", : "memory", "cc",
"w4", "w5", "w6", "w7", "w8",
"x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10"
); );

View File

@ -153,7 +153,7 @@ void BlockSha3_crypto(word64* state)
"ld1r {v30.2d}, [x1], #8\n\t" "ld1r {v30.2d}, [x1], #8\n\t"
"subs x2, x2, #1\n\t" "subs x2, x2, #1\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t" "eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_sha3_crypto_begin_%=\n\t" "b.ne L_sha3_crypto_begin_%=\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
"st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t"
@ -352,7 +352,7 @@ void BlockSha3_base(word64* state)
"ldr %x[state], [x27], #8\n\t" "ldr %x[state], [x27], #8\n\t"
"subs x28, x28, #1\n\t" "subs x28, x28, #1\n\t"
"eor x1, x1, %x[state]\n\t" "eor x1, x1, %x[state]\n\t"
"bne L_SHA3_transform_base_begin_%=\n\t" "b.ne L_SHA3_transform_base_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t" "ldr %x[state], [x29, #40]\n\t"
"stp x1, x2, [%x[state]]\n\t" "stp x1, x2, [%x[state]]\n\t"
"stp x3, x4, [%x[state], #16]\n\t" "stp x3, x4, [%x[state], #16]\n\t"

View File

@ -629,7 +629,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
"add x8, x8, x4\n\t" "add x8, x8, x4\n\t"
"add x4, x4, x14\n\t" "add x4, x4, x14\n\t"
"subs x27, x27, #1\n\t" "subs x27, x27, #1\n\t"
"bne L_sha512_len_neon_start_%=\n\t" "b.ne L_sha512_len_neon_start_%=\n\t"
/* Round 0 */ /* Round 0 */
"mov x13, v0.d[0]\n\t" "mov x13, v0.d[0]\n\t"
"ldr x15, [x3], #8\n\t" "ldr x15, [x3], #8\n\t"
@ -998,7 +998,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
"add x3, x3, %[L_SHA512_transform_neon_len_k]@PAGEOFF\n\t" "add x3, x3, %[L_SHA512_transform_neon_len_k]@PAGEOFF\n\t"
#endif /* __APPLE__ */ #endif /* __APPLE__ */
"subs %w[len], %w[len], #0x80\n\t" "subs %w[len], %w[len], #0x80\n\t"
"bne L_sha512_len_neon_begin_%=\n\t" "b.ne L_sha512_len_neon_begin_%=\n\t"
"stp x4, x5, [%x[sha512]]\n\t" "stp x4, x5, [%x[sha512]]\n\t"
"stp x6, x7, [%x[sha512], #16]\n\t" "stp x6, x7, [%x[sha512], #16]\n\t"
"stp x8, x9, [%x[sha512], #32]\n\t" "stp x8, x9, [%x[sha512], #32]\n\t"
@ -1576,7 +1576,7 @@ void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data,
"add v25.2d, v25.2d, v29.2d\n\t" "add v25.2d, v25.2d, v29.2d\n\t"
"add v24.2d, v24.2d, v28.2d\n\t" "add v24.2d, v24.2d, v28.2d\n\t"
"subs %w[len], %w[len], #0x80\n\t" "subs %w[len], %w[len], #0x80\n\t"
"bne L_sha512_len_crypto_begin_%=\n\t" "b.ne L_sha512_len_crypto_begin_%=\n\t"
/* Store digest back */ /* Store digest back */
"st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [%x[sha512]]\n\t" "st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [%x[sha512]]\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)

View File

@ -93,7 +93,7 @@ static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t" "sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t" "subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t" "mov x7, xzr\n\t"
"blt 2f\n\t" "b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */ /* Put in multiples of 8 bytes. */
"1:\n\t" "1:\n\t"
"ldr x8, [x4], -8\n\t" "ldr x8, [x4], -8\n\t"
@ -3351,7 +3351,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m,
"adc x3, x3, xzr\n\t" "adc x3, x3, xzr\n\t"
"subs x4, x4, 1\n\t" "subs x4, x4, 1\n\t"
"add %[a], %[a], 8\n\t" "add %[a], %[a], 8\n\t"
"bne 1b\n\t" "b.ne 1b\n\t"
"# Create mask\n\t" "# Create mask\n\t"
"neg x3, x3\n\t" "neg x3, x3\n\t"
"mov x9, %[a]\n\t" "mov x9, %[a]\n\t"
@ -6980,7 +6980,7 @@ static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t" "sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t" "subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t" "mov x7, xzr\n\t"
"blt 2f\n\t" "b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */ /* Put in multiples of 8 bytes. */
"1:\n\t" "1:\n\t"
"ldr x8, [x4], -8\n\t" "ldr x8, [x4], -8\n\t"
@ -16577,7 +16577,7 @@ static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t" "sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t" "subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t" "mov x7, xzr\n\t"
"blt 2f\n\t" "b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */ /* Put in multiples of 8 bytes. */
"1:\n\t" "1:\n\t"
"ldr x8, [x4], -8\n\t" "ldr x8, [x4], -8\n\t"
@ -39659,7 +39659,7 @@ static void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t" "sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t" "subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t" "mov x7, xzr\n\t"
"blt 2f\n\t" "b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */ /* Put in multiples of 8 bytes. */
"1:\n\t" "1:\n\t"
"ldr x8, [x4], -8\n\t" "ldr x8, [x4], -8\n\t"
@ -43865,7 +43865,7 @@ SP_NOINLINE static void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit*
"adc x3, x3, xzr\n\t" "adc x3, x3, xzr\n\t"
"subs x4, x4, 1\n\t" "subs x4, x4, 1\n\t"
"add %[a], %[a], 8\n\t" "add %[a], %[a], 8\n\t"
"bne 1b\n\t" "b.ne 1b\n\t"
"# Create mask\n\t" "# Create mask\n\t"
"neg x3, x3\n\t" "neg x3, x3\n\t"
"mov x9, %[a]\n\t" "mov x9, %[a]\n\t"
@ -66408,7 +66408,7 @@ static void sp_384_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t" "sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t" "subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t" "mov x7, xzr\n\t"
"blt 2f\n\t" "b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */ /* Put in multiples of 8 bytes. */
"1:\n\t" "1:\n\t"
"ldr x8, [x4], -8\n\t" "ldr x8, [x4], -8\n\t"
@ -72238,7 +72238,7 @@ SP_NOINLINE static void sp_521_mont_reduce_9(sp_digit* a, const sp_digit* m,
"# mu = a[i] * mp\n\t" "# mu = a[i] * mp\n\t"
"mul x9, %[mp], x13\n\t" "mul x9, %[mp], x13\n\t"
"cmp x4, #1\n\t" "cmp x4, #1\n\t"
"bne L_521_mont_reduce_9_nomask\n\t" "b.ne L_521_mont_reduce_9_nomask\n\t"
"and x9, x9, #0x1ff\n\t" "and x9, x9, #0x1ff\n\t"
"L_521_mont_reduce_9_nomask:\n\t" "L_521_mont_reduce_9_nomask:\n\t"
"# a[i+0] += m[0] * mu\n\t" "# a[i+0] += m[0] * mu\n\t"
@ -72312,7 +72312,7 @@ SP_NOINLINE static void sp_521_mont_reduce_9(sp_digit* a, const sp_digit* m,
"adc x3, x3, xzr\n\t" "adc x3, x3, xzr\n\t"
"subs x4, x4, 1\n\t" "subs x4, x4, 1\n\t"
"add %[a], %[a], 8\n\t" "add %[a], %[a], 8\n\t"
"bne 1b\n\t" "b.ne 1b\n\t"
"extr x12, x13, x12, 9\n\t" "extr x12, x13, x12, 9\n\t"
"extr x13, x14, x13, 9\n\t" "extr x13, x14, x13, 9\n\t"
"extr x14, x15, x14, 9\n\t" "extr x14, x15, x14, 9\n\t"
@ -111555,7 +111555,7 @@ static void sp_521_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t" "sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t" "subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t" "mov x7, xzr\n\t"
"blt 2f\n\t" "b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */ /* Put in multiples of 8 bytes. */
"1:\n\t" "1:\n\t"
"ldr x8, [x4], -8\n\t" "ldr x8, [x4], -8\n\t"
@ -115993,7 +115993,7 @@ SP_NOINLINE static void sp_1024_mont_reduce_16(sp_digit* a, const sp_digit* m,
"adc x3, x3, xzr\n\t" "adc x3, x3, xzr\n\t"
"subs x4, x4, 1\n\t" "subs x4, x4, 1\n\t"
"add %[a], %[a], 8\n\t" "add %[a], %[a], 8\n\t"
"bne 1b\n\t" "b.ne 1b\n\t"
"# Create mask\n\t" "# Create mask\n\t"
"subs x11, x10, x28\n\t" "subs x11, x10, x28\n\t"
"neg x3, x3\n\t" "neg x3, x3\n\t"
@ -125143,7 +125143,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
"sub x4, x4, 8\n\t" "sub x4, x4, 8\n\t"
"subs x6, %[n], 8\n\t" "subs x6, %[n], 8\n\t"
"mov x7, xzr\n\t" "mov x7, xzr\n\t"
"blt 2f\n\t" "b.lt 2f\n\t"
/* Put in multiples of 8 bytes. */ /* Put in multiples of 8 bytes. */
"1:\n\t" "1:\n\t"
"ldr x8, [x4], -8\n\t" "ldr x8, [x4], -8\n\t"