Green Hills compiler fixes

internal.c: Move non-enumeration value out of switch.
ssl.c: Only declare globalRNGMutex when required.
x509.c: initialize ret

armv8-aes.c, armv8-chacha.c: fix branch instructions
armv8-mlkem*: ensure only required constants are input operands and move
constants closer to first use.
armv8-poly1305.c: remove POLY1305_BLOCK_SIZE from input operands.
armv8-sha3-asm_c.c, armv8-sha512-asm_c.c: use constraint ':' instead of
'S'.
armv8-sha512.c: initialize initfp. Is always used.
pull/8774/head
Sean Parkinson 2025-05-20 10:55:26 +10:00
parent f8bb889712
commit fc1d281268
11 changed files with 377 additions and 584 deletions

View File

@ -26488,7 +26488,7 @@ const char* wolfSSL_ERR_reason_error_string(unsigned long e)
return "peer ip address mismatch";
case WANT_READ :
case -WOLFSSL_ERROR_WANT_READ :
case WOLFSSL_ERROR_WANT_READ_E :
return "non-blocking socket wants data to be read";
case NOT_READY_ERROR :
@ -26498,17 +26498,17 @@ const char* wolfSSL_ERR_reason_error_string(unsigned long e)
return "record layer version error";
case WANT_WRITE :
case -WOLFSSL_ERROR_WANT_WRITE :
case WOLFSSL_ERROR_WANT_WRITE_E :
return "non-blocking socket write buffer full";
case -WOLFSSL_ERROR_WANT_CONNECT:
case -WOLFSSL_ERROR_WANT_ACCEPT:
case WOLFSSL_ERROR_WANT_CONNECT_E :
case WOLFSSL_ERROR_WANT_ACCEPT_E :
return "The underlying BIO was not yet connected";
case -WOLFSSL_ERROR_SYSCALL:
case WOLFSSL_ERROR_SYSCALL_E :
return "fatal I/O error in TLS layer";
case -WOLFSSL_ERROR_WANT_X509_LOOKUP:
case WOLFSSL_ERROR_WANT_X509_LOOKUP_E :
return "application client cert callback asked to be called again";
case BUFFER_ERROR :
@ -26548,7 +26548,7 @@ const char* wolfSSL_ERR_reason_error_string(unsigned long e)
return "can't decode peer key";
case ZERO_RETURN:
case -WOLFSSL_ERROR_ZERO_RETURN:
case WOLFSSL_ERROR_ZERO_RETURN_E :
return "peer sent close notify alert";
case ECC_CURVETYPE_ERROR:

View File

@ -234,8 +234,10 @@ static struct SystemCryptoPolicy crypto_policy;
static WC_RNG globalRNG;
static volatile int initGlobalRNG = 0;
#if defined(OPENSSL_EXTRA) || !defined(WOLFSSL_MUTEX_INITIALIZER)
static WC_MAYBE_UNUSED wolfSSL_Mutex globalRNGMutex
WOLFSSL_MUTEX_INITIALIZER_CLAUSE(globalRNGMutex);
#endif
#ifndef WOLFSSL_MUTEX_INITIALIZER
static int globalRNGMutex_valid = 0;
#endif

View File

@ -5507,7 +5507,7 @@ int wolfSSL_X509_NAME_get_text_by_NID(WOLFSSL_X509_NAME* name,
WOLFSSL_EVP_PKEY* wolfSSL_X509_get_pubkey(WOLFSSL_X509* x509)
{
WOLFSSL_EVP_PKEY* key = NULL;
int ret;
int ret = 0;
(void)ret;

File diff suppressed because it is too large Load Diff

View File

@ -556,7 +556,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"SRI v6.4s, v17.4s, #25 \n\t"
"SRI v7.4s, v18.4s, #25 \n\t"
"SRI v4.4s, v19.4s, #25 \n\t"
"BNE L_chacha20_arm64_inner_%= \n\t"
"B.NE L_chacha20_arm64_inner_%= \n\t"
/* Add counter now rather than after transposed */
"ADD v12.4s, v12.4s, v28.4s \n\t"
"ADD w16, w16, w21 \n\t"
@ -666,7 +666,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t"
"SUBS %[bytes], %[bytes], #320 \n\t"
"ADD v28.4s, v28.4s, v29.4s \n\t"
"BNE L_chacha20_arm64_outer_%= \n\t"
"B.NE L_chacha20_arm64_outer_%= \n\t"
: [input] "+r" (input), [m] "+r" (m), [c] "+r" (c),
[bytes] "+r" (bytes64)
: [L_chacha20_neon_add_all_cntrs] "r" (L_chacha20_neon_add_all_cntrs),
@ -959,7 +959,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(
"EXT v9.16B, v9.16B, v9.16B, #12 \n\t"
"EXT v10.16B, v10.16B, v10.16B, #8 \n\t"
"EXT v11.16B, v11.16B, v11.16B, #4 \n\t"
"BNE L_chacha20_arm64_256_loop_%= \n\t"
"B.NE L_chacha20_arm64_256_loop_%= \n\t"
/* Load message */
"LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t"
/* Add one (2 added during calculating vector results) */
@ -1364,7 +1364,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(
"ROR r4, r4, #25 \n\t" // 4 4
"VEXT.8 q11, q11, q11, #4 \n\t" // permute elements left by one
"BNE L_chacha20_arm32_256_loop_%= \n\t"
"B.NE L_chacha20_arm32_256_loop_%= \n\t"
// r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
// 0 1 2 3 4 5 6 7 8 9 12 13 14
@ -1583,7 +1583,7 @@ static WC_INLINE int wc_Chacha_encrypt_128(
"EXT v5.16B, v5.16B, v5.16B, #12 \n\t"
"EXT v6.16B, v6.16B, v6.16B, #8 \n\t"
"EXT v7.16B, v7.16B, v7.16B, #4 \n\t"
"BNE L_chacha20_arm64_128_loop_%= \n\t"
"B.NE L_chacha20_arm64_128_loop_%= \n\t"
/* Add back state, XOR in message and store (load next block) */
"ADD v0.4S, v0.4S, v18.4S \n\t"
"ADD v1.4S, v1.4S, v19.4S \n\t"
@ -1736,7 +1736,7 @@ static WC_INLINE int wc_Chacha_encrypt_128(
"VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
"VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one
"BNE L_chacha20_arm32_128_loop_%= \n\t"
"B.NE L_chacha20_arm32_128_loop_%= \n\t"
"VMOV.I32 q8, #0 \n\t"
"VADD.I32 q0, q0, q10 \n\t"
@ -2251,7 +2251,7 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
"ADD v2.4S, v2.4S, v10.4S \n\t"
"ADD v3.4S, v3.4S, v11.4S \n\t"
"CMP %[bytes], #64 \n\t"
"BLT L_chacha20_arm64_64_lt_64_%= \n\t"
"B.LT L_chacha20_arm64_64_lt_64_%= \n\t"
"LD1 {v4.4S-v7.4S}, [%[m]], #64 \n\t"
"EOR v4.16B, v4.16B, v0.16B \n\t"
"EOR v5.16B, v5.16B, v1.16B \n\t"
@ -2260,13 +2260,13 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
"ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t"
"SUBS %[bytes], %[bytes], #64 \n\t"
"ADD v11.4S, v11.4S, v14.4S \n\t"
"BNE L_chacha20_arm64_64_loop_%= \n\t"
"B.NE L_chacha20_arm64_64_loop_%= \n\t"
"B L_chacha20_arm64_64_done_%= \n\t"
"\n"
"L_chacha20_arm64_64_lt_64_%=: \n\t"
"ST1 {v0.4s-v3.4s}, [%[over]]\n\t"
"CMP %[bytes], #32 \n\t"
"BLT L_chacha20_arm64_64_lt_32_%= \n\t"
"B.LT L_chacha20_arm64_64_lt_32_%= \n\t"
"LD1 {v4.4S, v5.4S}, [%[m]], #32 \n\t"
"EOR v4.16B, v4.16B, v0.16B \n\t"
"EOR v5.16B, v5.16B, v1.16B \n\t"
@ -2274,27 +2274,27 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
"SUBS %[bytes], %[bytes], #32 \n\t"
"MOV v0.16B, v2.16B \n\t"
"MOV v1.16B, v3.16B \n\t"
"BEQ L_chacha20_arm64_64_done_%= \n\t"
"B.EQ L_chacha20_arm64_64_done_%= \n\t"
"\n"
"L_chacha20_arm64_64_lt_32_%=: \n\t"
"CMP %[bytes], #16 \n\t"
"BLT L_chacha20_arm64_64_lt_16_%= \n\t"
"B.LT L_chacha20_arm64_64_lt_16_%= \n\t"
"LD1 {v4.4S}, [%[m]], #16 \n\t"
"EOR v4.16B, v4.16B, v0.16B \n\t"
"ST1 {v4.4S}, [%[c]], #16 \n\t"
"SUBS %[bytes], %[bytes], #16 \n\t"
"MOV v0.16B, v1.16B \n\t"
"BEQ L_chacha20_arm64_64_done_%= \n\t"
"B.EQ L_chacha20_arm64_64_done_%= \n\t"
"\n"
"L_chacha20_arm64_64_lt_16_%=: \n\t"
"CMP %[bytes], #8 \n\t"
"BLT L_chacha20_arm64_64_lt_8_%= \n\t"
"B.LT L_chacha20_arm64_64_lt_8_%= \n\t"
"LD1 {v4.2S}, [%[m]], #8 \n\t"
"EOR v4.8B, v4.8B, v0.8B \n\t"
"ST1 {v4.2S}, [%[c]], #8 \n\t"
"SUBS %[bytes], %[bytes], #8 \n\t"
"MOV v0.D[0], v0.D[1] \n\t"
"BEQ L_chacha20_arm64_64_done_%= \n\t"
"B.EQ L_chacha20_arm64_64_done_%= \n\t"
"\n"
"L_chacha20_arm64_64_lt_8_%=: \n\t"
"MOV x4, v0.D[0] \n\t"
@ -2305,7 +2305,7 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
"STRB w6, [%[c]], #1 \n\t"
"SUBS %[bytes], %[bytes], #1 \n\t"
"LSR x4, x4, #8 \n\t"
"BGT L_chacha20_arm64_64_loop_lt_8_%= \n\t"
"B.GT L_chacha20_arm64_64_loop_lt_8_%= \n\t"
"\n"
"L_chacha20_arm64_64_done_%=: \n\t"
: [input] "+r" (input), [m] "+r" (m), [c] "+r" (c),
@ -2816,7 +2816,7 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
"VADD.I32 q2, q2, q10 \n\t"
"VADD.I32 q3, q3, q11 \n\t"
"CMP %[bytes], #64 \n\t"
"BLT L_chacha20_arm32_64_lt_64_%= \n\t"
"B.LT L_chacha20_arm32_64_lt_64_%= \n\t"
/* XOR full 64 byte block */
"VLD1.8 { q4, q5 }, [%[m]]! \n\t"
"VLD1.8 { q6, q7 }, [%[m]]! \n\t"
@ -2828,14 +2828,14 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
"VST1.8 { q2, q3 }, [%[c]]! \n\t"
"SUBS %[bytes], %[bytes], #64 \n\t"
"VADD.I32 q11, q11, q14 \n\t"
"BNE L_chacha20_arm32_64_outer_loop_%= \n\t"
"B.NE L_chacha20_arm32_64_outer_loop_%= \n\t"
"B L_chacha20_arm32_64_done_%= \n\t"
"\n"
"L_chacha20_arm32_64_lt_64_%=: \n\t"
"VSTM %[over], {q0-q3} \n\t"
/* XOR 32 bytes */
"CMP %[bytes], #32 \n\t"
"BLT L_chacha20_arm32_64_lt_32_%= \n\t"
"B.LT L_chacha20_arm32_64_lt_32_%= \n\t"
"VLD1.8 { q4, q5 }, [%[m]]! \n\t"
"VEOR q4, q4, q0 \n\t"
"VEOR q5, q5, q1 \n\t"
@ -2843,41 +2843,41 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
"SUBS %[bytes], %[bytes], #32 \n\t"
"VMOV q0, q2 \n\t"
"VMOV q1, q3 \n\t"
"BEQ L_chacha20_arm32_64_done_%= \n\t"
"B.EQ L_chacha20_arm32_64_done_%= \n\t"
"\n"
"L_chacha20_arm32_64_lt_32_%=: \n\t"
/* XOR 16 bytes */
"CMP %[bytes], #16 \n\t"
"BLT L_chacha20_arm32_64_lt_16_%= \n\t"
"B.LT L_chacha20_arm32_64_lt_16_%= \n\t"
"VLD1.8 { q4 }, [%[m]]! \n\t"
"VEOR q4, q4, q0 \n\t"
"VST1.8 { q4 }, [%[c]]! \n\t"
"SUBS %[bytes], %[bytes], #16 \n\t"
"VMOV q0, q1 \n\t"
"BEQ L_chacha20_arm32_64_done_%= \n\t"
"B.EQ L_chacha20_arm32_64_done_%= \n\t"
"\n"
"L_chacha20_arm32_64_lt_16_%=: \n\t"
/* XOR 8 bytes */
"CMP %[bytes], #8 \n\t"
"BLT L_chacha20_arm32_64_lt_8_%= \n\t"
"B.LT L_chacha20_arm32_64_lt_8_%= \n\t"
"VLD1.8 { d8 }, [%[m]]! \n\t"
"VEOR d8, d8, d0 \n\t"
"VST1.8 { d8 }, [%[c]]! \n\t"
"SUBS %[bytes], %[bytes], #8 \n\t"
"VMOV d0, d1 \n\t"
"BEQ L_chacha20_arm32_64_done_%= \n\t"
"B.EQ L_chacha20_arm32_64_done_%= \n\t"
"\n"
"L_chacha20_arm32_64_lt_8_%=: \n\t"
/* XOR 4 bytes */
"CMP %[bytes], #4 \n\t"
"BLT L_chacha20_arm32_64_lt_4_%= \n\t"
"B.LT L_chacha20_arm32_64_lt_4_%= \n\t"
"LDR r12, [%[m]], #4 \n\t"
"VMOV r14, d0[0] \n\t"
"EOR r12, r12, r14 \n\t"
"STR r12, [%[c]], #4 \n\t"
"SUBS %[bytes], %[bytes], #4 \n\t"
"VSHR.U64 d0, d0, #32 \n\t"
"BEQ L_chacha20_arm32_64_done_%= \n\t"
"B.EQ L_chacha20_arm32_64_done_%= \n\t"
"\n"
"L_chacha20_arm32_64_lt_4_%=: \n\t"
/* XOR remaining bytes */
@ -2889,7 +2889,7 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
"STRB r12, [%[c]], #1 \n\t"
"SUBS %[bytes], %[bytes], #1 \n\t"
"LSR r14, r14, #8 \n\t"
"BGT L_chacha20_arm32_64_lt_4_loop_%= \n\t"
"B.GT L_chacha20_arm32_64_lt_4_loop_%= \n\t"
"\n"
"L_chacha20_arm32_64_done_%=: \n\t"
: [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes)

View File

@ -29,21 +29,6 @@
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifndef WOLFSSL_ARMASM_INLINE
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_q, %object
.section .rodata
.size L_mlkem_aarch64_q, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_aarch64_q:
.short 0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_consts, %object
@ -59,44 +44,6 @@ L_mlkem_aarch64_q:
#endif /* __APPLE__ */
L_mlkem_aarch64_consts:
.short 0x0d01,0xf301,0x4ebf,0x0549,0x5049,0x0000,0x0000,0x0000
#ifndef __APPLE__
.text
.type L_sha3_aarch64_r, %object
.section .rodata
.size L_sha3_aarch64_r, 192
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 3
#else
.p2align 3
#endif /* __APPLE__ */
L_sha3_aarch64_r:
.xword 0x0000000000000001
.xword 0x0000000000008082
.xword 0x800000000000808a
.xword 0x8000000080008000
.xword 0x000000000000808b
.xword 0x0000000080000001
.xword 0x8000000080008081
.xword 0x8000000000008009
.xword 0x000000000000008a
.xword 0x0000000000000088
.xword 0x0000000080008009
.xword 0x000000008000000a
.xword 0x000000008000808b
.xword 0x800000000000008b
.xword 0x8000000000008089
.xword 0x8000000000008003
.xword 0x8000000000008002
.xword 0x8000000000000080
.xword 0x000000000000800a
.xword 0x800000008000000a
.xword 0x8000000080008081
.xword 0x8000000000008080
.xword 0x0000000080000001
.xword 0x8000000080008008
#ifdef WOLFSSL_WC_MLKEM
#ifndef __APPLE__
.text
@ -7006,6 +6953,21 @@ _mlkem_basemul_mont_add:
#ifndef __APPLE__
.size mlkem_basemul_mont_add,.-mlkem_basemul_mont_add
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.type L_mlkem_aarch64_q, %object
.section .rodata
.size L_mlkem_aarch64_q, 16
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 2
#else
.p2align 2
#endif /* __APPLE__ */
L_mlkem_aarch64_q:
.short 0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01
#ifndef __APPLE__
.text
.globl mlkem_csubq_neon
@ -9724,6 +9686,44 @@ L_mlkem_rej_uniform_done:
#ifndef __APPLE__
.size mlkem_rej_uniform_neon,.-mlkem_rej_uniform_neon
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.type L_sha3_aarch64_r, %object
.section .rodata
.size L_sha3_aarch64_r, 192
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 3
#else
.p2align 3
#endif /* __APPLE__ */
L_sha3_aarch64_r:
.xword 0x0000000000000001
.xword 0x0000000000008082
.xword 0x800000000000808a
.xword 0x8000000080008000
.xword 0x000000000000808b
.xword 0x0000000080000001
.xword 0x8000000080008081
.xword 0x8000000000008009
.xword 0x000000000000008a
.xword 0x0000000000000088
.xword 0x0000000080008009
.xword 0x000000008000000a
.xword 0x000000008000808b
.xword 0x800000000000008b
.xword 0x8000000000008089
.xword 0x8000000000008003
.xword 0x8000000000008002
.xword 0x8000000000000080
.xword 0x000000000000800a
.xword 0x800000008000000a
.xword 0x8000000080008081
.xword 0x8000000000008080
.xword 0x0000000080000001
.xword 0x8000000080008008
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
#ifndef __APPLE__
.text

View File

@ -30,29 +30,10 @@
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifdef WOLFSSL_ARMASM_INLINE
static const word16 L_mlkem_aarch64_q[] = {
0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01,
};
static const word16 L_mlkem_aarch64_consts[] = {
0x0d01, 0xf301, 0x4ebf, 0x0549, 0x5049, 0x0000, 0x0000, 0x0000,
};
static const word64 L_sha3_aarch64_r[] = {
0x0000000000000001, 0x0000000000008082,
0x800000000000808a, 0x8000000080008000,
0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009,
0x000000000000008a, 0x0000000000000088,
0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b,
0x8000000000008089, 0x8000000000008003,
0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080,
0x0000000080000001, 0x8000000080008008,
};
#include <wolfssl/wolfcrypt/wc_mlkem.h>
#ifdef WOLFSSL_WC_MLKEM
@ -1405,11 +1386,9 @@ void mlkem_ntt(sword16* r)
"stp q17, q18, [x1, #192]\n\t"
"stp q19, q20, [x1, #224]\n\t"
: [r] "+r" (r)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts]),
[L_mlkem_aarch64_zetas] "i" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "i" (L_mlkem_aarch64_zetas_qinv)
: "memory", "cc", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
@ -2922,13 +2901,9 @@ void mlkem_invntt(sword16* r)
"str q23, [x1, #208]\n\t"
"str q24, [x1, #240]\n\t"
: [r] "+r" (r)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts]),
[L_mlkem_aarch64_zetas_inv] "i" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "i" (L_mlkem_aarch64_zetas_inv_qinv)
: "memory", "cc", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
@ -4096,13 +4071,7 @@ void mlkem_ntt_sqrdmlsh(sword16* r)
"stp q17, q18, [x1, #192]\n\t"
"stp q19, q20, [x1, #224]\n\t"
: [r] "+r" (r)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts])
: "memory", "cc", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
@ -5393,13 +5362,7 @@ void mlkem_invntt_sqrdmlsh(sword16* r)
"str q23, [x1, #208]\n\t"
"str q24, [x1, #240]\n\t"
: [r] "+r" (r)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts])
: "memory", "cc", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
@ -6102,14 +6065,8 @@ void mlkem_basemul_mont(sword16* r, const sword16* a, const sword16* b)
"zip2 v25.8h, v22.8h, v23.8h\n\t"
"stp q24, q25, [%x[r], #480]\n\t"
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts]),
[L_mlkem_aarch64_zetas_mul] "i" (L_mlkem_aarch64_zetas_mul)
: "memory", "cc", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
@ -6840,14 +6797,8 @@ void mlkem_basemul_mont_add(sword16* r, const sword16* a, const sword16* b)
"add v29.8h, v29.8h, v25.8h\n\t"
"stp q28, q29, [%x[r], #480]\n\t"
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts]),
[L_mlkem_aarch64_zetas_mul] "i" (L_mlkem_aarch64_zetas_mul)
: "memory", "cc", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
@ -6855,6 +6806,10 @@ void mlkem_basemul_mont_add(sword16* r, const sword16* a, const sword16* b)
);
}
static const word16 L_mlkem_aarch64_q[] = {
0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01,
};
void mlkem_csubq_neon(sword16* p)
{
__asm__ __volatile__ (
@ -7013,14 +6968,7 @@ void mlkem_csubq_neon(sword16* p)
"st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [%x[p]], #0x40\n\t"
"st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [%x[p]], #0x40\n\t"
: [p] "+r" (p)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul)
: [L_mlkem_aarch64_q] "i" (L_mlkem_aarch64_q)
: "memory", "cc", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18", "v19", "v20"
@ -7195,14 +7143,7 @@ void mlkem_add_reduce(sword16* r, const sword16* a)
"st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t"
"st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts])
: "memory", "cc", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18"
@ -7417,14 +7358,7 @@ void mlkem_add3_reduce(sword16* r, const sword16* a, const sword16* b)
"st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t"
"st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t"
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts])
: "memory", "cc", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
@ -7599,14 +7533,7 @@ void mlkem_rsub_reduce(sword16* r, const sword16* a)
"st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[r]], #0x40\n\t"
"st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[r]], #0x40\n\t"
: [r] "+r" (r), [a] "+r" (a)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts])
: "memory", "cc", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18"
@ -7803,14 +7730,7 @@ void mlkem_to_mont(sword16* p)
"st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t"
"st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t"
: [p] "+r" (p)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts])
: "memory", "cc", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18"
@ -7976,14 +7896,7 @@ void mlkem_to_mont_sqrdmlsh(sword16* p)
"st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t"
"st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t"
: [p] "+r" (p)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul)
: [%[L_mlkem_aarch64_consts]] "i" (%[L_mlkem_aarch64_consts])
: "memory", "cc", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18"
@ -8231,17 +8144,9 @@ void mlkem_to_msg_neon(byte* msg, sword16* p)
"ins v18.b[7], v25.b[0]\n\t"
"st1 {v18.8b}, [%x[msg]], #8\n\t"
: [msg] "+r" (msg), [p] "+r" (p)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits)
: [L_mlkem_to_msg_low] "i" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "i" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "i" (L_mlkem_to_msg_bits)
: "memory", "cc", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
@ -8415,19 +8320,8 @@ void mlkem_from_msg_neon(sword16* p, const byte* msg)
"and v7.16b, v7.16b, v1.16b\n\t"
"st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%x[p]], #0x40\n\t"
: [p] "+r" (p), [msg] "+r" (msg)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits),
[L_mlkem_from_msg_q1half] "S" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "S" (L_mlkem_from_msg_bits)
: [L_mlkem_from_msg_q1half] "i" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "i" (L_mlkem_from_msg_bits)
: "memory", "cc", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7", "v8", "v9", "v10", "v11"
);
@ -8693,19 +8587,7 @@ int mlkem_cmp_neon(const byte* a, const byte* b, int sz)
"subs x0, x0, xzr\n\t"
"csetm w0, ne\n\t"
: [a] "+r" (a), [b] "+r" (b), [sz] "+r" (sz)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits),
[L_mlkem_from_msg_q1half] "S" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "S" (L_mlkem_from_msg_bits)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v10", "v11"
);
@ -9410,22 +9292,10 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
"L_mlkem_rej_uniform_done_%=: \n\t"
"mov x0, x12\n\t"
: [p] "+r" (p), [len] "+r" (len), [r] "+r" (r), [rLen] "+r" (rLen)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits),
[L_mlkem_from_msg_q1half] "S" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "S" (L_mlkem_from_msg_bits),
[L_mlkem_rej_uniform_mask] "S" (L_mlkem_rej_uniform_mask),
[L_mlkem_rej_uniform_bits] "S" (L_mlkem_rej_uniform_bits),
[L_mlkem_rej_uniform_indices] "S" (L_mlkem_rej_uniform_indices)
: [%[L_mlkem_aarch64_q]] "i" (%[L_mlkem_aarch64_q]),
[L_mlkem_rej_uniform_mask] "i" (L_mlkem_rej_uniform_mask),
[L_mlkem_rej_uniform_bits] "i" (L_mlkem_rej_uniform_bits),
[L_mlkem_rej_uniform_indices] "i" (L_mlkem_rej_uniform_indices)
: "memory", "cc", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
"x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v10", "v11", "v12", "v13"
@ -9433,6 +9303,21 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r,
return (word32)(size_t)p;
}
static const word64 L_sha3_aarch64_r[] = {
0x0000000000000001, 0x0000000000008082,
0x800000000000808a, 0x8000000080008000,
0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009,
0x000000000000008a, 0x0000000000000088,
0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b,
0x8000000000008089, 0x8000000000008003,
0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080,
0x0000000080000001, 0x8000000080008008,
};
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
void mlkem_sha3_blocksx3_neon(word64* state)
{
@ -9728,22 +9613,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
"str x26, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits),
[L_mlkem_from_msg_q1half] "S" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "S" (L_mlkem_from_msg_bits),
[L_mlkem_rej_uniform_mask] "S" (L_mlkem_rej_uniform_mask),
[L_mlkem_rej_uniform_bits] "S" (L_mlkem_rej_uniform_bits),
[L_mlkem_rej_uniform_indices] "S" (L_mlkem_rej_uniform_indices)
: [L_sha3_aarch64_r] "i" (L_sha3_aarch64_r)
: "memory", "cc", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
"x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19",
"x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0",
@ -10070,22 +9940,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
"str x27, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state), [seed] "+r" (seed)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits),
[L_mlkem_from_msg_q1half] "S" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "S" (L_mlkem_from_msg_bits),
[L_mlkem_rej_uniform_mask] "S" (L_mlkem_rej_uniform_mask),
[L_mlkem_rej_uniform_bits] "S" (L_mlkem_rej_uniform_bits),
[L_mlkem_rej_uniform_indices] "S" (L_mlkem_rej_uniform_indices)
: [L_sha3_aarch64_r] "i" (L_sha3_aarch64_r)
: "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20",
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1",
@ -10412,22 +10267,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
"str x27, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state), [seed] "+r" (seed)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits),
[L_mlkem_from_msg_q1half] "S" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "S" (L_mlkem_from_msg_bits),
[L_mlkem_rej_uniform_mask] "S" (L_mlkem_rej_uniform_mask),
[L_mlkem_rej_uniform_bits] "S" (L_mlkem_rej_uniform_bits),
[L_mlkem_rej_uniform_indices] "S" (L_mlkem_rej_uniform_indices)
: [L_sha3_aarch64_r] "i" (L_sha3_aarch64_r)
: "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20",
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1",
@ -10818,22 +10658,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
"str x26, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits),
[L_mlkem_from_msg_q1half] "S" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "S" (L_mlkem_from_msg_bits),
[L_mlkem_rej_uniform_mask] "S" (L_mlkem_rej_uniform_mask),
[L_mlkem_rej_uniform_bits] "S" (L_mlkem_rej_uniform_bits),
[L_mlkem_rej_uniform_indices] "S" (L_mlkem_rej_uniform_indices)
: [L_sha3_aarch64_r] "i" (L_sha3_aarch64_r)
: "memory", "cc", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
"x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19",
"x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0",
@ -11245,22 +11070,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
"str x27, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state), [seed] "+r" (seed)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits),
[L_mlkem_from_msg_q1half] "S" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "S" (L_mlkem_from_msg_bits),
[L_mlkem_rej_uniform_mask] "S" (L_mlkem_rej_uniform_mask),
[L_mlkem_rej_uniform_bits] "S" (L_mlkem_rej_uniform_bits),
[L_mlkem_rej_uniform_indices] "S" (L_mlkem_rej_uniform_indices)
: [L_sha3_aarch64_r] "i" (L_sha3_aarch64_r)
: "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20",
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1",
@ -11672,22 +11482,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
"str x27, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state), [seed] "+r" (seed)
: [L_mlkem_aarch64_q] "S" (L_mlkem_aarch64_q),
[L_mlkem_aarch64_consts] "S" (L_mlkem_aarch64_consts),
[L_sha3_aarch64_r] "S" (L_sha3_aarch64_r),
[L_mlkem_aarch64_zetas] "S" (L_mlkem_aarch64_zetas),
[L_mlkem_aarch64_zetas_qinv] "S" (L_mlkem_aarch64_zetas_qinv),
[L_mlkem_aarch64_zetas_inv] "S" (L_mlkem_aarch64_zetas_inv),
[L_mlkem_aarch64_zetas_inv_qinv] "S" (L_mlkem_aarch64_zetas_inv_qinv),
[L_mlkem_aarch64_zetas_mul] "S" (L_mlkem_aarch64_zetas_mul),
[L_mlkem_to_msg_low] "S" (L_mlkem_to_msg_low),
[L_mlkem_to_msg_high] "S" (L_mlkem_to_msg_high),
[L_mlkem_to_msg_bits] "S" (L_mlkem_to_msg_bits),
[L_mlkem_from_msg_q1half] "S" (L_mlkem_from_msg_q1half),
[L_mlkem_from_msg_bits] "S" (L_mlkem_from_msg_bits),
[L_mlkem_rej_uniform_mask] "S" (L_mlkem_rej_uniform_mask),
[L_mlkem_rej_uniform_bits] "S" (L_mlkem_rej_uniform_bits),
[L_mlkem_rej_uniform_indices] "S" (L_mlkem_rej_uniform_indices)
: [L_sha3_aarch64_r] "i" (L_sha3_aarch64_r)
: "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20",
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1",

View File

@ -49,7 +49,7 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
__asm__ __volatile__ (
/* Check for zero bytes to do. */
"CMP %[bytes], #16 \n\t"
"BLO L_poly1305_aarch64_16_done_%= \n\t"
"B.LO L_poly1305_aarch64_16_done_%= \n\t"
"MOV x12, #1 \n\t"
/* Load h */
@ -129,7 +129,7 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
"SUBS %[bytes], %[bytes], #16\n\t"
"ADD %[m], %[m], #16\n\t"
"BGT L_poly1305_aarch64_16_loop_%=\n\t"
"B.GT L_poly1305_aarch64_16_loop_%=\n\t"
/* Base 64 -> Base 26 */
"MOV x10, #0x3ffffff\n\t"
@ -146,8 +146,7 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
".align 2 \n\t"
"L_poly1305_aarch64_16_done_%=: \n\t"
: [bytes] "+r" (bytes), [m] "+r" (m)
: [POLY1305_BLOCK_SIZE] "I" (POLY1305_BLOCK_SIZE),
[ctx_r64] "m" (ctx->r64[0]), [ctx_h] "r" (ctx->h),
: [ctx_r64] "m" (ctx->r64[0]), [ctx_h] "r" (ctx->h),
[finished] "r" ((word64)ctx->finished)
: "memory", "cc",
"x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14",
@ -161,7 +160,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
__asm__ __volatile__ (
/* If less than 4 blocks to process then use regular method */
"CMP %[bytes], #64 \n\t"
"BLO L_poly1305_aarch64_64_done_%= \n\t"
"B.LO L_poly1305_aarch64_64_done_%= \n\t"
"MOV x9, #0x3ffffff \n\t"
/* Load h */
"LDP x20, x22, [%[h]] \n\t"
@ -189,7 +188,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"MOV v26.D[1], x9 \n\t"
"DUP v30.4S, v26.S[0] \n\t"
"CMP %[bytes], #96 \n\t"
"BLO L_poly1305_aarch64_64_start_block_size_64_%= \n\t"
"B.LO L_poly1305_aarch64_64_start_block_size_64_%= \n\t"
/* Load r^2 to NEON v0, v1, v2, v3, v4 */
"LD4 { v0.S-v3.S }[2], [%[r_2]], #16 \n\t"
"LD1 { v4.S }[2], [%[r_2]] \n\t"
@ -363,7 +362,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"UMLAL2 v25.2D, v14.4S, v0.4S \n\t"
/* If less than six message blocks left then leave loop */
"CMP %[bytes], #96 \n\t"
"BLS L_poly1305_aarch64_64_loop_128_final_%= \n\t"
"B.LS L_poly1305_aarch64_64_loop_128_final_%= \n\t"
/* Load m */
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
@ -493,7 +492,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"MOV v19.S[1], v19.S[2] \n\t"
/* If less than 2 blocks left go straight to final multiplication. */
"CMP %[bytes], #32 \n\t"
"BLO L_poly1305_aarch64_64_last_mult_%= \n\t"
"B.LO L_poly1305_aarch64_64_last_mult_%= \n\t"
/* Else go to one loop of L_poly1305_aarch64_64_loop_64 */
"B L_poly1305_aarch64_64_loop_64_%= \n\t"
"\n"
@ -677,7 +676,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
"MOV v19.S[1], v19.S[2] \n\t"
/* If at least two message blocks left then loop_64 */
"CMP %[bytes], #32 \n\t"
"BHS L_poly1305_aarch64_64_loop_64_%= \n\t"
"B.HS L_poly1305_aarch64_64_loop_64_%= \n\t"
"\n"
".align 2 \n\t"
"L_poly1305_aarch64_64_last_mult_%=: \n\t"
@ -821,8 +820,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
: [bytes] "+r" (bytes),
[m] "+r" (m),
[ctx] "+m" (ctx)
: [POLY1305_BLOCK_SIZE] "I" (POLY1305_BLOCK_SIZE),
[h] "r" (ctx->h),
: [h] "r" (ctx->h),
[r] "r" (ctx->r),
[r_2] "r" (ctx->r_2),
[r_4] "r" (ctx->r_4),

View File

@ -162,7 +162,7 @@ void BlockSha3_crypto(word64* state)
"st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t"
"st1 {v24.1d}, [%x[state]]\n\t"
: [state] "+r" (state)
: [L_SHA3_transform_crypto_r] "S" (L_SHA3_transform_crypto_r)
: [L_SHA3_transform_crypto_r] "i" (L_SHA3_transform_crypto_r)
: "memory", "cc", "x1", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
@ -369,7 +369,7 @@ void BlockSha3_base(word64* state)
"str x26, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state)
: [L_SHA3_transform_base_r] "S" (L_SHA3_transform_base_r)
: [L_SHA3_transform_base_r] "i" (L_SHA3_transform_base_r)
: "memory", "cc", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
"x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19",
"x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"

View File

@ -1004,8 +1004,8 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len)
"stp x8, x9, [%x[sha512], #32]\n\t"
"stp x10, x11, [%x[sha512], #48]\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
: [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k),
[L_SHA512_transform_neon_len_r8] "S" (L_SHA512_transform_neon_len_r8)
: [L_SHA512_transform_neon_len_k] "i" (L_SHA512_transform_neon_len_k),
[L_SHA512_transform_neon_len_r8] "i" (L_SHA512_transform_neon_len_r8)
: "memory", "cc", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20",
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2",
@ -1580,7 +1580,7 @@ void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data,
/* Store digest back */
"st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [%x[sha512]]\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
: [L_SHA512_trans_crypto_len_k] "S" (L_SHA512_trans_crypto_len_k)
: [L_SHA512_trans_crypto_len_k] "i" (L_SHA512_trans_crypto_len_k)
: "memory", "cc", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",

View File

@ -647,9 +647,7 @@ static int Sha512_Family_Final(wc_Sha512* sha512, byte* hash,
{
int ret;
int digestSz;
int (*initfp)(wc_Sha512*);
(void)initfp;
int (*initfp)(wc_Sha512*) = NULL;
if (sha512 == NULL || hash == NULL) {
return BAD_FUNC_ARG;