Kyber ARM32 ASM: add assembly using base instructions

Support ARMv4 up to ARMv8.
Base instructions only - faster implemenation will use NEON.
pull/8040/head
Sean Parkinson 2024-10-04 11:06:18 +10:00
parent d0d802a2df
commit f7afc47d98
9 changed files with 19546 additions and 23 deletions

View File

@ -1199,7 +1199,13 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-kyber-asm_
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-kyber-asm.S
endif !BUILD_ARMASM_INLINE
endif BUILD_ARM_THUMB
else
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-kyber-asm_c.c
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-kyber-asm.S
endif !BUILD_ARMASM_INLINE
endif !BUILD_ARM_THUMB
endif BUILD_ARMASM
if !BUILD_X86_ASM
if BUILD_INTELASM

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -218,7 +218,7 @@ L_poly1305_arm32_16_loop:
# Load length
ldr r2, [sp, #20]
# Reduce mod 2^130 - 5
bic r3, r8, #3
bic r3, r8, #0x3
and r8, r8, #3
adds r4, r4, r3
lsr r3, r3, #2

View File

@ -243,7 +243,7 @@ void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p,
/* Load length */
"ldr %[len], [sp, #20]\n\t"
/* Reduce mod 2^130 - 5 */
"bic %[notLast], r8, #3\n\t"
"bic %[notLast], r8, #0x3\n\t"
"and r8, r8, #3\n\t"
"adds r4, r4, %[notLast]\n\t"
"lsr %[notLast], %[notLast], #2\n\t"

View File

@ -182,7 +182,7 @@ const sword16 zetas_inv[KYBER_N / 2] = {
"SSUB16 " #a ", " #a ", r10\n\t"
#if !(defined(__thumb__) || (defined(__aarch64__)) && defined(WOLFSSL_ARMASM))
#if !defined(WOLFSSL_ARMASM)
/* Number-Theoretic Transform.
*
* @param [in, out] r Polynomial to transform.
@ -2154,7 +2154,7 @@ int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen)
}
#endif
#if !(defined(WOLFSSL_ARMASM) && (defined(__aarch64__) || defined(__thumb__)))
#if !defined(WOLFSSL_ARMASM)
/* Rejection sampling on uniform random bytes to generate uniform random
* integers mod q.
*
@ -3350,7 +3350,7 @@ int kyber_cmp(const byte* a, const byte* b, int sz)
/******************************************************************************/
#if !(defined(__thumb__) || (defined(__aarch64__)) && defined(WOLFSSL_ARMASM))
#if !defined(WOLFSSL_ARMASM)
/* Conditional subtraction of q to each coefficient of a polynomial.
*
@ -3371,10 +3371,14 @@ static KYBER_NOINLINE void kyber_csubq_c(sword16* p)
#define kyber_csubq_c kyber_csubq_neon
#else
#elif defined(__thumb__)
#define kyber_csubq_c kyber_thumb2_csubq
#else
#define kyber_csubq_c kyber_arm32_csubq
#endif
/******************************************************************************/

View File

@ -320,12 +320,28 @@ WOLFSSL_LOCAL void kyber_to_msg_neon(byte* msg, sword16* p);
WOLFSSL_LOCAL void kyber_thumb2_ntt(sword16* r);
WOLFSSL_LOCAL void kyber_thumb2_invntt(sword16* r);
WOLFSSL_LOCAL void kyber_thumb2_basemul_mont(sword16* r, const sword16* a,
const sword16* b);
const sword16* b);
WOLFSSL_LOCAL void kyber_thumb2_basemul_mont_add(sword16* r, const sword16* a,
const sword16* b);
const sword16* b);
WOLFSSL_LOCAL void kyber_thumb2_csubq(sword16* p);
WOLFSSL_LOCAL unsigned int kyber_thumb2_rej_uniform(sword16* p,
unsigned int len, const byte* r, unsigned int rLen);
#elif defined(WOLFSSL_ARMASM)
#define kyber_ntt kyber_arm32_ntt
#define kyber_invntt kyber_arm32_invntt
#define kyber_basemul_mont kyber_arm32_basemul_mont
#define kyber_basemul_mont_add kyber_arm32_basemul_mont_add
#define kyber_rej_uniform_c kyber_arm32_rej_uniform
WOLFSSL_LOCAL void kyber_arm32_ntt(sword16* r);
WOLFSSL_LOCAL void kyber_arm32_invntt(sword16* r);
WOLFSSL_LOCAL void kyber_arm32_basemul_mont(sword16* r, const sword16* a,
const sword16* b);
WOLFSSL_LOCAL void kyber_arm32_basemul_mont_add(sword16* r, const sword16* a,
const sword16* b);
WOLFSSL_LOCAL void kyber_arm32_csubq(sword16* p);
WOLFSSL_LOCAL unsigned int kyber_arm32_rej_uniform(sword16* p, unsigned int len,
const byte* r, unsigned int rLen);
#endif
#ifdef __cplusplus