Merge pull request #7998 from SparkiDev/kyber_aarch64_asm

Kyber Aarch64: assembly implementations of functions
pull/8022/head
David Garske 2024-09-26 11:59:06 -07:00 committed by GitHub
commit 2285c02f1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 25658 additions and 107 deletions

View File

@ -2977,7 +2977,7 @@ then
AM_CPPFLAGS="$AM_CPPFLAGS+sm4"
fi
else
AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto"
AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto -DWOLFSSL_AARCH64_NO_SQRMLSH"
fi
;;
esac

View File

@ -1057,6 +1057,13 @@ if BUILD_INTELASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_kyber_asm.S
endif
endif
if BUILD_ARMASM_NEON
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-kyber-asm_c.c
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-kyber-asm.S
endif !BUILD_ARMASM_INLINE
endif BUILD_ARMASM_NEON
endif
if BUILD_DILITHIUM

View File

@ -337,8 +337,7 @@ _fe_cmov_table:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-128]!
add x29, sp, #0
str x17, [x29, #40]
str x19, [x29, #48]
stp x17, x19, [x29, #40]
stp x20, x21, [x29, #56]
stp x22, x23, [x29, #72]
stp x24, x25, [x29, #88]
@ -546,8 +545,7 @@ _fe_cmov_table:
stp x10, x11, [x0, #48]
stp x12, x13, [x0, #64]
stp x14, x15, [x0, #80]
ldr x17, [x29, #40]
ldr x19, [x29, #48]
ldp x17, x19, [x29, #40]
ldp x20, x21, [x29, #56]
ldp x22, x23, [x29, #72]
ldp x24, x25, [x29, #88]
@ -573,8 +571,7 @@ _fe_mul:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-64]!
add x29, sp, #0
str x17, [x29, #24]
str x19, [x29, #32]
stp x17, x19, [x29, #24]
stp x20, x21, [x29, #40]
str x22, [x29, #56]
# Multiply
@ -703,8 +700,7 @@ _fe_mul:
# Store
stp x6, x7, [x0]
stp x8, x9, [x0, #16]
ldr x17, [x29, #24]
ldr x19, [x29, #32]
ldp x17, x19, [x29, #24]
ldp x20, x21, [x29, #40]
ldr x22, [x29, #56]
ldp x29, x30, [sp], #0x40
@ -835,8 +831,7 @@ _fe_invert:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-176]!
add x29, sp, #0
str x17, [x29, #160]
str x20, [x29, #168]
stp x17, x20, [x29, #160]
# Invert
str x0, [x29, #144]
str x1, [x29, #152]
@ -1694,8 +1689,7 @@ L_fe_invert8:
#else
bl _fe_mul
#endif /* __APPLE__ */
ldr x17, [x29, #160]
ldr x20, [x29, #168]
ldp x17, x20, [x29, #160]
ldp x29, x30, [sp], #0xb0
ret
#ifndef __APPLE__
@ -1715,8 +1709,7 @@ _curve25519:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-288]!
add x29, sp, #0
str x17, [x29, #200]
str x19, [x29, #208]
stp x17, x19, [x29, #200]
stp x20, x21, [x29, #216]
stp x22, x23, [x29, #232]
stp x24, x25, [x29, #248]
@ -3801,8 +3794,7 @@ L_curve25519_inv_8:
stp x14, x15, [x0]
stp x16, x17, [x0, #16]
mov x0, xzr
ldr x17, [x29, #200]
ldr x19, [x29, #208]
ldp x17, x19, [x29, #200]
ldp x20, x21, [x29, #216]
ldp x22, x23, [x29, #232]
ldp x24, x25, [x29, #248]
@ -3828,8 +3820,7 @@ _fe_pow22523:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-144]!
add x29, sp, #0
str x17, [x29, #128]
str x23, [x29, #136]
stp x17, x23, [x29, #128]
# pow22523
str x0, [x29, #112]
str x1, [x29, #120]
@ -4619,8 +4610,7 @@ L_fe_pow22523_7:
#else
bl _fe_mul
#endif /* __APPLE__ */
ldr x17, [x29, #128]
ldr x23, [x29, #136]
ldp x17, x23, [x29, #128]
ldp x29, x30, [sp], #0x90
ret
#ifndef __APPLE__
@ -4640,8 +4630,7 @@ _ge_p1p1_to_p2:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
str x17, [x29, #40]
str x19, [x29, #48]
stp x17, x19, [x29, #40]
stp x20, x21, [x29, #56]
str x22, [x29, #72]
str x0, [x29, #16]
@ -5002,8 +4991,7 @@ _ge_p1p1_to_p2:
# Store
stp x14, x15, [x0]
stp x16, x17, [x0, #16]
ldr x17, [x29, #40]
ldr x19, [x29, #48]
ldp x17, x19, [x29, #40]
ldp x20, x21, [x29, #56]
ldr x22, [x29, #72]
ldp x29, x30, [sp], #0x50
@ -5025,8 +5013,7 @@ _ge_p1p1_to_p3:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-112]!
add x29, sp, #0
str x17, [x29, #40]
str x19, [x29, #48]
stp x17, x19, [x29, #40]
stp x20, x21, [x29, #56]
stp x22, x23, [x29, #72]
stp x24, x25, [x29, #88]
@ -5505,8 +5492,7 @@ _ge_p1p1_to_p3:
# Store
stp x14, x15, [x0]
stp x16, x17, [x0, #16]
ldr x17, [x29, #40]
ldr x19, [x29, #48]
ldp x17, x19, [x29, #40]
ldp x20, x21, [x29, #56]
ldp x22, x23, [x29, #72]
ldp x24, x25, [x29, #88]
@ -5530,8 +5516,7 @@ _ge_p2_dbl:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-128]!
add x29, sp, #0
str x17, [x29, #40]
str x19, [x29, #48]
stp x17, x19, [x29, #40]
stp x20, x21, [x29, #56]
stp x22, x23, [x29, #72]
stp x24, x25, [x29, #88]
@ -5986,8 +5971,7 @@ _ge_p2_dbl:
sbc x7, x7, xzr
stp x4, x5, [x0]
stp x6, x7, [x0, #16]
ldr x17, [x29, #40]
ldr x19, [x29, #48]
ldp x17, x19, [x29, #40]
ldp x20, x21, [x29, #56]
ldp x22, x23, [x29, #72]
ldp x24, x25, [x29, #88]
@ -6012,8 +5996,7 @@ _ge_madd:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-144]!
add x29, sp, #0
str x17, [x29, #56]
str x19, [x29, #64]
stp x17, x19, [x29, #56]
stp x20, x21, [x29, #72]
stp x22, x23, [x29, #88]
stp x24, x25, [x29, #104]
@ -6503,8 +6486,7 @@ _ge_madd:
stp x10, x11, [x0, #16]
stp x4, x5, [x1]
stp x6, x7, [x1, #16]
ldr x17, [x29, #56]
ldr x19, [x29, #64]
ldp x17, x19, [x29, #56]
ldp x20, x21, [x29, #72]
ldp x22, x23, [x29, #88]
ldp x24, x25, [x29, #104]
@ -6529,8 +6511,7 @@ _ge_msub:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-144]!
add x29, sp, #0
str x17, [x29, #56]
str x19, [x29, #64]
stp x17, x19, [x29, #56]
stp x20, x21, [x29, #72]
stp x22, x23, [x29, #88]
stp x24, x25, [x29, #104]
@ -7020,8 +7001,7 @@ _ge_msub:
stp x10, x11, [x0, #16]
stp x4, x5, [x1]
stp x6, x7, [x1, #16]
ldr x17, [x29, #56]
ldr x19, [x29, #64]
ldp x17, x19, [x29, #56]
ldp x20, x21, [x29, #72]
ldp x22, x23, [x29, #88]
ldp x24, x25, [x29, #104]
@ -7046,8 +7026,7 @@ _ge_add:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-144]!
add x29, sp, #0
str x17, [x29, #56]
str x19, [x29, #64]
stp x17, x19, [x29, #56]
stp x20, x21, [x29, #72]
stp x22, x23, [x29, #88]
stp x24, x25, [x29, #104]
@ -7663,8 +7642,7 @@ _ge_add:
stp x23, x24, [x0, #16]
stp x12, x13, [x1]
stp x14, x15, [x1, #16]
ldr x17, [x29, #56]
ldr x19, [x29, #64]
ldp x17, x19, [x29, #56]
ldp x20, x21, [x29, #72]
ldp x22, x23, [x29, #88]
ldp x24, x25, [x29, #104]
@ -7689,8 +7667,7 @@ _ge_sub:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-144]!
add x29, sp, #0
str x17, [x29, #56]
str x19, [x29, #64]
stp x17, x19, [x29, #56]
stp x20, x21, [x29, #72]
stp x22, x23, [x29, #88]
stp x24, x25, [x29, #104]
@ -8321,8 +8298,7 @@ _ge_sub:
stp x14, x15, [x0, #16]
stp x21, x22, [x1]
stp x23, x24, [x1, #16]
ldr x17, [x29, #56]
ldr x19, [x29, #64]
ldp x17, x19, [x29, #56]
ldp x20, x21, [x29, #72]
ldp x22, x23, [x29, #88]
ldp x24, x25, [x29, #104]
@ -8347,8 +8323,7 @@ _sc_reduce:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-64]!
add x29, sp, #0
str x17, [x29, #16]
str x19, [x29, #24]
stp x17, x19, [x29, #16]
stp x20, x21, [x29, #32]
stp x22, x23, [x29, #48]
ldp x2, x3, [x0]
@ -8525,8 +8500,7 @@ _sc_reduce:
# Store result
stp x2, x3, [x0]
stp x4, x5, [x0, #16]
ldr x17, [x29, #16]
ldr x19, [x29, #24]
ldp x17, x19, [x29, #16]
ldp x20, x21, [x29, #32]
ldp x22, x23, [x29, #48]
ldp x29, x30, [sp], #0x40
@ -8548,8 +8522,7 @@ _sc_muladd:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-96]!
add x29, sp, #0
str x17, [x29, #24]
str x19, [x29, #32]
stp x17, x19, [x29, #24]
stp x20, x21, [x29, #40]
stp x22, x23, [x29, #56]
stp x24, x25, [x29, #72]
@ -8824,8 +8797,7 @@ _sc_muladd:
# Store result
stp x4, x5, [x0]
stp x6, x7, [x0, #16]
ldr x17, [x29, #24]
ldr x19, [x29, #32]
ldp x17, x19, [x29, #24]
ldp x20, x21, [x29, #40]
ldp x22, x23, [x29, #56]
ldp x24, x25, [x29, #72]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -47,29 +47,29 @@
.p2align 3
#endif /* __APPLE__ */
L_SHA3_transform_crypto_r:
.xword 0x1
.xword 0x8082
.xword 0x0000000000000001
.xword 0x0000000000008082
.xword 0x800000000000808a
.xword 0x8000000080008000
.xword 0x808b
.xword 0x80000001
.xword 0x000000000000808b
.xword 0x0000000080000001
.xword 0x8000000080008081
.xword 0x8000000000008009
.xword 0x8a
.xword 0x88
.xword 0x80008009
.xword 0x8000000a
.xword 0x8000808b
.xword 0x000000000000008a
.xword 0x0000000000000088
.xword 0x0000000080008009
.xword 0x000000008000000a
.xword 0x000000008000808b
.xword 0x800000000000008b
.xword 0x8000000000008089
.xword 0x8000000000008003
.xword 0x8000000000008002
.xword 0x8000000000000080
.xword 0x800a
.xword 0x000000000000800a
.xword 0x800000008000000a
.xword 0x8000000080008081
.xword 0x8000000000008080
.xword 0x80000001
.xword 0x0000000080000001
.xword 0x8000000080008008
#ifndef __APPLE__
.text
@ -206,6 +206,251 @@ L_sha3_crypto_begin:
#ifndef __APPLE__
.size BlockSha3,.-BlockSha3
#endif /* __APPLE__ */
#else
#ifndef __APPLE__
.text
.type L_SHA3_transform_base_r, %object
.section .rodata
.size L_SHA3_transform_base_r, 192
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 3
#else
.p2align 3
#endif /* __APPLE__ */
L_SHA3_transform_base_r:
.xword 0x0000000000000001
.xword 0x0000000000008082
.xword 0x800000000000808a
.xword 0x8000000080008000
.xword 0x000000000000808b
.xword 0x0000000080000001
.xword 0x8000000080008081
.xword 0x8000000000008009
.xword 0x000000000000008a
.xword 0x0000000000000088
.xword 0x0000000080008009
.xword 0x000000008000000a
.xword 0x000000008000808b
.xword 0x800000000000008b
.xword 0x8000000000008089
.xword 0x8000000000008003
.xword 0x8000000000008002
.xword 0x8000000000000080
.xword 0x000000000000800a
.xword 0x800000008000000a
.xword 0x8000000080008081
.xword 0x8000000000008080
.xword 0x0000000080000001
.xword 0x8000000080008008
#ifndef __APPLE__
.text
.globl BlockSha3
.type BlockSha3,@function
.align 2
BlockSha3:
#else
.section __TEXT,__text
.globl _BlockSha3
.p2align 2
_BlockSha3:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-160]!
add x29, sp, #0
stp x17, x19, [x29, #72]
stp x20, x21, [x29, #88]
stp x22, x23, [x29, #104]
stp x24, x25, [x29, #120]
stp x26, x27, [x29, #136]
str x28, [x29, #152]
#ifndef __APPLE__
adrp x27, L_SHA3_transform_base_r
add x27, x27, :lo12:L_SHA3_transform_base_r
#else
adrp x27, L_SHA3_transform_base_r@PAGE
add x27, x27, :lo12:L_SHA3_transform_base_r@PAGEOFF
#endif /* __APPLE__ */
ldp x1, x2, [x0]
ldp x3, x4, [x0, #16]
ldp x5, x6, [x0, #32]
ldp x7, x8, [x0, #48]
ldp x9, x10, [x0, #64]
ldp x11, x12, [x0, #80]
ldp x13, x14, [x0, #96]
ldp x15, x16, [x0, #112]
ldp x17, x19, [x0, #128]
ldp x20, x21, [x0, #144]
ldp x22, x23, [x0, #160]
ldp x24, x25, [x0, #176]
ldr x26, [x0, #192]
str x0, [x29, #40]
mov x28, #24
# Start of 24 rounds
L_SHA3_transform_base_begin:
stp x27, x28, [x29, #48]
eor x0, x5, x10
eor x30, x1, x6
eor x28, x3, x8
eor x0, x0, x15
eor x30, x30, x11
eor x28, x28, x13
eor x0, x0, x21
eor x30, x30, x16
eor x28, x28, x19
eor x0, x0, x26
eor x30, x30, x22
eor x28, x28, x24
str x0, [x29, #32]
str x28, [x29, #24]
eor x27, x2, x7
eor x28, x4, x9
eor x27, x27, x12
eor x28, x28, x14
eor x27, x27, x17
eor x28, x28, x20
eor x27, x27, x23
eor x28, x28, x25
eor x0, x0, x27, ror 63
eor x27, x27, x28, ror 63
eor x1, x1, x0
eor x6, x6, x0
eor x11, x11, x0
eor x16, x16, x0
eor x22, x22, x0
eor x3, x3, x27
eor x8, x8, x27
eor x13, x13, x27
eor x19, x19, x27
eor x24, x24, x27
ldr x0, [x29, #32]
ldr x27, [x29, #24]
eor x28, x28, x30, ror 63
eor x30, x30, x27, ror 63
eor x27, x27, x0, ror 63
eor x5, x5, x28
eor x10, x10, x28
eor x15, x15, x28
eor x21, x21, x28
eor x26, x26, x28
eor x2, x2, x30
eor x7, x7, x30
eor x12, x12, x30
eor x17, x17, x30
eor x23, x23, x30
eor x4, x4, x27
eor x9, x9, x27
eor x14, x14, x27
eor x20, x20, x27
eor x25, x25, x27
# Swap Rotate
ror x0, x2, #63
ror x2, x7, #20
ror x7, x10, #44
ror x10, x24, #3
ror x24, x15, #25
ror x15, x22, #46
ror x22, x3, #2
ror x3, x13, #21
ror x13, x14, #39
ror x14, x21, #56
ror x21, x25, #8
ror x25, x16, #23
ror x16, x5, #37
ror x5, x26, #50
ror x26, x23, #62
ror x23, x9, #9
ror x9, x17, #19
ror x17, x6, #28
ror x6, x4, #36
ror x4, x20, #43
ror x20, x19, #49
ror x19, x12, #54
ror x12, x8, #58
ror x8, x11, #61
# Row Mix
bic x11, x3, x2
bic x27, x4, x3
bic x28, x1, x5
bic x30, x2, x1
eor x1, x1, x11
eor x2, x2, x27
bic x11, x5, x4
eor x4, x4, x28
eor x3, x3, x11
eor x5, x5, x30
bic x11, x8, x7
bic x27, x9, x8
bic x28, x6, x10
bic x30, x7, x6
eor x6, x6, x11
eor x7, x7, x27
bic x11, x10, x9
eor x9, x9, x28
eor x8, x8, x11
eor x10, x10, x30
bic x11, x13, x12
bic x27, x14, x13
bic x28, x0, x15
bic x30, x12, x0
eor x11, x0, x11
eor x12, x12, x27
bic x0, x15, x14
eor x14, x14, x28
eor x13, x13, x0
eor x15, x15, x30
bic x0, x19, x17
bic x27, x20, x19
bic x28, x16, x21
bic x30, x17, x16
eor x16, x16, x0
eor x17, x17, x27
bic x0, x21, x20
eor x20, x20, x28
eor x19, x19, x0
eor x21, x21, x30
bic x0, x24, x23
bic x27, x25, x24
bic x28, x22, x26
bic x30, x23, x22
eor x22, x22, x0
eor x23, x23, x27
bic x0, x26, x25
eor x25, x25, x28
eor x24, x24, x0
eor x26, x26, x30
# Done tranforming
ldp x27, x28, [x29, #48]
ldr x0, [x27], #8
subs x28, x28, #1
eor x1, x1, x0
bne L_SHA3_transform_base_begin
ldr x0, [x29, #40]
stp x1, x2, [x0]
stp x3, x4, [x0, #16]
stp x5, x6, [x0, #32]
stp x7, x8, [x0, #48]
stp x9, x10, [x0, #64]
stp x11, x12, [x0, #80]
stp x13, x14, [x0, #96]
stp x15, x16, [x0, #112]
stp x17, x19, [x0, #128]
stp x20, x21, [x0, #144]
stp x22, x23, [x0, #160]
stp x24, x25, [x0, #176]
str x26, [x0, #192]
ldp x17, x19, [x29, #72]
ldp x20, x21, [x29, #88]
ldp x22, x23, [x29, #104]
ldp x24, x25, [x29, #120]
ldp x26, x27, [x29, #136]
ldr x28, [x29, #152]
ldp x29, x30, [sp], #0xa0
ret
#ifndef __APPLE__
.size BlockSha3,.-BlockSha3
#endif /* __APPLE__ */
#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
#endif /* WOLFSSL_SHA3 */
#endif /* __aarch64__ */

View File

@ -181,6 +181,222 @@ void BlockSha3(unsigned long* state)
);
}
#else
static const uint64_t L_SHA3_transform_base_r[] = {
0x1UL,
0x8082UL,
0x800000000000808aUL,
0x8000000080008000UL,
0x808bUL,
0x80000001UL,
0x8000000080008081UL,
0x8000000000008009UL,
0x8aUL,
0x88UL,
0x80008009UL,
0x8000000aUL,
0x8000808bUL,
0x800000000000008bUL,
0x8000000000008089UL,
0x8000000000008003UL,
0x8000000000008002UL,
0x8000000000000080UL,
0x800aUL,
0x800000008000000aUL,
0x8000000080008081UL,
0x8000000000008080UL,
0x80000001UL,
0x8000000080008008UL,
};
void BlockSha3(unsigned long* state)
{
__asm__ __volatile__ (
"stp x29, x30, [sp, #-64]!\n\t"
"add x29, sp, #0\n\t"
#ifndef __APPLE__
"adrp x27, %[L_SHA3_transform_base_r]\n\t"
"add x27, x27, :lo12:%[L_SHA3_transform_base_r]\n\t"
#else
"adrp x27, %[L_SHA3_transform_base_r]@PAGE\n\t"
"add x27, x27, %[L_SHA3_transform_base_r]@PAGEOFF\n\t"
#endif /* __APPLE__ */
"ldp x1, x2, [%x[state]]\n\t"
"ldp x3, x4, [%x[state], #16]\n\t"
"ldp x5, x6, [%x[state], #32]\n\t"
"ldp x7, x8, [%x[state], #48]\n\t"
"ldp x9, x10, [%x[state], #64]\n\t"
"ldp x11, x12, [%x[state], #80]\n\t"
"ldp x13, x14, [%x[state], #96]\n\t"
"ldp x15, x16, [%x[state], #112]\n\t"
"ldp x17, x19, [%x[state], #128]\n\t"
"ldp x20, x21, [%x[state], #144]\n\t"
"ldp x22, x23, [%x[state], #160]\n\t"
"ldp x24, x25, [%x[state], #176]\n\t"
"ldr x26, [%x[state], #192]\n\t"
"str %x[state], [x29, #40]\n\t"
"mov x28, #24\n\t"
/* Start of 24 rounds */
"\n"
"L_SHA3_transform_base_begin_%=: \n\t"
"stp x27, x28, [x29, #48]\n\t"
"eor %x[state], x5, x10\n\t"
"eor x30, x1, x6\n\t"
"eor x28, x3, x8\n\t"
"eor %x[state], %x[state], x15\n\t"
"eor x30, x30, x11\n\t"
"eor x28, x28, x13\n\t"
"eor %x[state], %x[state], x21\n\t"
"eor x30, x30, x16\n\t"
"eor x28, x28, x19\n\t"
"eor %x[state], %x[state], x26\n\t"
"eor x30, x30, x22\n\t"
"eor x28, x28, x24\n\t"
"str %x[state], [x29, #32]\n\t"
"str x28, [x29, #24]\n\t"
"eor x27, x2, x7\n\t"
"eor x28, x4, x9\n\t"
"eor x27, x27, x12\n\t"
"eor x28, x28, x14\n\t"
"eor x27, x27, x17\n\t"
"eor x28, x28, x20\n\t"
"eor x27, x27, x23\n\t"
"eor x28, x28, x25\n\t"
"eor %x[state], %x[state], x27, ror 63\n\t"
"eor x27, x27, x28, ror 63\n\t"
"eor x1, x1, %x[state]\n\t"
"eor x6, x6, %x[state]\n\t"
"eor x11, x11, %x[state]\n\t"
"eor x16, x16, %x[state]\n\t"
"eor x22, x22, %x[state]\n\t"
"eor x3, x3, x27\n\t"
"eor x8, x8, x27\n\t"
"eor x13, x13, x27\n\t"
"eor x19, x19, x27\n\t"
"eor x24, x24, x27\n\t"
"ldr %x[state], [x29, #32]\n\t"
"ldr x27, [x29, #24]\n\t"
"eor x28, x28, x30, ror 63\n\t"
"eor x30, x30, x27, ror 63\n\t"
"eor x27, x27, %x[state], ror 63\n\t"
"eor x5, x5, x28\n\t"
"eor x10, x10, x28\n\t"
"eor x15, x15, x28\n\t"
"eor x21, x21, x28\n\t"
"eor x26, x26, x28\n\t"
"eor x2, x2, x30\n\t"
"eor x7, x7, x30\n\t"
"eor x12, x12, x30\n\t"
"eor x17, x17, x30\n\t"
"eor x23, x23, x30\n\t"
"eor x4, x4, x27\n\t"
"eor x9, x9, x27\n\t"
"eor x14, x14, x27\n\t"
"eor x20, x20, x27\n\t"
"eor x25, x25, x27\n\t"
/* Swap Rotate */
"ror %x[state], x2, #63\n\t"
"ror x2, x7, #20\n\t"
"ror x7, x10, #44\n\t"
"ror x10, x24, #3\n\t"
"ror x24, x15, #25\n\t"
"ror x15, x22, #46\n\t"
"ror x22, x3, #2\n\t"
"ror x3, x13, #21\n\t"
"ror x13, x14, #39\n\t"
"ror x14, x21, #56\n\t"
"ror x21, x25, #8\n\t"
"ror x25, x16, #23\n\t"
"ror x16, x5, #37\n\t"
"ror x5, x26, #50\n\t"
"ror x26, x23, #62\n\t"
"ror x23, x9, #9\n\t"
"ror x9, x17, #19\n\t"
"ror x17, x6, #28\n\t"
"ror x6, x4, #36\n\t"
"ror x4, x20, #43\n\t"
"ror x20, x19, #49\n\t"
"ror x19, x12, #54\n\t"
"ror x12, x8, #58\n\t"
"ror x8, x11, #61\n\t"
/* Row Mix */
"bic x11, x3, x2\n\t"
"bic x27, x4, x3\n\t"
"bic x28, x1, x5\n\t"
"bic x30, x2, x1\n\t"
"eor x1, x1, x11\n\t"
"eor x2, x2, x27\n\t"
"bic x11, x5, x4\n\t"
"eor x4, x4, x28\n\t"
"eor x3, x3, x11\n\t"
"eor x5, x5, x30\n\t"
"bic x11, x8, x7\n\t"
"bic x27, x9, x8\n\t"
"bic x28, x6, x10\n\t"
"bic x30, x7, x6\n\t"
"eor x6, x6, x11\n\t"
"eor x7, x7, x27\n\t"
"bic x11, x10, x9\n\t"
"eor x9, x9, x28\n\t"
"eor x8, x8, x11\n\t"
"eor x10, x10, x30\n\t"
"bic x11, x13, x12\n\t"
"bic x27, x14, x13\n\t"
"bic x28, %x[state], x15\n\t"
"bic x30, x12, %x[state]\n\t"
"eor x11, %x[state], x11\n\t"
"eor x12, x12, x27\n\t"
"bic %x[state], x15, x14\n\t"
"eor x14, x14, x28\n\t"
"eor x13, x13, %x[state]\n\t"
"eor x15, x15, x30\n\t"
"bic %x[state], x19, x17\n\t"
"bic x27, x20, x19\n\t"
"bic x28, x16, x21\n\t"
"bic x30, x17, x16\n\t"
"eor x16, x16, %x[state]\n\t"
"eor x17, x17, x27\n\t"
"bic %x[state], x21, x20\n\t"
"eor x20, x20, x28\n\t"
"eor x19, x19, %x[state]\n\t"
"eor x21, x21, x30\n\t"
"bic %x[state], x24, x23\n\t"
"bic x27, x25, x24\n\t"
"bic x28, x22, x26\n\t"
"bic x30, x23, x22\n\t"
"eor x22, x22, %x[state]\n\t"
"eor x23, x23, x27\n\t"
"bic %x[state], x26, x25\n\t"
"eor x25, x25, x28\n\t"
"eor x24, x24, %x[state]\n\t"
"eor x26, x26, x30\n\t"
/* Done tranforming */
"ldp x27, x28, [x29, #48]\n\t"
"ldr %x[state], [x27], #8\n\t"
"subs x28, x28, #1\n\t"
"eor x1, x1, %x[state]\n\t"
"bne L_SHA3_transform_base_begin_%=\n\t"
"ldr %x[state], [x29, #40]\n\t"
"stp x1, x2, [%x[state]]\n\t"
"stp x3, x4, [%x[state], #16]\n\t"
"stp x5, x6, [%x[state], #32]\n\t"
"stp x7, x8, [%x[state], #48]\n\t"
"stp x9, x10, [%x[state], #64]\n\t"
"stp x11, x12, [%x[state], #80]\n\t"
"stp x13, x14, [%x[state], #96]\n\t"
"stp x15, x16, [%x[state], #112]\n\t"
"stp x17, x19, [%x[state], #128]\n\t"
"stp x20, x21, [%x[state], #144]\n\t"
"stp x22, x23, [%x[state], #160]\n\t"
"stp x24, x25, [%x[state], #176]\n\t"
"str x26, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state)
: [L_SHA3_transform_base_r] "S" (L_SHA3_transform_base_r)
: "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "cc"
);
}
#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
#endif /* WOLFSSL_SHA3 */
#endif /* __aarch64__ */

View File

@ -65,7 +65,7 @@ L_SHA512_transform_neon_len_k:
.xword 0xc19bf174cf692694
.xword 0xe49b69c19ef14ad2
.xword 0xefbe4786384f25e3
.xword 0xfc19dc68b8cd5b5
.xword 0x0fc19dc68b8cd5b5
.xword 0x240ca1cc77ac9c65
.xword 0x2de92c6f592b0275
.xword 0x4a7484aa6ea6e483
@ -77,7 +77,7 @@ L_SHA512_transform_neon_len_k:
.xword 0xbf597fc7beef0ee4
.xword 0xc6e00bf33da88fc2
.xword 0xd5a79147930aa725
.xword 0x6ca6351e003826f
.xword 0x06ca6351e003826f
.xword 0x142929670a0e6e70
.xword 0x27b70a8546d22ffc
.xword 0x2e1b21385c26c926
@ -115,8 +115,8 @@ L_SHA512_transform_neon_len_k:
.xword 0xd186b8c721c0c207
.xword 0xeada7dd6cde0eb1e
.xword 0xf57d4f7fee6ed178
.xword 0x6f067aa72176fba
.xword 0xa637dc5a2c898a6
.xword 0x06f067aa72176fba
.xword 0x0a637dc5a2c898a6
.xword 0x113f9804bef90dae
.xword 0x1b710b35131c471b
.xword 0x28db77f523047d84
@ -156,8 +156,7 @@ _Transform_Sha512_Len_neon:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-128]!
add x29, sp, #0
str x17, [x29, #16]
str x19, [x29, #24]
stp x17, x19, [x29, #16]
stp x20, x21, [x29, #32]
stp x22, x23, [x29, #48]
stp x24, x25, [x29, #64]
@ -1082,8 +1081,7 @@ L_sha512_len_neon_start:
stp x6, x7, [x0, #16]
stp x8, x9, [x0, #32]
stp x10, x11, [x0, #48]
ldr x17, [x29, #16]
ldr x19, [x29, #24]
ldp x17, x19, [x29, #16]
ldp x20, x21, [x29, #32]
ldp x22, x23, [x29, #48]
ldp x24, x25, [x29, #64]
@ -1128,7 +1126,7 @@ L_SHA512_transform_crypto_len_k:
.xword 0xc19bf174cf692694
.xword 0xe49b69c19ef14ad2
.xword 0xefbe4786384f25e3
.xword 0xfc19dc68b8cd5b5
.xword 0x0fc19dc68b8cd5b5
.xword 0x240ca1cc77ac9c65
.xword 0x2de92c6f592b0275
.xword 0x4a7484aa6ea6e483
@ -1140,7 +1138,7 @@ L_SHA512_transform_crypto_len_k:
.xword 0xbf597fc7beef0ee4
.xword 0xc6e00bf33da88fc2
.xword 0xd5a79147930aa725
.xword 0x6ca6351e003826f
.xword 0x06ca6351e003826f
.xword 0x142929670a0e6e70
.xword 0x27b70a8546d22ffc
.xword 0x2e1b21385c26c926
@ -1178,8 +1176,8 @@ L_SHA512_transform_crypto_len_k:
.xword 0xd186b8c721c0c207
.xword 0xeada7dd6cde0eb1e
.xword 0xf57d4f7fee6ed178
.xword 0x6f067aa72176fba
.xword 0xa637dc5a2c898a6
.xword 0x06f067aa72176fba
.xword 0x0a637dc5a2c898a6
.xword 0x113f9804bef90dae
.xword 0x1b710b35131c471b
.xword 0x28db77f523047d84

View File

@ -62,8 +62,7 @@
}
#endif
#if (!defined(WOLFSSL_ARMASM) || (!defined(__arm__) && \
!defined(WOLFSSL_ARMASM_CRYPTO_SHA3))) && !defined(WOLFSSL_RISCV_ASM)
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
#ifdef USE_INTEL_SPEEDUP
#include <wolfssl/wolfcrypt/cpuid.h>

View File

@ -51,10 +51,11 @@
/* Use SHA3-512 to generate 64-bytes of hash. */
#define KYBER_HASH_G kyber_hash512
/* Use SHAKE-256 as a key derivation function (KDF). */
#ifdef USE_INTEL_SPEEDUP
#define KYBER_KDF kyber_kdf
#if defined(USE_INTEL_SPEEDUP) || \
(defined(WOLFSSL_ARMASM) && defined(__aarch64__))
#define KYBER_KDF kyber_kdf
#else
#define KYBER_KDF wc_Shake256Hash
#define KYBER_KDF wc_Shake256Hash
#endif
/******************************************************************************/

File diff suppressed because it is too large Load Diff

View File

@ -220,8 +220,7 @@ WOLFSSL_LOCAL void sha3_block_bmi2(word64* s);
WOLFSSL_LOCAL void sha3_block_avx2(word64* s);
WOLFSSL_LOCAL void BlockSha3(word64 *s);
#endif
#if (defined(WOLFSSL_ARMASM) && (defined(__arm__) || \
defined(WOLFSSL_ARMASM_CRYPTO_SHA3))) || defined(WOLFSSL_RISCV_ASM)
#if defined(WOLFSSL_ARMASM) || defined(WOLFSSL_RISCV_ASM)
WOLFSSL_LOCAL void BlockSha3(word64 *s);
#endif

View File

@ -163,7 +163,8 @@ WOLFSSL_LOCAL
int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1, sword16* vec2,
sword16* poly, byte* seed);
#ifdef USE_INTEL_SPEEDUP
#if defined(USE_INTEL_SPEEDUP) || \
(defined(WOLFSSL_ARMASM) && defined(__aarch64__))
WOLFSSL_LOCAL
int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen);
#endif
@ -288,6 +289,27 @@ void kyber_decompress_5_avx2(sword16* p, const byte* r);
WOLFSSL_LOCAL
int kyber_cmp_avx2(const byte* a, const byte* b, int sz);
#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM)
WOLFSSL_LOCAL void kyber_ntt(sword16* r);
WOLFSSL_LOCAL void kyber_invntt(sword16* r);
WOLFSSL_LOCAL void kyber_basemul_mont(sword16* r, const sword16* a,
const sword16* b);
WOLFSSL_LOCAL void kyber_basemul_mont_add(sword16* r, const sword16* a,
const sword16* b);
WOLFSSL_LOCAL void kyber_add_reduce(sword16* r, const sword16* a);
WOLFSSL_LOCAL void kyber_add3_reduce(sword16* r, const sword16* a,
const sword16* b);
WOLFSSL_LOCAL void kyber_rsub_reduce(sword16* r, const sword16* a);
WOLFSSL_LOCAL void kyber_to_mont(sword16* p);
WOLFSSL_LOCAL void kyber_sha3_blocksx3_neon(word64* state);
WOLFSSL_LOCAL void kyber_shake128_blocksx3_seed_neon(word64* state, byte* seed);
WOLFSSL_LOCAL void kyber_shake256_blocksx3_seed_neon(word64* state, byte* seed);
WOLFSSL_LOCAL unsigned int kyber_rej_uniform_neon(sword16* p, unsigned int len,
const byte* r, unsigned int rLen);
WOLFSSL_LOCAL int kyber_cmp_neon(const byte* a, const byte* b, int sz);
WOLFSSL_LOCAL void kyber_csubq_neon(sword16* p);
WOLFSSL_LOCAL void kyber_from_msg_neon(sword16* p, const byte* msg);
WOLFSSL_LOCAL void kyber_to_msg_neon(byte* msg, sword16* p);
#endif
#ifdef __cplusplus