ARM32 ChaCha20, Poly1305: assembly code

Add assembly code for ChaCha20 and Poly1305 on ARM32 when no NEON
available.
pull/8020/head
Sean Parkinson 2024-09-26 18:43:34 +10:00
parent e26ac5e122
commit 2323a5cf59
14 changed files with 2177 additions and 109 deletions

View File

@ -924,8 +924,10 @@ if BUILD_ARMASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-poly1305.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305.c
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-poly1305-asm.S
endif !BUILD_ARMASM_INLINE
endif
@ -999,17 +1001,17 @@ endif
if BUILD_CHACHA
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha.c
if BUILD_ARMASM_NEON
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c
else
if BUILD_ARMASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha.c
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm_c.c
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-chacha-asm.S
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-chacha-asm.S
endif !BUILD_ARMASM_INLINE
endif BUILD_ARMASM
else
if BUILD_RISCV_ASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-chacha.c
endif BUILD_RISCV_ASM
@ -1018,7 +1020,7 @@ if BUILD_INTELASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha_asm.S
endif BUILD_INTELASM
endif !BUILD_X86_ASM
endif !BUILD_ARMASM_NEON
endif !BUILD_ARMASM
if BUILD_POLY1305
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c
endif BUILD_POLY1305

View File

@ -72,8 +72,7 @@ Public domain.
#endif /* HAVE_CHACHA */
#if defined(WOLFSSL_ARMASM) && (!defined(WOLFSSL_ARMASM_NO_NEON) || \
defined(__thumb__))
#if defined(WOLFSSL_ARMASM)
/* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */
#elif defined(WOLFSSL_RISCV_ASM)

View File

@ -232,7 +232,7 @@ extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
}
#endif/* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */
/* if not 64 bit then use 32 bit */
#elif !defined(WOLFSSL_ARMASM) || !defined(__thumb__)
#elif !defined(WOLFSSL_ARMASM)
static word32 U8TO32(const byte *p)
{
@ -269,8 +269,7 @@ static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8])
}
#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \
!defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM)
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
/*
This local function operates on a message with a given number of bytes
with a given ctx pointer to a Poly1305 structure.
@ -789,8 +788,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
return 0;
}
#endif /* (!WOLFSSL_ARMASM || (!__aarch64__ && !__thumb__)) &&
* !WOLFSSL_RISCV_ASM */
#endif /* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */
int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
@ -885,8 +883,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
/* process full blocks */
if (bytes >= POLY1305_BLOCK_SIZE) {
size_t want = ((size_t)bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1));
#if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \
!defined(__thumb__))) && !defined(WOLFSSL_RISCV_ASM)
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
int ret;
ret = poly1305_blocks(ctx, m, want);
if (ret != 0)

View File

@ -411,7 +411,7 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p)
static const uint32_t L_AES_ARM32_rcon[] = {
0x01000000, 0x02000000, 0x04000000, 0x08000000,
0x10000000, 0x20000000, 0x40000000, 0x80000000,
0x1b000000, 0x36000000,
0x1b000000, 0x36000000,
};
void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks);

View File

@ -0,0 +1,522 @@
/* armv8-32-chacha-asm
*
* Copyright (C) 2006-2024 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./chacha/chacha.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.S
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef HAVE_CHACHA
.text
.align 4
.globl wc_chacha_setiv
.type wc_chacha_setiv, %function
wc_chacha_setiv:
push {r4, lr}
add r3, r0, #52
ldr r4, [r1]
ldr r12, [r1, #4]
ldr lr, [r1, #8]
str r2, [r0, #48]
#ifdef BIG_ENDIAN_ORDER
rev r4, r4
rev r12, r12
rev lr, lr
#endif /* BIG_ENDIAN_ORDER */
stm r3, {r4, r12, lr}
pop {r4, pc}
.size wc_chacha_setiv,.-wc_chacha_setiv
.text
.type L_chacha_arm32_constants, %object
.size L_chacha_arm32_constants, 32
.align 4
L_chacha_arm32_constants:
.word 0x61707865
.word 0x3120646e
.word 0x79622d36
.word 0x6b206574
.word 0x61707865
.word 0x3320646e
.word 0x79622d32
.word 0x6b206574
.text
.align 4
.globl wc_chacha_setkey
.type wc_chacha_setkey, %function
wc_chacha_setkey:
push {r4, r5, lr}
adr r3, L_chacha_arm32_constants
subs r2, r2, #16
add r3, r3, r2
# Start state with constants
ldm r3, {r4, r5, r12, lr}
stm r0!, {r4, r5, r12, lr}
# Next is first 16 bytes of key.
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r12, [r1, #8]
ldr lr, [r1, #12]
#ifdef BIG_ENDIAN_ORDER
rev r4, r4
rev r5, r5
rev r12, r12
rev lr, lr
#endif /* BIG_ENDIAN_ORDER */
stm r0!, {r4, r5, r12, lr}
# Next 16 bytes of key.
beq L_chacha_arm32_setkey_same_keyb_ytes
# Update key pointer for next 16 bytes.
add r1, r1, r2
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r12, [r1, #8]
ldr lr, [r1, #12]
L_chacha_arm32_setkey_same_keyb_ytes:
stm r0, {r4, r5, r12, lr}
pop {r4, r5, pc}
.size wc_chacha_setkey,.-wc_chacha_setkey
#ifdef WOLFSSL_ARMASM_NO_NEON
.text
.align 4
.globl wc_chacha_crypt_bytes
.type wc_chacha_crypt_bytes, %function
wc_chacha_crypt_bytes:
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
sub sp, sp, #52
mov lr, r0
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
str r0, [sp, #32]
str r1, [sp, #36]
#else
strd r0, r1, [sp, #32]
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
str r2, [sp, #40]
str r3, [sp, #44]
#else
strd r2, r3, [sp, #40]
#endif
L_chacha_arm32_crypt_block:
# Put x[12]..x[15] onto stack.
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
ldr r4, [lr, #48]
ldr r5, [lr, #52]
#else
ldrd r4, r5, [lr, #48]
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
ldr r6, [lr, #56]
ldr r7, [lr, #60]
#else
ldrd r6, r7, [lr, #56]
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
str r4, [sp, #16]
str r5, [sp, #20]
#else
strd r4, r5, [sp, #16]
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
str r6, [sp, #24]
str r7, [sp, #28]
#else
strd r6, r7, [sp, #24]
#endif
# Load x[0]..x[12] into registers.
ldm lr, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12}
# 10x 2 full rounds to perform.
mov lr, #10
str lr, [sp, #48]
L_chacha_arm32_crypt_loop:
# 0, 4, 8, 12
# 1, 5, 9, 13
ldr lr, [sp, #20]
add r0, r0, r4
add r1, r1, r5
eor r12, r12, r0
eor lr, lr, r1
ror r12, r12, #16
ror lr, lr, #16
add r8, r8, r12
add r9, r9, lr
eor r4, r4, r8
eor r5, r5, r9
ror r4, r4, #20
ror r5, r5, #20
add r0, r0, r4
add r1, r1, r5
eor r12, r12, r0
eor lr, lr, r1
ror r12, r12, #24
ror lr, lr, #24
add r8, r8, r12
add r9, r9, lr
eor r4, r4, r8
eor r5, r5, r9
ror r4, r4, #25
ror r5, r5, #25
str r12, [sp, #16]
str lr, [sp, #20]
# 2, 6, 10, 14
# 3, 7, 11, 15
ldr r12, [sp, #24]
ldr lr, [sp, #28]
add r2, r2, r6
add r3, r3, r7
eor r12, r12, r2
eor lr, lr, r3
ror r12, r12, #16
ror lr, lr, #16
add r10, r10, r12
add r11, r11, lr
eor r6, r6, r10
eor r7, r7, r11
ror r6, r6, #20
ror r7, r7, #20
add r2, r2, r6
add r3, r3, r7
eor r12, r12, r2
eor lr, lr, r3
ror r12, r12, #24
ror lr, lr, #24
add r10, r10, r12
add r11, r11, lr
eor r6, r6, r10
eor r7, r7, r11
ror r6, r6, #25
ror r7, r7, #25
# 3, 4, 9, 14
# 0, 5, 10, 15
add r3, r3, r4
add r0, r0, r5
eor r12, r12, r3
eor lr, lr, r0
ror r12, r12, #16
ror lr, lr, #16
add r9, r9, r12
add r10, r10, lr
eor r4, r4, r9
eor r5, r5, r10
ror r4, r4, #20
ror r5, r5, #20
add r3, r3, r4
add r0, r0, r5
eor r12, r12, r3
eor lr, lr, r0
ror r12, r12, #24
ror lr, lr, #24
add r9, r9, r12
add r10, r10, lr
eor r4, r4, r9
eor r5, r5, r10
ror r4, r4, #25
ror r5, r5, #25
str r12, [sp, #24]
str lr, [sp, #28]
ldr r12, [sp, #16]
ldr lr, [sp, #20]
# 1, 6, 11, 12
# 2, 7, 8, 13
add r1, r1, r6
add r2, r2, r7
eor r12, r12, r1
eor lr, lr, r2
ror r12, r12, #16
ror lr, lr, #16
add r11, r11, r12
add r8, r8, lr
eor r6, r6, r11
eor r7, r7, r8
ror r6, r6, #20
ror r7, r7, #20
add r1, r1, r6
add r2, r2, r7
eor r12, r12, r1
eor lr, lr, r2
ror r12, r12, #24
ror lr, lr, #24
add r11, r11, r12
add r8, r8, lr
eor r6, r6, r11
eor r7, r7, r8
ror r6, r6, #25
ror r7, r7, #25
str lr, [sp, #20]
# Check if we have done enough rounds.
ldr lr, [sp, #48]
subs lr, lr, #1
str lr, [sp, #48]
bgt L_chacha_arm32_crypt_loop
stm sp, {r8, r9, r10, r11, r12}
ldr lr, [sp, #32]
mov r12, sp
# Add in original state
ldm lr!, {r8, r9, r10, r11}
add r0, r0, r8
add r1, r1, r9
add r2, r2, r10
add r3, r3, r11
ldm lr!, {r8, r9, r10, r11}
add r4, r4, r8
add r5, r5, r9
add r6, r6, r10
add r7, r7, r11
ldm r12, {r8, r9}
ldm lr!, {r10, r11}
add r8, r8, r10
add r9, r9, r11
stm r12!, {r8, r9}
ldm r12, {r8, r9}
ldm lr!, {r10, r11}
add r8, r8, r10
add r9, r9, r11
stm r12!, {r8, r9}
ldm r12, {r8, r9}
ldm lr!, {r10, r11}
add r8, r8, r10
add r9, r9, r11
add r10, r10, #1
stm r12!, {r8, r9}
str r10, [lr, #-8]
ldm r12, {r8, r9}
ldm lr, {r10, r11}
add r8, r8, r10
add r9, r9, r11
stm r12, {r8, r9}
ldr r12, [sp, #44]
cmp r12, #0x40
blt L_chacha_arm32_crypt_lt_block
ldr r12, [sp, #40]
ldr lr, [sp, #36]
# XOR state into 64 bytes.
ldr r8, [r12]
ldr r9, [r12, #4]
ldr r10, [r12, #8]
ldr r11, [r12, #12]
eor r0, r0, r8
eor r1, r1, r9
eor r2, r2, r10
eor r3, r3, r11
str r0, [lr]
str r1, [lr, #4]
str r2, [lr, #8]
str r3, [lr, #12]
ldr r8, [r12, #16]
ldr r9, [r12, #20]
ldr r10, [r12, #24]
ldr r11, [r12, #28]
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
str r4, [lr, #16]
str r5, [lr, #20]
str r6, [lr, #24]
str r7, [lr, #28]
ldr r4, [sp]
ldr r5, [sp, #4]
ldr r6, [sp, #8]
ldr r7, [sp, #12]
ldr r8, [r12, #32]
ldr r9, [r12, #36]
ldr r10, [r12, #40]
ldr r11, [r12, #44]
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
str r4, [lr, #32]
str r5, [lr, #36]
str r6, [lr, #40]
str r7, [lr, #44]
ldr r4, [sp, #16]
ldr r5, [sp, #20]
ldr r6, [sp, #24]
ldr r7, [sp, #28]
ldr r8, [r12, #48]
ldr r9, [r12, #52]
ldr r10, [r12, #56]
ldr r11, [r12, #60]
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
str r4, [lr, #48]
str r5, [lr, #52]
str r6, [lr, #56]
str r7, [lr, #60]
ldr r3, [sp, #44]
add r12, r12, #0x40
add lr, lr, #0x40
str r12, [sp, #40]
str lr, [sp, #36]
subs r3, r3, #0x40
ldr lr, [sp, #32]
str r3, [sp, #44]
bne L_chacha_arm32_crypt_block
b L_chacha_arm32_crypt_done
L_chacha_arm32_crypt_lt_block:
# Store in over field of ChaCha.
ldr lr, [sp, #32]
add r12, lr, #0x44
stm r12!, {r0, r1, r2, r3, r4, r5, r6, r7}
ldm sp, {r0, r1, r2, r3, r4, r5, r6, r7}
stm r12, {r0, r1, r2, r3, r4, r5, r6, r7}
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
ldr r2, [sp, #40]
ldr r3, [sp, #44]
#else
ldrd r2, r3, [sp, #40]
#endif
ldr r1, [sp, #36]
rsb r12, r3, #0x40
str r12, [lr, #64]
add lr, lr, #0x44
L_chacha_arm32_crypt_16byte_loop:
cmp r3, #16
blt L_chacha_arm32_crypt_word_loop
# 16 bytes of state XORed into message.
ldm lr!, {r4, r5, r6, r7}
ldr r8, [r2]
ldr r9, [r2, #4]
ldr r10, [r2, #8]
ldr r11, [r2, #12]
eor r8, r8, r4
eor r9, r9, r5
eor r10, r10, r6
eor r11, r11, r7
subs r3, r3, #16
str r8, [r1]
str r9, [r1, #4]
str r10, [r1, #8]
str r11, [r1, #12]
beq L_chacha_arm32_crypt_done
add r2, r2, #16
add r1, r1, #16
b L_chacha_arm32_crypt_16byte_loop
L_chacha_arm32_crypt_word_loop:
cmp r3, #4
blt L_chacha_arm32_crypt_byte_start
# 4 bytes of state XORed into message.
ldr r4, [lr]
ldr r8, [r2]
eor r8, r8, r4
subs r3, r3, #4
str r8, [r1]
beq L_chacha_arm32_crypt_done
add lr, lr, #4
add r2, r2, #4
add r1, r1, #4
b L_chacha_arm32_crypt_word_loop
L_chacha_arm32_crypt_byte_start:
ldr r4, [lr]
L_chacha_arm32_crypt_byte_loop:
ldrb r8, [r2]
eor r8, r8, r4
subs r3, r3, #1
strb r8, [r1]
beq L_chacha_arm32_crypt_done
lsr r4, r4, #8
add r2, r2, #1
add r1, r1, #1
b L_chacha_arm32_crypt_byte_loop
L_chacha_arm32_crypt_done:
add sp, sp, #52
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size wc_chacha_crypt_bytes,.-wc_chacha_crypt_bytes
.text
.align 4
.globl wc_chacha_use_over
.type wc_chacha_use_over, %function
wc_chacha_use_over:
push {r4, r5, r6, r7, r8, r9, lr}
L_chacha_arm32_over_16byte_loop:
cmp r3, #16
blt L_chacha_arm32_over_word_loop
# 16 bytes of state XORed into message.
ldr r12, [r0]
ldr lr, [r0, #4]
ldr r4, [r0, #8]
ldr r5, [r0, #12]
ldr r6, [r2]
ldr r7, [r2, #4]
ldr r8, [r2, #8]
ldr r9, [r2, #12]
eor r12, r12, r6
eor lr, lr, r7
eor r4, r4, r8
eor r5, r5, r9
subs r3, r3, #16
str r12, [r1]
str lr, [r1, #4]
str r4, [r1, #8]
str r5, [r1, #12]
beq L_chacha_arm32_over_done
add r0, r0, #16
add r2, r2, #16
add r1, r1, #16
b L_chacha_arm32_over_16byte_loop
L_chacha_arm32_over_word_loop:
cmp r3, #4
blt L_chacha_arm32_over_byte_loop
# 4 bytes of state XORed into message.
ldr r12, [r0]
ldr r6, [r2]
eor r12, r12, r6
subs r3, r3, #4
str r12, [r1]
beq L_chacha_arm32_over_done
add r0, r0, #4
add r2, r2, #4
add r1, r1, #4
b L_chacha_arm32_over_word_loop
L_chacha_arm32_over_byte_loop:
# 4 bytes of state XORed into message.
ldrb r12, [r0]
ldrb r6, [r2]
eor r12, r12, r6
subs r3, r3, #1
strb r12, [r1]
beq L_chacha_arm32_over_done
add r0, r0, #1
add r2, r2, #1
add r1, r1, #1
b L_chacha_arm32_over_byte_loop
L_chacha_arm32_over_done:
pop {r4, r5, r6, r7, r8, r9, pc}
.size wc_chacha_use_over,.-wc_chacha_use_over
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* HAVE_CHACHA */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#endif /* !WOLFSSL_ARMASM_INLINE */

View File

@ -0,0 +1,569 @@
/* armv8-32-chacha-asm
*
* Copyright (C) 2006-2024 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./chacha/chacha.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-chacha-asm.c
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifdef __IAR_SYSTEMS_ICC__
#define __asm__ asm
#define __volatile__ volatile
#endif /* __IAR_SYSTEMS_ICC__ */
#ifdef __KEIL__
#define __asm__ __asm
#define __volatile__ volatile
#endif /* __KEIL__ */
#ifdef HAVE_CHACHA
#include <wolfssl/wolfcrypt/chacha.h>
void wc_chacha_setiv(word32* x_p, const byte* iv_p, word32 counter_p)
{
register word32* x asm ("r0") = (word32*)x_p;
register const byte* iv asm ("r1") = (const byte*)iv_p;
register word32 counter asm ("r2") = (word32)counter_p;
__asm__ __volatile__ (
"add r3, %[x], #52\n\t"
"ldr r4, [%[iv]]\n\t"
"ldr r12, [%[iv], #4]\n\t"
"ldr lr, [%[iv], #8]\n\t"
"str %[counter], [%[x], #48]\n\t"
#ifdef BIG_ENDIAN_ORDER
"rev r4, r4\n\t"
"rev r12, r12\n\t"
"rev lr, lr\n\t"
#endif /* BIG_ENDIAN_ORDER */
"stm r3, {r4, r12, lr}\n\t"
: [x] "+r" (x), [iv] "+r" (iv), [counter] "+r" (counter)
:
: "memory", "r3", "r12", "lr", "r4", "cc"
);
}
static const uint32_t L_chacha_arm32_constants[] = {
0x61707865, 0x3120646e, 0x79622d36, 0x6b206574,
0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
};
void wc_chacha_setkey(word32* x_p, const byte* key_p, word32 keySz_p)
{
register word32* x asm ("r0") = (word32*)x_p;
register const byte* key asm ("r1") = (const byte*)key_p;
register word32 keySz asm ("r2") = (word32)keySz_p;
register uint32_t* L_chacha_arm32_constants_c asm ("r3") = (uint32_t*)&L_chacha_arm32_constants;
__asm__ __volatile__ (
"subs %[keySz], %[keySz], #16\n\t"
"add r3, r3, %[keySz]\n\t"
/* Start state with constants */
"ldm r3, {r4, r5, r12, lr}\n\t"
"stm %[x]!, {r4, r5, r12, lr}\n\t"
/* Next is first 16 bytes of key. */
"ldr r4, [%[key]]\n\t"
"ldr r5, [%[key], #4]\n\t"
"ldr r12, [%[key], #8]\n\t"
"ldr lr, [%[key], #12]\n\t"
#ifdef BIG_ENDIAN_ORDER
"rev r4, r4\n\t"
"rev r5, r5\n\t"
"rev r12, r12\n\t"
"rev lr, lr\n\t"
#endif /* BIG_ENDIAN_ORDER */
"stm %[x]!, {r4, r5, r12, lr}\n\t"
/* Next 16 bytes of key. */
"beq L_chacha_arm32_setkey_same_keyb_ytes_%=\n\t"
/* Update key pointer for next 16 bytes. */
"add %[key], %[key], %[keySz]\n\t"
"ldr r4, [%[key]]\n\t"
"ldr r5, [%[key], #4]\n\t"
"ldr r12, [%[key], #8]\n\t"
"ldr lr, [%[key], #12]\n\t"
"\n"
"L_chacha_arm32_setkey_same_keyb_ytes_%=: \n\t"
"stm %[x], {r4, r5, r12, lr}\n\t"
: [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz), [L_chacha_arm32_constants] "+r" (L_chacha_arm32_constants_c)
:
: "memory", "r12", "lr", "r4", "r5", "cc"
);
}
#ifdef WOLFSSL_ARMASM_NO_NEON
void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p, word32 len_p)
{
register ChaCha* ctx asm ("r0") = (ChaCha*)ctx_p;
register byte* c asm ("r1") = (byte*)c_p;
register const byte* m asm ("r2") = (const byte*)m_p;
register word32 len asm ("r3") = (word32)len_p;
__asm__ __volatile__ (
"sub sp, sp, #52\n\t"
"mov lr, %[ctx]\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"str %[ctx], [sp, #32]\n\t"
"str %[c], [sp, #36]\n\t"
#else
"strd %[ctx], %[c], [sp, #32]\n\t"
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"str %[m], [sp, #40]\n\t"
"str %[len], [sp, #44]\n\t"
#else
"strd %[m], %[len], [sp, #40]\n\t"
#endif
"\n"
"L_chacha_arm32_crypt_block_%=: \n\t"
/* Put x[12]..x[15] onto stack. */
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"ldr r4, [lr, #48]\n\t"
"ldr r5, [lr, #52]\n\t"
#else
"ldrd r4, r5, [lr, #48]\n\t"
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"ldr r6, [lr, #56]\n\t"
"ldr r7, [lr, #60]\n\t"
#else
"ldrd r6, r7, [lr, #56]\n\t"
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"str r4, [sp, #16]\n\t"
"str r5, [sp, #20]\n\t"
#else
"strd r4, r5, [sp, #16]\n\t"
#endif
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"str r6, [sp, #24]\n\t"
"str r7, [sp, #28]\n\t"
#else
"strd r6, r7, [sp, #24]\n\t"
#endif
/* Load x[0]..x[12] into registers. */
"ldm lr, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
/* 10x 2 full rounds to perform. */
"mov lr, #10\n\t"
"str lr, [sp, #48]\n\t"
"\n"
"L_chacha_arm32_crypt_loop_%=: \n\t"
/* 0, 4, 8, 12 */
/* 1, 5, 9, 13 */
"ldr lr, [sp, #20]\n\t"
"add %[ctx], %[ctx], r4\n\t"
"add %[c], %[c], r5\n\t"
"eor r12, r12, %[ctx]\n\t"
"eor lr, lr, %[c]\n\t"
"ror r12, r12, #16\n\t"
"ror lr, lr, #16\n\t"
"add r8, r8, r12\n\t"
"add r9, r9, lr\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"ror r4, r4, #20\n\t"
"ror r5, r5, #20\n\t"
"add %[ctx], %[ctx], r4\n\t"
"add %[c], %[c], r5\n\t"
"eor r12, r12, %[ctx]\n\t"
"eor lr, lr, %[c]\n\t"
"ror r12, r12, #24\n\t"
"ror lr, lr, #24\n\t"
"add r8, r8, r12\n\t"
"add r9, r9, lr\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"ror r4, r4, #25\n\t"
"ror r5, r5, #25\n\t"
"str r12, [sp, #16]\n\t"
"str lr, [sp, #20]\n\t"
/* 2, 6, 10, 14 */
/* 3, 7, 11, 15 */
"ldr r12, [sp, #24]\n\t"
"ldr lr, [sp, #28]\n\t"
"add %[m], %[m], r6\n\t"
"add %[len], %[len], r7\n\t"
"eor r12, r12, %[m]\n\t"
"eor lr, lr, %[len]\n\t"
"ror r12, r12, #16\n\t"
"ror lr, lr, #16\n\t"
"add r10, r10, r12\n\t"
"add r11, r11, lr\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"ror r6, r6, #20\n\t"
"ror r7, r7, #20\n\t"
"add %[m], %[m], r6\n\t"
"add %[len], %[len], r7\n\t"
"eor r12, r12, %[m]\n\t"
"eor lr, lr, %[len]\n\t"
"ror r12, r12, #24\n\t"
"ror lr, lr, #24\n\t"
"add r10, r10, r12\n\t"
"add r11, r11, lr\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"ror r6, r6, #25\n\t"
"ror r7, r7, #25\n\t"
/* 3, 4, 9, 14 */
/* 0, 5, 10, 15 */
"add %[len], %[len], r4\n\t"
"add %[ctx], %[ctx], r5\n\t"
"eor r12, r12, %[len]\n\t"
"eor lr, lr, %[ctx]\n\t"
"ror r12, r12, #16\n\t"
"ror lr, lr, #16\n\t"
"add r9, r9, r12\n\t"
"add r10, r10, lr\n\t"
"eor r4, r4, r9\n\t"
"eor r5, r5, r10\n\t"
"ror r4, r4, #20\n\t"
"ror r5, r5, #20\n\t"
"add %[len], %[len], r4\n\t"
"add %[ctx], %[ctx], r5\n\t"
"eor r12, r12, %[len]\n\t"
"eor lr, lr, %[ctx]\n\t"
"ror r12, r12, #24\n\t"
"ror lr, lr, #24\n\t"
"add r9, r9, r12\n\t"
"add r10, r10, lr\n\t"
"eor r4, r4, r9\n\t"
"eor r5, r5, r10\n\t"
"ror r4, r4, #25\n\t"
"ror r5, r5, #25\n\t"
"str r12, [sp, #24]\n\t"
"str lr, [sp, #28]\n\t"
"ldr r12, [sp, #16]\n\t"
"ldr lr, [sp, #20]\n\t"
/* 1, 6, 11, 12 */
/* 2, 7, 8, 13 */
"add %[c], %[c], r6\n\t"
"add %[m], %[m], r7\n\t"
"eor r12, r12, %[c]\n\t"
"eor lr, lr, %[m]\n\t"
"ror r12, r12, #16\n\t"
"ror lr, lr, #16\n\t"
"add r11, r11, r12\n\t"
"add r8, r8, lr\n\t"
"eor r6, r6, r11\n\t"
"eor r7, r7, r8\n\t"
"ror r6, r6, #20\n\t"
"ror r7, r7, #20\n\t"
"add %[c], %[c], r6\n\t"
"add %[m], %[m], r7\n\t"
"eor r12, r12, %[c]\n\t"
"eor lr, lr, %[m]\n\t"
"ror r12, r12, #24\n\t"
"ror lr, lr, #24\n\t"
"add r11, r11, r12\n\t"
"add r8, r8, lr\n\t"
"eor r6, r6, r11\n\t"
"eor r7, r7, r8\n\t"
"ror r6, r6, #25\n\t"
"ror r7, r7, #25\n\t"
"str lr, [sp, #20]\n\t"
/* Check if we have done enough rounds. */
"ldr lr, [sp, #48]\n\t"
"subs lr, lr, #1\n\t"
"str lr, [sp, #48]\n\t"
"bgt L_chacha_arm32_crypt_loop_%=\n\t"
"stm sp, {r8, r9, r10, r11, r12}\n\t"
"ldr lr, [sp, #32]\n\t"
"mov r12, sp\n\t"
/* Add in original state */
"ldm lr!, {r8, r9, r10, r11}\n\t"
"add %[ctx], %[ctx], r8\n\t"
"add %[c], %[c], r9\n\t"
"add %[m], %[m], r10\n\t"
"add %[len], %[len], r11\n\t"
"ldm lr!, {r8, r9, r10, r11}\n\t"
"add r4, r4, r8\n\t"
"add r5, r5, r9\n\t"
"add r6, r6, r10\n\t"
"add r7, r7, r11\n\t"
"ldm r12, {r8, r9}\n\t"
"ldm lr!, {r10, r11}\n\t"
"add r8, r8, r10\n\t"
"add r9, r9, r11\n\t"
"stm r12!, {r8, r9}\n\t"
"ldm r12, {r8, r9}\n\t"
"ldm lr!, {r10, r11}\n\t"
"add r8, r8, r10\n\t"
"add r9, r9, r11\n\t"
"stm r12!, {r8, r9}\n\t"
"ldm r12, {r8, r9}\n\t"
"ldm lr!, {r10, r11}\n\t"
"add r8, r8, r10\n\t"
"add r9, r9, r11\n\t"
"add r10, r10, #1\n\t"
"stm r12!, {r8, r9}\n\t"
"str r10, [lr, #-8]\n\t"
"ldm r12, {r8, r9}\n\t"
"ldm lr, {r10, r11}\n\t"
"add r8, r8, r10\n\t"
"add r9, r9, r11\n\t"
"stm r12, {r8, r9}\n\t"
"ldr r12, [sp, #44]\n\t"
"cmp r12, #0x40\n\t"
"blt L_chacha_arm32_crypt_lt_block_%=\n\t"
"ldr r12, [sp, #40]\n\t"
"ldr lr, [sp, #36]\n\t"
/* XOR state into 64 bytes. */
"ldr r8, [r12]\n\t"
"ldr r9, [r12, #4]\n\t"
"ldr r10, [r12, #8]\n\t"
"ldr r11, [r12, #12]\n\t"
"eor %[ctx], %[ctx], r8\n\t"
"eor %[c], %[c], r9\n\t"
"eor %[m], %[m], r10\n\t"
"eor %[len], %[len], r11\n\t"
"str %[ctx], [lr]\n\t"
"str %[c], [lr, #4]\n\t"
"str %[m], [lr, #8]\n\t"
"str %[len], [lr, #12]\n\t"
"ldr r8, [r12, #16]\n\t"
"ldr r9, [r12, #20]\n\t"
"ldr r10, [r12, #24]\n\t"
"ldr r11, [r12, #28]\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"str r4, [lr, #16]\n\t"
"str r5, [lr, #20]\n\t"
"str r6, [lr, #24]\n\t"
"str r7, [lr, #28]\n\t"
"ldr r4, [sp]\n\t"
"ldr r5, [sp, #4]\n\t"
"ldr r6, [sp, #8]\n\t"
"ldr r7, [sp, #12]\n\t"
"ldr r8, [r12, #32]\n\t"
"ldr r9, [r12, #36]\n\t"
"ldr r10, [r12, #40]\n\t"
"ldr r11, [r12, #44]\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"str r4, [lr, #32]\n\t"
"str r5, [lr, #36]\n\t"
"str r6, [lr, #40]\n\t"
"str r7, [lr, #44]\n\t"
"ldr r4, [sp, #16]\n\t"
"ldr r5, [sp, #20]\n\t"
"ldr r6, [sp, #24]\n\t"
"ldr r7, [sp, #28]\n\t"
"ldr r8, [r12, #48]\n\t"
"ldr r9, [r12, #52]\n\t"
"ldr r10, [r12, #56]\n\t"
"ldr r11, [r12, #60]\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"eor r6, r6, r10\n\t"
"eor r7, r7, r11\n\t"
"str r4, [lr, #48]\n\t"
"str r5, [lr, #52]\n\t"
"str r6, [lr, #56]\n\t"
"str r7, [lr, #60]\n\t"
"ldr %[len], [sp, #44]\n\t"
"add r12, r12, #0x40\n\t"
"add lr, lr, #0x40\n\t"
"str r12, [sp, #40]\n\t"
"str lr, [sp, #36]\n\t"
"subs %[len], %[len], #0x40\n\t"
"ldr lr, [sp, #32]\n\t"
"str %[len], [sp, #44]\n\t"
"bne L_chacha_arm32_crypt_block_%=\n\t"
"b L_chacha_arm32_crypt_done_%=\n\t"
"\n"
"L_chacha_arm32_crypt_lt_block_%=: \n\t"
/* Store in over field of ChaCha. */
"ldr lr, [sp, #32]\n\t"
"add r12, lr, #0x44\n\t"
"stm r12!, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t"
"ldm sp, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t"
"stm r12, {%[ctx], %[c], %[m], %[len], r4, r5, r6, r7}\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
"ldr %[m], [sp, #40]\n\t"
"ldr %[len], [sp, #44]\n\t"
#else
"ldrd %[m], %[len], [sp, #40]\n\t"
#endif
"ldr %[c], [sp, #36]\n\t"
"rsb r12, %[len], #0x40\n\t"
"str r12, [lr, #64]\n\t"
"add lr, lr, #0x44\n\t"
"\n"
"L_chacha_arm32_crypt_16byte_loop_%=: \n\t"
"cmp %[len], #16\n\t"
"blt L_chacha_arm32_crypt_word_loop_%=\n\t"
/* 16 bytes of state XORed into message. */
"ldm lr!, {r4, r5, r6, r7}\n\t"
"ldr r8, [%[m]]\n\t"
"ldr r9, [%[m], #4]\n\t"
"ldr r10, [%[m], #8]\n\t"
"ldr r11, [%[m], #12]\n\t"
"eor r8, r8, r4\n\t"
"eor r9, r9, r5\n\t"
"eor r10, r10, r6\n\t"
"eor r11, r11, r7\n\t"
"subs %[len], %[len], #16\n\t"
"str r8, [%[c]]\n\t"
"str r9, [%[c], #4]\n\t"
"str r10, [%[c], #8]\n\t"
"str r11, [%[c], #12]\n\t"
"beq L_chacha_arm32_crypt_done_%=\n\t"
"add %[m], %[m], #16\n\t"
"add %[c], %[c], #16\n\t"
"b L_chacha_arm32_crypt_16byte_loop_%=\n\t"
"\n"
"L_chacha_arm32_crypt_word_loop_%=: \n\t"
"cmp %[len], #4\n\t"
"blt L_chacha_arm32_crypt_byte_start_%=\n\t"
/* 4 bytes of state XORed into message. */
"ldr r4, [lr]\n\t"
"ldr r8, [%[m]]\n\t"
"eor r8, r8, r4\n\t"
"subs %[len], %[len], #4\n\t"
"str r8, [%[c]]\n\t"
"beq L_chacha_arm32_crypt_done_%=\n\t"
"add lr, lr, #4\n\t"
"add %[m], %[m], #4\n\t"
"add %[c], %[c], #4\n\t"
"b L_chacha_arm32_crypt_word_loop_%=\n\t"
"\n"
"L_chacha_arm32_crypt_byte_start_%=: \n\t"
"ldr r4, [lr]\n\t"
"\n"
"L_chacha_arm32_crypt_byte_loop_%=: \n\t"
"ldrb r8, [%[m]]\n\t"
"eor r8, r8, r4\n\t"
"subs %[len], %[len], #1\n\t"
"strb r8, [%[c]]\n\t"
"beq L_chacha_arm32_crypt_done_%=\n\t"
"lsr r4, r4, #8\n\t"
"add %[m], %[m], #1\n\t"
"add %[c], %[c], #1\n\t"
"b L_chacha_arm32_crypt_byte_loop_%=\n\t"
"\n"
"L_chacha_arm32_crypt_done_%=: \n\t"
"add sp, sp, #52\n\t"
: [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len)
:
: "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
);
}
void wc_chacha_use_over(byte* over_p, byte* output_p, const byte* input_p, word32 len_p)
{
register byte* over asm ("r0") = (byte*)over_p;
register byte* output asm ("r1") = (byte*)output_p;
register const byte* input asm ("r2") = (const byte*)input_p;
register word32 len asm ("r3") = (word32)len_p;
__asm__ __volatile__ (
"\n"
"L_chacha_arm32_over_16byte_loop_%=: \n\t"
"cmp %[len], #16\n\t"
"blt L_chacha_arm32_over_word_loop_%=\n\t"
/* 16 bytes of state XORed into message. */
"ldr r12, [%[over]]\n\t"
"ldr lr, [%[over], #4]\n\t"
"ldr r4, [%[over], #8]\n\t"
"ldr r5, [%[over], #12]\n\t"
"ldr r6, [%[input]]\n\t"
"ldr r7, [%[input], #4]\n\t"
"ldr r8, [%[input], #8]\n\t"
"ldr r9, [%[input], #12]\n\t"
"eor r12, r12, r6\n\t"
"eor lr, lr, r7\n\t"
"eor r4, r4, r8\n\t"
"eor r5, r5, r9\n\t"
"subs %[len], %[len], #16\n\t"
"str r12, [%[output]]\n\t"
"str lr, [%[output], #4]\n\t"
"str r4, [%[output], #8]\n\t"
"str r5, [%[output], #12]\n\t"
"beq L_chacha_arm32_over_done_%=\n\t"
"add %[over], %[over], #16\n\t"
"add %[input], %[input], #16\n\t"
"add %[output], %[output], #16\n\t"
"b L_chacha_arm32_over_16byte_loop_%=\n\t"
"\n"
"L_chacha_arm32_over_word_loop_%=: \n\t"
"cmp %[len], #4\n\t"
"blt L_chacha_arm32_over_byte_loop_%=\n\t"
/* 4 bytes of state XORed into message. */
"ldr r12, [%[over]]\n\t"
"ldr r6, [%[input]]\n\t"
"eor r12, r12, r6\n\t"
"subs %[len], %[len], #4\n\t"
"str r12, [%[output]]\n\t"
"beq L_chacha_arm32_over_done_%=\n\t"
"add %[over], %[over], #4\n\t"
"add %[input], %[input], #4\n\t"
"add %[output], %[output], #4\n\t"
"b L_chacha_arm32_over_word_loop_%=\n\t"
"\n"
"L_chacha_arm32_over_byte_loop_%=: \n\t"
/* 4 bytes of state XORed into message. */
"ldrb r12, [%[over]]\n\t"
"ldrb r6, [%[input]]\n\t"
"eor r12, r12, r6\n\t"
"subs %[len], %[len], #1\n\t"
"strb r12, [%[output]]\n\t"
"beq L_chacha_arm32_over_done_%=\n\t"
"add %[over], %[over], #1\n\t"
"add %[input], %[input], #1\n\t"
"add %[output], %[output], #1\n\t"
"b L_chacha_arm32_over_byte_loop_%=\n\t"
"\n"
"L_chacha_arm32_over_done_%=: \n\t"
: [over] "+r" (over), [output] "+r" (output), [input] "+r" (input), [len] "+r" (len)
:
: "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
);
}
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* HAVE_CHACHA */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */

View File

@ -0,0 +1,356 @@
/* armv8-32-poly1305-asm
*
* Copyright (C) 2006-2024 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./poly1305/poly1305.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef HAVE_POLY1305
.text
.align 4
.globl poly1305_blocks_arm32_16
.type poly1305_blocks_arm32_16, %function
poly1305_blocks_arm32_16:
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
sub sp, sp, #28
cmp r2, #0
beq L_poly1305_arm32_16_done
add lr, sp, #12
stm lr, {r0, r1, r2, r3}
# Get h pointer
add lr, r0, #16
ldm lr, {r4, r5, r6, r7, r8}
L_poly1305_arm32_16_loop:
# Add m to h
ldr r1, [sp, #16]
ldr r2, [r1]
ldr r3, [r1, #4]
ldr r9, [r1, #8]
ldr r10, [r1, #12]
ldr r11, [sp, #24]
adds r4, r4, r2
adcs r5, r5, r3
adcs r6, r6, r9
adcs r7, r7, r10
add r1, r1, #16
adc r8, r8, r11
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
stm lr, {r4, r5, r6, r7, r8}
#else
# h[0]-h[2] in r4-r6 for multiplication.
str r7, [lr, #12]
str r8, [lr, #16]
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
str r1, [sp, #16]
ldr r1, [sp, #12]
# Multiply h by r
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
# r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i]
ldr r3, [r1]
eor r0, r0, r0
# r[0] * h[0]
# h[0] in r4
umull r4, r5, r3, r4
# r[0] * h[2]
# h[2] in r6
umull r6, r7, r3, r6
# r[0] * h[4]
# h[4] in r8
mul r8, r3, r8
# r[0] * h[1]
ldr r2, [lr, #4]
mov r12, r0
umlal r5, r12, r3, r2
# r[0] * h[3]
ldr r2, [lr, #12]
adds r6, r6, r12
adc r7, r7, r0
umlal r7, r8, r3, r2
# r[1] * h[0]
ldr r3, [r1, #4]
ldr r2, [lr]
mov r12, r0
umlal r5, r12, r3, r2
# r[1] * h[1]
ldr r2, [lr, #4]
adds r6, r6, r12
adc r12, r0, r0
umlal r6, r12, r3, r2
# r[1] * h[2]
ldr r2, [lr, #8]
adds r7, r7, r12
adc r12, r0, r0
umlal r7, r12, r3, r2
# r[1] * h[3]
ldr r2, [lr, #12]
adds r8, r8, r12
adc r9, r0, r0
umlal r8, r9, r3, r2
# r[1] * h[4]
ldr r2, [lr, #16]
mla r9, r3, r2, r9
# r[2] * h[0]
ldr r3, [r1, #8]
ldr r2, [lr]
mov r12, r0
umlal r6, r12, r3, r2
# r[2] * h[1]
ldr r2, [lr, #4]
adds r7, r7, r12
adc r12, r0, r0
umlal r7, r12, r3, r2
# r[2] * h[2]
ldr r2, [lr, #8]
adds r8, r8, r12
adc r12, r0, r0
umlal r8, r12, r3, r2
# r[2] * h[3]
ldr r2, [lr, #12]
adds r9, r9, r12
adc r10, r0, r0
umlal r9, r10, r3, r2
# r[2] * h[4]
ldr r2, [lr, #16]
mla r10, r3, r2, r10
# r[3] * h[0]
ldr r3, [r1, #12]
ldr r2, [lr]
mov r12, r0
umlal r7, r12, r3, r2
# r[3] * h[1]
ldr r2, [lr, #4]
adds r8, r8, r12
adc r12, r0, r0
umlal r8, r12, r3, r2
# r[3] * h[2]
ldr r2, [lr, #8]
adds r9, r9, r12
adc r10, r10, r0
umlal r9, r10, r3, r2
# r[3] * h[3]
ldr r2, [lr, #12]
mov r11, r0
umlal r10, r11, r3, r2
# r[3] * h[4]
ldr r2, [lr, #16]
mov r12, r0
mla r11, r3, r2, r11
#else
ldm r1, {r0, r1, r2, r3}
# r[0] * h[0]
umull r10, r11, r0, r4
# r[1] * h[0]
umull r12, r7, r1, r4
# r[0] * h[1]
umaal r11, r12, r0, r5
# r[2] * h[0]
umull r8, r9, r2, r4
# r[1] * h[1]
umaal r12, r8, r1, r5
# r[0] * h[2]
umaal r12, r7, r0, r6
# r[3] * h[0]
umaal r8, r9, r3, r4
stm sp, {r10, r11, r12}
# r[2] * h[1]
umaal r7, r8, r2, r5
# Replace h[0] with h[3]
ldr r4, [lr, #12]
# r[1] * h[2]
umull r10, r11, r1, r6
# r[2] * h[2]
umaal r8, r9, r2, r6
# r[0] * h[3]
umaal r7, r10, r0, r4
# r[3] * h[1]
umaal r8, r11, r3, r5
# r[1] * h[3]
umaal r8, r10, r1, r4
# r[3] * h[2]
umaal r9, r11, r3, r6
# r[2] * h[3]
umaal r9, r10, r2, r4
# Replace h[1] with h[4]
ldr r5, [lr, #16]
# r[3] * h[3]
umaal r10, r11, r3, r4
mov r12, #0
# r[0] * h[4]
umaal r8, r12, r0, r5
# r[1] * h[4]
umaal r9, r12, r1, r5
# r[2] * h[4]
umaal r10, r12, r2, r5
# r[3] * h[4]
umaal r11, r12, r3, r5
# DONE
ldm sp, {r4, r5, r6}
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
# r12 will be zero because r is masked.
# Load length
ldr r2, [sp, #20]
# Reduce mod 2^130 - 5
bic r3, r8, #3
and r8, r8, #3
adds r4, r4, r3
lsr r3, r3, #2
adcs r5, r5, r9
orr r3, r3, r9, LSL #30
adcs r6, r6, r10
lsr r9, r9, #2
adcs r7, r7, r11
orr r9, r9, r10, LSL #30
adc r8, r8, r12
lsr r10, r10, #2
adds r4, r4, r3
orr r10, r10, r11, LSL #30
adcs r5, r5, r9
lsr r11, r11, #2
adcs r6, r6, r10
adcs r7, r7, r11
adc r8, r8, r12
# Sub 16 from length.
subs r2, r2, #16
# Store length.
str r2, [sp, #20]
# Loop again if more message to do.
bgt L_poly1305_arm32_16_loop
stm lr, {r4, r5, r6, r7, r8}
L_poly1305_arm32_16_done:
add sp, sp, #28
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size poly1305_blocks_arm32_16,.-poly1305_blocks_arm32_16
.text
.type L_poly1305_arm32_clamp, %object
.size L_poly1305_arm32_clamp, 16
.align 4
L_poly1305_arm32_clamp:
.word 0xfffffff
.word 0xffffffc
.word 0xffffffc
.word 0xffffffc
.text
.align 4
.globl poly1305_set_key
.type poly1305_set_key, %function
poly1305_set_key:
push {r4, r5, r6, r7, r8, lr}
# Load mask.
adr lr, L_poly1305_arm32_clamp
ldm lr, {r6, r7, r8, r12}
# Load and cache padding.
ldr r2, [r1, #16]
ldr r3, [r1, #20]
ldr r4, [r1, #24]
ldr r5, [r1, #28]
add lr, r0, #36
stm lr, {r2, r3, r4, r5}
# Load, mask and store r.
ldr r2, [r1]
ldr r3, [r1, #4]
ldr r4, [r1, #8]
ldr r5, [r1, #12]
and r2, r2, r6
and r3, r3, r7
and r4, r4, r8
and r5, r5, r12
add lr, r0, #0
stm lr, {r2, r3, r4, r5}
# h (accumulator) = 0
eor r6, r6, r6
eor r7, r7, r7
eor r8, r8, r8
eor r12, r12, r12
add lr, r0, #16
eor r5, r5, r5
stm lr, {r5, r6, r7, r8, r12}
# Zero leftover
str r5, [r0, #52]
pop {r4, r5, r6, r7, r8, pc}
.size poly1305_set_key,.-poly1305_set_key
.text
.align 4
.globl poly1305_final
.type poly1305_final, %function
poly1305_final:
push {r4, r5, r6, r7, r8, r9, lr}
add r9, r0, #16
ldm r9, {r4, r5, r6, r7, r8}
# Add 5 and check for h larger than p.
adds r2, r4, #5
adcs r2, r5, #0
adcs r2, r6, #0
adcs r2, r7, #0
adc r2, r8, #0
sub r2, r2, #4
lsr r2, r2, #31
sub r2, r2, #1
and r2, r2, #5
# Add 0/5 to h.
adds r4, r4, r2
adcs r5, r5, #0
adcs r6, r6, #0
adc r7, r7, #0
# Add padding
add r9, r0, #36
ldm r9, {r2, r3, r12, lr}
adds r4, r4, r2
adcs r5, r5, r3
adcs r6, r6, r12
adc r7, r7, lr
# Store MAC
str r4, [r1]
str r5, [r1, #4]
str r6, [r1, #8]
str r7, [r1, #12]
# Zero out h.
eor r4, r4, r4
eor r5, r5, r5
eor r6, r6, r6
eor r7, r7, r7
eor r8, r8, r8
add r9, r0, #16
stm r9, {r4, r5, r6, r7, r8}
# Zero out r.
add r9, r0, #0
stm r9, {r4, r5, r6, r7}
# Zero out padding.
add r9, r0, #36
stm r9, {r4, r5, r6, r7}
pop {r4, r5, r6, r7, r8, r9, pc}
.size poly1305_final,.-poly1305_final
#endif /* HAVE_POLY1305 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#endif /* !WOLFSSL_ARMASM_INLINE */

View File

@ -0,0 +1,388 @@
/* armv8-32-poly1305-asm
*
* Copyright (C) 2006-2024 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./poly1305/poly1305.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.c
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifdef __IAR_SYSTEMS_ICC__
#define __asm__ asm
#define __volatile__ volatile
#endif /* __IAR_SYSTEMS_ICC__ */
#ifdef __KEIL__
#define __asm__ __asm
#define __volatile__ volatile
#endif /* __KEIL__ */
#ifdef HAVE_POLY1305
#include <wolfssl/wolfcrypt/poly1305.h>
void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, int notLast_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register const byte* m asm ("r1") = (const byte*)m_p;
register word32 len asm ("r2") = (word32)len_p;
register int notLast asm ("r3") = (int)notLast_p;
__asm__ __volatile__ (
"sub sp, sp, #28\n\t"
"cmp %[len], #0\n\t"
"beq L_poly1305_arm32_16_done_%=\n\t"
"add lr, sp, #12\n\t"
"stm lr, {%[ctx], %[m], %[len], %[notLast]}\n\t"
/* Get h pointer */
"add lr, %[ctx], #16\n\t"
"ldm lr, {r4, r5, r6, r7, r8}\n\t"
"\n"
"L_poly1305_arm32_16_loop_%=: \n\t"
/* Add m to h */
"ldr %[m], [sp, #16]\n\t"
"ldr %[len], [%[m]]\n\t"
"ldr %[notLast], [%[m], #4]\n\t"
"ldr r9, [%[m], #8]\n\t"
"ldr r10, [%[m], #12]\n\t"
"ldr r11, [sp, #24]\n\t"
"adds r4, r4, %[len]\n\t"
"adcs r5, r5, %[notLast]\n\t"
"adcs r6, r6, r9\n\t"
"adcs r7, r7, r10\n\t"
"add %[m], %[m], #16\n\t"
"adc r8, r8, r11\n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
"stm lr, {r4, r5, r6, r7, r8}\n\t"
#else
/* h[0]-h[2] in r4-r6 for multiplication. */
"str r7, [lr, #12]\n\t"
"str r8, [lr, #16]\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
"str %[m], [sp, #16]\n\t"
"ldr %[m], [sp, #12]\n\t"
/* Multiply h by r */
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
/* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */
"ldr %[notLast], [%[m]]\n\t"
"eor %[ctx], %[ctx], %[ctx]\n\t"
/* r[0] * h[0] */
/* h[0] in r4 */
"umull r4, r5, %[notLast], r4\n\t"
/* r[0] * h[2] */
/* h[2] in r6 */
"umull r6, r7, %[notLast], r6\n\t"
/* r[0] * h[4] */
/* h[4] in r8 */
"mul r8, %[notLast], r8\n\t"
/* r[0] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r5, r12, %[notLast], %[len]\n\t"
/* r[0] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"adds r6, r6, r12\n\t"
"adc r7, r7, %[ctx]\n\t"
"umlal r7, r8, %[notLast], %[len]\n\t"
/* r[1] * h[0] */
"ldr %[notLast], [%[m], #4]\n\t"
"ldr %[len], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r5, r12, %[notLast], %[len]\n\t"
/* r[1] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"adds r6, r6, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r6, r12, %[notLast], %[len]\n\t"
/* r[1] * h[2] */
"ldr %[len], [lr, #8]\n\t"
"adds r7, r7, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r7, r12, %[notLast], %[len]\n\t"
/* r[1] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"adds r8, r8, r12\n\t"
"adc r9, %[ctx], %[ctx]\n\t"
"umlal r8, r9, %[notLast], %[len]\n\t"
/* r[1] * h[4] */
"ldr %[len], [lr, #16]\n\t"
"mla r9, %[notLast], %[len], r9\n\t"
/* r[2] * h[0] */
"ldr %[notLast], [%[m], #8]\n\t"
"ldr %[len], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r6, r12, %[notLast], %[len]\n\t"
/* r[2] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"adds r7, r7, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r7, r12, %[notLast], %[len]\n\t"
/* r[2] * h[2] */
"ldr %[len], [lr, #8]\n\t"
"adds r8, r8, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r8, r12, %[notLast], %[len]\n\t"
/* r[2] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"adds r9, r9, r12\n\t"
"adc r10, %[ctx], %[ctx]\n\t"
"umlal r9, r10, %[notLast], %[len]\n\t"
/* r[2] * h[4] */
"ldr %[len], [lr, #16]\n\t"
"mla r10, %[notLast], %[len], r10\n\t"
/* r[3] * h[0] */
"ldr %[notLast], [%[m], #12]\n\t"
"ldr %[len], [lr]\n\t"
"mov r12, %[ctx]\n\t"
"umlal r7, r12, %[notLast], %[len]\n\t"
/* r[3] * h[1] */
"ldr %[len], [lr, #4]\n\t"
"adds r8, r8, r12\n\t"
"adc r12, %[ctx], %[ctx]\n\t"
"umlal r8, r12, %[notLast], %[len]\n\t"
/* r[3] * h[2] */
"ldr %[len], [lr, #8]\n\t"
"adds r9, r9, r12\n\t"
"adc r10, r10, %[ctx]\n\t"
"umlal r9, r10, %[notLast], %[len]\n\t"
/* r[3] * h[3] */
"ldr %[len], [lr, #12]\n\t"
"mov r11, %[ctx]\n\t"
"umlal r10, r11, %[notLast], %[len]\n\t"
/* r[3] * h[4] */
"ldr %[len], [lr, #16]\n\t"
"mov r12, %[ctx]\n\t"
"mla r11, %[notLast], %[len], r11\n\t"
#else
"ldm %[m], {%[ctx], %[m], %[len], %[notLast]}\n\t"
/* r[0] * h[0] */
"umull r10, r11, %[ctx], r4\n\t"
/* r[1] * h[0] */
"umull r12, r7, %[m], r4\n\t"
/* r[0] * h[1] */
"umaal r11, r12, %[ctx], r5\n\t"
/* r[2] * h[0] */
"umull r8, r9, %[len], r4\n\t"
/* r[1] * h[1] */
"umaal r12, r8, %[m], r5\n\t"
/* r[0] * h[2] */
"umaal r12, r7, %[ctx], r6\n\t"
/* r[3] * h[0] */
"umaal r8, r9, %[notLast], r4\n\t"
"stm sp, {r10, r11, r12}\n\t"
/* r[2] * h[1] */
"umaal r7, r8, %[len], r5\n\t"
/* Replace h[0] with h[3] */
"ldr r4, [lr, #12]\n\t"
/* r[1] * h[2] */
"umull r10, r11, %[m], r6\n\t"
/* r[2] * h[2] */
"umaal r8, r9, %[len], r6\n\t"
/* r[0] * h[3] */
"umaal r7, r10, %[ctx], r4\n\t"
/* r[3] * h[1] */
"umaal r8, r11, %[notLast], r5\n\t"
/* r[1] * h[3] */
"umaal r8, r10, %[m], r4\n\t"
/* r[3] * h[2] */
"umaal r9, r11, %[notLast], r6\n\t"
/* r[2] * h[3] */
"umaal r9, r10, %[len], r4\n\t"
/* Replace h[1] with h[4] */
"ldr r5, [lr, #16]\n\t"
/* r[3] * h[3] */
"umaal r10, r11, %[notLast], r4\n\t"
"mov r12, #0\n\t"
/* r[0] * h[4] */
"umaal r8, r12, %[ctx], r5\n\t"
/* r[1] * h[4] */
"umaal r9, r12, %[m], r5\n\t"
/* r[2] * h[4] */
"umaal r10, r12, %[len], r5\n\t"
/* r[3] * h[4] */
"umaal r11, r12, %[notLast], r5\n\t"
/* DONE */
"ldm sp, {r4, r5, r6}\n\t"
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
/* r12 will be zero because r is masked. */
/* Load length */
"ldr %[len], [sp, #20]\n\t"
/* Reduce mod 2^130 - 5 */
"bic %[notLast], r8, #3\n\t"
"and r8, r8, #3\n\t"
"adds r4, r4, %[notLast]\n\t"
"lsr %[notLast], %[notLast], #2\n\t"
"adcs r5, r5, r9\n\t"
"orr %[notLast], %[notLast], r9, LSL #30\n\t"
"adcs r6, r6, r10\n\t"
"lsr r9, r9, #2\n\t"
"adcs r7, r7, r11\n\t"
"orr r9, r9, r10, LSL #30\n\t"
"adc r8, r8, r12\n\t"
"lsr r10, r10, #2\n\t"
"adds r4, r4, %[notLast]\n\t"
"orr r10, r10, r11, LSL #30\n\t"
"adcs r5, r5, r9\n\t"
"lsr r11, r11, #2\n\t"
"adcs r6, r6, r10\n\t"
"adcs r7, r7, r11\n\t"
"adc r8, r8, r12\n\t"
/* Sub 16 from length. */
"subs %[len], %[len], #16\n\t"
/* Store length. */
"str %[len], [sp, #20]\n\t"
/* Loop again if more message to do. */
"bgt L_poly1305_arm32_16_loop_%=\n\t"
"stm lr, {r4, r5, r6, r7, r8}\n\t"
"\n"
"L_poly1305_arm32_16_done_%=: \n\t"
"add sp, sp, #28\n\t"
: [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len), [notLast] "+r" (notLast)
:
: "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
);
}
static const uint32_t L_poly1305_arm32_clamp[] = {
0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc,
};
void poly1305_set_key(Poly1305* ctx_p, const byte* key_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register const byte* key asm ("r1") = (const byte*)key_p;
register uint32_t* L_poly1305_arm32_clamp_c asm ("r2") = (uint32_t*)&L_poly1305_arm32_clamp;
__asm__ __volatile__ (
/* Load mask. */
"mov lr, %[L_poly1305_arm32_clamp]\n\t"
"ldm lr, {r6, r7, r8, r12}\n\t"
/* Load and cache padding. */
"ldr r2, [%[key], #16]\n\t"
"ldr r3, [%[key], #20]\n\t"
"ldr r4, [%[key], #24]\n\t"
"ldr r5, [%[key], #28]\n\t"
"add lr, %[ctx], #36\n\t"
"stm lr, {r2, r3, r4, r5}\n\t"
/* Load, mask and store r. */
"ldr r2, [%[key]]\n\t"
"ldr r3, [%[key], #4]\n\t"
"ldr r4, [%[key], #8]\n\t"
"ldr r5, [%[key], #12]\n\t"
"and r2, r2, r6\n\t"
"and r3, r3, r7\n\t"
"and r4, r4, r8\n\t"
"and r5, r5, r12\n\t"
"add lr, %[ctx], #0\n\t"
"stm lr, {r2, r3, r4, r5}\n\t"
/* h (accumulator) = 0 */
"eor r6, r6, r6\n\t"
"eor r7, r7, r7\n\t"
"eor r8, r8, r8\n\t"
"eor r12, r12, r12\n\t"
"add lr, %[ctx], #16\n\t"
"eor r5, r5, r5\n\t"
"stm lr, {r5, r6, r7, r8, r12}\n\t"
/* Zero leftover */
"str r5, [%[ctx], #52]\n\t"
: [ctx] "+r" (ctx), [key] "+r" (key), [L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c)
:
: "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
);
}
void poly1305_final(Poly1305* ctx_p, byte* mac_p)
{
register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p;
register byte* mac asm ("r1") = (byte*)mac_p;
__asm__ __volatile__ (
"add r9, %[ctx], #16\n\t"
"ldm r9, {r4, r5, r6, r7, r8}\n\t"
/* Add 5 and check for h larger than p. */
"adds r2, r4, #5\n\t"
"adcs r2, r5, #0\n\t"
"adcs r2, r6, #0\n\t"
"adcs r2, r7, #0\n\t"
"adc r2, r8, #0\n\t"
"sub r2, r2, #4\n\t"
"lsr r2, r2, #31\n\t"
"sub r2, r2, #1\n\t"
"and r2, r2, #5\n\t"
/* Add 0/5 to h. */
"adds r4, r4, r2\n\t"
"adcs r5, r5, #0\n\t"
"adcs r6, r6, #0\n\t"
"adc r7, r7, #0\n\t"
/* Add padding */
"add r9, %[ctx], #36\n\t"
"ldm r9, {r2, r3, r12, lr}\n\t"
"adds r4, r4, r2\n\t"
"adcs r5, r5, r3\n\t"
"adcs r6, r6, r12\n\t"
"adc r7, r7, lr\n\t"
/* Store MAC */
"str r4, [%[mac]]\n\t"
"str r5, [%[mac], #4]\n\t"
"str r6, [%[mac], #8]\n\t"
"str r7, [%[mac], #12]\n\t"
/* Zero out h. */
"eor r4, r4, r4\n\t"
"eor r5, r5, r5\n\t"
"eor r6, r6, r6\n\t"
"eor r7, r7, r7\n\t"
"eor r8, r8, r8\n\t"
"add r9, %[ctx], #16\n\t"
"stm r9, {r4, r5, r6, r7, r8}\n\t"
/* Zero out r. */
"add r9, %[ctx], #0\n\t"
"stm r9, {r4, r5, r6, r7}\n\t"
/* Zero out padding. */
"add r9, %[ctx], #36\n\t"
"stm r9, {r4, r5, r6, r7}\n\t"
: [ctx] "+r" (ctx), [mac] "+r" (mac)
:
: "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
);
}
#endif /* HAVE_POLY1305 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */

View File

@ -32,6 +32,8 @@
#ifdef WOLFSSL_ARMASM
#if !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__)
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef WOLFSSL_SHA3
#ifndef WOLFSSL_ARMASM_NO_NEON
.text
.type L_sha3_arm2_neon_rt, %object
.size L_sha3_arm2_neon_rt, 192
@ -85,60 +87,6 @@ L_sha3_arm2_neon_rt:
.word 0x0
.word 0x80008008
.word 0x80000000
.text
.type L_sha3_arm2_rt, %object
.size L_sha3_arm2_rt, 192
.align 4
L_sha3_arm2_rt:
.word 0x1
.word 0x0
.word 0x8082
.word 0x0
.word 0x808a
.word 0x80000000
.word 0x80008000
.word 0x80000000
.word 0x808b
.word 0x0
.word 0x80000001
.word 0x0
.word 0x80008081
.word 0x80000000
.word 0x8009
.word 0x80000000
.word 0x8a
.word 0x0
.word 0x88
.word 0x0
.word 0x80008009
.word 0x0
.word 0x8000000a
.word 0x0
.word 0x8000808b
.word 0x0
.word 0x8b
.word 0x80000000
.word 0x8089
.word 0x80000000
.word 0x8003
.word 0x80000000
.word 0x8002
.word 0x80000000
.word 0x80
.word 0x80000000
.word 0x800a
.word 0x0
.word 0x8000000a
.word 0x80000000
.word 0x80008081
.word 0x80000000
.word 0x8080
.word 0x80000000
.word 0x80000001
.word 0x0
.word 0x80008008
.word 0x80000000
#ifndef WOLFSSL_ARMASM_NO_NEON
.text
.align 4
.globl BlockSha3
@ -407,6 +355,59 @@ L_sha3_arm32_neon_begin:
.size BlockSha3,.-BlockSha3
#endif /* WOLFSSL_ARMASM_NO_NEON */
#ifdef WOLFSSL_ARMASM_NO_NEON
.text
.type L_sha3_arm2_rt, %object
.size L_sha3_arm2_rt, 192
.align 4
L_sha3_arm2_rt:
.word 0x1
.word 0x0
.word 0x8082
.word 0x0
.word 0x808a
.word 0x80000000
.word 0x80008000
.word 0x80000000
.word 0x808b
.word 0x0
.word 0x80000001
.word 0x0
.word 0x80008081
.word 0x80000000
.word 0x8009
.word 0x80000000
.word 0x8a
.word 0x0
.word 0x88
.word 0x0
.word 0x80008009
.word 0x0
.word 0x8000000a
.word 0x0
.word 0x8000808b
.word 0x0
.word 0x8b
.word 0x80000000
.word 0x8089
.word 0x80000000
.word 0x8003
.word 0x80000000
.word 0x8002
.word 0x80000000
.word 0x80
.word 0x80000000
.word 0x800a
.word 0x0
.word 0x8000000a
.word 0x80000000
.word 0x80008081
.word 0x80000000
.word 0x8080
.word 0x80000000
.word 0x80000001
.word 0x0
.word 0x80008008
.word 0x80000000
.text
.align 4
.globl BlockSha3
@ -2391,6 +2392,7 @@ L_sha3_arm32_begin:
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.size BlockSha3,.-BlockSha3
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* WOLFSSL_SHA3 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */

View File

@ -51,6 +51,8 @@
#define __asm__ __asm
#define __volatile__ volatile
#endif /* __KEIL__ */
#ifdef WOLFSSL_SHA3
#ifndef WOLFSSL_ARMASM_NO_NEON
static const uint64_t L_sha3_arm2_neon_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
@ -66,29 +68,12 @@ static const uint64_t L_sha3_arm2_neon_rt[] = {
0x0000000080000001UL, 0x8000000080008008UL,
};
static const uint64_t L_sha3_arm2_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
0x000000000000808bUL, 0x0000000080000001UL,
0x8000000080008081UL, 0x8000000000008009UL,
0x000000000000008aUL, 0x0000000000000088UL,
0x0000000080008009UL, 0x000000008000000aUL,
0x000000008000808bUL, 0x800000000000008bUL,
0x8000000000008089UL, 0x8000000000008003UL,
0x8000000000008002UL, 0x8000000000000080UL,
0x000000000000800aUL, 0x800000008000000aUL,
0x8000000080008081UL, 0x8000000000008080UL,
0x0000000080000001UL, 0x8000000080008008UL,
};
#include <wolfssl/wolfcrypt/sha3.h>
#ifndef WOLFSSL_ARMASM_NO_NEON
void BlockSha3(word64* state_p)
{
register word64* state asm ("r0") = (word64*)state_p;
register uint64_t* L_sha3_arm2_neon_rt_c asm ("r1") = (uint64_t*)&L_sha3_arm2_neon_rt;
register uint64_t* L_sha3_arm2_rt_c asm ("r2") = (uint64_t*)&L_sha3_arm2_rt;
__asm__ __volatile__ (
"sub sp, sp, #16\n\t"
@ -348,16 +333,31 @@ void BlockSha3(word64* state_p)
"vst1.8 {d20-d23}, [%[state]]!\n\t"
"vst1.8 {d24}, [%[state]]\n\t"
"add sp, sp, #16\n\t"
: [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c)
: [state] "+r" (state), [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c)
:
: "memory", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc"
: "memory", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc"
);
}
#endif /* WOLFSSL_ARMASM_NO_NEON */
#ifdef WOLFSSL_ARMASM_NO_NEON
static const uint64_t L_sha3_arm2_rt[] = {
0x0000000000000001UL, 0x0000000000008082UL,
0x800000000000808aUL, 0x8000000080008000UL,
0x000000000000808bUL, 0x0000000080000001UL,
0x8000000080008081UL, 0x8000000000008009UL,
0x000000000000008aUL, 0x0000000000000088UL,
0x0000000080008009UL, 0x000000008000000aUL,
0x000000008000808bUL, 0x800000000000008bUL,
0x8000000000008089UL, 0x8000000000008003UL,
0x8000000000008002UL, 0x8000000000000080UL,
0x000000000000800aUL, 0x800000008000000aUL,
0x8000000080008081UL, 0x8000000000008080UL,
0x0000000080000001UL, 0x8000000080008008UL,
};
#include <wolfssl/wolfcrypt/sha3.h>
#ifdef WOLFSSL_ARMASM_NO_NEON
void BlockSha3(word64* state_p)
{
register word64* state asm ("r0") = (word64*)state_p;
@ -2348,6 +2348,7 @@ void BlockSha3(word64* state_p)
}
#endif /* WOLFSSL_ARMASM_NO_NEON */
#endif /* WOLFSSL_SHA3 */
#endif /* !__aarch64__ && __arm__ && !__thumb__ */
#endif /* WOLFSSL_ARMASM */
#endif /* !defined(__aarch64__) && defined(__arm__) && !defined(__thumb__) */

View File

@ -29,7 +29,7 @@
#include <wolfssl/wolfcrypt/settings.h>
#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON)
#if defined(WOLFSSL_ARMASM)
#ifdef HAVE_CHACHA
#include <wolfssl/wolfcrypt/chacha.h>
@ -73,15 +73,43 @@
* Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version
* uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB.
*/
int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
int wc_Chacha_SetIV(ChaCha* ctx, const byte* iv, word32 counter)
{
#ifndef __aarch64__
int ret = 0;
#ifdef CHACHA_AEAD_TEST
word32 i;
printf("NONCE : ");
if (iv != NULL) {
for (i = 0; i < CHACHA_IV_BYTES; i++) {
printf("%02x", iv[i]);
}
}
printf("\n\n");
#endif
/* Validate parameters. */
if ((ctx == NULL) || (iv == NULL)) {
ret = BAD_FUNC_ARG;
}
if (ret == 0) {
/* No unused bytes to XOR into input. */
ctx->left = 0;
/* Set counter and IV into state. */
wc_chacha_setiv(ctx->X, iv, counter);
}
return ret;
#else
word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */
#ifdef CHACHA_AEAD_TEST
word32 i;
printf("NONCE : ");
for (i = 0; i < CHACHA_IV_BYTES; i++) {
printf("%02x", inIv[i]);
printf("%02x", iv[i]);
}
printf("\n\n");
#endif
@ -89,7 +117,7 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
if (ctx == NULL)
return BAD_FUNC_ARG;
XMEMCPY(temp, inIv, CHACHA_IV_BYTES);
XMEMCPY(temp, iv, CHACHA_IV_BYTES);
ctx->left = 0;
ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */
@ -98,18 +126,54 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */
return 0;
#endif
}
#ifdef __aarch64__
/* "expand 32-byte k" as unsigned 32 byte */
static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
/* "expand 16-byte k" as unsigned 16 byte */
static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574};
#endif
/**
* Key setup. 8 word iv (nonce)
*/
int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
{
#ifndef __aarch64__
int ret = 0;
#ifdef CHACHA_AEAD_TEST
printf("ChaCha key used :\n");
if (key != NULL) {
word32 i;
for (i = 0; i < keySz; i++) {
printf("%02x", key[i]);
if ((i % 8) == 7)
printf("\n");
}
}
printf("\n\n");
#endif
/* Validate parameters. */
if ((ctx == NULL) || (key == NULL)) {
ret = BAD_FUNC_ARG;
}
else if ((keySz != (CHACHA_MAX_KEY_SZ / 2)) &&
(keySz != CHACHA_MAX_KEY_SZ )) {
ret = BAD_FUNC_ARG;
}
if (ret == 0) {
ctx->left = 0;
wc_chacha_setkey(ctx->X, key, keySz);
}
return ret;
#else
const word32* constants;
const byte* k;
@ -169,8 +233,10 @@ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
ctx->left = 0;
return 0;
#endif
}
#ifndef WOLFSSL_ARMASM_NO_NEON
static const word32 L_chacha20_neon_inc_first_word[] = {
0x1,
0x0,
@ -2815,7 +2881,6 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
}
/**
* Encrypt a stream of bytes
*/
@ -2862,40 +2927,68 @@ static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
}
}
#endif
/**
* API to encrypt/decrypt a message of any size.
*/
int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
word32 msglen)
word32 len)
{
#ifdef WOLFSSL_ARMASM_NO_NEON
int ret = 0;
if ((ctx == NULL) || (output == NULL) || (input == NULL)) {
ret = BAD_FUNC_ARG;
}
/* Handle left over bytes from last block. */
if ((ret == 0) && (len > 0) && (ctx->left > 0)) {
byte* over = ((byte*)ctx->over) + CHACHA_CHUNK_BYTES - ctx->left;
word32 l = min(len, ctx->left);
wc_chacha_use_over(over, output, input, l);
ctx->left -= l;
input += l;
output += l;
len -= l;
}
if ((ret == 0) && (len != 0)) {
wc_chacha_crypt_bytes(ctx, output, input, len);
}
return ret;
#else
if (ctx == NULL || output == NULL || input == NULL)
return BAD_FUNC_ARG;
/* handle left overs */
if (msglen > 0 && ctx->left > 0) {
if (len > 0 && ctx->left > 0) {
byte* out;
word32 i;
out = (byte*)ctx->over + CHACHA_CHUNK_BYTES - ctx->left;
for (i = 0; i < msglen && i < ctx->left; i++) {
for (i = 0; i < len && i < ctx->left; i++) {
output[i] = (byte)(input[i] ^ out[i]);
}
ctx->left -= i;
msglen -= i;
len -= i;
output += i;
input += i;
}
if (msglen == 0) {
if (len == 0) {
return 0;
}
wc_Chacha_encrypt_bytes(ctx, input, output, msglen);
wc_Chacha_encrypt_bytes(ctx, input, output, len);
return 0;
#endif
}
#endif /* HAVE_CHACHA */
#endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */
#endif /* WOLFSSL_ARMASM */

View File

@ -32,7 +32,6 @@
#include <wolfssl/wolfcrypt/types.h>
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifdef HAVE_POLY1305
#include <wolfssl/wolfcrypt/poly1305.h>
@ -49,6 +48,8 @@
#include <stdio.h>
#endif
#ifdef __aarch64__
static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
const unsigned char *m, size_t bytes)
{
@ -1118,6 +1119,127 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
return 0;
}
#endif /* HAVE_POLY1305 */
#else
#ifdef __thumb__
/* Process 16 bytes of message at a time.
*
* @param [in] ctx Poly1305 context.
* @param [in] m Message to process.
* @param [in] bytes Length of message in bytes.
*/
void poly1305_blocks_thumb2(Poly1305* ctx, const unsigned char* m,
size_t bytes)
{
poly1305_blocks_thumb2_16(ctx, m, bytes, 1);
}
/* Process 16 bytes of message.
*
* @param [in] ctx Poly1305 context.
* @param [in] m Message to process.
*/
void poly1305_block_thumb2(Poly1305* ctx, const unsigned char* m)
{
poly1305_blocks_thumb2_16(ctx, m, POLY1305_BLOCK_SIZE, 1);
}
#else
/* Process 16 bytes of message at a time.
*
* @param [in] ctx Poly1305 context.
* @param [in] m Message to process.
* @param [in] bytes Length of message in bytes.
*/
void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes)
{
poly1305_blocks_arm32_16(ctx, m, bytes, 1);
}
/* Process 16 bytes of message.
*
* @param [in] ctx Poly1305 context.
* @param [in] m Message to process.
*/
void poly1305_block_arm32(Poly1305* ctx, const unsigned char* m)
{
poly1305_blocks_arm32_16(ctx, m, POLY1305_BLOCK_SIZE, 1);
}
#endif
/* Set the key for the Poly1305 operation.
*
* @param [in] ctx Poly1305 context.
* @param [in] key Key data to use.
* @param [in] keySz Size of key in bytes. Must be 32.
* @return 0 on success.
* @return BAD_FUNC_ARG when ctx or key is NULL or keySz is not 32.
*/
int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
{
int ret = 0;
#ifdef CHACHA_AEAD_TEST
word32 k;
printf("Poly key used:\n");
if (key != NULL) {
for (k = 0; k < keySz; k++) {
printf("%02x", key[k]);
if ((k+1) % 8 == 0)
printf("\n");
}
}
printf("\n");
#endif
/* Validate parameters. */
if ((ctx == NULL) || (key == NULL) || (keySz != 32)) {
ret = BAD_FUNC_ARG;
}
if (ret == 0) {
poly1305_set_key(ctx, key);
}
return ret;
}
/* Finalize the Poly1305 operation calculating the MAC.
*
* @param [in] ctx Poly1305 context.
* @param [in] mac Buffer to hold the MAC. Myst be at least 16 bytes long.
* @return 0 on success.
* @return BAD_FUNC_ARG when ctx or mac is NULL.
*/
int wc_Poly1305Final(Poly1305* ctx, byte* mac)
{
int ret = 0;
/* Validate parameters. */
if ((ctx == NULL) || (mac == NULL)) {
ret = BAD_FUNC_ARG;
}
/* Process the remaining partial block - last block. */
if (ret == 0) {
if (ctx->leftover) {
size_t i = ctx->leftover;
ctx->buffer[i++] = 1;
for (; i < POLY1305_BLOCK_SIZE; i++) {
ctx->buffer[i] = 0;
}
#ifdef __thumb__
poly1305_blocks_thumb2_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE,
0);
#else
poly1305_blocks_arm32_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0);
#endif
}
poly1305_final(ctx, mac);
}
return ret;
}
#endif /* __aarch64__ */
#endif /* HAVE_POLY1305 */
#endif /* WOLFSSL_ARMASM */

View File

@ -107,12 +107,18 @@ WOLFSSL_API int wc_XChacha_SetKey(ChaCha *ctx, const byte *key, word32 keySz,
word32 counter);
#endif
#if defined(WOLFSSL_ARMASM) && defined(__thumb__)
#if defined(WOLFSSL_ARMASM)
#ifndef __aarch64__
void wc_chacha_setiv(word32* x, const byte* iv, word32 counter);
void wc_chacha_setkey(word32* x, const byte* key, word32 keySz);
#endif
#if defined(WOLFSSL_ARMASM_NO_NEON) || defined(__thumb__)
void wc_chacha_use_over(byte* over, byte* output, const byte* input,
word32 len);
void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len);
#endif
#endif

View File

@ -98,7 +98,7 @@ typedef struct Poly1305 {
word64 leftover;
unsigned char buffer[POLY1305_BLOCK_SIZE];
unsigned char finished;
#elif defined(WOLFSSL_ARMASM) && defined(__thumb__)
#elif defined(WOLFSSL_ARMASM)
word32 r[4];
word32 h[5];
word32 pad[4];
@ -147,16 +147,16 @@ WOLFSSL_API int wc_Poly1305_EncodeSizes64(Poly1305* ctx, word64 aadSz,
WOLFSSL_API int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional,
word32 addSz, const byte* input, word32 sz, byte* tag, word32 tagSz);
#if defined(__aarch64__ ) && defined(WOLFSSL_ARMASM)
#if defined(WOLFSSL_ARMASM)
#if defined(__aarch64__ )
#define poly1305_blocks poly1305_blocks_aarch64
#define poly1305_block poly1305_block_aarch64
void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
size_t bytes);
void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m);
#endif
#if defined(__thumb__ ) && defined(WOLFSSL_ARMASM)
#else
#if defined(__thumb__)
#define poly1305_blocks poly1305_blocks_thumb2
#define poly1305_block poly1305_block_thumb2
@ -166,9 +166,20 @@ void poly1305_block_thumb2(Poly1305* ctx, const unsigned char *m);
void poly1305_blocks_thumb2_16(Poly1305* ctx, const unsigned char* m,
word32 len, int notLast);
#else
#define poly1305_blocks poly1305_blocks_arm32
#define poly1305_block poly1305_block_arm32
void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char *m, size_t bytes);
void poly1305_block_arm32(Poly1305* ctx, const unsigned char *m);
void poly1305_blocks_arm32_16(Poly1305* ctx, const unsigned char* m, word32 len,
int notLast);
#endif
void poly1305_set_key(Poly1305* ctx, const byte* key);
void poly1305_final(Poly1305* ctx, byte* mac);
#endif
#endif /* WOLFSSL_ARMASM */
#if defined(WOLFSSL_RISCV_ASM)
#define poly1305_blocks poly1305_blocks_riscv64