SHA-3, ARM64: add assembly support for crypto instructions

Add ability to compile ARM assembly from inline C code.
pull/4833/head
Sean Parkinson 2022-02-07 09:20:49 +10:00
parent 59ea65bad3
commit 0042a2594c
12 changed files with 579 additions and 130 deletions

View File

@ -1687,31 +1687,38 @@ then
fi
ENABLED_ARMASM_INLINE="no"
ENABLED_ARMASM_SHA3="no"
# ARM Assembly
# Both SHA3 and SHA512 instructions available with ARMV8.2-a
AC_ARG_ENABLE([armasm],
[AS_HELP_STRING([--enable-armasm],[Enable wolfSSL ARMv8 ASM support (default: disabled). Set to sha512-crypto to use SHA512 instructions with Aarch64 CPU.])],
[AS_HELP_STRING([--enable-armasm],[Enable wolfSSL ARMv8 ASM support (default: disabled). Set to sha512-crypto or sha3-crypto to use SHA512 and SHA3 instructions with Aarch64 CPU.])],
[ ENABLED_ARMASM=$enableval ],
[ ENABLED_ARMASM=no ]
)
if test "$ENABLED_ARMASM" != "no" && test "$ENABLED_ASM" = "yes"
then
for v in `echo $ENABLED_ARMASM | tr "," " "`
do
case $v in
yes)
;;
sha512-crypto)
inline)
ENABLED_ARMASM_INLINE=yes
;;
sha512-crypto | sha3-crypto)
case $host_cpu in
*aarch64*)
;;
*)
AC_MSG_ERROR([SHA512 instructions only available on Aarch64 CPU.])
AC_MSG_ERROR([SHA512/SHA3 instructions only available on Aarch64 CPU.])
break;;
esac
ENABLED_ARMASM_SHA512=yes
ENABLED_ARMASM_SHA3=yes
;;
*)
AC_MSG_ERROR([Invalid choice of ARM asm inclusions (yes, sha512-crypto): $ENABLED_ARMASM.])
AC_MSG_ERROR([Invalid choice of ARM asm inclusions (yes, sha512-crypto, sha3-crypto): $ENABLED_ARMASM.])
break;;
esac
done
@ -1729,11 +1736,15 @@ then
case $host_os in
*darwin*)
# All known Aarch64 Mac computers support SHA-512 instructions
ENABLED_ARMASM_SHA512=yes
ENABLED_ARMASM_SHA3=yes
;;
*)
# +crypto needed for hardware acceleration
AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto"
if test "$ENABLED_ARMASM_SHA3" = "yes"; then
AM_CPPFLAGS="$AM_CPPFLAGS -march=armv8.2-a+crypto+sha3"
else
AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto"
fi
;;
esac
# Include options.h
@ -1764,9 +1775,9 @@ then
esac
fi
if test "$ENABLED_ARMASM_SHA512" = "yes"; then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512"
if test "$ENABLED_ARMASM_SHA3" = "yes"; then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512 -DWOLFSSL_ARMASM_CRYPTO_SHA3"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512 -DWOLFSSL_ARMASM_CRYPTO_SHA3"
fi
# Xilinx hardened crypto
@ -7389,6 +7400,7 @@ AM_CONDITIONAL([BUILD_SNIFFTEST],[ test "x$ENABLED_SNIFFTEST" = "xyes"])
AM_CONDITIONAL([BUILD_AESGCM],[test "x$ENABLED_AESGCM" = "xyes" || test "x$ENABLED_USERSETTINGS" = "xyes"])
AM_CONDITIONAL([BUILD_AESCCM],[test "x$ENABLED_AESCCM" = "xyes" || test "x$ENABLED_USERSETTINGS" = "xyes"])
AM_CONDITIONAL([BUILD_ARMASM],[test "x$ENABLED_ARMASM" = "xyes"])
AM_CONDITIONAL([BUILD_ARMASM_INLINE],[test "x$ENABLED_ARMASM_INLINE" = "xyes"])
AM_CONDITIONAL([BUILD_XILINX],[test "x$ENABLED_XILINX" = "xyes"])
AM_CONDITIONAL([BUILD_AESNI],[test "x$ENABLED_AESNI" = "xyes"])
AM_CONDITIONAL([BUILD_INTELASM],[test "x$ENABLED_INTELASM" = "xyes"])
@ -7897,8 +7909,12 @@ echo " * PKCS#12: $ENABLED_PKCS12"
echo " * Cavium Nitrox: $ENABLED_CAVIUM"
echo " * Cavium Octeon (Sync): $ENABLED_OCTEON_SYNC"
echo " * Intel Quick Assist: $ENABLED_INTEL_QA"
if test "$ENABLED_ARMASM_INLINE" = "yes"
then
ENABLED_ARMASM="inline C"
fi
echo " * ARM ASM: $ENABLED_ARMASM"
echo " * ARM ASM SHA512 Crypto $ENABLED_ARMASM_SHA512"
echo " * ARM ASM SHA512/SHA3 Crypto $ENABLED_ARMASM_SHA3"
echo " * AES Key Wrap: $ENABLED_AESKEYWRAP"
echo " * Write duplicate: $ENABLED_WRITEDUP"
echo " * Xilinx Hardware Acc.: $ENABLED_XILINX"

View File

@ -293,9 +293,14 @@ if BUILD_SHA512
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512.c
if BUILD_ARMASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c
if BUILD_ARMASM_INLINE
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm_c.c
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm.S
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
endif
endif
if BUILD_INTELASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512_asm.S
endif
@ -303,6 +308,13 @@ endif
if BUILD_SHA3
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha3.c
if BUILD_ARMASM
if BUILD_ARMASM_INLINE
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm_c.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm.S
endif
endif
endif
if BUILD_DH
@ -479,8 +491,13 @@ if !BUILD_FIPS_CURRENT
if BUILD_SHA512
if BUILD_ARMASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c
if BUILD_ARMASM_INLINE
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm_c.c
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm.S
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
endif
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512.c
if BUILD_INTELASM
@ -493,6 +510,13 @@ endif !BUILD_FIPS_CURRENT
if !BUILD_FIPS_CURRENT
if BUILD_SHA3
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha3.c
if BUILD_ARMASM
if BUILD_ARMASM_INLINE
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm_c.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm.S
endif
endif
endif
endif !BUILD_FIPS_CURRENT
@ -653,8 +677,13 @@ if BUILD_INTELASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S
else
if BUILD_ARMASM
if BUILD_ARMASM_INLINE
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-curve25519_c.c
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519_c.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-curve25519.S
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S
endif
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c
endif
@ -672,7 +701,11 @@ if BUILD_INTELASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S
else
if BUILD_ARMASM
if BUILD_ARMASM_INLINE
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519_c.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S
endif
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c
endif

View File

@ -62,10 +62,6 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \
wolfcrypt/src/port/arm/armv8-aes.c \
wolfcrypt/src/port/arm/armv8-sha256.c \
wolfcrypt/src/port/arm/armv8-chacha.c \
wolfcrypt/src/port/arm/armv8-curve25519.c \
wolfcrypt/src/port/arm/armv8-32-curve25519.c \
wolfcrypt/src/port/arm/armv8-sha512-asm.c \
wolfcrypt/src/port/arm/armv8-32-sha512-asm.c \
wolfcrypt/src/port/nxp/ksdk_port.c \
wolfcrypt/src/port/nxp/dcp_port.c \
wolfcrypt/src/port/nxp/se050_port.c \

View File

@ -68,10 +68,10 @@ void fe_frombytes(fe out, const unsigned char* in)
void fe_tobytes(unsigned char* out, const fe n)
{
__asm__ __volatile__ (
"ldrd r2, r3, [%[in]]\n\t"
"ldrd r12, lr, [%[in], #8]\n\t"
"ldrd r4, r5, [%[in], #16]\n\t"
"ldrd r6, r7, [%[in], #24]\n\t"
"ldrd r2, r3, [%[n]]\n\t"
"ldrd r12, lr, [%[n], #8]\n\t"
"ldrd r4, r5, [%[n], #16]\n\t"
"ldrd r6, r7, [%[n], #24]\n\t"
"adds r8, r2, #19\n\t"
"adcs r8, r3, #0\n\t"
"adcs r8, r12, #0\n\t"
@ -3861,23 +3861,26 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con
"str %[ry], [sp, #4]\n\t"
"str %[rz], [sp, #8]\n\t"
"str %[px], [sp, #12]\n\t"
"ldr r2, [sp, #32]\n\t"
"ldr r2, [sp, #28]\n\t"
"ldr r1, [sp, #12]\n\t"
"ldr r0, [sp]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #28]\n\t"
"ldr r1, [sp, #24]\n\t"
"ldr r2, [sp, #24]\n\t"
"ldr r1, [sp, #20]\n\t"
"ldr r0, [sp, #4]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #32]\n\t"
"ldr r1, [sp, #28]\n\t"
"ldr r2, [sp, #28]\n\t"
"ldr r1, [sp, #24]\n\t"
"ldr r0, [sp, #8]\n\t"
"bl fe_mul\n\t"
"add sp, sp, #16\n\t"
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px)
:
: "memory", "lr"
);
(void)py;
(void)pz;
(void)pt;
}
void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt)
@ -3888,27 +3891,31 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe
"str %[ry], [sp, #4]\n\t"
"str %[rz], [sp, #8]\n\t"
"str %[rt], [sp, #12]\n\t"
"ldr r2, [sp, #36]\n\t"
"ldr r1, [sp, #24]\n\t"
"ldr r0, [sp]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #32]\n\t"
"ldr r1, [sp, #28]\n\t"
"ldr r0, [sp, #4]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #36]\n\t"
"ldr r1, [sp, #32]\n\t"
"ldr r0, [sp, #8]\n\t"
"ldr r1, [sp, #20]\n\t"
"ldr r0, [sp]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #28]\n\t"
"ldr r1, [sp, #24]\n\t"
"ldr r0, [sp, #4]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #32]\n\t"
"ldr r1, [sp, #28]\n\t"
"ldr r0, [sp, #8]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #24]\n\t"
"ldr r1, [sp, #20]\n\t"
"ldr r0, [sp, #12]\n\t"
"bl fe_mul\n\t"
"add sp, sp, #16\n\t"
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt)
:
: "memory", "lr"
);
(void)px;
(void)py;
(void)pz;
(void)pt;
}
void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz)
@ -3919,15 +3926,15 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"str %[ry], [sp, #4]\n\t"
"str %[rz], [sp, #8]\n\t"
"str %[rt], [sp, #12]\n\t"
"ldr r1, [sp, #88]\n\t"
"ldr r1, [sp, #52]\n\t"
"ldr r0, [sp]\n\t"
"bl fe_sq\n\t"
"ldr r1, [sp, #92]\n\t"
"ldr r1, [sp, #56]\n\t"
"ldr r0, [sp, #8]\n\t"
"bl fe_sq\n\t"
"ldr r0, [sp, #4]\n\t"
"ldr r1, [sp, #88]\n\t"
"ldr r2, [sp, #92]\n\t"
"ldr r1, [sp, #52]\n\t"
"ldr r2, [sp, #56]\n\t"
/* Add */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -4114,7 +4121,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"strd r5, r6, [r0, #8]\n\t"
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"ldr r1, [sp, #96]\n\t"
"ldr r1, [sp, #60]\n\t"
"ldr r0, [sp, #12]\n\t"
"bl fe_sq2\n\t"
"ldr r0, [sp, #12]\n\t"
@ -4159,10 +4166,13 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"add sp, sp, #16\n\t"
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz)
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt)
:
: "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
);
(void)px;
(void)py;
(void)pz;
}
void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
@ -4174,8 +4184,8 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"str %[rz], [sp, #8]\n\t"
"str %[rt], [sp, #12]\n\t"
"ldr r0, [sp]\n\t"
"ldr r1, [sp, #108]\n\t"
"ldr r2, [sp, #104]\n\t"
"ldr r1, [sp, #72]\n\t"
"ldr r2, [sp, #68]\n\t"
/* Add */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -4216,8 +4226,8 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"ldr r0, [sp, #4]\n\t"
"ldr r1, [sp, #108]\n\t"
"ldr r2, [sp, #104]\n\t"
"ldr r1, [sp, #72]\n\t"
"ldr r2, [sp, #68]\n\t"
/* Sub */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -4257,16 +4267,16 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"strd r5, r6, [r0, #8]\n\t"
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"ldr r2, [sp, #124]\n\t"
"ldr r2, [sp, #88]\n\t"
"ldr r1, [sp]\n\t"
"ldr r0, [sp, #8]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #128]\n\t"
"ldr r2, [sp, #92]\n\t"
"ldr r1, [sp, #4]\n\t"
"ldr r0, [sp, #4]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #116]\n\t"
"ldr r1, [sp, #120]\n\t"
"ldr r2, [sp, #80]\n\t"
"ldr r1, [sp, #84]\n\t"
"ldr r0, [sp, #12]\n\t"
"bl fe_mul\n\t"
"ldr r0, [sp, #4]\n\t"
@ -4372,7 +4382,7 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"adc r10, r10, lr\n\t"
"strd r9, r10, [r1, #24]\n\t"
"ldr r0, [sp, #8]\n\t"
"ldr r1, [sp, #112]\n\t"
"ldr r1, [sp, #76]\n\t"
/* Double */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -4506,10 +4516,14 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"adc r10, r10, lr\n\t"
"strd r9, r10, [r1, #24]\n\t"
"add sp, sp, #32\n\t"
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt)
:
: "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
);
(void)px;
(void)py;
(void)pz;
(void)pt;
(void)qxy2d;
(void)qyplusx;
(void)qyminusx;
@ -4524,8 +4538,8 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"str %[rz], [sp, #8]\n\t"
"str %[rt], [sp, #12]\n\t"
"ldr r0, [sp]\n\t"
"ldr r1, [sp, #108]\n\t"
"ldr r2, [sp, #104]\n\t"
"ldr r1, [sp, #72]\n\t"
"ldr r2, [sp, #68]\n\t"
/* Add */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -4566,8 +4580,8 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"ldr r0, [sp, #4]\n\t"
"ldr r1, [sp, #108]\n\t"
"ldr r2, [sp, #104]\n\t"
"ldr r1, [sp, #72]\n\t"
"ldr r2, [sp, #68]\n\t"
/* Sub */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -4607,16 +4621,16 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"strd r5, r6, [r0, #8]\n\t"
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"ldr r2, [sp, #128]\n\t"
"ldr r2, [sp, #92]\n\t"
"ldr r1, [sp]\n\t"
"ldr r0, [sp, #8]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #124]\n\t"
"ldr r2, [sp, #88]\n\t"
"ldr r1, [sp, #4]\n\t"
"ldr r0, [sp, #4]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #116]\n\t"
"ldr r1, [sp, #120]\n\t"
"ldr r2, [sp, #80]\n\t"
"ldr r1, [sp, #84]\n\t"
"ldr r0, [sp, #12]\n\t"
"bl fe_mul\n\t"
"ldr r0, [sp, #4]\n\t"
@ -4722,7 +4736,7 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"adc r10, r10, lr\n\t"
"strd r9, r10, [r1, #24]\n\t"
"ldr r0, [sp, #8]\n\t"
"ldr r1, [sp, #112]\n\t"
"ldr r1, [sp, #76]\n\t"
/* Double */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -4856,10 +4870,14 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p
"adc r10, r10, lr\n\t"
"strd r9, r10, [r1, #24]\n\t"
"add sp, sp, #32\n\t"
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt)
:
: "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
);
(void)px;
(void)py;
(void)pz;
(void)pt;
(void)qxy2d;
(void)qyplusx;
(void)qyminusx;
@ -4874,8 +4892,8 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"str %[rz], [sp, #8]\n\t"
"str %[rt], [sp, #12]\n\t"
"ldr r0, [sp]\n\t"
"ldr r1, [sp, #172]\n\t"
"ldr r2, [sp, #168]\n\t"
"ldr r1, [sp, #136]\n\t"
"ldr r2, [sp, #132]\n\t"
/* Add */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -4916,8 +4934,8 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"ldr r0, [sp, #4]\n\t"
"ldr r1, [sp, #172]\n\t"
"ldr r2, [sp, #168]\n\t"
"ldr r1, [sp, #136]\n\t"
"ldr r2, [sp, #132]\n\t"
/* Sub */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -4957,20 +4975,20 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"strd r5, r6, [r0, #8]\n\t"
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"ldr r2, [sp, #192]\n\t"
"ldr r2, [sp, #156]\n\t"
"ldr r1, [sp]\n\t"
"ldr r0, [sp, #8]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #196]\n\t"
"ldr r2, [sp, #160]\n\t"
"ldr r1, [sp, #4]\n\t"
"ldr r0, [sp, #4]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #180]\n\t"
"ldr r1, [sp, #188]\n\t"
"ldr r2, [sp, #144]\n\t"
"ldr r1, [sp, #152]\n\t"
"ldr r0, [sp, #12]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #184]\n\t"
"ldr r1, [sp, #176]\n\t"
"ldr r2, [sp, #148]\n\t"
"ldr r1, [sp, #140]\n\t"
"ldr r0, [sp]\n\t"
"bl fe_mul\n\t"
"add r0, sp, #16\n\t"
@ -5211,10 +5229,14 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"adc r10, r10, lr\n\t"
"strd r9, r10, [r1, #24]\n\t"
"add sp, sp, #0x60\n\t"
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt)
:
: "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
);
(void)px;
(void)py;
(void)pz;
(void)pt;
(void)qz;
(void)qt2d;
(void)qyplusx;
@ -5230,8 +5252,8 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"str %[rz], [sp, #8]\n\t"
"str %[rt], [sp, #12]\n\t"
"ldr r0, [sp]\n\t"
"ldr r1, [sp, #172]\n\t"
"ldr r2, [sp, #168]\n\t"
"ldr r1, [sp, #136]\n\t"
"ldr r2, [sp, #132]\n\t"
/* Add */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -5272,8 +5294,8 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"ldr r0, [sp, #4]\n\t"
"ldr r1, [sp, #172]\n\t"
"ldr r2, [sp, #168]\n\t"
"ldr r1, [sp, #136]\n\t"
"ldr r2, [sp, #132]\n\t"
/* Sub */
"ldrd %[rt], r4, [r1]\n\t"
"ldrd r5, r6, [r1, #8]\n\t"
@ -5313,20 +5335,20 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"strd r5, r6, [r0, #8]\n\t"
"strd r7, r8, [r0, #16]\n\t"
"strd r9, r10, [r0, #24]\n\t"
"ldr r2, [sp, #196]\n\t"
"ldr r2, [sp, #160]\n\t"
"ldr r1, [sp]\n\t"
"ldr r0, [sp, #8]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #192]\n\t"
"ldr r2, [sp, #156]\n\t"
"ldr r1, [sp, #4]\n\t"
"ldr r0, [sp, #4]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #180]\n\t"
"ldr r1, [sp, #188]\n\t"
"ldr r2, [sp, #144]\n\t"
"ldr r1, [sp, #152]\n\t"
"ldr r0, [sp, #12]\n\t"
"bl fe_mul\n\t"
"ldr r2, [sp, #184]\n\t"
"ldr r1, [sp, #176]\n\t"
"ldr r2, [sp, #148]\n\t"
"ldr r1, [sp, #140]\n\t"
"ldr r0, [sp]\n\t"
"bl fe_mul\n\t"
"add r0, sp, #16\n\t"
@ -5567,10 +5589,14 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz
"adc r10, r10, lr\n\t"
"strd r9, r10, [r1, #24]\n\t"
"add sp, sp, #0x60\n\t"
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
: [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt)
:
: "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
);
(void)px;
(void)py;
(void)pz;
(void)pt;
(void)qz;
(void)qt2d;
(void)qyplusx;

View File

@ -120,6 +120,7 @@ static const uint64_t L_SHA512_transform_len_k[] = {
0x6c44198c4a475817UL,
};
void Transform_Sha512_Len();
void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
{
__asm__ __volatile__ (
@ -3650,6 +3651,7 @@ static const uint64_t L_SHA512_transform_neon_len_k[] = {
0x6c44198c4a475817UL,
};
void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len);
void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
{
__asm__ __volatile__ (
@ -4773,7 +4775,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
"subs %[len], %[len], #0x80\n\t"
"bne L_sha512_len_neon_begin_%=\n\t"
: [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
: [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k), [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k)
: [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k)
: "memory", "r3", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}

View File

@ -0,0 +1,215 @@
/* armv8-sha3-asm
*
* Copyright (C) 2006-2021 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./sha3/sha3.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha3-asm.S
*/
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifdef WOLFSSL_SHA3
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
#ifndef __APPLE__
.text
.type L_SHA3_transform_crypto_r, %object
.section .rodata
.size L_SHA3_transform_crypto_r, 192
#else
.section __DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align 3
#else
.p2align 3
#endif /* __APPLE__ */
L_SHA3_transform_crypto_r:
.xword 0x1
.xword 0x8082
.xword 0x800000000000808a
.xword 0x8000000080008000
.xword 0x808b
.xword 0x80000001
.xword 0x8000000080008081
.xword 0x8000000000008009
.xword 0x8a
.xword 0x88
.xword 0x80008009
.xword 0x8000000a
.xword 0x8000808b
.xword 0x800000000000008b
.xword 0x8000000000008089
.xword 0x8000000000008003
.xword 0x8000000000008002
.xword 0x8000000000000080
.xword 0x800a
.xword 0x800000008000000a
.xword 0x8000000080008081
.xword 0x8000000000008080
.xword 0x80000001
.xword 0x8000000080008008
#ifndef __APPLE__
.text
.globl BlockSha3
.type BlockSha3,@function
.align 2
BlockSha3:
#else
.section __TEXT,__text
.globl _BlockSha3
.p2align 2
_BlockSha3:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
#ifdef __APPLE__
.arch_extension sha3
#endif /* __APPLE__ */
#ifndef __APPLE__
adrp x1, L_SHA3_transform_crypto_r
add x1, x1, :lo12:L_SHA3_transform_crypto_r
#else
adrp x1, L_SHA3_transform_crypto_r@PAGE
add x1, x1, :lo12:L_SHA3_transform_crypto_r@PAGEOFF
#endif /* __APPLE__ */
ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
ld1 {v24.1d}, [x0]
sub x0, x0, #0xc0
mov x2, #24
# Start of 24 rounds
L_sha3_crypto_begin:
# Col Mix
eor3 v31.16b, v0.16b, v5.16b, v10.16b
eor3 v27.16b, v1.16b, v6.16b, v11.16b
eor3 v28.16b, v2.16b, v7.16b, v12.16b
eor3 v29.16b, v3.16b, v8.16b, v13.16b
eor3 v30.16b, v4.16b, v9.16b, v14.16b
eor3 v31.16b, v31.16b, v15.16b, v20.16b
eor3 v27.16b, v27.16b, v16.16b, v21.16b
eor3 v28.16b, v28.16b, v17.16b, v22.16b
eor3 v29.16b, v29.16b, v18.16b, v23.16b
eor3 v30.16b, v30.16b, v19.16b, v24.16b
rax1 v25.2d, v30.2d, v27.2d
rax1 v26.2d, v31.2d, v28.2d
rax1 v27.2d, v27.2d, v29.2d
rax1 v28.2d, v28.2d, v30.2d
rax1 v29.2d, v29.2d, v31.2d
eor v0.16b, v0.16b, v25.16b
xar v30.2d, v1.2d, v26.2d, #63
xar v1.2d, v6.2d, v26.2d, #20
xar v6.2d, v9.2d, v29.2d, #44
xar v9.2d, v22.2d, v27.2d, #3
xar v22.2d, v14.2d, v29.2d, #25
xar v14.2d, v20.2d, v25.2d, #46
xar v20.2d, v2.2d, v27.2d, #2
xar v2.2d, v12.2d, v27.2d, #21
xar v12.2d, v13.2d, v28.2d, #39
xar v13.2d, v19.2d, v29.2d, #56
xar v19.2d, v23.2d, v28.2d, #8
xar v23.2d, v15.2d, v25.2d, #23
xar v15.2d, v4.2d, v29.2d, #37
xar v4.2d, v24.2d, v29.2d, #50
xar v24.2d, v21.2d, v26.2d, #62
xar v21.2d, v8.2d, v28.2d, #9
xar v8.2d, v16.2d, v26.2d, #19
xar v16.2d, v5.2d, v25.2d, #28
xar v5.2d, v3.2d, v28.2d, #36
xar v3.2d, v18.2d, v28.2d, #43
xar v18.2d, v17.2d, v27.2d, #49
xar v17.2d, v11.2d, v26.2d, #54
xar v11.2d, v7.2d, v27.2d, #58
xar v7.2d, v10.2d, v25.2d, #61
# Row Mix
mov v25.16b, v0.16b
mov v26.16b, v1.16b
bcax v0.16b, v25.16b, v2.16b, v26.16b
bcax v1.16b, v26.16b, v3.16b, v2.16b
bcax v2.16b, v2.16b, v4.16b, v3.16b
bcax v3.16b, v3.16b, v25.16b, v4.16b
bcax v4.16b, v4.16b, v26.16b, v25.16b
mov v25.16b, v5.16b
mov v26.16b, v6.16b
bcax v5.16b, v25.16b, v7.16b, v26.16b
bcax v6.16b, v26.16b, v8.16b, v7.16b
bcax v7.16b, v7.16b, v9.16b, v8.16b
bcax v8.16b, v8.16b, v25.16b, v9.16b
bcax v9.16b, v9.16b, v26.16b, v25.16b
mov v26.16b, v11.16b
bcax v10.16b, v30.16b, v12.16b, v26.16b
bcax v11.16b, v26.16b, v13.16b, v12.16b
bcax v12.16b, v12.16b, v14.16b, v13.16b
bcax v13.16b, v13.16b, v30.16b, v14.16b
bcax v14.16b, v14.16b, v26.16b, v30.16b
mov v25.16b, v15.16b
mov v26.16b, v16.16b
bcax v15.16b, v25.16b, v17.16b, v26.16b
bcax v16.16b, v26.16b, v18.16b, v17.16b
bcax v17.16b, v17.16b, v19.16b, v18.16b
bcax v18.16b, v18.16b, v25.16b, v19.16b
bcax v19.16b, v19.16b, v26.16b, v25.16b
mov v25.16b, v20.16b
mov v26.16b, v21.16b
bcax v20.16b, v25.16b, v22.16b, v26.16b
bcax v21.16b, v26.16b, v23.16b, v22.16b
bcax v22.16b, v22.16b, v24.16b, v23.16b
bcax v23.16b, v23.16b, v25.16b, v24.16b
bcax v24.16b, v24.16b, v26.16b, v25.16b
ld1r {v30.2d}, [x1], #8
subs x2, x2, #1
eor v0.16b, v0.16b, v30.16b
bne L_sha3_crypto_begin
st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32
st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32
st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32
st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32
st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32
st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32
st1 {v24.1d}, [x0]
ldp d8, d9, [x29, #16]
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size BlockSha3,.-BlockSha3
#endif /* __APPLE__ */
#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
#endif /* WOLFSSL_SHA3 */
#endif /* __aarch64__ */
#endif /* WOLFSSL_ARMASM */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

View File

@ -0,0 +1,185 @@
/* armv8-sha3-asm
*
* Copyright (C) 2006-2021 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
#include <wolfssl/wolfcrypt/settings.h>
/* Generated using (from wolfssl):
* cd ../scripts
* ruby ./sha3/sha3.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha3-asm.c
*/
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#include <wolfssl/wolfcrypt/sha3.h>
#ifdef WOLFSSL_SHA3
#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
static const uint64_t L_SHA3_transform_crypto_r[] = {
0x1UL,
0x8082UL,
0x800000000000808aUL,
0x8000000080008000UL,
0x808bUL,
0x80000001UL,
0x8000000080008081UL,
0x8000000000008009UL,
0x8aUL,
0x88UL,
0x80008009UL,
0x8000000aUL,
0x8000808bUL,
0x800000000000008bUL,
0x8000000000008089UL,
0x8000000000008003UL,
0x8000000000008002UL,
0x8000000000000080UL,
0x800aUL,
0x800000008000000aUL,
0x8000000080008081UL,
0x8000000000008080UL,
0x80000001UL,
0x8000000080008008UL,
};
void BlockSha3(unsigned long* state)
{
__asm__ __volatile__ (
#ifdef __APPLE__
".arch_extension sha3\n\t"
#endif /* __APPLE__ */
#ifndef __APPLE__
"adrp x1, %[L_SHA3_transform_crypto_r]\n\t"
"add x1, x1, :lo12:%[L_SHA3_transform_crypto_r]\n\t"
#else
"adrp x1, %[L_SHA3_transform_crypto_r]@PAGE\n\t"
"add x1, x1, %[L_SHA3_transform_crypto_r]@PAGEOFF\n\t"
#endif /* __APPLE__ */
"ld4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"ld4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
"ld4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t"
"ld4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t"
"ld4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t"
"ld4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t"
"ld1 {v24.1d}, [%x[state]]\n\t"
"sub %x[state], %x[state], #0xc0\n\t"
"mov x2, #24\n\t"
/* Start of 24 rounds */
"\n"
"L_sha3_crypto_begin_%=: \n\t"
/* Col Mix */
"eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t"
"eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t"
"eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t"
"eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t"
"eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t"
"eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t"
"eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t"
"eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t"
"eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t"
"eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t"
"rax1 v25.2d, v30.2d, v27.2d\n\t"
"rax1 v26.2d, v31.2d, v28.2d\n\t"
"rax1 v27.2d, v27.2d, v29.2d\n\t"
"rax1 v28.2d, v28.2d, v30.2d\n\t"
"rax1 v29.2d, v29.2d, v31.2d\n\t"
"eor v0.16b, v0.16b, v25.16b\n\t"
"xar v30.2d, v1.2d, v26.2d, #63\n\t"
"xar v1.2d, v6.2d, v26.2d, #20\n\t"
"xar v6.2d, v9.2d, v29.2d, #44\n\t"
"xar v9.2d, v22.2d, v27.2d, #3\n\t"
"xar v22.2d, v14.2d, v29.2d, #25\n\t"
"xar v14.2d, v20.2d, v25.2d, #46\n\t"
"xar v20.2d, v2.2d, v27.2d, #2\n\t"
"xar v2.2d, v12.2d, v27.2d, #21\n\t"
"xar v12.2d, v13.2d, v28.2d, #39\n\t"
"xar v13.2d, v19.2d, v29.2d, #56\n\t"
"xar v19.2d, v23.2d, v28.2d, #8\n\t"
"xar v23.2d, v15.2d, v25.2d, #23\n\t"
"xar v15.2d, v4.2d, v29.2d, #37\n\t"
"xar v4.2d, v24.2d, v29.2d, #50\n\t"
"xar v24.2d, v21.2d, v26.2d, #62\n\t"
"xar v21.2d, v8.2d, v28.2d, #9\n\t"
"xar v8.2d, v16.2d, v26.2d, #19\n\t"
"xar v16.2d, v5.2d, v25.2d, #28\n\t"
"xar v5.2d, v3.2d, v28.2d, #36\n\t"
"xar v3.2d, v18.2d, v28.2d, #43\n\t"
"xar v18.2d, v17.2d, v27.2d, #49\n\t"
"xar v17.2d, v11.2d, v26.2d, #54\n\t"
"xar v11.2d, v7.2d, v27.2d, #58\n\t"
"xar v7.2d, v10.2d, v25.2d, #61\n\t"
/* Row Mix */
"mov v25.16b, v0.16b\n\t"
"mov v26.16b, v1.16b\n\t"
"bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t"
"bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t"
"bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t"
"bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t"
"bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t"
"mov v25.16b, v5.16b\n\t"
"mov v26.16b, v6.16b\n\t"
"bcax v5.16b, v25.16b, v7.16b, v26.16b\n\t"
"bcax v6.16b, v26.16b, v8.16b, v7.16b\n\t"
"bcax v7.16b, v7.16b, v9.16b, v8.16b\n\t"
"bcax v8.16b, v8.16b, v25.16b, v9.16b\n\t"
"bcax v9.16b, v9.16b, v26.16b, v25.16b\n\t"
"mov v26.16b, v11.16b\n\t"
"bcax v10.16b, v30.16b, v12.16b, v26.16b\n\t"
"bcax v11.16b, v26.16b, v13.16b, v12.16b\n\t"
"bcax v12.16b, v12.16b, v14.16b, v13.16b\n\t"
"bcax v13.16b, v13.16b, v30.16b, v14.16b\n\t"
"bcax v14.16b, v14.16b, v26.16b, v30.16b\n\t"
"mov v25.16b, v15.16b\n\t"
"mov v26.16b, v16.16b\n\t"
"bcax v15.16b, v25.16b, v17.16b, v26.16b\n\t"
"bcax v16.16b, v26.16b, v18.16b, v17.16b\n\t"
"bcax v17.16b, v17.16b, v19.16b, v18.16b\n\t"
"bcax v18.16b, v18.16b, v25.16b, v19.16b\n\t"
"bcax v19.16b, v19.16b, v26.16b, v25.16b\n\t"
"mov v25.16b, v20.16b\n\t"
"mov v26.16b, v21.16b\n\t"
"bcax v20.16b, v25.16b, v22.16b, v26.16b\n\t"
"bcax v21.16b, v26.16b, v23.16b, v22.16b\n\t"
"bcax v22.16b, v22.16b, v24.16b, v23.16b\n\t"
"bcax v23.16b, v23.16b, v25.16b, v24.16b\n\t"
"bcax v24.16b, v24.16b, v26.16b, v25.16b\n\t"
"ld1r {v30.2d}, [x1], #8\n\t"
"subs x2, x2, #1\n\t"
"eor v0.16b, v0.16b, v30.16b\n\t"
"bne L_sha3_crypto_begin_%=\n\t"
"st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t"
"st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t"
"st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t"
"st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t"
"st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t"
"st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t"
"st1 {v24.1d}, [%x[state]]\n\t"
: [state] "+r" (state)
: [L_SHA3_transform_crypto_r] "S" (L_SHA3_transform_crypto_r)
: "memory", "x1", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
}
#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */
#endif /* WOLFSSL_SHA3 */
#endif /* __aarch64__ */
#endif /* WOLFSSL_ARMASM */

View File

@ -1201,20 +1201,12 @@ Transform_Sha512_Len_crypto:
.p2align 2
_Transform_Sha512_Len_crypto:
#endif /* __APPLE__ */
stp x29, x30, [sp, #-208]!
stp x29, x30, [sp, #-80]!
add x29, sp, #0
stp d8, d9, [x29, #16]
stp d10, d11, [x29, #32]
stp d12, d13, [x29, #48]
stp d14, d15, [x29, #64]
stp d16, d17, [x29, #80]
stp d18, d19, [x29, #96]
stp d20, d21, [x29, #112]
stp d22, d23, [x29, #128]
stp d24, d25, [x29, #144]
stp d26, d27, [x29, #160]
stp d28, d29, [x29, #176]
stp d30, d31, [x29, #192]
#ifdef __APPLE__
.arch_extension sha3
#endif /* __APPLE__ */
@ -1734,15 +1726,7 @@ L_sha512_len_crypto_begin:
ldp d10, d11, [x29, #32]
ldp d12, d13, [x29, #48]
ldp d14, d15, [x29, #64]
ldp d16, d17, [x29, #80]
ldp d18, d19, [x29, #96]
ldp d20, d21, [x29, #112]
ldp d22, d23, [x29, #128]
ldp d24, d25, [x29, #144]
ldp d26, d27, [x29, #160]
ldp d28, d29, [x29, #176]
ldp d30, d31, [x29, #192]
ldp x29, x30, [sp], #0xd0
ldp x29, x30, [sp], #0x50
ret
#ifndef __APPLE__
.size Transform_Sha512_Len_crypto,.-Transform_Sha512_Len_crypto

View File

@ -51,6 +51,7 @@
#endif
#ifndef WOLFSSL_ARMASM
#ifdef WOLFSSL_SHA3_SMALL
/* Rotate a 64-bit value left.
*
@ -137,8 +138,7 @@ static const word64 hash_keccak_r[24] =
* i The index of the loop.
*/
#define SWAP_ROTL(s, t1, t2, i) \
do \
{ \
do { \
t2 = s[K_I_##i]; s[K_I_##i] = ROTL64(t1, K_R_##i); \
} \
while (0)
@ -151,12 +151,10 @@ while (0)
* t Temporary variable.
*/
#define COL_MIX(s, b, x, t) \
do \
{ \
do { \
for (x = 0; x < 5; x++) \
b[x] = s[x + 0] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; \
for (x = 0; x < 5; x++) \
{ \
for (x = 0; x < 5; x++) { \
t = b[(x + 4) % 5] ^ ROTL64(b[(x + 1) % 5], 1); \
s[x + 0] ^= t; \
s[x + 5] ^= t; \
@ -179,14 +177,12 @@ while (0)
* t1 Temporary variable.
*/
#define ROW_MIX(s, b, y, x, t0, t1) \
do \
{ \
for (y = 0; y < 5; y++) \
{ \
do { \
for (y = 0; y < 5; y++) { \
for (x = 0; x < 5; x++) \
b[x] = s[y * 5 + x]; \
for (x = 0; x < 5; x++) \
s[y * 5 + x] = b[x] ^ (~b[(x + 1) % 5] & b[(x + 2) % 5]); \
s[y * 5 + x] = b[x] ^ (~b[(x + 1) % 5] & b[(x + 2) % 5]); \
} \
} \
while (0)
@ -202,10 +198,8 @@ while (0)
* t1 Temporary variable.
*/
#define ROW_MIX(s, b, y, x, t12, t34) \
do \
{ \
for (y = 0; y < 5; y++) \
{ \
do { \
for (y = 0; y < 5; y++) { \
for (x = 0; x < 5; x++) \
b[x] = s[y * 5 + x]; \
t12 = (b[1] ^ b[2]); t34 = (b[3] ^ b[4]); \
@ -351,8 +345,7 @@ static const word64 hash_keccak_r[24] =
* t Temporary variable.
*/
#define COL_MIX(s, b, x, t) \
do \
{ \
do { \
(b)[0] = (s)[0] ^ (s)[5] ^ (s)[10] ^ (s)[15] ^ (s)[20]; \
(b)[1] = (s)[1] ^ (s)[6] ^ (s)[11] ^ (s)[16] ^ (s)[21]; \
(b)[2] = (s)[2] ^ (s)[7] ^ (s)[12] ^ (s)[17] ^ (s)[22]; \
@ -384,8 +377,7 @@ while (0)
* t1 Temporary variable. (Unused)
*/
#define ROW_MIX(s2, s1, b, t0, t1) \
do \
{ \
do { \
(b)[0] = (s1)[0]; \
(b)[1] = S((s1), 0); \
(b)[2] = S((s1), 1); \
@ -449,8 +441,7 @@ while (0)
* t34 Temporary variable.
*/
#define ROW_MIX(s2, s1, b, t12, t34) \
do \
{ \
do { \
(b)[0] = (s1)[0]; \
(b)[1] = S((s1), 0); \
(b)[2] = S((s1), 1); \
@ -536,6 +527,7 @@ static void BlockSha3(word64 *s)
}
}
#endif /* WOLFSSL_SHA3_SMALL */
#endif /* !WOLFSSL_ARMASM */
static WC_INLINE word64 Load64Unaligned(const unsigned char *a)
{
@ -618,8 +610,7 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
byte l;
byte *t;
if (sha3->i > 0)
{
if (sha3->i > 0) {
l = p * 8 - sha3->i;
if (l > len) {
l = (byte)len;
@ -632,16 +623,14 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
len -= i;
sha3->i += i;
if (sha3->i == p * 8)
{
if (sha3->i == p * 8) {
for (i = 0; i < p; i++)
sha3->s[i] ^= Load64BitBigEndian(sha3->t + 8 * i);
BlockSha3(sha3->s);
sha3->i = 0;
}
}
while (len >= ((word32)(p * 8)))
{
while (len >= ((word32)(p * 8))) {
for (i = 0; i < p; i++)
sha3->s[i] ^= Load64Unaligned(data + 8 * i);
BlockSha3(sha3->s);

View File

@ -128,6 +128,9 @@ struct wc_Sha3 {
typedef wc_Sha3 wc_Shake;
#endif
#ifdef WOLFSSL_ARMASM
WOLFSSL_LOCAL void BlockSha3(word64 *s);
#endif
WOLFSSL_API int wc_InitSha3_224(wc_Sha3* sha3, void* heap, int devId);
WOLFSSL_API int wc_Sha3_224_Update(wc_Sha3* sha3, const byte* data, word32 len);