diff --git a/configure.ac b/configure.ac index eca24b329..f80e93319 100644 --- a/configure.ac +++ b/configure.ac @@ -1763,31 +1763,38 @@ then fi +ENABLED_ARMASM_INLINE="no" +ENABLED_ARMASM_SHA3="no" # ARM Assembly +# Both SHA3 and SHA512 instructions available with ARMV8.2-a AC_ARG_ENABLE([armasm], - [AS_HELP_STRING([--enable-armasm],[Enable wolfSSL ARMv8 ASM support (default: disabled). Set to sha512-crypto to use SHA512 instructions with Aarch64 CPU.])], + [AS_HELP_STRING([--enable-armasm],[Enable wolfSSL ARMv8 ASM support (default: disabled). Set to sha512-crypto or sha3-crypto to use SHA512 and SHA3 instructions with Aarch64 CPU.])], [ ENABLED_ARMASM=$enableval ], [ ENABLED_ARMASM=no ] ) if test "$ENABLED_ARMASM" != "no" && test "$ENABLED_ASM" = "yes" then + for v in `echo $ENABLED_ARMASM | tr "," " "` do case $v in yes) ;; - sha512-crypto) + inline) + ENABLED_ARMASM_INLINE=yes + ;; + sha512-crypto | sha3-crypto) case $host_cpu in *aarch64*) ;; *) - AC_MSG_ERROR([SHA512 instructions only available on Aarch64 CPU.]) + AC_MSG_ERROR([SHA512/SHA3 instructions only available on Aarch64 CPU.]) break;; esac - ENABLED_ARMASM_SHA512=yes + ENABLED_ARMASM_SHA3=yes ;; *) - AC_MSG_ERROR([Invalid choice of ARM asm inclusions (yes, sha512-crypto): $ENABLED_ARMASM.]) + AC_MSG_ERROR([Invalid choice of ARM asm inclusions (yes, sha512-crypto, sha3-crypto): $ENABLED_ARMASM.]) break;; esac done @@ -1805,11 +1812,15 @@ then case $host_os in *darwin*) # All known Aarch64 Mac computers support SHA-512 instructions - ENABLED_ARMASM_SHA512=yes + ENABLED_ARMASM_SHA3=yes ;; *) # +crypto needed for hardware acceleration - AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto" + if test "$ENABLED_ARMASM_SHA3" = "yes"; then + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv8.2-a+crypto+sha3" + else + AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto" + fi ;; esac # Include options.h @@ -1840,9 +1851,9 @@ then esac fi -if test "$ENABLED_ARMASM_SHA512" = "yes"; then - AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512" - AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512" +if test "$ENABLED_ARMASM_SHA3" = "yes"; then + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512 -DWOLFSSL_ARMASM_CRYPTO_SHA3" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512 -DWOLFSSL_ARMASM_CRYPTO_SHA3" fi # Xilinx hardened crypto @@ -7465,6 +7476,7 @@ AM_CONDITIONAL([BUILD_SNIFFTEST],[ test "x$ENABLED_SNIFFTEST" = "xyes"]) AM_CONDITIONAL([BUILD_AESGCM],[test "x$ENABLED_AESGCM" = "xyes" || test "x$ENABLED_USERSETTINGS" = "xyes"]) AM_CONDITIONAL([BUILD_AESCCM],[test "x$ENABLED_AESCCM" = "xyes" || test "x$ENABLED_USERSETTINGS" = "xyes"]) AM_CONDITIONAL([BUILD_ARMASM],[test "x$ENABLED_ARMASM" = "xyes"]) +AM_CONDITIONAL([BUILD_ARMASM_INLINE],[test "x$ENABLED_ARMASM_INLINE" = "xyes"]) AM_CONDITIONAL([BUILD_XILINX],[test "x$ENABLED_XILINX" = "xyes"]) AM_CONDITIONAL([BUILD_AESNI],[test "x$ENABLED_AESNI" = "xyes"]) AM_CONDITIONAL([BUILD_INTELASM],[test "x$ENABLED_INTELASM" = "xyes"]) @@ -7974,8 +7986,12 @@ echo " * PKCS#12: $ENABLED_PKCS12" echo " * Cavium Nitrox: $ENABLED_CAVIUM" echo " * Cavium Octeon (Sync): $ENABLED_OCTEON_SYNC" echo " * Intel Quick Assist: $ENABLED_INTEL_QA" +if test "$ENABLED_ARMASM_INLINE" = "yes" +then + ENABLED_ARMASM="inline C" +fi echo " * ARM ASM: $ENABLED_ARMASM" -echo " * ARM ASM SHA512 Crypto $ENABLED_ARMASM_SHA512" +echo " * ARM ASM SHA512/SHA3 Crypto $ENABLED_ARMASM_SHA3" echo " * AES Key Wrap: $ENABLED_AESKEYWRAP" echo " * Write duplicate: $ENABLED_WRITEDUP" echo " * Xilinx Hardware Acc.: $ENABLED_XILINX" diff --git a/src/include.am b/src/include.am index 28ad1c373..cfb7e7d14 100644 --- a/src/include.am +++ b/src/include.am @@ -293,9 +293,14 @@ if BUILD_SHA512 src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512.c if BUILD_ARMASM src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c +if BUILD_ARMASM_INLINE +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm_c.c +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +else src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm.S src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm.S endif +endif if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512_asm.S endif @@ -303,6 +308,13 @@ endif if BUILD_SHA3 src_libwolfssl_la_SOURCES += wolfcrypt/src/sha3.c +if BUILD_ARMASM +if BUILD_ARMASM_INLINE +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm_c.c +else +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm.S +endif +endif endif if BUILD_DH @@ -479,8 +491,13 @@ if !BUILD_FIPS_CURRENT if BUILD_SHA512 if BUILD_ARMASM src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c +if BUILD_ARMASM_INLINE +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm_c.c +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +else src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm.S src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm.S +endif else src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512.c if BUILD_INTELASM @@ -493,6 +510,13 @@ endif !BUILD_FIPS_CURRENT if !BUILD_FIPS_CURRENT if BUILD_SHA3 src_libwolfssl_la_SOURCES += wolfcrypt/src/sha3.c +if BUILD_ARMASM +if BUILD_ARMASM_INLINE +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm_c.c +else +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm.S +endif +endif endif endif !BUILD_FIPS_CURRENT @@ -653,8 +677,13 @@ if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S else if BUILD_ARMASM +if BUILD_ARMASM_INLINE +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-curve25519_c.c +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519_c.c +else src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-curve25519.S src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S +endif else src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c endif @@ -672,7 +701,11 @@ if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S else if BUILD_ARMASM +if BUILD_ARMASM_INLINE +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519_c.c +else src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S +endif else src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c endif diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index 593374121..14d7c01da 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -62,10 +62,6 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \ wolfcrypt/src/port/arm/armv8-aes.c \ wolfcrypt/src/port/arm/armv8-sha256.c \ wolfcrypt/src/port/arm/armv8-chacha.c \ - wolfcrypt/src/port/arm/armv8-curve25519.c \ - wolfcrypt/src/port/arm/armv8-32-curve25519.c \ - wolfcrypt/src/port/arm/armv8-sha512-asm.c \ - wolfcrypt/src/port/arm/armv8-32-sha512-asm.c \ wolfcrypt/src/port/nxp/ksdk_port.c \ wolfcrypt/src/port/nxp/dcp_port.c \ wolfcrypt/src/port/nxp/se050_port.c \ diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c similarity index 98% rename from wolfcrypt/src/port/arm/armv8-32-curve25519.c rename to wolfcrypt/src/port/arm/armv8-32-curve25519_c.c index 1ff277f50..813ec21f8 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519.c +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c @@ -68,10 +68,10 @@ void fe_frombytes(fe out, const unsigned char* in) void fe_tobytes(unsigned char* out, const fe n) { __asm__ __volatile__ ( - "ldrd r2, r3, [%[in]]\n\t" - "ldrd r12, lr, [%[in], #8]\n\t" - "ldrd r4, r5, [%[in], #16]\n\t" - "ldrd r6, r7, [%[in], #24]\n\t" + "ldrd r2, r3, [%[n]]\n\t" + "ldrd r12, lr, [%[n], #8]\n\t" + "ldrd r4, r5, [%[n], #16]\n\t" + "ldrd r6, r7, [%[n], #24]\n\t" "adds r8, r2, #19\n\t" "adcs r8, r3, #0\n\t" "adcs r8, r12, #0\n\t" @@ -3861,23 +3861,26 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con "str %[ry], [sp, #4]\n\t" "str %[rz], [sp, #8]\n\t" "str %[px], [sp, #12]\n\t" - "ldr r2, [sp, #32]\n\t" + "ldr r2, [sp, #28]\n\t" "ldr r1, [sp, #12]\n\t" "ldr r0, [sp]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #28]\n\t" - "ldr r1, [sp, #24]\n\t" + "ldr r2, [sp, #24]\n\t" + "ldr r1, [sp, #20]\n\t" "ldr r0, [sp, #4]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #32]\n\t" - "ldr r1, [sp, #28]\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #24]\n\t" "ldr r0, [sp, #8]\n\t" "bl fe_mul\n\t" "add sp, sp, #16\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px) : : "memory", "lr" ); + (void)py; + (void)pz; + (void)pt; } void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) @@ -3888,27 +3891,31 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe "str %[ry], [sp, #4]\n\t" "str %[rz], [sp, #8]\n\t" "str %[rt], [sp, #12]\n\t" - "ldr r2, [sp, #36]\n\t" - "ldr r1, [sp, #24]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_mul\n\t" "ldr r2, [sp, #32]\n\t" - "ldr r1, [sp, #28]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #36]\n\t" - "ldr r1, [sp, #32]\n\t" - "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #20]\n\t" + "ldr r0, [sp]\n\t" "bl fe_mul\n\t" "ldr r2, [sp, #28]\n\t" "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #28]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #24]\n\t" + "ldr r1, [sp, #20]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_mul\n\t" "add sp, sp, #16\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : : "memory", "lr" ); + (void)px; + (void)py; + (void)pz; + (void)pt; } void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) @@ -3919,15 +3926,15 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "str %[ry], [sp, #4]\n\t" "str %[rz], [sp, #8]\n\t" "str %[rt], [sp, #12]\n\t" - "ldr r1, [sp, #88]\n\t" + "ldr r1, [sp, #52]\n\t" "ldr r0, [sp]\n\t" "bl fe_sq\n\t" - "ldr r1, [sp, #92]\n\t" + "ldr r1, [sp, #56]\n\t" "ldr r0, [sp, #8]\n\t" "bl fe_sq\n\t" "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #88]\n\t" - "ldr r2, [sp, #92]\n\t" + "ldr r1, [sp, #52]\n\t" + "ldr r2, [sp, #56]\n\t" /* Add */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -4114,7 +4121,7 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "strd r5, r6, [r0, #8]\n\t" "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" - "ldr r1, [sp, #96]\n\t" + "ldr r1, [sp, #60]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_sq2\n\t" "ldr r0, [sp, #12]\n\t" @@ -4159,10 +4166,13 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" "add sp, sp, #16\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz) + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); + (void)px; + (void)py; + (void)pz; } void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) @@ -4174,8 +4184,8 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "str %[rz], [sp, #8]\n\t" "str %[rt], [sp, #12]\n\t" "ldr r0, [sp]\n\t" - "ldr r1, [sp, #108]\n\t" - "ldr r2, [sp, #104]\n\t" + "ldr r1, [sp, #72]\n\t" + "ldr r2, [sp, #68]\n\t" /* Add */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -4216,8 +4226,8 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #108]\n\t" - "ldr r2, [sp, #104]\n\t" + "ldr r1, [sp, #72]\n\t" + "ldr r2, [sp, #68]\n\t" /* Sub */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -4257,16 +4267,16 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "strd r5, r6, [r0, #8]\n\t" "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" - "ldr r2, [sp, #124]\n\t" + "ldr r2, [sp, #88]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #128]\n\t" + "ldr r2, [sp, #92]\n\t" "ldr r1, [sp, #4]\n\t" "ldr r0, [sp, #4]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #116]\n\t" - "ldr r1, [sp, #120]\n\t" + "ldr r2, [sp, #80]\n\t" + "ldr r1, [sp, #84]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_mul\n\t" "ldr r0, [sp, #4]\n\t" @@ -4372,7 +4382,7 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adc r10, r10, lr\n\t" "strd r9, r10, [r1, #24]\n\t" "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #112]\n\t" + "ldr r1, [sp, #76]\n\t" /* Double */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -4506,10 +4516,14 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adc r10, r10, lr\n\t" "strd r9, r10, [r1, #24]\n\t" "add sp, sp, #32\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); + (void)px; + (void)py; + (void)pz; + (void)pt; (void)qxy2d; (void)qyplusx; (void)qyminusx; @@ -4524,8 +4538,8 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "str %[rz], [sp, #8]\n\t" "str %[rt], [sp, #12]\n\t" "ldr r0, [sp]\n\t" - "ldr r1, [sp, #108]\n\t" - "ldr r2, [sp, #104]\n\t" + "ldr r1, [sp, #72]\n\t" + "ldr r2, [sp, #68]\n\t" /* Add */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -4566,8 +4580,8 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #108]\n\t" - "ldr r2, [sp, #104]\n\t" + "ldr r1, [sp, #72]\n\t" + "ldr r2, [sp, #68]\n\t" /* Sub */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -4607,16 +4621,16 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "strd r5, r6, [r0, #8]\n\t" "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" - "ldr r2, [sp, #128]\n\t" + "ldr r2, [sp, #92]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #124]\n\t" + "ldr r2, [sp, #88]\n\t" "ldr r1, [sp, #4]\n\t" "ldr r0, [sp, #4]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #116]\n\t" - "ldr r1, [sp, #120]\n\t" + "ldr r2, [sp, #80]\n\t" + "ldr r1, [sp, #84]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_mul\n\t" "ldr r0, [sp, #4]\n\t" @@ -4722,7 +4736,7 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adc r10, r10, lr\n\t" "strd r9, r10, [r1, #24]\n\t" "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #112]\n\t" + "ldr r1, [sp, #76]\n\t" /* Double */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -4856,10 +4870,14 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "adc r10, r10, lr\n\t" "strd r9, r10, [r1, #24]\n\t" "add sp, sp, #32\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); + (void)px; + (void)py; + (void)pz; + (void)pt; (void)qxy2d; (void)qyplusx; (void)qyminusx; @@ -4874,8 +4892,8 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "str %[rz], [sp, #8]\n\t" "str %[rt], [sp, #12]\n\t" "ldr r0, [sp]\n\t" - "ldr r1, [sp, #172]\n\t" - "ldr r2, [sp, #168]\n\t" + "ldr r1, [sp, #136]\n\t" + "ldr r2, [sp, #132]\n\t" /* Add */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -4916,8 +4934,8 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #172]\n\t" - "ldr r2, [sp, #168]\n\t" + "ldr r1, [sp, #136]\n\t" + "ldr r2, [sp, #132]\n\t" /* Sub */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -4957,20 +4975,20 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "strd r5, r6, [r0, #8]\n\t" "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" - "ldr r2, [sp, #192]\n\t" + "ldr r2, [sp, #156]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #196]\n\t" + "ldr r2, [sp, #160]\n\t" "ldr r1, [sp, #4]\n\t" "ldr r0, [sp, #4]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #180]\n\t" - "ldr r1, [sp, #188]\n\t" + "ldr r2, [sp, #144]\n\t" + "ldr r1, [sp, #152]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #184]\n\t" - "ldr r1, [sp, #176]\n\t" + "ldr r2, [sp, #148]\n\t" + "ldr r1, [sp, #140]\n\t" "ldr r0, [sp]\n\t" "bl fe_mul\n\t" "add r0, sp, #16\n\t" @@ -5211,10 +5229,14 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adc r10, r10, lr\n\t" "strd r9, r10, [r1, #24]\n\t" "add sp, sp, #0x60\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); + (void)px; + (void)py; + (void)pz; + (void)pt; (void)qz; (void)qt2d; (void)qyplusx; @@ -5230,8 +5252,8 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "str %[rz], [sp, #8]\n\t" "str %[rt], [sp, #12]\n\t" "ldr r0, [sp]\n\t" - "ldr r1, [sp, #172]\n\t" - "ldr r2, [sp, #168]\n\t" + "ldr r1, [sp, #136]\n\t" + "ldr r2, [sp, #132]\n\t" /* Add */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -5272,8 +5294,8 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #172]\n\t" - "ldr r2, [sp, #168]\n\t" + "ldr r1, [sp, #136]\n\t" + "ldr r2, [sp, #132]\n\t" /* Sub */ "ldrd %[rt], r4, [r1]\n\t" "ldrd r5, r6, [r1, #8]\n\t" @@ -5313,20 +5335,20 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "strd r5, r6, [r0, #8]\n\t" "strd r7, r8, [r0, #16]\n\t" "strd r9, r10, [r0, #24]\n\t" - "ldr r2, [sp, #196]\n\t" + "ldr r2, [sp, #160]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #192]\n\t" + "ldr r2, [sp, #156]\n\t" "ldr r1, [sp, #4]\n\t" "ldr r0, [sp, #4]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #180]\n\t" - "ldr r1, [sp, #188]\n\t" + "ldr r2, [sp, #144]\n\t" + "ldr r1, [sp, #152]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_mul\n\t" - "ldr r2, [sp, #184]\n\t" - "ldr r1, [sp, #176]\n\t" + "ldr r2, [sp, #148]\n\t" + "ldr r1, [sp, #140]\n\t" "ldr r0, [sp]\n\t" "bl fe_mul\n\t" "add r0, sp, #16\n\t" @@ -5567,10 +5589,14 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "adc r10, r10, lr\n\t" "strd r9, r10, [r1, #24]\n\t" "add sp, sp, #0x60\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); + (void)px; + (void)py; + (void)pz; + (void)pt; (void)qz; (void)qt2d; (void)qyplusx; diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c similarity index 99% rename from wolfcrypt/src/port/arm/armv8-32-sha512-asm.c rename to wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c index a46046d7c..1511233d8 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c @@ -120,6 +120,7 @@ static const uint64_t L_SHA512_transform_len_k[] = { 0x6c44198c4a475817UL, }; +void Transform_Sha512_Len(); void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) { __asm__ __volatile__ ( @@ -3650,6 +3651,7 @@ static const uint64_t L_SHA512_transform_neon_len_k[] = { 0x6c44198c4a475817UL, }; +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) { __asm__ __volatile__ ( @@ -4773,7 +4775,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "subs %[len], %[len], #0x80\n\t" "bne L_sha512_len_neon_begin_%=\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) - : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k), [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k) + : [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k) : "memory", "r3", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.c b/wolfcrypt/src/port/arm/armv8-curve25519_c.c similarity index 100% rename from wolfcrypt/src/port/arm/armv8-curve25519.c rename to wolfcrypt/src/port/arm/armv8-curve25519_c.c diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-sha3-asm.S new file mode 100644 index 000000000..ec9c1cc4a --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm.S @@ -0,0 +1,215 @@ +/* armv8-sha3-asm + * + * Copyright (C) 2006-2021 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha3/sha3.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha3-asm.S + */ +#ifdef WOLFSSL_ARMASM +#ifdef __aarch64__ +#ifdef WOLFSSL_SHA3 +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +#ifndef __APPLE__ + .text + .type L_SHA3_transform_crypto_r, %object + .section .rodata + .size L_SHA3_transform_crypto_r, 192 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA3_transform_crypto_r: + .xword 0x1 + .xword 0x8082 + .xword 0x800000000000808a + .xword 0x8000000080008000 + .xword 0x808b + .xword 0x80000001 + .xword 0x8000000080008081 + .xword 0x8000000000008009 + .xword 0x8a + .xword 0x88 + .xword 0x80008009 + .xword 0x8000000a + .xword 0x8000808b + .xword 0x800000000000008b + .xword 0x8000000000008089 + .xword 0x8000000000008003 + .xword 0x8000000000008002 + .xword 0x8000000000000080 + .xword 0x800a + .xword 0x800000008000000a + .xword 0x8000000080008081 + .xword 0x8000000000008080 + .xword 0x80000001 + .xword 0x8000000080008008 +#ifndef __APPLE__ +.text +.globl BlockSha3 +.type BlockSha3,@function +.align 2 +BlockSha3: +#else +.section __TEXT,__text +.globl _BlockSha3 +.p2align 2 +_BlockSha3: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifdef __APPLE__ +.arch_extension sha3 +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x1, L_SHA3_transform_crypto_r + add x1, x1, :lo12:L_SHA3_transform_crypto_r +#else + adrp x1, L_SHA3_transform_crypto_r@PAGE + add x1, x1, :lo12:L_SHA3_transform_crypto_r@PAGEOFF +#endif /* __APPLE__ */ + ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + ld1 {v24.1d}, [x0] + sub x0, x0, #0xc0 + mov x2, #24 + # Start of 24 rounds +L_sha3_crypto_begin: + # Col Mix + eor3 v31.16b, v0.16b, v5.16b, v10.16b + eor3 v27.16b, v1.16b, v6.16b, v11.16b + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor3 v29.16b, v3.16b, v8.16b, v13.16b + eor3 v30.16b, v4.16b, v9.16b, v14.16b + eor3 v31.16b, v31.16b, v15.16b, v20.16b + eor3 v27.16b, v27.16b, v16.16b, v21.16b + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor3 v29.16b, v29.16b, v18.16b, v23.16b + eor3 v30.16b, v30.16b, v19.16b, v24.16b + rax1 v25.2d, v30.2d, v27.2d + rax1 v26.2d, v31.2d, v28.2d + rax1 v27.2d, v27.2d, v29.2d + rax1 v28.2d, v28.2d, v30.2d + rax1 v29.2d, v29.2d, v31.2d + eor v0.16b, v0.16b, v25.16b + xar v30.2d, v1.2d, v26.2d, #63 + xar v1.2d, v6.2d, v26.2d, #20 + xar v6.2d, v9.2d, v29.2d, #44 + xar v9.2d, v22.2d, v27.2d, #3 + xar v22.2d, v14.2d, v29.2d, #25 + xar v14.2d, v20.2d, v25.2d, #46 + xar v20.2d, v2.2d, v27.2d, #2 + xar v2.2d, v12.2d, v27.2d, #21 + xar v12.2d, v13.2d, v28.2d, #39 + xar v13.2d, v19.2d, v29.2d, #56 + xar v19.2d, v23.2d, v28.2d, #8 + xar v23.2d, v15.2d, v25.2d, #23 + xar v15.2d, v4.2d, v29.2d, #37 + xar v4.2d, v24.2d, v29.2d, #50 + xar v24.2d, v21.2d, v26.2d, #62 + xar v21.2d, v8.2d, v28.2d, #9 + xar v8.2d, v16.2d, v26.2d, #19 + xar v16.2d, v5.2d, v25.2d, #28 + xar v5.2d, v3.2d, v28.2d, #36 + xar v3.2d, v18.2d, v28.2d, #43 + xar v18.2d, v17.2d, v27.2d, #49 + xar v17.2d, v11.2d, v26.2d, #54 + xar v11.2d, v7.2d, v27.2d, #58 + xar v7.2d, v10.2d, v25.2d, #61 + # Row Mix + mov v25.16b, v0.16b + mov v26.16b, v1.16b + bcax v0.16b, v25.16b, v2.16b, v26.16b + bcax v1.16b, v26.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v25.16b, v4.16b + bcax v4.16b, v4.16b, v26.16b, v25.16b + mov v25.16b, v5.16b + mov v26.16b, v6.16b + bcax v5.16b, v25.16b, v7.16b, v26.16b + bcax v6.16b, v26.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v25.16b, v9.16b + bcax v9.16b, v9.16b, v26.16b, v25.16b + mov v26.16b, v11.16b + bcax v10.16b, v30.16b, v12.16b, v26.16b + bcax v11.16b, v26.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v30.16b, v14.16b + bcax v14.16b, v14.16b, v26.16b, v30.16b + mov v25.16b, v15.16b + mov v26.16b, v16.16b + bcax v15.16b, v25.16b, v17.16b, v26.16b + bcax v16.16b, v26.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v25.16b, v19.16b + bcax v19.16b, v19.16b, v26.16b, v25.16b + mov v25.16b, v20.16b + mov v26.16b, v21.16b + bcax v20.16b, v25.16b, v22.16b, v26.16b + bcax v21.16b, v26.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v25.16b, v24.16b + bcax v24.16b, v24.16b, v26.16b, v25.16b + ld1r {v30.2d}, [x1], #8 + subs x2, x2, #1 + eor v0.16b, v0.16b, v30.16b + bne L_sha3_crypto_begin + st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 + st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 + st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 + st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 + st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 + st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 + st1 {v24.1d}, [x0] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size BlockSha3,.-BlockSha3 +#endif /* __APPLE__ */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#endif /* WOLFSSL_SHA3 */ +#endif /* __aarch64__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c new file mode 100644 index 000000000..54c04451d --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c @@ -0,0 +1,185 @@ +/* armv8-sha3-asm + * + * Copyright (C) 2006-2021 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha3/sha3.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha3-asm.c + */ +#ifdef WOLFSSL_ARMASM +#ifdef __aarch64__ +#include + +#ifdef WOLFSSL_SHA3 +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +static const uint64_t L_SHA3_transform_crypto_r[] = { + 0x1UL, + 0x8082UL, + 0x800000000000808aUL, + 0x8000000080008000UL, + 0x808bUL, + 0x80000001UL, + 0x8000000080008081UL, + 0x8000000000008009UL, + 0x8aUL, + 0x88UL, + 0x80008009UL, + 0x8000000aUL, + 0x8000808bUL, + 0x800000000000008bUL, + 0x8000000000008089UL, + 0x8000000000008003UL, + 0x8000000000008002UL, + 0x8000000000000080UL, + 0x800aUL, + 0x800000008000000aUL, + 0x8000000080008081UL, + 0x8000000000008080UL, + 0x80000001UL, + 0x8000000080008008UL, +}; + +void BlockSha3(unsigned long* state) +{ + __asm__ __volatile__ ( +#ifdef __APPLE__ + ".arch_extension sha3\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x1, %[L_SHA3_transform_crypto_r]\n\t" + "add x1, x1, :lo12:%[L_SHA3_transform_crypto_r]\n\t" +#else + "adrp x1, %[L_SHA3_transform_crypto_r]@PAGE\n\t" + "add x1, x1, %[L_SHA3_transform_crypto_r]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ld4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "ld4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "ld4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "ld4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "ld4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "ld4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "ld1 {v24.1d}, [%x[state]]\n\t" + "sub %x[state], %x[state], #0xc0\n\t" + "mov x2, #24\n\t" + /* Start of 24 rounds */ + "\n" + "L_sha3_crypto_begin_%=: \n\t" + /* Col Mix */ + "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" + "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" + "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" + "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" + "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" + "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" + "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" + "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" + "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" + "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" + "rax1 v25.2d, v30.2d, v27.2d\n\t" + "rax1 v26.2d, v31.2d, v28.2d\n\t" + "rax1 v27.2d, v27.2d, v29.2d\n\t" + "rax1 v28.2d, v28.2d, v30.2d\n\t" + "rax1 v29.2d, v29.2d, v31.2d\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "xar v30.2d, v1.2d, v26.2d, #63\n\t" + "xar v1.2d, v6.2d, v26.2d, #20\n\t" + "xar v6.2d, v9.2d, v29.2d, #44\n\t" + "xar v9.2d, v22.2d, v27.2d, #3\n\t" + "xar v22.2d, v14.2d, v29.2d, #25\n\t" + "xar v14.2d, v20.2d, v25.2d, #46\n\t" + "xar v20.2d, v2.2d, v27.2d, #2\n\t" + "xar v2.2d, v12.2d, v27.2d, #21\n\t" + "xar v12.2d, v13.2d, v28.2d, #39\n\t" + "xar v13.2d, v19.2d, v29.2d, #56\n\t" + "xar v19.2d, v23.2d, v28.2d, #8\n\t" + "xar v23.2d, v15.2d, v25.2d, #23\n\t" + "xar v15.2d, v4.2d, v29.2d, #37\n\t" + "xar v4.2d, v24.2d, v29.2d, #50\n\t" + "xar v24.2d, v21.2d, v26.2d, #62\n\t" + "xar v21.2d, v8.2d, v28.2d, #9\n\t" + "xar v8.2d, v16.2d, v26.2d, #19\n\t" + "xar v16.2d, v5.2d, v25.2d, #28\n\t" + "xar v5.2d, v3.2d, v28.2d, #36\n\t" + "xar v3.2d, v18.2d, v28.2d, #43\n\t" + "xar v18.2d, v17.2d, v27.2d, #49\n\t" + "xar v17.2d, v11.2d, v26.2d, #54\n\t" + "xar v11.2d, v7.2d, v27.2d, #58\n\t" + "xar v7.2d, v10.2d, v25.2d, #61\n\t" + /* Row Mix */ + "mov v25.16b, v0.16b\n\t" + "mov v26.16b, v1.16b\n\t" + "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" + "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" + "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" + "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" + "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" + "mov v25.16b, v5.16b\n\t" + "mov v26.16b, v6.16b\n\t" + "bcax v5.16b, v25.16b, v7.16b, v26.16b\n\t" + "bcax v6.16b, v26.16b, v8.16b, v7.16b\n\t" + "bcax v7.16b, v7.16b, v9.16b, v8.16b\n\t" + "bcax v8.16b, v8.16b, v25.16b, v9.16b\n\t" + "bcax v9.16b, v9.16b, v26.16b, v25.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "bcax v10.16b, v30.16b, v12.16b, v26.16b\n\t" + "bcax v11.16b, v26.16b, v13.16b, v12.16b\n\t" + "bcax v12.16b, v12.16b, v14.16b, v13.16b\n\t" + "bcax v13.16b, v13.16b, v30.16b, v14.16b\n\t" + "bcax v14.16b, v14.16b, v26.16b, v30.16b\n\t" + "mov v25.16b, v15.16b\n\t" + "mov v26.16b, v16.16b\n\t" + "bcax v15.16b, v25.16b, v17.16b, v26.16b\n\t" + "bcax v16.16b, v26.16b, v18.16b, v17.16b\n\t" + "bcax v17.16b, v17.16b, v19.16b, v18.16b\n\t" + "bcax v18.16b, v18.16b, v25.16b, v19.16b\n\t" + "bcax v19.16b, v19.16b, v26.16b, v25.16b\n\t" + "mov v25.16b, v20.16b\n\t" + "mov v26.16b, v21.16b\n\t" + "bcax v20.16b, v25.16b, v22.16b, v26.16b\n\t" + "bcax v21.16b, v26.16b, v23.16b, v22.16b\n\t" + "bcax v22.16b, v22.16b, v24.16b, v23.16b\n\t" + "bcax v23.16b, v23.16b, v25.16b, v24.16b\n\t" + "bcax v24.16b, v24.16b, v26.16b, v25.16b\n\t" + "ld1r {v30.2d}, [x1], #8\n\t" + "subs x2, x2, #1\n\t" + "eor v0.16b, v0.16b, v30.16b\n\t" + "bne L_sha3_crypto_begin_%=\n\t" + "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" + "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" + "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" + "st4 {v12.d, v13.d, v14.d, v15.d}[0], [%x[state]], #32\n\t" + "st4 {v16.d, v17.d, v18.d, v19.d}[0], [%x[state]], #32\n\t" + "st4 {v20.d, v21.d, v22.d, v23.d}[0], [%x[state]], #32\n\t" + "st1 {v24.1d}, [%x[state]]\n\t" + : [state] "+r" (state) + : [L_SHA3_transform_crypto_r] "S" (L_SHA3_transform_crypto_r) + : "memory", "x1", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} + +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#endif /* WOLFSSL_SHA3 */ +#endif /* __aarch64__ */ +#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S index 9c890eed2..452998b0f 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -1201,20 +1201,12 @@ Transform_Sha512_Len_crypto: .p2align 2 _Transform_Sha512_Len_crypto: #endif /* __APPLE__ */ - stp x29, x30, [sp, #-208]! + stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] - stp d16, d17, [x29, #80] - stp d18, d19, [x29, #96] - stp d20, d21, [x29, #112] - stp d22, d23, [x29, #128] - stp d24, d25, [x29, #144] - stp d26, d27, [x29, #160] - stp d28, d29, [x29, #176] - stp d30, d31, [x29, #192] #ifdef __APPLE__ .arch_extension sha3 #endif /* __APPLE__ */ @@ -1734,15 +1726,7 @@ L_sha512_len_crypto_begin: ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] - ldp d16, d17, [x29, #80] - ldp d18, d19, [x29, #96] - ldp d20, d21, [x29, #112] - ldp d22, d23, [x29, #128] - ldp d24, d25, [x29, #144] - ldp d26, d27, [x29, #160] - ldp d28, d29, [x29, #176] - ldp d30, d31, [x29, #192] - ldp x29, x30, [sp], #0xd0 + ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size Transform_Sha512_Len_crypto,.-Transform_Sha512_Len_crypto diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c similarity index 100% rename from wolfcrypt/src/port/arm/armv8-sha512-asm.c rename to wolfcrypt/src/port/arm/armv8-sha512-asm_c.c diff --git a/wolfcrypt/src/sha3.c b/wolfcrypt/src/sha3.c index f125c8446..b2e76e8ef 100644 --- a/wolfcrypt/src/sha3.c +++ b/wolfcrypt/src/sha3.c @@ -51,6 +51,7 @@ #endif +#ifndef WOLFSSL_ARMASM #ifdef WOLFSSL_SHA3_SMALL /* Rotate a 64-bit value left. * @@ -137,8 +138,7 @@ static const word64 hash_keccak_r[24] = * i The index of the loop. */ #define SWAP_ROTL(s, t1, t2, i) \ -do \ -{ \ +do { \ t2 = s[K_I_##i]; s[K_I_##i] = ROTL64(t1, K_R_##i); \ } \ while (0) @@ -151,12 +151,10 @@ while (0) * t Temporary variable. */ #define COL_MIX(s, b, x, t) \ -do \ -{ \ +do { \ for (x = 0; x < 5; x++) \ b[x] = s[x + 0] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; \ - for (x = 0; x < 5; x++) \ - { \ + for (x = 0; x < 5; x++) { \ t = b[(x + 4) % 5] ^ ROTL64(b[(x + 1) % 5], 1); \ s[x + 0] ^= t; \ s[x + 5] ^= t; \ @@ -179,14 +177,12 @@ while (0) * t1 Temporary variable. */ #define ROW_MIX(s, b, y, x, t0, t1) \ -do \ -{ \ - for (y = 0; y < 5; y++) \ - { \ +do { \ + for (y = 0; y < 5; y++) { \ for (x = 0; x < 5; x++) \ b[x] = s[y * 5 + x]; \ for (x = 0; x < 5; x++) \ - s[y * 5 + x] = b[x] ^ (~b[(x + 1) % 5] & b[(x + 2) % 5]); \ + s[y * 5 + x] = b[x] ^ (~b[(x + 1) % 5] & b[(x + 2) % 5]); \ } \ } \ while (0) @@ -202,10 +198,8 @@ while (0) * t1 Temporary variable. */ #define ROW_MIX(s, b, y, x, t12, t34) \ -do \ -{ \ - for (y = 0; y < 5; y++) \ - { \ +do { \ + for (y = 0; y < 5; y++) { \ for (x = 0; x < 5; x++) \ b[x] = s[y * 5 + x]; \ t12 = (b[1] ^ b[2]); t34 = (b[3] ^ b[4]); \ @@ -351,8 +345,7 @@ static const word64 hash_keccak_r[24] = * t Temporary variable. */ #define COL_MIX(s, b, x, t) \ -do \ -{ \ +do { \ (b)[0] = (s)[0] ^ (s)[5] ^ (s)[10] ^ (s)[15] ^ (s)[20]; \ (b)[1] = (s)[1] ^ (s)[6] ^ (s)[11] ^ (s)[16] ^ (s)[21]; \ (b)[2] = (s)[2] ^ (s)[7] ^ (s)[12] ^ (s)[17] ^ (s)[22]; \ @@ -384,8 +377,7 @@ while (0) * t1 Temporary variable. (Unused) */ #define ROW_MIX(s2, s1, b, t0, t1) \ -do \ -{ \ +do { \ (b)[0] = (s1)[0]; \ (b)[1] = S((s1), 0); \ (b)[2] = S((s1), 1); \ @@ -449,8 +441,7 @@ while (0) * t34 Temporary variable. */ #define ROW_MIX(s2, s1, b, t12, t34) \ -do \ -{ \ +do { \ (b)[0] = (s1)[0]; \ (b)[1] = S((s1), 0); \ (b)[2] = S((s1), 1); \ @@ -536,6 +527,7 @@ static void BlockSha3(word64 *s) } } #endif /* WOLFSSL_SHA3_SMALL */ +#endif /* !WOLFSSL_ARMASM */ static WC_INLINE word64 Load64Unaligned(const unsigned char *a) { @@ -618,8 +610,7 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p) byte l; byte *t; - if (sha3->i > 0) - { + if (sha3->i > 0) { l = p * 8 - sha3->i; if (l > len) { l = (byte)len; @@ -632,16 +623,14 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p) len -= i; sha3->i += i; - if (sha3->i == p * 8) - { + if (sha3->i == p * 8) { for (i = 0; i < p; i++) sha3->s[i] ^= Load64BitBigEndian(sha3->t + 8 * i); BlockSha3(sha3->s); sha3->i = 0; } } - while (len >= ((word32)(p * 8))) - { + while (len >= ((word32)(p * 8))) { for (i = 0; i < p; i++) sha3->s[i] ^= Load64Unaligned(data + 8 * i); BlockSha3(sha3->s); diff --git a/wolfssl/wolfcrypt/sha3.h b/wolfssl/wolfcrypt/sha3.h index e06dc1592..b41524e9e 100644 --- a/wolfssl/wolfcrypt/sha3.h +++ b/wolfssl/wolfcrypt/sha3.h @@ -128,6 +128,9 @@ struct wc_Sha3 { typedef wc_Sha3 wc_Shake; #endif +#ifdef WOLFSSL_ARMASM +WOLFSSL_LOCAL void BlockSha3(word64 *s); +#endif WOLFSSL_API int wc_InitSha3_224(wc_Sha3* sha3, void* heap, int devId); WOLFSSL_API int wc_Sha3_224_Update(wc_Sha3* sha3, const byte* data, word32 len);