Merge pull request #5650 from SparkiDev/aes_x86_asm

pull/5664/head
Hayden Roche 2022-10-04 16:40:02 -07:00 committed by GitHub
commit 6f4af1581b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 13959 additions and 44 deletions

View File

@ -576,6 +576,15 @@ then
ENABLED_FASTMATH="yes"
fi
if test "$host_cpu" = "x86_64" || test "$host_cpu" = "amd64"
then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_64_BUILD"
fi
if test "$host_cpu" = "x86"
then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_BUILD"
fi
# if sp-math-all is not set, then enable fast math
if test "x$ENABLED_FASTMATH" = "xyes" && test "$enable_sp_math_all" = "" && test "$enable_sp_math" = ""
then
@ -594,11 +603,6 @@ then
ENABLED_HEAPMATH="no"
ENABLED_SP_MATH_ALL="no"
fi
if test "$host_cpu" = "x86_64" || test "$host_cpu" = "amd64"
then
# Have settings.h set FP_MAX_BITS higher if user didn't set directly
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_64_BUILD"
fi
AS_IF([test "x$host_cpu" = "xaarch64"],[AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_AARCH64_BUILD"])
if test "$ENABLED_SAKKE" = "yes" && test "$ENABLED_SAKKE_SMALL" != "yes"
@ -2313,6 +2317,16 @@ then
AM_CFLAGS="$AM_CFLAGS -DUSE_INTEL_SPEEDUP"
ENABLED_AESNI=yes
fi
if test "$host_cpu" = "x86_64" || test "$host_cpu" = "amd64"
then
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_X86_64_BUILD"
fi
if test "$host_cpu" = "x86"
then
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_X86_BUILD"
ENABLED_X86_ASM=yes
fi
fi
AC_ARG_ENABLE([aligndata],
@ -6732,7 +6746,7 @@ if test "$ENABLED_SP_MATH_ALL" = "yes" && test "$ENABLED_ASM" != "no"; then
case $host_cpu in
*x86_64* | *amd64*)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64 -DWOLFSSL_X86_64_BUILD"
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64"
;;
*x86*)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86"
@ -6817,10 +6831,6 @@ if test "$ENABLED_SP_ASM" = "yes" && test "$ENABLED_SP" = "yes"; then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64_ASM"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_X86_64_ASM"
ENABLED_SP_X86_64_ASM=yes
if test "x$ENABLED_FASTMATH" = "xno"
then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_64_BUILD"
fi
;;
*)
AC_MSG_ERROR([ASM not available for CPU. Supported CPUs: x86_64, aarch64, arm])
@ -8097,6 +8107,7 @@ AM_CONDITIONAL([BUILD_ARMASM_CRYPTO],[test "x$ENABLED_ARMASM_CRYPTO" = "xyes"])
AM_CONDITIONAL([BUILD_XILINX],[test "x$ENABLED_XILINX" = "xyes"])
AM_CONDITIONAL([BUILD_AESNI],[test "x$ENABLED_AESNI" = "xyes"])
AM_CONDITIONAL([BUILD_INTELASM],[test "x$ENABLED_INTELASM" = "xyes"])
AM_CONDITIONAL([BUILD_X86_ASM],[test "x$ENABLED_X86_ASM" = "xyes"])
AM_CONDITIONAL([BUILD_AFALG],[test "x$ENABLED_AFALG" = "xyes"])
AM_CONDITIONAL([BUILD_KCAPI],[test "x$ENABLED_KCAPI" = "xyes"])
AM_CONDITIONAL([BUILD_DEVCRYPTO],[test "x$ENABLED_DEVCRYPTO" = "xyes"])

View File

@ -118,8 +118,12 @@ endif
if BUILD_AESNI
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
if BUILD_X86_ASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
endif
endif
if BUILD_DES3
src_libwolfssl_la_SOURCES += wolfcrypt/src/des3.c
@ -210,8 +214,12 @@ endif
if BUILD_AESNI
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
if BUILD_X86_ASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
endif
endif
if BUILD_SHA
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha.c
@ -552,9 +560,13 @@ endif
if !BUILD_FIPS_CURRENT
if BUILD_AESNI
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
if BUILD_X86_ASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
endif
endif
endif
if BUILD_CAMELLIA
src_libwolfssl_la_SOURCES += wolfcrypt/src/camellia.c

View File

@ -738,7 +738,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits
XASM_LINK("AES_CBC_encrypt");
#ifdef HAVE_AES_DECRYPT
#if defined(WOLFSSL_AESNI_BY4)
#if defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD)
void AES_CBC_decrypt_by4(const unsigned char* in, unsigned char* out,
unsigned char* ivec, unsigned long length,
const unsigned char* KS, int nr)
@ -4191,7 +4191,7 @@ int wc_AesSetIV(Aes* aes, const byte* iv)
/* if input and output same will overwrite input iv */
XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#if defined(WOLFSSL_AESNI_BY4)
#if defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD)
AES_CBC_decrypt_by4(in, out, (byte*)aes->reg, sz, (byte*)aes->key,
aes->rounds);
#elif defined(WOLFSSL_AESNI_BY6)
@ -7867,7 +7867,7 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
}
else
#endif
#ifdef HAVE_INTEL_AVX1
#if defined(HAVE_INTEL_AVX1)
if (IS_INTEL_AVX1(intel_flags)) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
AES_GCM_encrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
@ -8414,7 +8414,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
}
else
#endif
#ifdef HAVE_INTEL_AVX1
#if defined(HAVE_INTEL_AVX1)
if (IS_INTEL_AVX1(intel_flags)) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
AES_GCM_decrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
@ -9035,7 +9035,7 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni(
extern "C" {
#endif
/* Assembly code implementations in: aes_gcm_asm.S */
/* Assembly code implementations in: aes_gcm_asm.S and aes_gcm_x86_asm.S */
#ifdef HAVE_INTEL_AVX2
extern void AES_GCM_decrypt_update_avx2(const unsigned char* key, int nr,
unsigned char* out, const unsigned char* in, unsigned int nbytes,

View File

@ -27,6 +27,7 @@
* by Intel Mobility Group, Israel Development Center, Israel Shay Gueron
*/
#ifdef WOLFSSL_X86_64_BUILD
/*
AES_CBC_encrypt (const unsigned char *in,
@ -1333,6 +1334,893 @@ pxor %xmm4, %xmm3
pxor %xmm2, %xmm3
ret
#elif defined WOLFSSL_X86_BUILD
/*
AES_CBC_encrypt (const unsigned char *in,
unsigned char *out,
unsigned char ivec[16],
unsigned long length,
const unsigned char *KS,
int nr)
*/
#ifndef __APPLE__
.globl AES_CBC_encrypt
AES_CBC_encrypt:
#else
.globl _AES_CBC_encrypt
_AES_CBC_encrypt:
#endif
# parameter 1: stack[4] => %edi
# parameter 2: stack[8] => %esi
# parameter 3: stack[12] => %edx
# parameter 4: stack[16] => %ecx
# parameter 5: stack[20] => %eax
# parameter 6: stack[24] => %ebx
push %edi
push %esi
push %ebx
push %ebp
movl 20(%esp), %edi
movl 24(%esp), %esi
movl 28(%esp), %edx
movl 32(%esp), %ecx
movl 36(%esp), %eax
movl 40(%esp), %ebx
movl %ecx, %ebp
shrl $4, %ecx
shll $60, %ebp
je NO_PARTS
addl $1, %ecx
NO_PARTS:
subl $16, %esi
movdqa (%edx), %xmm1
LOOP:
pxor (%edi), %xmm1
pxor (%eax), %xmm1
addl $16,%esi
addl $16,%edi
cmpl $12, %ebx
aesenc 16(%eax),%xmm1
aesenc 32(%eax),%xmm1
aesenc 48(%eax),%xmm1
aesenc 64(%eax),%xmm1
aesenc 80(%eax),%xmm1
aesenc 96(%eax),%xmm1
aesenc 112(%eax),%xmm1
aesenc 128(%eax),%xmm1
aesenc 144(%eax),%xmm1
movdqa 160(%eax),%xmm2
jb LAST
cmpl $14, %ebx
aesenc 160(%eax),%xmm1
aesenc 176(%eax),%xmm1
movdqa 192(%eax),%xmm2
jb LAST
aesenc 192(%eax),%xmm1
aesenc 208(%eax),%xmm1
movdqa 224(%eax),%xmm2
LAST:
decl %ecx
aesenclast %xmm2,%xmm1
movdqu %xmm1,(%esi)
jne LOOP
pop %ebp
pop %ebx
pop %esi
pop %edi
ret
/*
AES_CBC_decrypt_by4 (const unsigned char *in,
unsigned char *out,
unsigned char ivec[16],
unsigned long length,
const unsigned char *KS,
int nr)
*/
#ifndef __APPLE__
.globl AES_CBC_decrypt_by4
AES_CBC_decrypt_by4:
#else
.globl _AES_CBC_decrypt_by4
_AES_CBC_decrypt_by4:
#endif
# parameter 1: stack[4] => %edi
# parameter 2: stack[8] => %esi
# parameter 3: stack[12] => %edx
# parameter 4: stack[16] => %ecx
# parameter 5: stack[20] => %eax
# parameter 6: stack[24] => %ebx
push %edi
push %esi
push %ebx
push %ebp
movl 20(%esp), %edi
movl 24(%esp), %esi
movl 28(%esp), %edx
movl 32(%esp), %ecx
movl 36(%esp), %eax
movl 40(%esp), %ebx
subl $16, %esp
movdqu (%edx), %xmm0
movl %ecx, %ebp
shrl $4, %ecx
shll $60, %ebp
movdqu %xmm0, (%esp)
je DNO_PARTS_4
addl $1, %ecx
DNO_PARTS_4:
movl %ecx, %ebp
shll $62, %ebp
shrl $62, %ebp
shrl $2, %ecx
je DREMAINDER_4
subl $64, %esi
DLOOP_4:
movdqu (%edi), %xmm1
movdqu 16(%edi), %xmm2
movdqu 32(%edi), %xmm3
movdqu 48(%edi), %xmm4
movdqa (%eax), %xmm5
movdqa 16(%eax), %xmm6
movdqa 32(%eax), %xmm7
movdqa 48(%eax), %xmm0
pxor %xmm5, %xmm1
pxor %xmm5, %xmm2
pxor %xmm5, %xmm3
pxor %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
aesdec %xmm7, %xmm1
aesdec %xmm7, %xmm2
aesdec %xmm7, %xmm3
aesdec %xmm7, %xmm4
aesdec %xmm0, %xmm1
aesdec %xmm0, %xmm2
aesdec %xmm0, %xmm3
aesdec %xmm0, %xmm4
movdqa 64(%eax), %xmm5
movdqa 80(%eax), %xmm6
movdqa 96(%eax), %xmm7
movdqa 112(%eax), %xmm0
aesdec %xmm5, %xmm1
aesdec %xmm5, %xmm2
aesdec %xmm5, %xmm3
aesdec %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
aesdec %xmm7, %xmm1
aesdec %xmm7, %xmm2
aesdec %xmm7, %xmm3
aesdec %xmm7, %xmm4
aesdec %xmm0, %xmm1
aesdec %xmm0, %xmm2
aesdec %xmm0, %xmm3
aesdec %xmm0, %xmm4
movdqa 128(%eax), %xmm5
movdqa 144(%eax), %xmm6
movdqa 160(%eax), %xmm7
cmpl $12, %ebx
aesdec %xmm5, %xmm1
aesdec %xmm5, %xmm2
aesdec %xmm5, %xmm3
aesdec %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
jb DLAST_4
movdqa 160(%eax), %xmm5
movdqa 176(%eax), %xmm6
movdqa 192(%eax), %xmm7
cmpl $14, %ebx
aesdec %xmm5, %xmm1
aesdec %xmm5, %xmm2
aesdec %xmm5, %xmm3
aesdec %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
jb DLAST_4
movdqa 192(%eax), %xmm5
movdqa 208(%eax), %xmm6
movdqa 224(%eax), %xmm7
aesdec %xmm5, %xmm1
aesdec %xmm5, %xmm2
aesdec %xmm5, %xmm3
aesdec %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
DLAST_4:
addl $64, %esi
aesdeclast %xmm7, %xmm1
aesdeclast %xmm7, %xmm2
aesdeclast %xmm7, %xmm3
aesdeclast %xmm7, %xmm4
movdqu (%esp), %xmm0
movdqu (%edi), %xmm5
movdqu 16(%edi), %xmm6
movdqu 32(%edi), %xmm7
pxor %xmm0, %xmm1
pxor %xmm5, %xmm2
pxor %xmm6, %xmm3
pxor %xmm7, %xmm4
movdqu 48(%edi), %xmm0
movdqu %xmm1, (%esi)
movdqu %xmm2, 16(%esi)
movdqu %xmm3, 32(%esi)
movdqu %xmm4, 48(%esi)
movdqu %xmm0, (%esp)
addl $64, %edi
decl %ecx
jne DLOOP_4
addl $64, %esi
DREMAINDER_4:
cmpl $0, %ebp
je DEND_4
DLOOP_4_2:
movdqu (%edi), %xmm1
movdqa %xmm1, %xmm5
addl $16, %edi
pxor (%eax), %xmm1
movdqu 160(%eax), %xmm2
cmpl $12, %ebx
aesdec 16(%eax), %xmm1
aesdec 32(%eax), %xmm1
aesdec 48(%eax), %xmm1
aesdec 64(%eax), %xmm1
aesdec 80(%eax), %xmm1
aesdec 96(%eax), %xmm1
aesdec 112(%eax), %xmm1
aesdec 128(%eax), %xmm1
aesdec 144(%eax), %xmm1
jb DLAST_4_2
movdqu 192(%eax), %xmm2
cmpl $14, %ebx
aesdec 160(%eax), %xmm1
aesdec 176(%eax), %xmm1
jb DLAST_4_2
movdqu 224(%eax), %xmm2
aesdec 192(%eax), %xmm1
aesdec 208(%eax), %xmm1
DLAST_4_2:
aesdeclast %xmm2, %xmm1
pxor %xmm0, %xmm1
movdqa %xmm5, %xmm0
movdqu %xmm1, (%esi)
addl $16, %esi
decl %ebp
jne DLOOP_4_2
DEND_4:
addl $16, %esp
pop %ebp
pop %ebx
pop %esi
pop %edi
ret
/*
AES_ECB_encrypt (const unsigned char *in,
unsigned char *out,
unsigned long length,
const unsigned char *KS,
int nr)
*/
#ifndef __APPLE__
.globl AES_ECB_encrypt
AES_ECB_encrypt:
#else
.globl _AES_ECB_encrypt
_AES_ECB_encrypt:
#endif
# parameter 1: stack[4] => %edi
# parameter 2: stack[8] => %esi
# parameter 3: stack[12] => %edx
# parameter 4: stack[16] => %ecx
# parameter 5: stack[20] => %eax
push %edi
push %esi
push %ebx
movl 16(%esp), %edi
movl 20(%esp), %esi
movl 24(%esp), %edx
movl 28(%esp), %ecx
movl 32(%esp), %eax
movl %edx, %ebx
shrl $4, %edx
shll $60, %ebx
je EECB_NO_PARTS_4
addl $1, %edx
EECB_NO_PARTS_4:
movl %edx, %ebx
shll $62, %ebx
shrl $62, %ebx
shrl $2, %edx
je EECB_REMAINDER_4
subl $64, %esi
EECB_LOOP_4:
movdqu (%edi), %xmm1
movdqu 16(%edi), %xmm2
movdqu 32(%edi), %xmm3
movdqu 48(%edi), %xmm4
movdqa (%ecx), %xmm5
movdqa 16(%ecx), %xmm6
movdqa 32(%ecx), %xmm7
movdqa 48(%ecx), %xmm0
pxor %xmm5, %xmm1
pxor %xmm5, %xmm2
pxor %xmm5, %xmm3
pxor %xmm5, %xmm4
aesenc %xmm6, %xmm1
aesenc %xmm6, %xmm2
aesenc %xmm6, %xmm3
aesenc %xmm6, %xmm4
aesenc %xmm7, %xmm1
aesenc %xmm7, %xmm2
aesenc %xmm7, %xmm3
aesenc %xmm7, %xmm4
aesenc %xmm0, %xmm1
aesenc %xmm0, %xmm2
aesenc %xmm0, %xmm3
aesenc %xmm0, %xmm4
movdqa 64(%ecx), %xmm5
movdqa 80(%ecx), %xmm6
movdqa 96(%ecx), %xmm7
movdqa 112(%ecx), %xmm0
aesenc %xmm5, %xmm1
aesenc %xmm5, %xmm2
aesenc %xmm5, %xmm3
aesenc %xmm5, %xmm4
aesenc %xmm6, %xmm1
aesenc %xmm6, %xmm2
aesenc %xmm6, %xmm3
aesenc %xmm6, %xmm4
aesenc %xmm7, %xmm1
aesenc %xmm7, %xmm2
aesenc %xmm7, %xmm3
aesenc %xmm7, %xmm4
aesenc %xmm0, %xmm1
aesenc %xmm0, %xmm2
aesenc %xmm0, %xmm3
aesenc %xmm0, %xmm4
movdqa 128(%ecx), %xmm5
movdqa 144(%ecx), %xmm6
movdqa 160(%ecx), %xmm7
cmpl $12, %eax
aesenc %xmm5, %xmm1
aesenc %xmm5, %xmm2
aesenc %xmm5, %xmm3
aesenc %xmm5, %xmm4
aesenc %xmm6, %xmm1
aesenc %xmm6, %xmm2
aesenc %xmm6, %xmm3
aesenc %xmm6, %xmm4
jb EECB_LAST_4
movdqa 160(%ecx), %xmm5
movdqa 176(%ecx), %xmm6
movdqa 192(%ecx), %xmm7
cmpl $14, %eax
aesenc %xmm5, %xmm1
aesenc %xmm5, %xmm2
aesenc %xmm5, %xmm3
aesenc %xmm5, %xmm4
aesenc %xmm6, %xmm1
aesenc %xmm6, %xmm2
aesenc %xmm6, %xmm3
aesenc %xmm6, %xmm4
jb EECB_LAST_4
movdqa 192(%ecx), %xmm5
movdqa 208(%ecx), %xmm6
movdqa 224(%ecx), %xmm7
aesenc %xmm5, %xmm1
aesenc %xmm5, %xmm2
aesenc %xmm5, %xmm3
aesenc %xmm5, %xmm4
aesenc %xmm6, %xmm1
aesenc %xmm6, %xmm2
aesenc %xmm6, %xmm3
aesenc %xmm6, %xmm4
EECB_LAST_4:
addl $64, %edi
addl $64, %esi
decl %edx
aesenclast %xmm7, %xmm1
aesenclast %xmm7, %xmm2
aesenclast %xmm7, %xmm3
aesenclast %xmm7, %xmm4
movdqu %xmm1, (%esi)
movdqu %xmm2, 16(%esi)
movdqu %xmm3, 32(%esi)
movdqu %xmm4, 48(%esi)
jne EECB_LOOP_4
addl $64, %esi
EECB_REMAINDER_4:
cmpl $0, %ebx
je EECB_END_4
EECB_LOOP_4_2:
movdqu (%edi), %xmm1
addl $16, %edi
pxor (%ecx), %xmm1
movdqu 160(%ecx), %xmm2
aesenc 16(%ecx), %xmm1
aesenc 32(%ecx), %xmm1
aesenc 48(%ecx), %xmm1
aesenc 64(%ecx), %xmm1
aesenc 80(%ecx), %xmm1
aesenc 96(%ecx), %xmm1
aesenc 112(%ecx), %xmm1
aesenc 128(%ecx), %xmm1
aesenc 144(%ecx), %xmm1
cmpl $12, %eax
jb EECB_LAST_4_2
movdqu 192(%ecx), %xmm2
aesenc 160(%ecx), %xmm1
aesenc 176(%ecx), %xmm1
cmpl $14, %eax
jb EECB_LAST_4_2
movdqu 224(%ecx), %xmm2
aesenc 192(%ecx), %xmm1
aesenc 208(%ecx), %xmm1
EECB_LAST_4_2:
aesenclast %xmm2, %xmm1
movdqu %xmm1, (%esi)
addl $16, %esi
decl %ebx
jne EECB_LOOP_4_2
EECB_END_4:
pop %ebx
pop %esi
pop %edi
ret
/*
AES_ECB_decrypt (const unsigned char *in,
unsigned char *out,
unsigned long length,
const unsigned char *KS,
int nr)
*/
#ifndef __APPLE__
.globl AES_ECB_decrypt
AES_ECB_decrypt:
#else
.globl _AES_ECB_decrypt
_AES_ECB_decrypt:
#endif
# parameter 1: stack[4] => %edi
# parameter 2: stack[8] => %esi
# parameter 3: stack[12] => %edx
# parameter 4: stack[16] => %ecx
# parameter 5: stack[20] => %eax
push %edi
push %esi
push %ebx
movl 20(%esp), %edi
movl 24(%esp), %esi
movl 28(%esp), %edx
movl 32(%esp), %ecx
movl 36(%esp), %eax
movl %edx, %ebx
shrl $4, %edx
shll $60, %ebx
je DECB_NO_PARTS_4
addl $1, %edx
DECB_NO_PARTS_4:
movl %edx, %ebx
shll $62, %ebx
shrl $62, %ebx
shrl $2, %edx
je DECB_REMAINDER_4
subl $64, %esi
DECB_LOOP_4:
movdqu (%edi), %xmm1
movdqu 16(%edi), %xmm2
movdqu 32(%edi), %xmm3
movdqu 48(%edi), %xmm4
movdqa (%ecx), %xmm5
movdqa 16(%ecx), %xmm6
movdqa 32(%ecx), %xmm7
movdqa 48(%ecx), %xmm0
pxor %xmm5, %xmm1
pxor %xmm5, %xmm2
pxor %xmm5, %xmm3
pxor %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
aesdec %xmm7, %xmm1
aesdec %xmm7, %xmm2
aesdec %xmm7, %xmm3
aesdec %xmm7, %xmm4
aesdec %xmm0, %xmm1
aesdec %xmm0, %xmm2
aesdec %xmm0, %xmm3
aesdec %xmm0, %xmm4
movdqa 64(%ecx), %xmm5
movdqa 80(%ecx), %xmm6
movdqa 96(%ecx), %xmm7
movdqa 112(%ecx), %xmm0
aesdec %xmm5, %xmm1
aesdec %xmm5, %xmm2
aesdec %xmm5, %xmm3
aesdec %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
aesdec %xmm7, %xmm1
aesdec %xmm7, %xmm2
aesdec %xmm7, %xmm3
aesdec %xmm7, %xmm4
aesdec %xmm0, %xmm1
aesdec %xmm0, %xmm2
aesdec %xmm0, %xmm3
aesdec %xmm0, %xmm4
movdqa 128(%ecx), %xmm5
movdqa 144(%ecx), %xmm6
movdqa 160(%ecx), %xmm7
cmpl $12, %eax
aesdec %xmm5, %xmm1
aesdec %xmm5, %xmm2
aesdec %xmm5, %xmm3
aesdec %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
jb DECB_LAST_4
movdqa 160(%ecx), %xmm5
movdqa 176(%ecx), %xmm6
movdqa 192(%ecx), %xmm7
cmpl $14, %eax
aesdec %xmm5, %xmm1
aesdec %xmm5, %xmm2
aesdec %xmm5, %xmm3
aesdec %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
jb DECB_LAST_4
movdqa 192(%ecx), %xmm5
movdqa 208(%ecx), %xmm6
movdqa 224(%ecx), %xmm7
aesdec %xmm5, %xmm1
aesdec %xmm5, %xmm2
aesdec %xmm5, %xmm3
aesdec %xmm5, %xmm4
aesdec %xmm6, %xmm1
aesdec %xmm6, %xmm2
aesdec %xmm6, %xmm3
aesdec %xmm6, %xmm4
DECB_LAST_4:
addl $64, %edi
addl $64, %esi
decl %edx
aesdeclast %xmm7, %xmm1
aesdeclast %xmm7, %xmm2
aesdeclast %xmm7, %xmm3
aesdeclast %xmm7, %xmm4
movdqu %xmm1, (%esi)
movdqu %xmm2, 16(%esi)
movdqu %xmm3, 32(%esi)
movdqu %xmm4, 48(%esi)
jne DECB_LOOP_4
addl $64, %esi
DECB_REMAINDER_4:
cmpl $0, %ebx
je DECB_END_4
DECB_LOOP_4_2:
movdqu (%edi), %xmm1
addl $16, %edi
pxor (%ecx), %xmm1
movdqu 160(%ecx), %xmm2
cmpl $12, %eax
aesdec 16(%ecx), %xmm1
aesdec 32(%ecx), %xmm1
aesdec 48(%ecx), %xmm1
aesdec 64(%ecx), %xmm1
aesdec 80(%ecx), %xmm1
aesdec 96(%ecx), %xmm1
aesdec 112(%ecx), %xmm1
aesdec 128(%ecx), %xmm1
aesdec 144(%ecx), %xmm1
jb DECB_LAST_4_2
cmpl $14, %eax
movdqu 192(%ecx), %xmm2
aesdec 160(%ecx), %xmm1
aesdec 176(%ecx), %xmm1
jb DECB_LAST_4_2
movdqu 224(%ecx), %xmm2
aesdec 192(%ecx), %xmm1
aesdec 208(%ecx), %xmm1
DECB_LAST_4_2:
aesdeclast %xmm2, %xmm1
movdqu %xmm1, (%esi)
addl $16, %esi
decl %ebx
jne DECB_LOOP_4_2
DECB_END_4:
pop %ebx
pop %esi
pop %edi
ret
/*
void AES_128_Key_Expansion(const unsigned char* userkey,
unsigned char* key_schedule);
*/
.align 16,0x90
#ifndef __APPLE__
.globl AES_128_Key_Expansion
AES_128_Key_Expansion:
#else
.globl _AES_128_Key_Expansion
_AES_128_Key_Expansion:
#endif
# parameter 1: stack[4] => %eax
# parameter 2: stack[8] => %edx
movl 4(%esp), %eax
movl 8(%esp), %edx
movl $10, 240(%edx)
movdqu (%eax), %xmm1
movdqa %xmm1, (%edx)
ASSISTS:
aeskeygenassist $1, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 16(%edx)
aeskeygenassist $2, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 32(%edx)
aeskeygenassist $4, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 48(%edx)
aeskeygenassist $8, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 64(%edx)
aeskeygenassist $16, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 80(%edx)
aeskeygenassist $32, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 96(%edx)
aeskeygenassist $64, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 112(%edx)
aeskeygenassist $0x80, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 128(%edx)
aeskeygenassist $0x1b, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 144(%edx)
aeskeygenassist $0x36, %xmm1, %xmm2
call PREPARE_ROUNDKEY_128
movdqa %xmm1, 160(%edx)
ret
PREPARE_ROUNDKEY_128:
pshufd $255, %xmm2, %xmm2
movdqa %xmm1, %xmm3
pslldq $4, %xmm3
pxor %xmm3, %xmm1
pslldq $4, %xmm3
pxor %xmm3, %xmm1
pslldq $4, %xmm3
pxor %xmm3, %xmm1
pxor %xmm2, %xmm1
ret
/*
void AES_192_Key_Expansion (const unsigned char *userkey,
unsigned char *key)
*/
#ifndef __APPLE__
.globl AES_192_Key_Expansion
AES_192_Key_Expansion:
#else
.globl _AES_192_Key_Expansion
_AES_192_Key_Expansion:
#endif
# parameter 1: stack[4] => %eax
# parameter 2: stack[8] => %edx
movl 4(%esp), %eax
movl 8(%esp), %edx
movdqu (%eax), %xmm1
movq 16(%eax), %xmm3
movdqa %xmm1, (%edx)
movdqa %xmm3, %xmm5
aeskeygenassist $0x1, %xmm3, %xmm2
call PREPARE_ROUNDKEY_192
shufpd $0, %xmm1, %xmm5
movdqa %xmm5, 16(%edx)
movdqa %xmm1, %xmm6
shufpd $1, %xmm3, %xmm6
movdqa %xmm6, 32(%edx)
aeskeygenassist $0x2, %xmm3, %xmm2
call PREPARE_ROUNDKEY_192
movdqa %xmm1, 48(%edx)
movdqa %xmm3, %xmm5
aeskeygenassist $0x4, %xmm3, %xmm2
call PREPARE_ROUNDKEY_192
shufpd $0, %xmm1, %xmm5
movdqa %xmm5, 64(%edx)
movdqa %xmm1, %xmm6
shufpd $1, %xmm3, %xmm6
movdqa %xmm6, 80(%edx)
aeskeygenassist $0x8, %xmm3, %xmm2
call PREPARE_ROUNDKEY_192
movdqa %xmm1, 96(%edx)
movdqa %xmm3, %xmm5
aeskeygenassist $0x10, %xmm3, %xmm2
call PREPARE_ROUNDKEY_192
shufpd $0, %xmm1, %xmm5
movdqa %xmm5, 112(%edx)
movdqa %xmm1, %xmm6
shufpd $1, %xmm3, %xmm6
movdqa %xmm6, 128(%edx)
aeskeygenassist $0x20, %xmm3, %xmm2
call PREPARE_ROUNDKEY_192
movdqa %xmm1, 144(%edx)
movdqa %xmm3, %xmm5
aeskeygenassist $0x40, %xmm3, %xmm2
call PREPARE_ROUNDKEY_192
shufpd $0, %xmm1, %xmm5
movdqa %xmm5, 160(%edx)
movdqa %xmm1, %xmm6
shufpd $1, %xmm3, %xmm6
movdqa %xmm6, 176(%edx)
aeskeygenassist $0x80, %xmm3, %xmm2
call PREPARE_ROUNDKEY_192
movdqa %xmm1, 192(%edx)
movdqa %xmm3, 208(%edx)
ret
PREPARE_ROUNDKEY_192:
pshufd $0x55, %xmm2, %xmm2
movdqu %xmm1, %xmm4
pslldq $4, %xmm4
pxor %xmm4, %xmm1
pslldq $4, %xmm4
pxor %xmm4, %xmm1
pslldq $4, %xmm4
pxor %xmm4, %xmm1
pxor %xmm2, %xmm1
pshufd $0xff, %xmm1, %xmm2
movdqu %xmm3, %xmm4
pslldq $4, %xmm4
pxor %xmm4, %xmm3
pxor %xmm2, %xmm3
ret
/*
void AES_256_Key_Expansion (const unsigned char *userkey,
unsigned char *key)
*/
#ifndef __APPLE__
.globl AES_256_Key_Expansion
AES_256_Key_Expansion:
#else
.globl _AES_256_Key_Expansion
_AES_256_Key_Expansion:
#endif
# parameter 1: stack[4] => %eax
# parameter 2: stack[8] => %edx
movl 4(%esp), %eax
movl 8(%esp), %edx
movdqu (%eax), %xmm1
movdqu 16(%eax), %xmm3
movdqa %xmm1, (%edx)
movdqa %xmm3, 16(%edx)
aeskeygenassist $0x1, %xmm3, %xmm2
call MAKE_RK256_a
movdqa %xmm1, 32(%edx)
aeskeygenassist $0x0, %xmm1, %xmm2
call MAKE_RK256_b
movdqa %xmm3, 48(%edx)
aeskeygenassist $0x2, %xmm3, %xmm2
call MAKE_RK256_a
movdqa %xmm1, 64(%edx)
aeskeygenassist $0x0, %xmm1, %xmm2
call MAKE_RK256_b
movdqa %xmm3, 80(%edx)
aeskeygenassist $0x4, %xmm3, %xmm2
call MAKE_RK256_a
movdqa %xmm1, 96(%edx)
aeskeygenassist $0x0, %xmm1, %xmm2
call MAKE_RK256_b
movdqa %xmm3, 112(%edx)
aeskeygenassist $0x8, %xmm3, %xmm2
call MAKE_RK256_a
movdqa %xmm1, 128(%edx)
aeskeygenassist $0x0, %xmm1, %xmm2
call MAKE_RK256_b
movdqa %xmm3, 144(%edx)
aeskeygenassist $0x10, %xmm3, %xmm2
call MAKE_RK256_a
movdqa %xmm1, 160(%edx)
aeskeygenassist $0x0, %xmm1, %xmm2
call MAKE_RK256_b
movdqa %xmm3, 176(%edx)
aeskeygenassist $0x20, %xmm3, %xmm2
call MAKE_RK256_a
movdqa %xmm1, 192(%edx)
aeskeygenassist $0x0, %xmm1, %xmm2
call MAKE_RK256_b
movdqa %xmm3, 208(%edx)
aeskeygenassist $0x40, %xmm3, %xmm2
call MAKE_RK256_a
movdqa %xmm1, 224(%edx)
ret
MAKE_RK256_a:
pshufd $0xff, %xmm2, %xmm2
movdqa %xmm1, %xmm4
pslldq $4, %xmm4
pxor %xmm4, %xmm1
pslldq $4, %xmm4
pxor %xmm4, %xmm1
pslldq $4, %xmm4
pxor %xmm4, %xmm1
pxor %xmm2, %xmm1
ret
MAKE_RK256_b:
pshufd $0xaa, %xmm2, %xmm2
movdqa %xmm3, %xmm4
pslldq $4, %xmm4
pxor %xmm4, %xmm3
pslldq $4, %xmm4
pxor %xmm4, %xmm3
pslldq $4, %xmm4
pxor %xmm4, %xmm3
pxor %xmm2, %xmm3
ret
#endif /* WOLFSSL_X86_64_BUILD */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

View File

@ -30,6 +30,7 @@
#define HAVE_INTEL_AVX2
#endif /* NO_AVX2_SUPPORT */
#ifdef WOLFSSL_X86_64_BUILD
#ifndef __APPLE__
.data
#else
@ -15833,6 +15834,7 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
#endif /* __APPLE__ */
#endif /* WOLFSSL_AESGCM_STREAM */
#endif /* HAVE_INTEL_AVX2 */
#endif /* WOLFSSL_X86_64_BUILD */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits

File diff suppressed because it is too large Load Diff

View File

@ -30,6 +30,7 @@
#define HAVE_INTEL_AVX2
#endif /* NO_AVX2_SUPPORT */
#ifdef WOLFSSL_X86_64_BUILD
#ifndef __APPLE__
.text
.globl chacha_encrypt_x64
@ -1430,6 +1431,7 @@ L_chacha20_avx2_end256:
.size chacha_encrypt_avx2,.-chacha_encrypt_avx2
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#endif /* WOLFSSL_X86_64_BUILD */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits

View File

@ -55,7 +55,7 @@ and Daniel J. Bernstein
#pragma warning(disable: 4127)
#endif
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
#include <emmintrin.h>
#include <immintrin.h>
@ -77,12 +77,13 @@ and Daniel J. Bernstein
#endif
#endif
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
static word32 intel_flags = 0;
static word32 cpu_flags_set = 0;
#endif
#if defined(USE_INTEL_SPEEDUP) || defined(POLY130564)
#if (defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)) || \
defined(POLY130564)
#if defined(_MSC_VER)
#define POLY1305_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
@ -122,7 +123,7 @@ static word32 cpu_flags_set = 0;
#endif
#endif
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
#ifdef __cplusplus
extern "C" {
#endif
@ -265,7 +266,7 @@ with a given ctx pointer to a Poly1305 structure.
static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
size_t bytes)
{
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
/* AVX2 is handled in wc_Poly1305Update. */
SAVE_VECTOR_REGISTERS(return _svr_ret;);
poly1305_blocks_avx(ctx, m, bytes);
@ -399,7 +400,7 @@ number of bytes is less than the block size.
*/
static int poly1305_block(Poly1305* ctx, const unsigned char *m)
{
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
/* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
SAVE_VECTOR_REGISTERS(return _svr_ret;);
poly1305_block_avx(ctx, m);
@ -414,7 +415,8 @@ static int poly1305_block(Poly1305* ctx, const unsigned char *m)
#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
{
#if defined(POLY130564) && !defined(USE_INTEL_SPEEDUP)
#if defined(POLY130564) && \
!(defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP))
word64 t0,t1;
#endif
@ -435,7 +437,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
if (keySz != 32 || ctx == NULL)
return BAD_FUNC_ARG;
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
if (!cpu_flags_set) {
intel_flags = cpuid_get_flags();
cpu_flags_set = 1;
@ -502,7 +504,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
int wc_Poly1305Final(Poly1305* ctx, byte* mac)
{
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
#elif defined(POLY130564)
word64 h0,h1,h2,c;
@ -521,7 +523,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
if (ctx == NULL || mac == NULL)
return BAD_FUNC_ARG;
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
SAVE_VECTOR_REGISTERS(return _svr_ret;);
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags))
@ -707,7 +709,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
printf("\n");
#endif
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);

View File

@ -30,6 +30,7 @@
#define HAVE_INTEL_AVX2
#endif /* NO_AVX2_SUPPORT */
#ifdef WOLFSSL_X86_64_BUILD
#ifdef HAVE_INTEL_AVX1
#ifndef __APPLE__
.text
@ -1107,6 +1108,7 @@ L_poly1305_avx2_final_cmp_copy:
.size poly1305_final_avx2,.-poly1305_final_avx2
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#endif /* WOLFSSL_X86_64_BUILD */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits

View File

@ -174,7 +174,7 @@ on the specific device platform.
#endif
#if defined(USE_INTEL_SPEEDUP)
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
#if defined(__GNUC__) && ((__GNUC__ < 4) || \
(__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
#undef NO_AVX2_SUPPORT
@ -194,7 +194,7 @@ on the specific device platform.
#else
#undef HAVE_INTEL_AVX1
#undef HAVE_INTEL_AVX2
#endif /* USE_INTEL_SPEEDUP */
#endif /* WOLFSSL_X86_64_BUILD && USE_INTEL_SPEEDUP */
#if defined(HAVE_INTEL_AVX2)
#define HAVE_INTEL_RORX
@ -253,8 +253,8 @@ static int InitSha256(wc_Sha256* sha256)
/* Hardware Acceleration */
#if defined(USE_INTEL_SPEEDUP) && (defined(HAVE_INTEL_AVX1) || \
defined(HAVE_INTEL_AVX2))
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
/* in case intel instructions aren't available, plus we need the K[] global */
#define NEED_SOFT_SHA256
@ -1072,7 +1072,8 @@ static int InitSha256(wc_Sha256* sha256)
if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(USE_INTEL_SPEEDUP) && \
#if defined(WOLFSSL_X86_64_BUILD) && \
defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
@ -1107,7 +1108,7 @@ static int InitSha256(wc_Sha256* sha256)
/* process blocks */
#ifdef XTRANSFORM_LEN
#if defined(USE_INTEL_SPEEDUP) && \
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
if (Transform_Sha256_Len_p != NULL)
#endif
@ -1123,13 +1124,14 @@ static int InitSha256(wc_Sha256* sha256)
len -= blocksLen;
}
}
#if defined(USE_INTEL_SPEEDUP) && \
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
else
#endif
#endif /* XTRANSFORM_LEN */
#if !defined(XTRANSFORM_LEN) || (defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
#if !defined(XTRANSFORM_LEN) || \
(defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
{
while (len >= WC_SHA256_BLOCK_SIZE) {
word32* local32 = sha256->buffer;
@ -1137,7 +1139,8 @@ static int InitSha256(wc_Sha256* sha256)
/* Intel transform function requires use of sha256->buffer */
/* Little Endian requires byte swap, so can't use data directly */
#if defined(WC_HASH_DATA_ALIGNMENT) && !defined(LITTLE_ENDIAN_ORDER) && \
!(defined(USE_INTEL_SPEEDUP) && \
!(defined(WOLFSSL_X86_64_BUILD) && \
defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
if (((wc_ptr_t)data % WC_HASH_DATA_ALIGNMENT) == 0) {
local32 = (word32*)data;
@ -1152,7 +1155,8 @@ static int InitSha256(wc_Sha256* sha256)
len -= WC_SHA256_BLOCK_SIZE;
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(USE_INTEL_SPEEDUP) && \
#if defined(WOLFSSL_X86_64_BUILD) && \
defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
@ -1245,7 +1249,7 @@ static int InitSha256(wc_Sha256* sha256)
sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(USE_INTEL_SPEEDUP) && \
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
@ -1283,7 +1287,7 @@ static int InitSha256(wc_Sha256* sha256)
/* store lengths */
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(USE_INTEL_SPEEDUP) && \
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
@ -1297,10 +1301,11 @@ static int InitSha256(wc_Sha256* sha256)
XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
sizeof(word32));
#if defined(FREESCALE_MMCAU_SHA) || (defined(USE_INTEL_SPEEDUP) && \
#if defined(FREESCALE_MMCAU_SHA) || \
(defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
/* Kinetis requires only these bytes reversed */
#if defined(USE_INTEL_SPEEDUP) && \
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
#endif
@ -1532,7 +1537,7 @@ static int InitSha256(wc_Sha256* sha256)
sha224->loLen = 0;
sha224->hiLen = 0;
#if defined(USE_INTEL_SPEEDUP) && \
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
/* choose best Transform function under this runtime environment */
Sha256_SetTransform();

View File

@ -30,6 +30,7 @@
#define HAVE_INTEL_AVX2
#endif /* NO_AVX2_SUPPORT */
#ifdef WOLFSSL_X86_64_BUILD
#ifdef HAVE_INTEL_AVX1
#ifndef __APPLE__
.data
@ -22655,6 +22656,7 @@ L_sha256_len_avx2_rorx_done:
.size Transform_Sha256_AVX2_RORX_Len,.-Transform_Sha256_AVX2_RORX_Len
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#endif /* WOLFSSL_X86_64_BUILD */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits

View File

@ -11321,6 +11321,33 @@ WOLFSSL_TEST_SUBROUTINE int aesgcm_test(void)
ERROR_OUT(-6394, out);
}
#endif /* HAVE_AES_DECRYPT */
#ifdef BENCH_AESGCM_LARGE
/* setup test buffer */
result = wc_AesGcmEncryptInit(enc, k1, sizeof(k1), iv1, sizeof(iv1));
if (result != 0)
ERROR_OUT(-6360, out);
result = wc_AesGcmEncryptUpdate(enc, large_output, large_input,
BENCH_AESGCM_LARGE, a, sizeof(a));
if (result != 0)
ERROR_OUT(-6361, out);
result = wc_AesGcmEncryptFinal(enc, resultT, sizeof(t1));
if (result != 0)
ERROR_OUT(-6362, out);
#ifdef HAVE_AES_DECRYPT
result = wc_AesGcmDecryptInit(enc, k1, sizeof(k1), iv1, sizeof(iv1));
if (result != 0)
ERROR_OUT(-6363, out);
result = wc_AesGcmDecryptUpdate(enc, large_outdec, large_output,
BENCH_AESGCM_LARGE, a, sizeof(a));
if (result != 0)
ERROR_OUT(-6364, out);
result = wc_AesGcmDecryptFinal(enc, resultT, sizeof(t1));
if (result != 0)
ERROR_OUT(-6365, out);
if (XMEMCMP(large_input, large_outdec, BENCH_AESGCM_LARGE))
ERROR_OUT(-6366, out);
#endif /* HAVE_AES_DECRYPT */
#endif /* BENCH_AESGCM_LARGE */
#endif /* WOLFSSL_AESGCM_STREAM */
#endif /* WOLFSSL_AES_256 */
#endif /* !WOLFSSL_AFALG_XILINX_AES && !WOLFSSL_XILINX_CRYPT */

View File

@ -48,7 +48,7 @@
#define WC_HAS_GCC_4_4_64BIT
#endif
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
#elif (defined(WC_HAS_SIZEOF_INT128_64BIT) || defined(WC_HAS_MSVC_64BIT) || \
defined(WC_HAS_GCC_4_4_64BIT))
#define POLY130564
@ -67,7 +67,7 @@ enum {
/* Poly1305 state */
typedef struct Poly1305 {
#ifdef USE_INTEL_SPEEDUP
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
word64 r[3];
word64 h[3];
word64 pad[2];