mirror of https://github.com/wolfSSL/wolfssl.git
Merge pull request #5650 from SparkiDev/aes_x86_asm
commit
6f4af1581b
31
configure.ac
31
configure.ac
|
@ -576,6 +576,15 @@ then
|
|||
ENABLED_FASTMATH="yes"
|
||||
fi
|
||||
|
||||
if test "$host_cpu" = "x86_64" || test "$host_cpu" = "amd64"
|
||||
then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_64_BUILD"
|
||||
fi
|
||||
if test "$host_cpu" = "x86"
|
||||
then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_BUILD"
|
||||
fi
|
||||
|
||||
# if sp-math-all is not set, then enable fast math
|
||||
if test "x$ENABLED_FASTMATH" = "xyes" && test "$enable_sp_math_all" = "" && test "$enable_sp_math" = ""
|
||||
then
|
||||
|
@ -594,11 +603,6 @@ then
|
|||
ENABLED_HEAPMATH="no"
|
||||
ENABLED_SP_MATH_ALL="no"
|
||||
fi
|
||||
if test "$host_cpu" = "x86_64" || test "$host_cpu" = "amd64"
|
||||
then
|
||||
# Have settings.h set FP_MAX_BITS higher if user didn't set directly
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_64_BUILD"
|
||||
fi
|
||||
AS_IF([test "x$host_cpu" = "xaarch64"],[AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_AARCH64_BUILD"])
|
||||
|
||||
if test "$ENABLED_SAKKE" = "yes" && test "$ENABLED_SAKKE_SMALL" != "yes"
|
||||
|
@ -2313,6 +2317,16 @@ then
|
|||
AM_CFLAGS="$AM_CFLAGS -DUSE_INTEL_SPEEDUP"
|
||||
ENABLED_AESNI=yes
|
||||
fi
|
||||
|
||||
if test "$host_cpu" = "x86_64" || test "$host_cpu" = "amd64"
|
||||
then
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_X86_64_BUILD"
|
||||
fi
|
||||
if test "$host_cpu" = "x86"
|
||||
then
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_X86_BUILD"
|
||||
ENABLED_X86_ASM=yes
|
||||
fi
|
||||
fi
|
||||
|
||||
AC_ARG_ENABLE([aligndata],
|
||||
|
@ -6732,7 +6746,7 @@ if test "$ENABLED_SP_MATH_ALL" = "yes" && test "$ENABLED_ASM" != "no"; then
|
|||
|
||||
case $host_cpu in
|
||||
*x86_64* | *amd64*)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64 -DWOLFSSL_X86_64_BUILD"
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64"
|
||||
;;
|
||||
*x86*)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86"
|
||||
|
@ -6817,10 +6831,6 @@ if test "$ENABLED_SP_ASM" = "yes" && test "$ENABLED_SP" = "yes"; then
|
|||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64_ASM"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_X86_64_ASM"
|
||||
ENABLED_SP_X86_64_ASM=yes
|
||||
if test "x$ENABLED_FASTMATH" = "xno"
|
||||
then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_64_BUILD"
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
AC_MSG_ERROR([ASM not available for CPU. Supported CPUs: x86_64, aarch64, arm])
|
||||
|
@ -8097,6 +8107,7 @@ AM_CONDITIONAL([BUILD_ARMASM_CRYPTO],[test "x$ENABLED_ARMASM_CRYPTO" = "xyes"])
|
|||
AM_CONDITIONAL([BUILD_XILINX],[test "x$ENABLED_XILINX" = "xyes"])
|
||||
AM_CONDITIONAL([BUILD_AESNI],[test "x$ENABLED_AESNI" = "xyes"])
|
||||
AM_CONDITIONAL([BUILD_INTELASM],[test "x$ENABLED_INTELASM" = "xyes"])
|
||||
AM_CONDITIONAL([BUILD_X86_ASM],[test "x$ENABLED_X86_ASM" = "xyes"])
|
||||
AM_CONDITIONAL([BUILD_AFALG],[test "x$ENABLED_AFALG" = "xyes"])
|
||||
AM_CONDITIONAL([BUILD_KCAPI],[test "x$ENABLED_KCAPI" = "xyes"])
|
||||
AM_CONDITIONAL([BUILD_DEVCRYPTO],[test "x$ENABLED_DEVCRYPTO" = "xyes"])
|
||||
|
|
|
@ -118,8 +118,12 @@ endif
|
|||
|
||||
if BUILD_AESNI
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
|
||||
if BUILD_X86_ASM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
|
||||
else
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
|
||||
endif
|
||||
endif
|
||||
|
||||
if BUILD_DES3
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/des3.c
|
||||
|
@ -210,8 +214,12 @@ endif
|
|||
|
||||
if BUILD_AESNI
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
|
||||
if BUILD_X86_ASM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
|
||||
else
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
|
||||
endif
|
||||
endif
|
||||
|
||||
if BUILD_SHA
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha.c
|
||||
|
@ -552,9 +560,13 @@ endif
|
|||
if !BUILD_FIPS_CURRENT
|
||||
if BUILD_AESNI
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
|
||||
if BUILD_X86_ASM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
|
||||
else
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
if BUILD_CAMELLIA
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/camellia.c
|
||||
|
|
|
@ -738,7 +738,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits
|
|||
XASM_LINK("AES_CBC_encrypt");
|
||||
|
||||
#ifdef HAVE_AES_DECRYPT
|
||||
#if defined(WOLFSSL_AESNI_BY4)
|
||||
#if defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD)
|
||||
void AES_CBC_decrypt_by4(const unsigned char* in, unsigned char* out,
|
||||
unsigned char* ivec, unsigned long length,
|
||||
const unsigned char* KS, int nr)
|
||||
|
@ -4191,7 +4191,7 @@ int wc_AesSetIV(Aes* aes, const byte* iv)
|
|||
/* if input and output same will overwrite input iv */
|
||||
XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
|
||||
SAVE_VECTOR_REGISTERS(return _svr_ret;);
|
||||
#if defined(WOLFSSL_AESNI_BY4)
|
||||
#if defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD)
|
||||
AES_CBC_decrypt_by4(in, out, (byte*)aes->reg, sz, (byte*)aes->key,
|
||||
aes->rounds);
|
||||
#elif defined(WOLFSSL_AESNI_BY6)
|
||||
|
@ -7867,7 +7867,7 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
|||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
#if defined(HAVE_INTEL_AVX1)
|
||||
if (IS_INTEL_AVX1(intel_flags)) {
|
||||
SAVE_VECTOR_REGISTERS(return _svr_ret;);
|
||||
AES_GCM_encrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
|
||||
|
@ -8414,7 +8414,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
|||
}
|
||||
else
|
||||
#endif
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
#if defined(HAVE_INTEL_AVX1)
|
||||
if (IS_INTEL_AVX1(intel_flags)) {
|
||||
SAVE_VECTOR_REGISTERS(return _svr_ret;);
|
||||
AES_GCM_decrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
|
||||
|
@ -9035,7 +9035,7 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni(
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Assembly code implementations in: aes_gcm_asm.S */
|
||||
/* Assembly code implementations in: aes_gcm_asm.S and aes_gcm_x86_asm.S */
|
||||
#ifdef HAVE_INTEL_AVX2
|
||||
extern void AES_GCM_decrypt_update_avx2(const unsigned char* key, int nr,
|
||||
unsigned char* out, const unsigned char* in, unsigned int nbytes,
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
* by Intel Mobility Group, Israel Development Center, Israel Shay Gueron
|
||||
*/
|
||||
|
||||
#ifdef WOLFSSL_X86_64_BUILD
|
||||
|
||||
/*
|
||||
AES_CBC_encrypt (const unsigned char *in,
|
||||
|
@ -1333,6 +1334,893 @@ pxor %xmm4, %xmm3
|
|||
pxor %xmm2, %xmm3
|
||||
ret
|
||||
|
||||
#elif defined WOLFSSL_X86_BUILD
|
||||
|
||||
/*
|
||||
AES_CBC_encrypt (const unsigned char *in,
|
||||
unsigned char *out,
|
||||
unsigned char ivec[16],
|
||||
unsigned long length,
|
||||
const unsigned char *KS,
|
||||
int nr)
|
||||
*/
|
||||
#ifndef __APPLE__
|
||||
.globl AES_CBC_encrypt
|
||||
AES_CBC_encrypt:
|
||||
#else
|
||||
.globl _AES_CBC_encrypt
|
||||
_AES_CBC_encrypt:
|
||||
#endif
|
||||
# parameter 1: stack[4] => %edi
|
||||
# parameter 2: stack[8] => %esi
|
||||
# parameter 3: stack[12] => %edx
|
||||
# parameter 4: stack[16] => %ecx
|
||||
# parameter 5: stack[20] => %eax
|
||||
# parameter 6: stack[24] => %ebx
|
||||
push %edi
|
||||
push %esi
|
||||
push %ebx
|
||||
push %ebp
|
||||
movl 20(%esp), %edi
|
||||
movl 24(%esp), %esi
|
||||
movl 28(%esp), %edx
|
||||
movl 32(%esp), %ecx
|
||||
movl 36(%esp), %eax
|
||||
movl 40(%esp), %ebx
|
||||
|
||||
movl %ecx, %ebp
|
||||
shrl $4, %ecx
|
||||
shll $60, %ebp
|
||||
je NO_PARTS
|
||||
addl $1, %ecx
|
||||
NO_PARTS:
|
||||
subl $16, %esi
|
||||
movdqa (%edx), %xmm1
|
||||
LOOP:
|
||||
pxor (%edi), %xmm1
|
||||
pxor (%eax), %xmm1
|
||||
addl $16,%esi
|
||||
addl $16,%edi
|
||||
cmpl $12, %ebx
|
||||
aesenc 16(%eax),%xmm1
|
||||
aesenc 32(%eax),%xmm1
|
||||
aesenc 48(%eax),%xmm1
|
||||
aesenc 64(%eax),%xmm1
|
||||
aesenc 80(%eax),%xmm1
|
||||
aesenc 96(%eax),%xmm1
|
||||
aesenc 112(%eax),%xmm1
|
||||
aesenc 128(%eax),%xmm1
|
||||
aesenc 144(%eax),%xmm1
|
||||
movdqa 160(%eax),%xmm2
|
||||
jb LAST
|
||||
cmpl $14, %ebx
|
||||
|
||||
aesenc 160(%eax),%xmm1
|
||||
aesenc 176(%eax),%xmm1
|
||||
movdqa 192(%eax),%xmm2
|
||||
jb LAST
|
||||
aesenc 192(%eax),%xmm1
|
||||
aesenc 208(%eax),%xmm1
|
||||
movdqa 224(%eax),%xmm2
|
||||
LAST:
|
||||
decl %ecx
|
||||
aesenclast %xmm2,%xmm1
|
||||
movdqu %xmm1,(%esi)
|
||||
jne LOOP
|
||||
|
||||
pop %ebp
|
||||
pop %ebx
|
||||
pop %esi
|
||||
pop %edi
|
||||
ret
|
||||
|
||||
|
||||
/*
|
||||
AES_CBC_decrypt_by4 (const unsigned char *in,
|
||||
unsigned char *out,
|
||||
unsigned char ivec[16],
|
||||
unsigned long length,
|
||||
const unsigned char *KS,
|
||||
int nr)
|
||||
*/
|
||||
#ifndef __APPLE__
|
||||
.globl AES_CBC_decrypt_by4
|
||||
AES_CBC_decrypt_by4:
|
||||
#else
|
||||
.globl _AES_CBC_decrypt_by4
|
||||
_AES_CBC_decrypt_by4:
|
||||
#endif
|
||||
# parameter 1: stack[4] => %edi
|
||||
# parameter 2: stack[8] => %esi
|
||||
# parameter 3: stack[12] => %edx
|
||||
# parameter 4: stack[16] => %ecx
|
||||
# parameter 5: stack[20] => %eax
|
||||
# parameter 6: stack[24] => %ebx
|
||||
push %edi
|
||||
push %esi
|
||||
push %ebx
|
||||
push %ebp
|
||||
movl 20(%esp), %edi
|
||||
movl 24(%esp), %esi
|
||||
movl 28(%esp), %edx
|
||||
movl 32(%esp), %ecx
|
||||
movl 36(%esp), %eax
|
||||
movl 40(%esp), %ebx
|
||||
subl $16, %esp
|
||||
|
||||
movdqu (%edx), %xmm0
|
||||
movl %ecx, %ebp
|
||||
shrl $4, %ecx
|
||||
shll $60, %ebp
|
||||
movdqu %xmm0, (%esp)
|
||||
je DNO_PARTS_4
|
||||
addl $1, %ecx
|
||||
DNO_PARTS_4:
|
||||
movl %ecx, %ebp
|
||||
shll $62, %ebp
|
||||
shrl $62, %ebp
|
||||
shrl $2, %ecx
|
||||
je DREMAINDER_4
|
||||
subl $64, %esi
|
||||
DLOOP_4:
|
||||
movdqu (%edi), %xmm1
|
||||
movdqu 16(%edi), %xmm2
|
||||
movdqu 32(%edi), %xmm3
|
||||
movdqu 48(%edi), %xmm4
|
||||
movdqa (%eax), %xmm5
|
||||
movdqa 16(%eax), %xmm6
|
||||
movdqa 32(%eax), %xmm7
|
||||
movdqa 48(%eax), %xmm0
|
||||
pxor %xmm5, %xmm1
|
||||
pxor %xmm5, %xmm2
|
||||
pxor %xmm5, %xmm3
|
||||
pxor %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
aesdec %xmm7, %xmm1
|
||||
aesdec %xmm7, %xmm2
|
||||
aesdec %xmm7, %xmm3
|
||||
aesdec %xmm7, %xmm4
|
||||
aesdec %xmm0, %xmm1
|
||||
aesdec %xmm0, %xmm2
|
||||
aesdec %xmm0, %xmm3
|
||||
aesdec %xmm0, %xmm4
|
||||
movdqa 64(%eax), %xmm5
|
||||
movdqa 80(%eax), %xmm6
|
||||
movdqa 96(%eax), %xmm7
|
||||
movdqa 112(%eax), %xmm0
|
||||
aesdec %xmm5, %xmm1
|
||||
aesdec %xmm5, %xmm2
|
||||
aesdec %xmm5, %xmm3
|
||||
aesdec %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
aesdec %xmm7, %xmm1
|
||||
aesdec %xmm7, %xmm2
|
||||
aesdec %xmm7, %xmm3
|
||||
aesdec %xmm7, %xmm4
|
||||
aesdec %xmm0, %xmm1
|
||||
aesdec %xmm0, %xmm2
|
||||
aesdec %xmm0, %xmm3
|
||||
aesdec %xmm0, %xmm4
|
||||
movdqa 128(%eax), %xmm5
|
||||
movdqa 144(%eax), %xmm6
|
||||
movdqa 160(%eax), %xmm7
|
||||
cmpl $12, %ebx
|
||||
aesdec %xmm5, %xmm1
|
||||
aesdec %xmm5, %xmm2
|
||||
aesdec %xmm5, %xmm3
|
||||
aesdec %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
jb DLAST_4
|
||||
movdqa 160(%eax), %xmm5
|
||||
movdqa 176(%eax), %xmm6
|
||||
movdqa 192(%eax), %xmm7
|
||||
cmpl $14, %ebx
|
||||
aesdec %xmm5, %xmm1
|
||||
aesdec %xmm5, %xmm2
|
||||
aesdec %xmm5, %xmm3
|
||||
aesdec %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
jb DLAST_4
|
||||
movdqa 192(%eax), %xmm5
|
||||
movdqa 208(%eax), %xmm6
|
||||
movdqa 224(%eax), %xmm7
|
||||
aesdec %xmm5, %xmm1
|
||||
aesdec %xmm5, %xmm2
|
||||
aesdec %xmm5, %xmm3
|
||||
aesdec %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
DLAST_4:
|
||||
addl $64, %esi
|
||||
aesdeclast %xmm7, %xmm1
|
||||
aesdeclast %xmm7, %xmm2
|
||||
aesdeclast %xmm7, %xmm3
|
||||
aesdeclast %xmm7, %xmm4
|
||||
movdqu (%esp), %xmm0
|
||||
movdqu (%edi), %xmm5
|
||||
movdqu 16(%edi), %xmm6
|
||||
movdqu 32(%edi), %xmm7
|
||||
pxor %xmm0, %xmm1
|
||||
pxor %xmm5, %xmm2
|
||||
pxor %xmm6, %xmm3
|
||||
pxor %xmm7, %xmm4
|
||||
movdqu 48(%edi), %xmm0
|
||||
movdqu %xmm1, (%esi)
|
||||
movdqu %xmm2, 16(%esi)
|
||||
movdqu %xmm3, 32(%esi)
|
||||
movdqu %xmm4, 48(%esi)
|
||||
movdqu %xmm0, (%esp)
|
||||
addl $64, %edi
|
||||
decl %ecx
|
||||
jne DLOOP_4
|
||||
addl $64, %esi
|
||||
DREMAINDER_4:
|
||||
cmpl $0, %ebp
|
||||
je DEND_4
|
||||
DLOOP_4_2:
|
||||
movdqu (%edi), %xmm1
|
||||
movdqa %xmm1, %xmm5
|
||||
addl $16, %edi
|
||||
pxor (%eax), %xmm1
|
||||
movdqu 160(%eax), %xmm2
|
||||
cmpl $12, %ebx
|
||||
aesdec 16(%eax), %xmm1
|
||||
aesdec 32(%eax), %xmm1
|
||||
aesdec 48(%eax), %xmm1
|
||||
aesdec 64(%eax), %xmm1
|
||||
aesdec 80(%eax), %xmm1
|
||||
aesdec 96(%eax), %xmm1
|
||||
aesdec 112(%eax), %xmm1
|
||||
aesdec 128(%eax), %xmm1
|
||||
aesdec 144(%eax), %xmm1
|
||||
jb DLAST_4_2
|
||||
movdqu 192(%eax), %xmm2
|
||||
cmpl $14, %ebx
|
||||
aesdec 160(%eax), %xmm1
|
||||
aesdec 176(%eax), %xmm1
|
||||
jb DLAST_4_2
|
||||
movdqu 224(%eax), %xmm2
|
||||
aesdec 192(%eax), %xmm1
|
||||
aesdec 208(%eax), %xmm1
|
||||
DLAST_4_2:
|
||||
aesdeclast %xmm2, %xmm1
|
||||
pxor %xmm0, %xmm1
|
||||
movdqa %xmm5, %xmm0
|
||||
movdqu %xmm1, (%esi)
|
||||
addl $16, %esi
|
||||
decl %ebp
|
||||
jne DLOOP_4_2
|
||||
DEND_4:
|
||||
|
||||
addl $16, %esp
|
||||
pop %ebp
|
||||
pop %ebx
|
||||
pop %esi
|
||||
pop %edi
|
||||
ret
|
||||
|
||||
/*
|
||||
AES_ECB_encrypt (const unsigned char *in,
|
||||
unsigned char *out,
|
||||
unsigned long length,
|
||||
const unsigned char *KS,
|
||||
int nr)
|
||||
*/
|
||||
#ifndef __APPLE__
|
||||
.globl AES_ECB_encrypt
|
||||
AES_ECB_encrypt:
|
||||
#else
|
||||
.globl _AES_ECB_encrypt
|
||||
_AES_ECB_encrypt:
|
||||
#endif
|
||||
# parameter 1: stack[4] => %edi
|
||||
# parameter 2: stack[8] => %esi
|
||||
# parameter 3: stack[12] => %edx
|
||||
# parameter 4: stack[16] => %ecx
|
||||
# parameter 5: stack[20] => %eax
|
||||
push %edi
|
||||
push %esi
|
||||
push %ebx
|
||||
movl 16(%esp), %edi
|
||||
movl 20(%esp), %esi
|
||||
movl 24(%esp), %edx
|
||||
movl 28(%esp), %ecx
|
||||
movl 32(%esp), %eax
|
||||
|
||||
movl %edx, %ebx
|
||||
shrl $4, %edx
|
||||
shll $60, %ebx
|
||||
je EECB_NO_PARTS_4
|
||||
addl $1, %edx
|
||||
EECB_NO_PARTS_4:
|
||||
movl %edx, %ebx
|
||||
shll $62, %ebx
|
||||
shrl $62, %ebx
|
||||
shrl $2, %edx
|
||||
je EECB_REMAINDER_4
|
||||
subl $64, %esi
|
||||
EECB_LOOP_4:
|
||||
movdqu (%edi), %xmm1
|
||||
movdqu 16(%edi), %xmm2
|
||||
movdqu 32(%edi), %xmm3
|
||||
movdqu 48(%edi), %xmm4
|
||||
movdqa (%ecx), %xmm5
|
||||
movdqa 16(%ecx), %xmm6
|
||||
movdqa 32(%ecx), %xmm7
|
||||
movdqa 48(%ecx), %xmm0
|
||||
pxor %xmm5, %xmm1
|
||||
pxor %xmm5, %xmm2
|
||||
pxor %xmm5, %xmm3
|
||||
pxor %xmm5, %xmm4
|
||||
aesenc %xmm6, %xmm1
|
||||
aesenc %xmm6, %xmm2
|
||||
aesenc %xmm6, %xmm3
|
||||
aesenc %xmm6, %xmm4
|
||||
aesenc %xmm7, %xmm1
|
||||
aesenc %xmm7, %xmm2
|
||||
aesenc %xmm7, %xmm3
|
||||
aesenc %xmm7, %xmm4
|
||||
aesenc %xmm0, %xmm1
|
||||
aesenc %xmm0, %xmm2
|
||||
aesenc %xmm0, %xmm3
|
||||
aesenc %xmm0, %xmm4
|
||||
movdqa 64(%ecx), %xmm5
|
||||
movdqa 80(%ecx), %xmm6
|
||||
movdqa 96(%ecx), %xmm7
|
||||
movdqa 112(%ecx), %xmm0
|
||||
aesenc %xmm5, %xmm1
|
||||
aesenc %xmm5, %xmm2
|
||||
aesenc %xmm5, %xmm3
|
||||
aesenc %xmm5, %xmm4
|
||||
aesenc %xmm6, %xmm1
|
||||
aesenc %xmm6, %xmm2
|
||||
aesenc %xmm6, %xmm3
|
||||
aesenc %xmm6, %xmm4
|
||||
aesenc %xmm7, %xmm1
|
||||
aesenc %xmm7, %xmm2
|
||||
aesenc %xmm7, %xmm3
|
||||
aesenc %xmm7, %xmm4
|
||||
aesenc %xmm0, %xmm1
|
||||
aesenc %xmm0, %xmm2
|
||||
aesenc %xmm0, %xmm3
|
||||
aesenc %xmm0, %xmm4
|
||||
movdqa 128(%ecx), %xmm5
|
||||
movdqa 144(%ecx), %xmm6
|
||||
movdqa 160(%ecx), %xmm7
|
||||
cmpl $12, %eax
|
||||
aesenc %xmm5, %xmm1
|
||||
aesenc %xmm5, %xmm2
|
||||
aesenc %xmm5, %xmm3
|
||||
aesenc %xmm5, %xmm4
|
||||
aesenc %xmm6, %xmm1
|
||||
aesenc %xmm6, %xmm2
|
||||
aesenc %xmm6, %xmm3
|
||||
aesenc %xmm6, %xmm4
|
||||
jb EECB_LAST_4
|
||||
movdqa 160(%ecx), %xmm5
|
||||
movdqa 176(%ecx), %xmm6
|
||||
movdqa 192(%ecx), %xmm7
|
||||
cmpl $14, %eax
|
||||
aesenc %xmm5, %xmm1
|
||||
aesenc %xmm5, %xmm2
|
||||
aesenc %xmm5, %xmm3
|
||||
aesenc %xmm5, %xmm4
|
||||
aesenc %xmm6, %xmm1
|
||||
aesenc %xmm6, %xmm2
|
||||
aesenc %xmm6, %xmm3
|
||||
aesenc %xmm6, %xmm4
|
||||
jb EECB_LAST_4
|
||||
movdqa 192(%ecx), %xmm5
|
||||
movdqa 208(%ecx), %xmm6
|
||||
movdqa 224(%ecx), %xmm7
|
||||
aesenc %xmm5, %xmm1
|
||||
aesenc %xmm5, %xmm2
|
||||
aesenc %xmm5, %xmm3
|
||||
aesenc %xmm5, %xmm4
|
||||
aesenc %xmm6, %xmm1
|
||||
aesenc %xmm6, %xmm2
|
||||
aesenc %xmm6, %xmm3
|
||||
aesenc %xmm6, %xmm4
|
||||
EECB_LAST_4:
|
||||
addl $64, %edi
|
||||
addl $64, %esi
|
||||
decl %edx
|
||||
aesenclast %xmm7, %xmm1
|
||||
aesenclast %xmm7, %xmm2
|
||||
aesenclast %xmm7, %xmm3
|
||||
aesenclast %xmm7, %xmm4
|
||||
movdqu %xmm1, (%esi)
|
||||
movdqu %xmm2, 16(%esi)
|
||||
movdqu %xmm3, 32(%esi)
|
||||
movdqu %xmm4, 48(%esi)
|
||||
jne EECB_LOOP_4
|
||||
addl $64, %esi
|
||||
EECB_REMAINDER_4:
|
||||
cmpl $0, %ebx
|
||||
je EECB_END_4
|
||||
EECB_LOOP_4_2:
|
||||
movdqu (%edi), %xmm1
|
||||
addl $16, %edi
|
||||
pxor (%ecx), %xmm1
|
||||
movdqu 160(%ecx), %xmm2
|
||||
aesenc 16(%ecx), %xmm1
|
||||
aesenc 32(%ecx), %xmm1
|
||||
aesenc 48(%ecx), %xmm1
|
||||
aesenc 64(%ecx), %xmm1
|
||||
aesenc 80(%ecx), %xmm1
|
||||
aesenc 96(%ecx), %xmm1
|
||||
aesenc 112(%ecx), %xmm1
|
||||
aesenc 128(%ecx), %xmm1
|
||||
aesenc 144(%ecx), %xmm1
|
||||
cmpl $12, %eax
|
||||
jb EECB_LAST_4_2
|
||||
movdqu 192(%ecx), %xmm2
|
||||
aesenc 160(%ecx), %xmm1
|
||||
aesenc 176(%ecx), %xmm1
|
||||
cmpl $14, %eax
|
||||
jb EECB_LAST_4_2
|
||||
movdqu 224(%ecx), %xmm2
|
||||
aesenc 192(%ecx), %xmm1
|
||||
aesenc 208(%ecx), %xmm1
|
||||
EECB_LAST_4_2:
|
||||
aesenclast %xmm2, %xmm1
|
||||
movdqu %xmm1, (%esi)
|
||||
addl $16, %esi
|
||||
decl %ebx
|
||||
jne EECB_LOOP_4_2
|
||||
EECB_END_4:
|
||||
|
||||
pop %ebx
|
||||
pop %esi
|
||||
pop %edi
|
||||
ret
|
||||
|
||||
|
||||
/*
|
||||
AES_ECB_decrypt (const unsigned char *in,
|
||||
unsigned char *out,
|
||||
unsigned long length,
|
||||
const unsigned char *KS,
|
||||
int nr)
|
||||
*/
|
||||
#ifndef __APPLE__
|
||||
.globl AES_ECB_decrypt
|
||||
AES_ECB_decrypt:
|
||||
#else
|
||||
.globl _AES_ECB_decrypt
|
||||
_AES_ECB_decrypt:
|
||||
#endif
|
||||
# parameter 1: stack[4] => %edi
|
||||
# parameter 2: stack[8] => %esi
|
||||
# parameter 3: stack[12] => %edx
|
||||
# parameter 4: stack[16] => %ecx
|
||||
# parameter 5: stack[20] => %eax
|
||||
push %edi
|
||||
push %esi
|
||||
push %ebx
|
||||
movl 20(%esp), %edi
|
||||
movl 24(%esp), %esi
|
||||
movl 28(%esp), %edx
|
||||
movl 32(%esp), %ecx
|
||||
movl 36(%esp), %eax
|
||||
|
||||
|
||||
movl %edx, %ebx
|
||||
shrl $4, %edx
|
||||
shll $60, %ebx
|
||||
je DECB_NO_PARTS_4
|
||||
addl $1, %edx
|
||||
DECB_NO_PARTS_4:
|
||||
movl %edx, %ebx
|
||||
shll $62, %ebx
|
||||
shrl $62, %ebx
|
||||
shrl $2, %edx
|
||||
je DECB_REMAINDER_4
|
||||
subl $64, %esi
|
||||
DECB_LOOP_4:
|
||||
movdqu (%edi), %xmm1
|
||||
movdqu 16(%edi), %xmm2
|
||||
movdqu 32(%edi), %xmm3
|
||||
movdqu 48(%edi), %xmm4
|
||||
movdqa (%ecx), %xmm5
|
||||
movdqa 16(%ecx), %xmm6
|
||||
movdqa 32(%ecx), %xmm7
|
||||
movdqa 48(%ecx), %xmm0
|
||||
pxor %xmm5, %xmm1
|
||||
pxor %xmm5, %xmm2
|
||||
pxor %xmm5, %xmm3
|
||||
pxor %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
aesdec %xmm7, %xmm1
|
||||
aesdec %xmm7, %xmm2
|
||||
aesdec %xmm7, %xmm3
|
||||
aesdec %xmm7, %xmm4
|
||||
aesdec %xmm0, %xmm1
|
||||
aesdec %xmm0, %xmm2
|
||||
aesdec %xmm0, %xmm3
|
||||
aesdec %xmm0, %xmm4
|
||||
movdqa 64(%ecx), %xmm5
|
||||
movdqa 80(%ecx), %xmm6
|
||||
movdqa 96(%ecx), %xmm7
|
||||
movdqa 112(%ecx), %xmm0
|
||||
aesdec %xmm5, %xmm1
|
||||
aesdec %xmm5, %xmm2
|
||||
aesdec %xmm5, %xmm3
|
||||
aesdec %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
aesdec %xmm7, %xmm1
|
||||
aesdec %xmm7, %xmm2
|
||||
aesdec %xmm7, %xmm3
|
||||
aesdec %xmm7, %xmm4
|
||||
aesdec %xmm0, %xmm1
|
||||
aesdec %xmm0, %xmm2
|
||||
aesdec %xmm0, %xmm3
|
||||
aesdec %xmm0, %xmm4
|
||||
movdqa 128(%ecx), %xmm5
|
||||
movdqa 144(%ecx), %xmm6
|
||||
movdqa 160(%ecx), %xmm7
|
||||
cmpl $12, %eax
|
||||
aesdec %xmm5, %xmm1
|
||||
aesdec %xmm5, %xmm2
|
||||
aesdec %xmm5, %xmm3
|
||||
aesdec %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
jb DECB_LAST_4
|
||||
movdqa 160(%ecx), %xmm5
|
||||
movdqa 176(%ecx), %xmm6
|
||||
movdqa 192(%ecx), %xmm7
|
||||
cmpl $14, %eax
|
||||
aesdec %xmm5, %xmm1
|
||||
aesdec %xmm5, %xmm2
|
||||
aesdec %xmm5, %xmm3
|
||||
aesdec %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
jb DECB_LAST_4
|
||||
movdqa 192(%ecx), %xmm5
|
||||
movdqa 208(%ecx), %xmm6
|
||||
movdqa 224(%ecx), %xmm7
|
||||
aesdec %xmm5, %xmm1
|
||||
aesdec %xmm5, %xmm2
|
||||
aesdec %xmm5, %xmm3
|
||||
aesdec %xmm5, %xmm4
|
||||
aesdec %xmm6, %xmm1
|
||||
aesdec %xmm6, %xmm2
|
||||
aesdec %xmm6, %xmm3
|
||||
aesdec %xmm6, %xmm4
|
||||
DECB_LAST_4:
|
||||
addl $64, %edi
|
||||
addl $64, %esi
|
||||
decl %edx
|
||||
aesdeclast %xmm7, %xmm1
|
||||
aesdeclast %xmm7, %xmm2
|
||||
aesdeclast %xmm7, %xmm3
|
||||
aesdeclast %xmm7, %xmm4
|
||||
movdqu %xmm1, (%esi)
|
||||
movdqu %xmm2, 16(%esi)
|
||||
movdqu %xmm3, 32(%esi)
|
||||
movdqu %xmm4, 48(%esi)
|
||||
jne DECB_LOOP_4
|
||||
addl $64, %esi
|
||||
DECB_REMAINDER_4:
|
||||
cmpl $0, %ebx
|
||||
je DECB_END_4
|
||||
DECB_LOOP_4_2:
|
||||
movdqu (%edi), %xmm1
|
||||
addl $16, %edi
|
||||
pxor (%ecx), %xmm1
|
||||
movdqu 160(%ecx), %xmm2
|
||||
cmpl $12, %eax
|
||||
aesdec 16(%ecx), %xmm1
|
||||
aesdec 32(%ecx), %xmm1
|
||||
aesdec 48(%ecx), %xmm1
|
||||
aesdec 64(%ecx), %xmm1
|
||||
aesdec 80(%ecx), %xmm1
|
||||
aesdec 96(%ecx), %xmm1
|
||||
aesdec 112(%ecx), %xmm1
|
||||
aesdec 128(%ecx), %xmm1
|
||||
aesdec 144(%ecx), %xmm1
|
||||
jb DECB_LAST_4_2
|
||||
cmpl $14, %eax
|
||||
movdqu 192(%ecx), %xmm2
|
||||
aesdec 160(%ecx), %xmm1
|
||||
aesdec 176(%ecx), %xmm1
|
||||
jb DECB_LAST_4_2
|
||||
movdqu 224(%ecx), %xmm2
|
||||
aesdec 192(%ecx), %xmm1
|
||||
aesdec 208(%ecx), %xmm1
|
||||
DECB_LAST_4_2:
|
||||
aesdeclast %xmm2, %xmm1
|
||||
movdqu %xmm1, (%esi)
|
||||
addl $16, %esi
|
||||
decl %ebx
|
||||
jne DECB_LOOP_4_2
|
||||
DECB_END_4:
|
||||
pop %ebx
|
||||
pop %esi
|
||||
pop %edi
|
||||
ret
|
||||
|
||||
|
||||
|
||||
/*
|
||||
void AES_128_Key_Expansion(const unsigned char* userkey,
|
||||
unsigned char* key_schedule);
|
||||
*/
|
||||
.align 16,0x90
|
||||
#ifndef __APPLE__
|
||||
.globl AES_128_Key_Expansion
|
||||
AES_128_Key_Expansion:
|
||||
#else
|
||||
.globl _AES_128_Key_Expansion
|
||||
_AES_128_Key_Expansion:
|
||||
#endif
|
||||
# parameter 1: stack[4] => %eax
|
||||
# parameter 2: stack[8] => %edx
|
||||
movl 4(%esp), %eax
|
||||
movl 8(%esp), %edx
|
||||
|
||||
movl $10, 240(%edx)
|
||||
|
||||
movdqu (%eax), %xmm1
|
||||
movdqa %xmm1, (%edx)
|
||||
|
||||
|
||||
ASSISTS:
|
||||
aeskeygenassist $1, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 16(%edx)
|
||||
aeskeygenassist $2, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 32(%edx)
|
||||
aeskeygenassist $4, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 48(%edx)
|
||||
aeskeygenassist $8, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 64(%edx)
|
||||
aeskeygenassist $16, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 80(%edx)
|
||||
aeskeygenassist $32, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 96(%edx)
|
||||
aeskeygenassist $64, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 112(%edx)
|
||||
aeskeygenassist $0x80, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 128(%edx)
|
||||
aeskeygenassist $0x1b, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 144(%edx)
|
||||
aeskeygenassist $0x36, %xmm1, %xmm2
|
||||
call PREPARE_ROUNDKEY_128
|
||||
movdqa %xmm1, 160(%edx)
|
||||
ret
|
||||
|
||||
PREPARE_ROUNDKEY_128:
|
||||
pshufd $255, %xmm2, %xmm2
|
||||
movdqa %xmm1, %xmm3
|
||||
pslldq $4, %xmm3
|
||||
pxor %xmm3, %xmm1
|
||||
pslldq $4, %xmm3
|
||||
pxor %xmm3, %xmm1
|
||||
pslldq $4, %xmm3
|
||||
pxor %xmm3, %xmm1
|
||||
pxor %xmm2, %xmm1
|
||||
ret
|
||||
|
||||
|
||||
/*
|
||||
void AES_192_Key_Expansion (const unsigned char *userkey,
|
||||
unsigned char *key)
|
||||
*/
|
||||
#ifndef __APPLE__
|
||||
.globl AES_192_Key_Expansion
|
||||
AES_192_Key_Expansion:
|
||||
#else
|
||||
.globl _AES_192_Key_Expansion
|
||||
_AES_192_Key_Expansion:
|
||||
#endif
|
||||
# parameter 1: stack[4] => %eax
|
||||
# parameter 2: stack[8] => %edx
|
||||
movl 4(%esp), %eax
|
||||
movl 8(%esp), %edx
|
||||
|
||||
movdqu (%eax), %xmm1
|
||||
movq 16(%eax), %xmm3
|
||||
movdqa %xmm1, (%edx)
|
||||
movdqa %xmm3, %xmm5
|
||||
|
||||
aeskeygenassist $0x1, %xmm3, %xmm2
|
||||
call PREPARE_ROUNDKEY_192
|
||||
shufpd $0, %xmm1, %xmm5
|
||||
movdqa %xmm5, 16(%edx)
|
||||
movdqa %xmm1, %xmm6
|
||||
shufpd $1, %xmm3, %xmm6
|
||||
movdqa %xmm6, 32(%edx)
|
||||
|
||||
aeskeygenassist $0x2, %xmm3, %xmm2
|
||||
call PREPARE_ROUNDKEY_192
|
||||
movdqa %xmm1, 48(%edx)
|
||||
movdqa %xmm3, %xmm5
|
||||
|
||||
aeskeygenassist $0x4, %xmm3, %xmm2
|
||||
call PREPARE_ROUNDKEY_192
|
||||
shufpd $0, %xmm1, %xmm5
|
||||
movdqa %xmm5, 64(%edx)
|
||||
movdqa %xmm1, %xmm6
|
||||
shufpd $1, %xmm3, %xmm6
|
||||
movdqa %xmm6, 80(%edx)
|
||||
|
||||
aeskeygenassist $0x8, %xmm3, %xmm2
|
||||
call PREPARE_ROUNDKEY_192
|
||||
movdqa %xmm1, 96(%edx)
|
||||
movdqa %xmm3, %xmm5
|
||||
|
||||
aeskeygenassist $0x10, %xmm3, %xmm2
|
||||
call PREPARE_ROUNDKEY_192
|
||||
shufpd $0, %xmm1, %xmm5
|
||||
movdqa %xmm5, 112(%edx)
|
||||
movdqa %xmm1, %xmm6
|
||||
shufpd $1, %xmm3, %xmm6
|
||||
movdqa %xmm6, 128(%edx)
|
||||
|
||||
aeskeygenassist $0x20, %xmm3, %xmm2
|
||||
call PREPARE_ROUNDKEY_192
|
||||
movdqa %xmm1, 144(%edx)
|
||||
movdqa %xmm3, %xmm5
|
||||
|
||||
aeskeygenassist $0x40, %xmm3, %xmm2
|
||||
call PREPARE_ROUNDKEY_192
|
||||
shufpd $0, %xmm1, %xmm5
|
||||
movdqa %xmm5, 160(%edx)
|
||||
movdqa %xmm1, %xmm6
|
||||
shufpd $1, %xmm3, %xmm6
|
||||
movdqa %xmm6, 176(%edx)
|
||||
|
||||
aeskeygenassist $0x80, %xmm3, %xmm2
|
||||
call PREPARE_ROUNDKEY_192
|
||||
movdqa %xmm1, 192(%edx)
|
||||
movdqa %xmm3, 208(%edx)
|
||||
ret
|
||||
|
||||
PREPARE_ROUNDKEY_192:
|
||||
pshufd $0x55, %xmm2, %xmm2
|
||||
movdqu %xmm1, %xmm4
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm1
|
||||
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm1
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm1
|
||||
pxor %xmm2, %xmm1
|
||||
pshufd $0xff, %xmm1, %xmm2
|
||||
movdqu %xmm3, %xmm4
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm3
|
||||
pxor %xmm2, %xmm3
|
||||
ret
|
||||
|
||||
|
||||
/*
|
||||
void AES_256_Key_Expansion (const unsigned char *userkey,
|
||||
unsigned char *key)
|
||||
*/
|
||||
#ifndef __APPLE__
|
||||
.globl AES_256_Key_Expansion
|
||||
AES_256_Key_Expansion:
|
||||
#else
|
||||
.globl _AES_256_Key_Expansion
|
||||
_AES_256_Key_Expansion:
|
||||
#endif
|
||||
# parameter 1: stack[4] => %eax
|
||||
# parameter 2: stack[8] => %edx
|
||||
movl 4(%esp), %eax
|
||||
movl 8(%esp), %edx
|
||||
|
||||
movdqu (%eax), %xmm1
|
||||
movdqu 16(%eax), %xmm3
|
||||
movdqa %xmm1, (%edx)
|
||||
movdqa %xmm3, 16(%edx)
|
||||
|
||||
aeskeygenassist $0x1, %xmm3, %xmm2
|
||||
call MAKE_RK256_a
|
||||
movdqa %xmm1, 32(%edx)
|
||||
aeskeygenassist $0x0, %xmm1, %xmm2
|
||||
call MAKE_RK256_b
|
||||
movdqa %xmm3, 48(%edx)
|
||||
aeskeygenassist $0x2, %xmm3, %xmm2
|
||||
call MAKE_RK256_a
|
||||
movdqa %xmm1, 64(%edx)
|
||||
aeskeygenassist $0x0, %xmm1, %xmm2
|
||||
call MAKE_RK256_b
|
||||
movdqa %xmm3, 80(%edx)
|
||||
aeskeygenassist $0x4, %xmm3, %xmm2
|
||||
call MAKE_RK256_a
|
||||
movdqa %xmm1, 96(%edx)
|
||||
aeskeygenassist $0x0, %xmm1, %xmm2
|
||||
call MAKE_RK256_b
|
||||
movdqa %xmm3, 112(%edx)
|
||||
aeskeygenassist $0x8, %xmm3, %xmm2
|
||||
call MAKE_RK256_a
|
||||
movdqa %xmm1, 128(%edx)
|
||||
aeskeygenassist $0x0, %xmm1, %xmm2
|
||||
call MAKE_RK256_b
|
||||
movdqa %xmm3, 144(%edx)
|
||||
aeskeygenassist $0x10, %xmm3, %xmm2
|
||||
call MAKE_RK256_a
|
||||
movdqa %xmm1, 160(%edx)
|
||||
aeskeygenassist $0x0, %xmm1, %xmm2
|
||||
call MAKE_RK256_b
|
||||
movdqa %xmm3, 176(%edx)
|
||||
aeskeygenassist $0x20, %xmm3, %xmm2
|
||||
call MAKE_RK256_a
|
||||
movdqa %xmm1, 192(%edx)
|
||||
|
||||
aeskeygenassist $0x0, %xmm1, %xmm2
|
||||
call MAKE_RK256_b
|
||||
movdqa %xmm3, 208(%edx)
|
||||
aeskeygenassist $0x40, %xmm3, %xmm2
|
||||
call MAKE_RK256_a
|
||||
movdqa %xmm1, 224(%edx)
|
||||
|
||||
ret
|
||||
|
||||
MAKE_RK256_a:
|
||||
pshufd $0xff, %xmm2, %xmm2
|
||||
movdqa %xmm1, %xmm4
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm1
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm1
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm1
|
||||
pxor %xmm2, %xmm1
|
||||
ret
|
||||
|
||||
MAKE_RK256_b:
|
||||
pshufd $0xaa, %xmm2, %xmm2
|
||||
movdqa %xmm3, %xmm4
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm3
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm3
|
||||
pslldq $4, %xmm4
|
||||
pxor %xmm4, %xmm3
|
||||
pxor %xmm2, %xmm3
|
||||
ret
|
||||
|
||||
#endif /* WOLFSSL_X86_64_BUILD */
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#define HAVE_INTEL_AVX2
|
||||
#endif /* NO_AVX2_SUPPORT */
|
||||
|
||||
#ifdef WOLFSSL_X86_64_BUILD
|
||||
#ifndef __APPLE__
|
||||
.data
|
||||
#else
|
||||
|
@ -15833,6 +15834,7 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
|
|||
#endif /* __APPLE__ */
|
||||
#endif /* WOLFSSL_AESGCM_STREAM */
|
||||
#endif /* HAVE_INTEL_AVX2 */
|
||||
#endif /* WOLFSSL_X86_64_BUILD */
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -30,6 +30,7 @@
|
|||
#define HAVE_INTEL_AVX2
|
||||
#endif /* NO_AVX2_SUPPORT */
|
||||
|
||||
#ifdef WOLFSSL_X86_64_BUILD
|
||||
#ifndef __APPLE__
|
||||
.text
|
||||
.globl chacha_encrypt_x64
|
||||
|
@ -1430,6 +1431,7 @@ L_chacha20_avx2_end256:
|
|||
.size chacha_encrypt_avx2,.-chacha_encrypt_avx2
|
||||
#endif /* __APPLE__ */
|
||||
#endif /* HAVE_INTEL_AVX2 */
|
||||
#endif /* WOLFSSL_X86_64_BUILD */
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
|
|
@ -55,7 +55,7 @@ and Daniel J. Bernstein
|
|||
#pragma warning(disable: 4127)
|
||||
#endif
|
||||
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
|
@ -77,12 +77,13 @@ and Daniel J. Bernstein
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
static word32 intel_flags = 0;
|
||||
static word32 cpu_flags_set = 0;
|
||||
#endif
|
||||
|
||||
#if defined(USE_INTEL_SPEEDUP) || defined(POLY130564)
|
||||
#if (defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)) || \
|
||||
defined(POLY130564)
|
||||
#if defined(_MSC_VER)
|
||||
#define POLY1305_NOINLINE __declspec(noinline)
|
||||
#elif defined(__GNUC__)
|
||||
|
@ -122,7 +123,7 @@ static word32 cpu_flags_set = 0;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -265,7 +266,7 @@ with a given ctx pointer to a Poly1305 structure.
|
|||
static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
size_t bytes)
|
||||
{
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
/* AVX2 is handled in wc_Poly1305Update. */
|
||||
SAVE_VECTOR_REGISTERS(return _svr_ret;);
|
||||
poly1305_blocks_avx(ctx, m, bytes);
|
||||
|
@ -399,7 +400,7 @@ number of bytes is less than the block size.
|
|||
*/
|
||||
static int poly1305_block(Poly1305* ctx, const unsigned char *m)
|
||||
{
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
/* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
|
||||
SAVE_VECTOR_REGISTERS(return _svr_ret;);
|
||||
poly1305_block_avx(ctx, m);
|
||||
|
@ -414,7 +415,8 @@ static int poly1305_block(Poly1305* ctx, const unsigned char *m)
|
|||
#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
|
||||
int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||
{
|
||||
#if defined(POLY130564) && !defined(USE_INTEL_SPEEDUP)
|
||||
#if defined(POLY130564) && \
|
||||
!(defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP))
|
||||
word64 t0,t1;
|
||||
#endif
|
||||
|
||||
|
@ -435,7 +437,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
|||
if (keySz != 32 || ctx == NULL)
|
||||
return BAD_FUNC_ARG;
|
||||
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
if (!cpu_flags_set) {
|
||||
intel_flags = cpuid_get_flags();
|
||||
cpu_flags_set = 1;
|
||||
|
@ -502,7 +504,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
|||
|
||||
int wc_Poly1305Final(Poly1305* ctx, byte* mac)
|
||||
{
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
#elif defined(POLY130564)
|
||||
|
||||
word64 h0,h1,h2,c;
|
||||
|
@ -521,7 +523,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
|
|||
if (ctx == NULL || mac == NULL)
|
||||
return BAD_FUNC_ARG;
|
||||
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
SAVE_VECTOR_REGISTERS(return _svr_ret;);
|
||||
#ifdef HAVE_INTEL_AVX2
|
||||
if (IS_INTEL_AVX2(intel_flags))
|
||||
|
@ -707,7 +709,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
|
|||
printf("\n");
|
||||
#endif
|
||||
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
#ifdef HAVE_INTEL_AVX2
|
||||
if (IS_INTEL_AVX2(intel_flags)) {
|
||||
SAVE_VECTOR_REGISTERS(return _svr_ret;);
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#define HAVE_INTEL_AVX2
|
||||
#endif /* NO_AVX2_SUPPORT */
|
||||
|
||||
#ifdef WOLFSSL_X86_64_BUILD
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
#ifndef __APPLE__
|
||||
.text
|
||||
|
@ -1107,6 +1108,7 @@ L_poly1305_avx2_final_cmp_copy:
|
|||
.size poly1305_final_avx2,.-poly1305_final_avx2
|
||||
#endif /* __APPLE__ */
|
||||
#endif /* HAVE_INTEL_AVX2 */
|
||||
#endif /* WOLFSSL_X86_64_BUILD */
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
|
|
@ -174,7 +174,7 @@ on the specific device platform.
|
|||
#endif
|
||||
|
||||
|
||||
#if defined(USE_INTEL_SPEEDUP)
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
#if defined(__GNUC__) && ((__GNUC__ < 4) || \
|
||||
(__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
|
||||
#undef NO_AVX2_SUPPORT
|
||||
|
@ -194,7 +194,7 @@ on the specific device platform.
|
|||
#else
|
||||
#undef HAVE_INTEL_AVX1
|
||||
#undef HAVE_INTEL_AVX2
|
||||
#endif /* USE_INTEL_SPEEDUP */
|
||||
#endif /* WOLFSSL_X86_64_BUILD && USE_INTEL_SPEEDUP */
|
||||
|
||||
#if defined(HAVE_INTEL_AVX2)
|
||||
#define HAVE_INTEL_RORX
|
||||
|
@ -253,8 +253,8 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
|
||||
|
||||
/* Hardware Acceleration */
|
||||
#if defined(USE_INTEL_SPEEDUP) && (defined(HAVE_INTEL_AVX1) || \
|
||||
defined(HAVE_INTEL_AVX2))
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
|
||||
|
||||
/* in case intel instructions aren't available, plus we need the K[] global */
|
||||
#define NEED_SOFT_SHA256
|
||||
|
@ -1072,7 +1072,8 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
|
||||
if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
|
||||
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
|
||||
#if defined(USE_INTEL_SPEEDUP) && \
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && \
|
||||
defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
|
||||
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
|
||||
#endif
|
||||
|
@ -1107,7 +1108,7 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
|
||||
/* process blocks */
|
||||
#ifdef XTRANSFORM_LEN
|
||||
#if defined(USE_INTEL_SPEEDUP) && \
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
|
||||
if (Transform_Sha256_Len_p != NULL)
|
||||
#endif
|
||||
|
@ -1123,13 +1124,14 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
len -= blocksLen;
|
||||
}
|
||||
}
|
||||
#if defined(USE_INTEL_SPEEDUP) && \
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
|
||||
else
|
||||
#endif
|
||||
#endif /* XTRANSFORM_LEN */
|
||||
#if !defined(XTRANSFORM_LEN) || (defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
|
||||
#if !defined(XTRANSFORM_LEN) || \
|
||||
(defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
|
||||
{
|
||||
while (len >= WC_SHA256_BLOCK_SIZE) {
|
||||
word32* local32 = sha256->buffer;
|
||||
|
@ -1137,7 +1139,8 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
/* Intel transform function requires use of sha256->buffer */
|
||||
/* Little Endian requires byte swap, so can't use data directly */
|
||||
#if defined(WC_HASH_DATA_ALIGNMENT) && !defined(LITTLE_ENDIAN_ORDER) && \
|
||||
!(defined(USE_INTEL_SPEEDUP) && \
|
||||
!(defined(WOLFSSL_X86_64_BUILD) && \
|
||||
defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
|
||||
if (((wc_ptr_t)data % WC_HASH_DATA_ALIGNMENT) == 0) {
|
||||
local32 = (word32*)data;
|
||||
|
@ -1152,7 +1155,8 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
len -= WC_SHA256_BLOCK_SIZE;
|
||||
|
||||
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
|
||||
#if defined(USE_INTEL_SPEEDUP) && \
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && \
|
||||
defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
|
||||
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
|
||||
#endif
|
||||
|
@ -1245,7 +1249,7 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
|
||||
|
||||
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
|
||||
#if defined(USE_INTEL_SPEEDUP) && \
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
|
||||
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
|
||||
#endif
|
||||
|
@ -1283,7 +1287,7 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
|
||||
/* store lengths */
|
||||
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
|
||||
#if defined(USE_INTEL_SPEEDUP) && \
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
|
||||
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
|
||||
#endif
|
||||
|
@ -1297,10 +1301,11 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
|
||||
sizeof(word32));
|
||||
|
||||
#if defined(FREESCALE_MMCAU_SHA) || (defined(USE_INTEL_SPEEDUP) && \
|
||||
#if defined(FREESCALE_MMCAU_SHA) || \
|
||||
(defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
|
||||
/* Kinetis requires only these bytes reversed */
|
||||
#if defined(USE_INTEL_SPEEDUP) && \
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
|
||||
if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
|
||||
#endif
|
||||
|
@ -1532,7 +1537,7 @@ static int InitSha256(wc_Sha256* sha256)
|
|||
sha224->loLen = 0;
|
||||
sha224->hiLen = 0;
|
||||
|
||||
#if defined(USE_INTEL_SPEEDUP) && \
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
|
||||
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
|
||||
/* choose best Transform function under this runtime environment */
|
||||
Sha256_SetTransform();
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#define HAVE_INTEL_AVX2
|
||||
#endif /* NO_AVX2_SUPPORT */
|
||||
|
||||
#ifdef WOLFSSL_X86_64_BUILD
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
#ifndef __APPLE__
|
||||
.data
|
||||
|
@ -22655,6 +22656,7 @@ L_sha256_len_avx2_rorx_done:
|
|||
.size Transform_Sha256_AVX2_RORX_Len,.-Transform_Sha256_AVX2_RORX_Len
|
||||
#endif /* __APPLE__ */
|
||||
#endif /* HAVE_INTEL_AVX2 */
|
||||
#endif /* WOLFSSL_X86_64_BUILD */
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
|
|
@ -11321,6 +11321,33 @@ WOLFSSL_TEST_SUBROUTINE int aesgcm_test(void)
|
|||
ERROR_OUT(-6394, out);
|
||||
}
|
||||
#endif /* HAVE_AES_DECRYPT */
|
||||
#ifdef BENCH_AESGCM_LARGE
|
||||
/* setup test buffer */
|
||||
result = wc_AesGcmEncryptInit(enc, k1, sizeof(k1), iv1, sizeof(iv1));
|
||||
if (result != 0)
|
||||
ERROR_OUT(-6360, out);
|
||||
result = wc_AesGcmEncryptUpdate(enc, large_output, large_input,
|
||||
BENCH_AESGCM_LARGE, a, sizeof(a));
|
||||
if (result != 0)
|
||||
ERROR_OUT(-6361, out);
|
||||
result = wc_AesGcmEncryptFinal(enc, resultT, sizeof(t1));
|
||||
if (result != 0)
|
||||
ERROR_OUT(-6362, out);
|
||||
#ifdef HAVE_AES_DECRYPT
|
||||
result = wc_AesGcmDecryptInit(enc, k1, sizeof(k1), iv1, sizeof(iv1));
|
||||
if (result != 0)
|
||||
ERROR_OUT(-6363, out);
|
||||
result = wc_AesGcmDecryptUpdate(enc, large_outdec, large_output,
|
||||
BENCH_AESGCM_LARGE, a, sizeof(a));
|
||||
if (result != 0)
|
||||
ERROR_OUT(-6364, out);
|
||||
result = wc_AesGcmDecryptFinal(enc, resultT, sizeof(t1));
|
||||
if (result != 0)
|
||||
ERROR_OUT(-6365, out);
|
||||
if (XMEMCMP(large_input, large_outdec, BENCH_AESGCM_LARGE))
|
||||
ERROR_OUT(-6366, out);
|
||||
#endif /* HAVE_AES_DECRYPT */
|
||||
#endif /* BENCH_AESGCM_LARGE */
|
||||
#endif /* WOLFSSL_AESGCM_STREAM */
|
||||
#endif /* WOLFSSL_AES_256 */
|
||||
#endif /* !WOLFSSL_AFALG_XILINX_AES && !WOLFSSL_XILINX_CRYPT */
|
||||
|
|
|
@ -48,7 +48,7 @@
|
|||
#define WC_HAS_GCC_4_4_64BIT
|
||||
#endif
|
||||
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
#elif (defined(WC_HAS_SIZEOF_INT128_64BIT) || defined(WC_HAS_MSVC_64BIT) || \
|
||||
defined(WC_HAS_GCC_4_4_64BIT))
|
||||
#define POLY130564
|
||||
|
@ -67,7 +67,7 @@ enum {
|
|||
|
||||
/* Poly1305 state */
|
||||
typedef struct Poly1305 {
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
|
||||
word64 r[3];
|
||||
word64 h[3];
|
||||
word64 pad[2];
|
||||
|
|
Loading…
Reference in New Issue