Assembly optimization for AES-NI, and AVX1 and AVX2

Unroll the loop for 8.
Use new optimized maths.
Fix SHA-384 to use SHA-512 assembly code.
Only perform CPU id check in one place.
pull/1030/head
Sean Parkinson 2017-07-18 10:14:17 +10:00
parent 36c2ee92dc
commit bde6a35ac4
13 changed files with 3212 additions and 706 deletions

View File

@ -789,6 +789,11 @@ AC_ARG_ENABLE([intelasm],
[ ENABLED_INTELASM=no ]
)
if test "$ENABLED_AESNI" = "small"
then
AM_CFLAGS="$AM_CFLAGS -DAES_GCM_AESNI_NO_UNROLL"
ENABLED_AESNI=yes
fi
if test "$ENABLED_AESNI" = "yes" || test "$ENABLED_INTELASM" = "yes"
then
@ -799,7 +804,7 @@ then
# opt levels greater than 2 may cause problems on systems w/o aesni
if test "$CC" != "icc"
then
AM_CFLAGS="$AM_CFLAGS -maes -msse4"
AM_CFLAGS="$AM_CFLAGS -maes -msse4 -mpclmul"
fi
fi
AS_IF([test "x$ENABLED_AESGCM" != "xno"],[AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_AESGCM"])

View File

@ -61,7 +61,8 @@ endif
src_libwolfssl_la_SOURCES += \
wolfcrypt/src/hmac.c \
wolfcrypt/src/hash.c
wolfcrypt/src/hash.c \
wolfcrypt/src/cpuid.c
if BUILD_RNG
src_libwolfssl_la_SOURCES += wolfcrypt/src/random.c

View File

@ -137,8 +137,8 @@
#define BEGIN_INTEL_CYCLES total_cycles = get_intel_cycles();
#define END_INTEL_CYCLES total_cycles = get_intel_cycles() - total_cycles;
#define SHOW_INTEL_CYCLES printf(" Cycles per byte = %6.2f", \
count == 0 ? 0 : \
(float)total_cycles / (count*BENCH_SIZE));
count == 0 ? 0 : \
(float)total_cycles / ((word64)count*BENCH_SIZE));
#elif defined(LINUX_CYCLE_COUNT)
#include <linux/perf_event.h>
#include <sys/syscall.h>
@ -579,7 +579,7 @@ static void bench_stats_sym_finish(const char* desc, int doAsync, int count, dou
persec = (1 / total) * blocks;
}
printf("%-8s%s %5.0f %s took %5.3f seconds, %8.3f %s/s",
printf("%-12s%s %5.0f %s took %5.3f seconds, %8.3f %s/s",
desc, BENCH_ASYNC_GET_NAME(doAsync), blocks, blockType, total,
persec, blockType);
SHOW_INTEL_CYCLES
@ -1275,7 +1275,31 @@ void bench_aesgcm(int doAsync)
count += times;
} while (bench_stats_sym_check(start));
exit_aes_gcm:
bench_stats_sym_finish("AES-GCM", doAsync, count, start);
bench_stats_sym_finish("AES-GCM-Enc", doAsync, count, start);
/* GCM uses same routine in backend for both encrypt and decrypt */
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks || BENCH_ASYNC_IS_PEND(); ) {
bench_async_poll();
/* while free pending slots in queue, submit ops */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, &times, numBlocks)) {
ret = wc_AesGcmDecrypt(&enc[i], bench_plain,
bench_cipher, BENCH_SIZE,
bench_iv, 12, bench_tag, AES_AUTH_TAG_SZ,
bench_additional, AES_AUTH_ADD_SZ);
if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, &times)) {
goto exit_aes_gcm_dec;
}
}
} /* for i */
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
exit_aes_gcm_dec:
bench_stats_sym_finish("AES-GCM-Dec", doAsync, count, start);
exit:

File diff suppressed because it is too large Load Diff

View File

@ -1502,100 +1502,4 @@ MAKE_RK256_b:
pxor xmm3,xmm2
ret
; See Intel® Carry-Less Multiplication Instruction
; and its Usage for Computing the GCM Mode White Paper
; by Shay Gueron, Intel Mobility Group, Israel Development Center;
; and Michael E. Kounavis, Intel Labs, Circuits and Systems Research
; void gfmul(__m128i a, __m128i b, __m128i* out);
; .globl gfmul
gfmul PROC
; xmm0 holds operand a (128 bits)
; xmm1 holds operand b (128 bits)
; r8 holds the pointer to output (128 bits)
; convert to what we had for att&t convention
movdqa xmm0, [rcx]
movdqa xmm1, [rdx]
; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
sub rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
movdqa [rsp+0], xmm6
movdqa [rsp+16], xmm7
movdqa [rsp+32], xmm8
movdqa [rsp+48], xmm9
movdqa xmm3, xmm0
pclmulqdq xmm3, xmm1, 0 ; xmm3 holds a0*b0
movdqa xmm4, xmm0
pclmulqdq xmm4, xmm1, 16 ; xmm4 holds a0*b1
movdqa xmm5, xmm0
pclmulqdq xmm5, xmm1, 1 ; xmm5 holds a1*b0
movdqa xmm6, xmm0
pclmulqdq xmm6, xmm1, 17 ; xmm6 holds a1*b1
pxor xmm4, xmm5 ; xmm4 holds a0*b1 + a1*b0
movdqa xmm5, xmm4
psrldq xmm4, 8
pslldq xmm5, 8
pxor xmm3, xmm5
pxor xmm6, xmm4 ; <xmm6:xmm3> holds the result of
; the carry-less multiplication of
; xmm0 by xmm1
; shift the result by one bit position to the left cope for the fact
; that bits are reversed
movdqa xmm7, xmm3
movdqa xmm8, xmm6
pslld xmm3, 1
pslld xmm6, 1
psrld xmm7, 31
psrld xmm8, 31
movdqa xmm9, xmm7
pslldq xmm8, 4
pslldq xmm7, 4
psrldq xmm9, 12
por xmm3, xmm7
por xmm6, xmm8
por xmm6, xmm9
; first phase of the reduction
movdqa xmm7, xmm3
movdqa xmm8, xmm3
movdqa xmm9, xmm3
pslld xmm7, 31 ; packed right shifting << 31
pslld xmm8, 30 ; packed right shifting shift << 30
pslld xmm9, 25 ; packed right shifting shift << 25
pxor xmm7, xmm8 ; xor the shifted versions
pxor xmm7, xmm9
movdqa xmm8, xmm7
pslldq xmm7, 12
psrldq xmm8, 4
pxor xmm3, xmm7 ; first phase of the reduction complete
movdqa xmm2, xmm3 ; second phase of the reduction
movdqa xmm4, xmm3
movdqa xmm5, xmm3
psrld xmm2, 1 ; packed left shifting >> 1
psrld xmm4, 2 ; packed left shifting >> 2
psrld xmm5, 7 ; packed left shifting >> 7
pxor xmm2, xmm4 ; xor the shifted versions
pxor xmm2, xmm5
pxor xmm2, xmm8
pxor xmm3, xmm2
pxor xmm6, xmm3 ; the result is in xmm6
movdqu [r8],xmm6 ; store the result
; restore non volatile xmms from stack
movdqa xmm6, [rsp+0]
movdqa xmm7, [rsp+16]
movdqa xmm8, [rsp+32]
movdqa xmm9, [rsp+48]
add rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
ret
gfmul ENDP
END

View File

@ -1288,91 +1288,6 @@ pxor %xmm4, %xmm3
pxor %xmm2, %xmm3
ret
#ifdef HAVE_AESGCM
/* See Intel® Carry-Less Multiplication Instruction
* and its Usage for Computing the GCM Mode White Paper
* by Shay Gueron, Intel Mobility Group, Israel Development Center;
* and Michael E. Kounavis, Intel Labs, Circuits and Systems Research
*
* This is for use with the C code.
*/
/* Figure 6. Code Sample - Performing Ghash Using Algorithms 1 and 5 */
/*
* void gfmul(__m128i a, __m128i b, __m128i* out);
*/
.globl gfmul
gfmul:
#xmm0 holds operand a (128 bits)
#xmm1 holds operand b (128 bits)
#rdi holds the pointer to output (128 bits)
movdqa %xmm0, %xmm3
pclmulqdq $0, %xmm1, %xmm3 # xmm3 holds a0*b0
movdqa %xmm0, %xmm4
pclmulqdq $16, %xmm1, %xmm4 # xmm4 holds a0*b1
movdqa %xmm0, %xmm5
pclmulqdq $1, %xmm1, %xmm5 # xmm5 holds a1*b0
movdqa %xmm0, %xmm6
pclmulqdq $17, %xmm1, %xmm6 # xmm6 holds a1*b1
pxor %xmm5, %xmm4 # xmm4 holds a0*b1 + a1*b0
movdqa %xmm4, %xmm5
psrldq $8, %xmm4
pslldq $8, %xmm5
pxor %xmm5, %xmm3
pxor %xmm4, %xmm6 # <xmm6:xmm3> holds the result of
# the carry-less multiplication of
# xmm0 by xmm1
# shift the result by one bit position to the left cope for the fact
# that bits are reversed
movdqa %xmm3, %xmm7
movdqa %xmm6, %xmm8
pslld $1, %xmm3
pslld $1, %xmm6
psrld $31, %xmm7
psrld $31, %xmm8
movdqa %xmm7, %xmm9
pslldq $4, %xmm8
pslldq $4, %xmm7
psrldq $12, %xmm9
por %xmm7, %xmm3
por %xmm8, %xmm6
por %xmm9, %xmm6
# first phase of the reduction
movdqa %xmm3, %xmm7
movdqa %xmm3, %xmm8
movdqa %xmm3, %xmm9
pslld $31, %xmm7 # packed right shifting << 31
pslld $30, %xmm8 # packed right shifting shift << 30
pslld $25, %xmm9 # packed right shifting shift << 25
pxor %xmm8, %xmm7 # xor the shifted versions
pxor %xmm9, %xmm7
movdqa %xmm7, %xmm8
pslldq $12, %xmm7
psrldq $4, %xmm8
pxor %xmm7, %xmm3 # first phase of the reduction complete
movdqa %xmm3,%xmm2 # second phase of the reduction
movdqa %xmm3,%xmm4
movdqa %xmm3,%xmm5
psrld $1, %xmm2 # packed left shifting >> 1
psrld $2, %xmm4 # packed left shifting >> 2
psrld $7, %xmm5 # packed left shifting >> 7
pxor %xmm4, %xmm2 # xor the shifted versions
pxor %xmm5, %xmm2
pxor %xmm8, %xmm2
pxor %xmm2, %xmm3
pxor %xmm3, %xmm6 # the result is in xmm6
movdqu %xmm6, (%rdi) # store the result
ret
#endif /* HAVE_AESGCM */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

View File

@ -0,0 +1,99 @@
/* cpuid.c
*
* Copyright (C) 2006-2016 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/cpuid.h>
#ifdef WOLFSSL_X86_64_BUILD
/* Each platform needs to query info type 1 from cpuid to see if aesni is
* supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
*/
#ifndef _MSC_VER
#define cpuid(reg, leaf, sub)\
__asm__ __volatile__ ("cpuid":\
"=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
"a" (leaf), "c"(sub));
#define XASM_LINK(f) asm(f)
#else
#include <intrin.h>
#define cpuid(a,b) __cpuid((int*)a,b)
#define XASM_LINK(f)
#endif /* _MSC_VER */
#define EAX 0
#define EBX 1
#define ECX 2
#define EDX 3
static word32 cpuid_check = 0;
static word32 cpuid_flags = 0;
static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit)
{
static int got_intel_cpu = 0;
static unsigned int reg[5];
reg[4] = '\0';
cpuid(reg, 0, 0);
if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) {
got_intel_cpu = 1;
}
if (got_intel_cpu) {
cpuid(reg, leaf, sub);
return ((reg[num] >> bit) & 0x1);
}
return 0;
}
void cpuid_set_flags(void)
{
if (!cpuid_check) {
cpuid_check = 1;
if (cpuid_flag(1, 0, ECX, 28)) { cpuid_flags |= CPUID_AVX1 ; }
if (cpuid_flag(7, 0, EBX, 5)) { cpuid_flags |= CPUID_AVX2 ; }
if (cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
if (cpuid_flag(1, 0, ECX, 30)) { cpuid_flags |= CPUID_RDRAND; }
if (cpuid_flag(7, 0, EBX, 18)) { cpuid_flags |= CPUID_RDSEED; }
if (cpuid_flag(1, 0, ECX, 26)) { cpuid_flags |= CPUID_AESNI ; }
}
}
word32 cpuid_get_flags(void)
{
if (!cpuid_check)
cpuid_set_flags();
return cpuid_flags;
}
#endif

View File

@ -32,6 +32,7 @@
*/
#include <wolfssl/wolfcrypt/random.h>
#include <wolfssl/wolfcrypt/cpuid.h>
#ifdef HAVE_FIPS
@ -141,12 +142,6 @@ int wc_RNG_GenerateByte(WC_RNG* rng, byte* b)
#ifdef HAVE_INTEL_RDRAND
static int wc_GenerateRand_IntelRD(OS_Seed* os, byte* output, word32 sz);
#endif
static word32 cpuid_check = 0;
static word32 cpuid_flags = 0;
#define CPUID_RDRAND 0x4
#define CPUID_RDSEED 0x8
#define IS_INTEL_RDRAND (cpuid_flags & CPUID_RDRAND)
#define IS_INTEL_RDSEED (cpuid_flags & CPUID_RDSEED)
#endif
/* Start NIST DRBG code */
@ -540,7 +535,7 @@ int wc_InitRng_ex(WC_RNG* rng, void* heap, int devId)
#ifdef HAVE_INTEL_RDRAND
/* if CPU supports RDRAND, use it directly and by-pass DRBG init */
if (IS_INTEL_RDRAND)
if (IS_INTEL_RDRAND(cpuid_get_flags()))
return 0;
#endif
@ -610,7 +605,7 @@ int wc_RNG_GenerateBlock(WC_RNG* rng, byte* output, word32 sz)
return BAD_FUNC_ARG;
#ifdef HAVE_INTEL_RDRAND
if (IS_INTEL_RDRAND)
if (IS_INTEL_RDRAND(cpuid_get_flags()))
return wc_GenerateRand_IntelRD(NULL, output, sz);
#endif
@ -982,52 +977,8 @@ int wc_FreeNetRandom(void)
#if defined(HAVE_INTEL_RDRAND) || defined(HAVE_INTEL_RDSEED)
#ifndef _MSC_VER
#define cpuid(reg, leaf, sub)\
__asm__ __volatile__ ("cpuid":\
"=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
"a" (leaf), "c"(sub));
#define XASM_LINK(f) asm(f)
#else
#include <intrin.h>
#define cpuid(a,b) __cpuid((int*)a,b)
#define XASM_LINK(f)
#endif /* _MSC_VER */
#define EAX 0
#define EBX 1
#define ECX 2
#define EDX 3
static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
int got_intel_cpu = 0;
unsigned int reg[5];
reg[4] = '\0';
cpuid(reg, 0, 0);
if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0)
{
got_intel_cpu = 1;
}
if (got_intel_cpu) {
cpuid(reg, leaf, sub);
return ((reg[num] >> bit) & 0x1);
}
return 0;
}
static void wc_InitRng_IntelRD(void) {
if (cpuid_check==0) {
if (cpuid_flag(1, 0, ECX, 30)) { cpuid_flags |= CPUID_RDRAND; }
if (cpuid_flag(7, 0, EBX, 18)) { cpuid_flags |= CPUID_RDSEED; }
cpuid_check = 1;
}
cpuid_set_flags();
}
#ifdef WOLFSSL_ASYNC_CRYPT
@ -1067,7 +1018,7 @@ static int wc_GenerateSeed_IntelRD(OS_Seed* os, byte* output, word32 sz)
(void)os;
if (!IS_INTEL_RDSEED)
if (!IS_INTEL_RDSEED(cpuid_get_flags()))
return -1;
for (; (sz / sizeof(word64)) > 0; sz -= sizeof(word64),
@ -1122,7 +1073,7 @@ static int wc_GenerateRand_IntelRD(OS_Seed* os, byte* output, word32 sz)
(void)os;
if (!IS_INTEL_RDRAND)
if (!IS_INTEL_RDRAND(cpuid_get_flags()))
return -1;
for (; (sz / sizeof(word64)) > 0; sz -= sizeof(word64),
@ -1702,7 +1653,7 @@ int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz)
int ret = 0;
#ifdef HAVE_INTEL_RDSEED
if (IS_INTEL_RDSEED) {
if (IS_INTEL_RDSEED(cpuid_get_flags())) {
ret = wc_GenerateSeed_IntelRD(NULL, output, sz);
if (ret == 0) {
/* success, we're done */

View File

@ -32,6 +32,7 @@
#include <wolfssl/wolfcrypt/sha256.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/cpuid.h>
/* fips wrapper calls, user can call direct */
#ifdef HAVE_FIPS
@ -177,77 +178,14 @@ static int InitSha256(Sha256* sha256)
More granural Stitched Message Sched/Round
}
#endif
*/
/* Each platform needs to query info type 1 from cpuid to see if aesni is
* supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
*/
#ifndef _MSC_VER
#define cpuid(reg, leaf, sub)\
__asm__ __volatile__ ("cpuid":\
"=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
"a" (leaf), "c"(sub));
#define XASM_LINK(f) asm(f)
#else
#include <intrin.h>
#define cpuid(a,b) __cpuid((int*)a,b)
#define XASM_LINK(f)
#endif /* _MSC_VER */
#define EAX 0
#define EBX 1
#define ECX 2
#define EDX 3
#define CPUID_AVX1 0x1
#define CPUID_AVX2 0x2
#define CPUID_RDRAND 0x4
#define CPUID_RDSEED 0x8
#define CPUID_BMI2 0x10 /* MULX, RORX */
#define IS_INTEL_AVX1 (cpuid_flags & CPUID_AVX1)
#define IS_INTEL_AVX2 (cpuid_flags & CPUID_AVX2)
#define IS_INTEL_BMI2 (cpuid_flags & CPUID_BMI2)
#define IS_INTEL_RDRAND (cpuid_flags & CPUID_RDRAND)
#define IS_INTEL_RDSEED (cpuid_flags & CPUID_RDSEED)
static word32 cpuid_check = 0;
static word32 cpuid_flags = 0;
static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
int got_intel_cpu=0;
unsigned int reg[5];
reg[4] = '\0';
cpuid(reg, 0, 0);
if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) {
got_intel_cpu = 1;
}
if (got_intel_cpu) {
cpuid(reg, leaf, sub);
return ((reg[num] >> bit) & 0x1);
}
return 0;
}
static int set_cpuid_flags(void) {
if (cpuid_check==0) {
if (cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1; }
if (cpuid_flag(7, 0, EBX, 5)) { cpuid_flags |= CPUID_AVX2; }
if (cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2; }
if (cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND; }
if (cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED; }
cpuid_check = 1;
return 0;
}
return 1;
}
/* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */
static int Transform(Sha256* sha256);
#if defined(HAVE_INTEL_AVX1)
@ -258,22 +196,31 @@ static int InitSha256(Sha256* sha256)
static int Transform_AVX1_RORX(Sha256 *sha256);
#endif
static int (*Transform_p)(Sha256* sha256) /* = _Transform */;
static int transform_check = 0;
#define XTRANSFORM(sha256, B) (*Transform_p)(sha256)
static void set_Transform(void) {
if (set_cpuid_flags()) return;
static void set_Transform(void)
{
word32 intel_flags;
cpuid_set_flags();
if (transform_check)
return;
transform_check = 1;
intel_flags = cpuid_get_flags();
#if defined(HAVE_INTEL_AVX2)
if (IS_INTEL_AVX2 && IS_INTEL_BMI2) {
Transform_p = Transform_AVX1_RORX; return;
Transform_p = Transform_AVX2;
/* for avoiding warning,"not used" */
}
if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) {
Transform_p = Transform_AVX1_RORX; return;
Transform_p = Transform_AVX2;
/* for avoiding warning,"not used" */
}
#endif
#if defined(HAVE_INTEL_AVX1)
Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform); return;
Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 :
Transform); return;
#endif
Transform_p = Transform; return;
Transform_p = Transform; return;
}
/* Dummy for saving MM_REGs on behalf of Transform */
@ -519,6 +466,11 @@ static int InitSha256(Sha256* sha256)
{
int ret = 0;
byte* local;
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
word32 intel_flags = cpuid_get_flags();
#endif
#endif
if (sha256 == NULL || (data == NULL && len > 0)) {
return BAD_FUNC_ARG;
@ -552,7 +504,7 @@ static int InitSha256(Sha256* sha256)
if (sha256->buffLen == SHA256_BLOCK_SIZE) {
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
{
ByteReverseWords(sha256->buffer, sha256->buffer,
@ -582,6 +534,11 @@ static int InitSha256(Sha256* sha256)
int ret;
byte* local = (byte*)sha256->buffer;
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
word32 intel_flags = cpuid_get_flags();
#endif
#endif
if (sha256 == NULL) {
return BAD_FUNC_ARG;
@ -598,15 +555,15 @@ static int InitSha256(Sha256* sha256)
SHA256_BLOCK_SIZE - sha256->buffLen);
sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
#endif
{
ByteReverseWords(sha256->buffer, sha256->buffer,
SHA256_BLOCK_SIZE);
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
ByteReverseWords(sha256->buffer, sha256->buffer,
SHA256_BLOCK_SIZE);
#endif
}
#endif
ret = XTRANSFORM(sha256, local);
if (ret != 0)
@ -624,7 +581,7 @@ static int InitSha256(Sha256* sha256)
/* store lengths */
#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
{
ByteReverseWords(sha256->buffer, sha256->buffer,
@ -640,7 +597,7 @@ static int InitSha256(Sha256* sha256)
defined(HAVE_INTEL_AVX2)
/* Kinetis requires only these bytes reversed */
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (IS_INTEL_AVX1 || IS_INTEL_AVX2)
if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
#endif
{
ByteReverseWords(

View File

@ -27,10 +27,9 @@
#include <wolfssl/wolfcrypt/settings.h>
#ifdef WOLFSSL_SHA512
#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/sha512.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/cpuid.h>
/* fips wrapper calls, user can call direct */
#ifdef HAVE_FIPS
@ -261,74 +260,6 @@ static int InitSha512(Sha512* sha512)
* supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
*/
#ifndef _MSC_VER
#define cpuid(reg, leaf, sub)\
__asm__ __volatile__ ("cpuid":\
"=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
"a" (leaf), "c"(sub));
#define XASM_LINK(f) asm(f)
#else
#include <intrin.h>
#define cpuid(a,b) __cpuid((int*)a,b)
#define XASM_LINK(f)
#endif /* _MSC_VER */
#define EAX 0
#define EBX 1
#define ECX 2
#define EDX 3
#define CPUID_AVX1 0x1
#define CPUID_AVX2 0x2
#define CPUID_RDRAND 0x4
#define CPUID_RDSEED 0x8
#define CPUID_BMI2 0x10 /* MULX, RORX */
#define IS_INTEL_AVX1 (cpuid_flags & CPUID_AVX1)
#define IS_INTEL_AVX2 (cpuid_flags & CPUID_AVX2)
#define IS_INTEL_BMI2 (cpuid_flags & CPUID_BMI2)
#define IS_INTEL_RDRAND (cpuid_flags & CPUID_RDRAND)
#define IS_INTEL_RDSEED (cpuid_flags & CPUID_RDSEED)
static word32 cpuid_check = 0;
static word32 cpuid_flags = 0;
static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
int got_intel_cpu = 0;
unsigned int reg[5];
reg[4] = '\0';
cpuid(reg, 0, 0);
if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) {
got_intel_cpu = 1;
}
if (got_intel_cpu) {
cpuid(reg, leaf, sub);
return ((reg[num] >> bit) & 0x1);
}
return 0;
}
static int set_cpuid_flags() {
if(cpuid_check ==0) {
if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; }
if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; }
if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; }
cpuid_check = 1 ;
return 0 ;
}
return 1 ;
}
#if defined(HAVE_INTEL_AVX1)
static int Transform_AVX1(Sha512 *sha512);
#endif
@ -340,6 +271,7 @@ static int InitSha512(Sha512* sha512)
#endif
static int _Transform(Sha512 *sha512);
static int (*Transform_p)(Sha512* sha512) = _Transform;
static int transform_check = 0;
#define Transform(sha512) (*Transform_p)(sha512)
/* Dummy for saving MM_REGs on behalf of Transform */
@ -353,6 +285,28 @@ static int InitSha512(Sha512* sha512)
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15")
#endif
static void Sha512_SetTransform()
{
word32 intel_flags;
if (transform_check)
return;
transform_check = 1;
intel_flags = cpuid_get_flags();
#if defined(HAVE_INTEL_AVX2)
if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) {
Transform_p = Transform_AVX1_RORX; return;
Transform_p = Transform_AVX2;
/* for avoiding warning,"not used" */
}
#endif
#if defined(HAVE_INTEL_AVX1)
Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 :
_Transform); return;
#endif
Transform_p = _Transform;
}
int wc_InitSha512_ex(Sha512* sha512, void* heap, int devId)
{
@ -361,20 +315,7 @@ static int InitSha512(Sha512* sha512)
(void)heap;
(void)devId;
if (set_cpuid_flags())
return ret;
#if defined(HAVE_INTEL_AVX2)
if (IS_INTEL_AVX2 && IS_INTEL_BMI2) {
Transform_p = Transform_AVX1_RORX; return ret;
Transform_p = Transform_AVX2;
/* for avoiding warning,"not used" */
}
#endif
#if defined(HAVE_INTEL_AVX1)
Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform); return ret;
#endif
Transform_p = _Transform;
Sha512_SetTransform();
return ret;
}
@ -554,6 +495,11 @@ static INLINE int Sha512Update(Sha512* sha512, const byte* data, word32 len)
int ret = 0;
/* do block size increments */
byte* local = (byte*)sha512->buffer;
#if defined(LITTLE_ENDIAN_ORDER)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
word32 intel_flags = cpuid_get_flags();
#endif
#endif
if (sha512 == NULL || (data == NULL && len > 0)) {
return BAD_FUNC_ARG;
@ -570,16 +516,18 @@ static INLINE int Sha512Update(Sha512* sha512, const byte* data, word32 len)
XMEMCPY(&local[sha512->buffLen], data, add);
sha512->buffLen += add;
data += add;
len -= add;
data += add;
len -= add;
if (sha512->buffLen == SHA512_BLOCK_SIZE) {
#if defined(LITTLE_ENDIAN_ORDER)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
{
ByteReverseWords64(sha512->buffer, sha512->buffer,
SHA512_BLOCK_SIZE);
SHA512_BLOCK_SIZE);
}
#endif
ret = Transform(sha512);
if (ret != 0)
@ -615,6 +563,11 @@ static INLINE int Sha512Final(Sha512* sha512)
{
byte* local = (byte*)sha512->buffer;
int ret;
#if defined(LITTLE_ENDIAN_ORDER)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
word32 intel_flags = cpuid_get_flags();
#endif
#endif
if (sha512 == NULL) {
return BAD_FUNC_ARG;
@ -629,13 +582,15 @@ static INLINE int Sha512Final(Sha512* sha512)
if (sha512->buffLen > SHA512_PAD_SIZE) {
XMEMSET(&local[sha512->buffLen], 0, SHA512_BLOCK_SIZE - sha512->buffLen);
sha512->buffLen += SHA512_BLOCK_SIZE - sha512->buffLen;
#if defined(LITTLE_ENDIAN_ORDER)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
#endif
ByteReverseWords64(sha512->buffer,sha512->buffer,SHA512_BLOCK_SIZE);
#endif /* LITTLE_ENDIAN_ORDER */
#if defined(LITTLE_ENDIAN_ORDER)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
{
ByteReverseWords64(sha512->buffer,sha512->buffer,
SHA512_BLOCK_SIZE);
}
#endif /* LITTLE_ENDIAN_ORDER */
ret = Transform(sha512);
if (ret != 0)
return ret;
@ -651,17 +606,19 @@ static INLINE int Sha512Final(Sha512* sha512)
/* store lengths */
#if defined(LITTLE_ENDIAN_ORDER)
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
#endif
ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE);
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
#endif
{
ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE);
}
#endif
/* ! length ordering dependent on digest endian type ! */
sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
if (IS_INTEL_AVX1 || IS_INTEL_AVX2)
if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
ByteReverseWords64(&(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
&(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
SHA512_BLOCK_SIZE - SHA512_PAD_SIZE);
@ -1470,6 +1427,21 @@ int wc_Sha384Final(Sha384* sha384, byte* hash)
}
/* Hardware Acceleration */
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId)
{
int ret = InitSha384(sha384);
(void)heap;
(void)devId;
Sha512_SetTransform();
return ret;
}
#else
int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId)
{
int ret;
@ -1492,6 +1464,7 @@ int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId)
return ret;
}
#endif
int wc_InitSha384(Sha384* sha384)
{

View File

@ -195,6 +195,12 @@ static int devId = INVALID_DEVID;
const char* wnrConfigFile = "wnr-example.conf";
#endif
#ifdef HAVE_AESGCM
#define LARGE_BUFFER_SIZE 1024
static byte large_input[LARGE_BUFFER_SIZE];
static byte large_output[LARGE_BUFFER_SIZE];
static byte large_outdec[LARGE_BUFFER_SIZE];
#endif
typedef struct testVector {
const char* input;
@ -375,6 +381,9 @@ int wolfcrypt_test(void* args)
#endif
{
int ret;
#ifdef HAVE_AESGCM
int i;
#endif
((func_args*)args)->return_code = -1; /* error state */
@ -665,6 +674,8 @@ int wolfcrypt_test(void* args)
printf( "AES256 test passed!\n");
#ifdef HAVE_AESGCM
for (i=0; i<LARGE_BUFFER_SIZE; i++)
large_input[i] = i;
if ( (ret = aesgcm_test()) != 0)
return err_sys("AES-GCM test failed!\n", ret);
else
@ -4594,6 +4605,10 @@ int aesgcm_test(void)
byte resultP[sizeof(p)];
byte resultC[sizeof(p)];
int result;
#if !defined(HAVE_FIPS) && !defined(STM32F2_CRYPTO) && !defined(STM32F4_CRYPTO)
int ivlen;
#endif
int alen, plen;
XMEMSET(resultT, 0, sizeof(resultT));
XMEMSET(resultC, 0, sizeof(resultC));
@ -4630,6 +4645,87 @@ int aesgcm_test(void)
if (XMEMCMP(p, resultP, sizeof(resultP)))
return -4306;
/* Large buffer test */
/* AES-GCM encrypt and decrypt both use AES encrypt internally */
result = wc_AesGcmEncrypt(&enc, large_output, large_input,
LARGE_BUFFER_SIZE, iv1, sizeof(iv1),
resultT, sizeof(resultT), a, sizeof(a));
#if defined(WOLFSSL_ASYNC_CRYPT)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4307;
result = wc_AesGcmDecrypt(&enc, large_outdec, large_output,
LARGE_BUFFER_SIZE, iv1, sizeof(iv1), resultT,
sizeof(resultT), a, sizeof(a));
#if defined(WOLFSSL_ASYNC_CRYPT)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4308;
if (XMEMCMP(large_input, large_outdec, LARGE_BUFFER_SIZE))
return -4309;
#if !defined(HAVE_FIPS) && !defined(STM32F2_CRYPTO) && !defined(STM32F4_CRYPTO)
/* Variable IV length test */
for (ivlen=0; ivlen<(int)sizeof(k1); ivlen++) {
/* AES-GCM encrypt and decrypt both use AES encrypt internally */
result = wc_AesGcmEncrypt(&enc, resultC, p, sizeof(p), k1, ivlen,
resultT, sizeof(resultT), a, sizeof(a));
#if defined(WOLFSSL_ASYNC_CRYPT)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4310;
result = wc_AesGcmDecrypt(&enc, resultP, resultC, sizeof(resultC), k1,
ivlen, resultT, sizeof(resultT), a, sizeof(a));
#if defined(WOLFSSL_ASYNC_CRYPT)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4311;
}
#endif
/* Variable authenticed data length test */
for (alen=0; alen<(int)sizeof(p); alen++) {
/* AES-GCM encrypt and decrypt both use AES encrypt internally */
result = wc_AesGcmEncrypt(&enc, resultC, p, sizeof(p), iv1,
sizeof(iv1), resultT, sizeof(resultT), p, alen);
#if defined(WOLFSSL_ASYNC_CRYPT)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4312;
result = wc_AesGcmDecrypt(&enc, resultP, resultC, sizeof(resultC), iv1,
sizeof(iv1), resultT, sizeof(resultT), p, alen);
#if defined(WOLFSSL_ASYNC_CRYPT)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4313;
}
/* Variable plain text length test */
for (plen=1; plen<(int)sizeof(p); plen++) {
/* AES-GCM encrypt and decrypt both use AES encrypt internally */
result = wc_AesGcmEncrypt(&enc, resultC, p, plen, iv1, sizeof(iv1),
resultT, sizeof(resultT), a, sizeof(a));
#if defined(WOLFSSL_ASYNC_CRYPT)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4314;
result = wc_AesGcmDecrypt(&enc, resultP, resultC, plen, iv1,
sizeof(iv1), resultT, sizeof(resultT), a, sizeof(a));
#if defined(WOLFSSL_ASYNC_CRYPT)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4315;
}
/* FIPS, QAT and STM32F2/4 HW Crypto only support 12-byte IV */
#if !defined(HAVE_FIPS) && !defined(HAVE_INTEL_QA) && \
!defined(STM32F2_CRYPTO) && !defined(STM32F4_CRYPTO) && \
@ -4646,11 +4742,11 @@ int aesgcm_test(void)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4307;
return -4316;
if (XMEMCMP(c2, resultC, sizeof(resultC)))
return -4308;
return -4317;
if (XMEMCMP(t2, resultT, sizeof(resultT)))
return -4309;
return -4318;
result = wc_AesGcmDecrypt(&enc, resultP, resultC, sizeof(resultC),
iv2, sizeof(iv2), resultT, sizeof(resultT), a, sizeof(a));
@ -4658,9 +4754,9 @@ int aesgcm_test(void)
result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
#endif
if (result != 0)
return -4310;
return -4319;
if (XMEMCMP(p, resultP, sizeof(resultP)))
return -4311;
return -4320;
#endif /* !HAVE_FIPS && !HAVE_INTEL_QA && !STM32F2_CRYPTO && !STM32F4_CRYPTO */
wc_AesFree(&enc);

View File

@ -0,0 +1,59 @@
/* cpuid.h
*
* Copyright (C) 2006-2016 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
#ifndef WOLF_CRYPT_CPUID_H
#define WOLF_CRYPT_CPUID_H
#include <wolfssl/wolfcrypt/types.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef WOLFSSL_X86_64_BUILD
#define CPUID_AVX1 0x0001
#define CPUID_AVX2 0x0002
#define CPUID_RDRAND 0x0004
#define CPUID_RDSEED 0x0008
#define CPUID_BMI2 0x0010 /* MULX, RORX */
#define CPUID_AESNI 0x0020
#define IS_INTEL_AVX1(f) ((f) & CPUID_AVX1)
#define IS_INTEL_AVX2(f) ((f) & CPUID_AVX2)
#define IS_INTEL_RDRAND(f) ((f) & CPUID_RDRAND)
#define IS_INTEL_RDSEED(f) ((f) & CPUID_RDSEED)
#define IS_INTEL_BMI2(f) ((f) & CPUID_BMI2)
#define IS_INTEL_AESNI(f) ((f) & CPUID_AESNI)
void cpuid_set_flags(void);
word32 cpuid_get_flags(void);
#endif
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* WOLF_CRYPT_CPUID_H */

View File

@ -60,7 +60,8 @@ nobase_include_HEADERS+= \
wolfssl/wolfcrypt/wolfevent.h \
wolfssl/wolfcrypt/pkcs12.h \
wolfssl/wolfcrypt/wolfmath.h \
wolfssl/wolfcrypt/sha3.h
wolfssl/wolfcrypt/sha3.h \
wolfssl/wolfcrypt/cpuid.h
noinst_HEADERS+= \
wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h \