Assembly optimization for AES-NI, and AVX1 and AVX2

Unroll the loop for 8. Use new optimized maths. Fix SHA-384 to use SHA-512 assembly code. Only perform CPU id check in one place.
2017-07-18 10:14:17 +10:00 · 2017-07-18 10:14:17 +10:00 · bde6a35ac4
parent 36c2ee92dc
commit bde6a35ac4
13 changed files with 3212 additions and 706 deletions
--- a/configure.ac
+++ b/configure.ac
@ -789,6 +789,11 @@ AC_ARG_ENABLE([intelasm],
    [ ENABLED_INTELASM=no ]
    )

+if test "$ENABLED_AESNI" = "small"
+then
+    AM_CFLAGS="$AM_CFLAGS -DAES_GCM_AESNI_NO_UNROLL"
+    ENABLED_AESNI=yes
+fi

 if test "$ENABLED_AESNI" = "yes" || test "$ENABLED_INTELASM" = "yes"
 then
@ -799,7 +804,7 @@ then
        # opt levels greater than 2 may cause problems on systems w/o aesni
        if test "$CC" != "icc"
        then
-            AM_CFLAGS="$AM_CFLAGS -maes -msse4"
+            AM_CFLAGS="$AM_CFLAGS -maes -msse4 -mpclmul"
        fi
    fi
    AS_IF([test "x$ENABLED_AESGCM" != "xno"],[AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_AESGCM"])
--- a/src/include.am
+++ b/src/include.am
@ -61,7 +61,8 @@ endif

 src_libwolfssl_la_SOURCES += \
               wolfcrypt/src/hmac.c \
-               wolfcrypt/src/hash.c
+               wolfcrypt/src/hash.c \
+               wolfcrypt/src/cpuid.c

 if BUILD_RNG
 src_libwolfssl_la_SOURCES += wolfcrypt/src/random.c
--- a/wolfcrypt/benchmark/benchmark.c
+++ b/wolfcrypt/benchmark/benchmark.c
@ -137,8 +137,8 @@
    #define BEGIN_INTEL_CYCLES total_cycles = get_intel_cycles();
    #define END_INTEL_CYCLES   total_cycles = get_intel_cycles() - total_cycles;
    #define SHOW_INTEL_CYCLES  printf(" Cycles per byte = %6.2f", \
-                                   count == 0 ? 0 : \
-                                   (float)total_cycles / (count*BENCH_SIZE));
+                              count == 0 ? 0 : \
+                              (float)total_cycles / ((word64)count*BENCH_SIZE));
 #elif defined(LINUX_CYCLE_COUNT)
    #include <linux/perf_event.h>
    #include <sys/syscall.h>
@ -579,7 +579,7 @@ static void bench_stats_sym_finish(const char* desc, int doAsync, int count, dou
        persec = (1 / total) * blocks;
    }

-    printf("%-8s%s %5.0f %s took %5.3f seconds, %8.3f %s/s",
+    printf("%-12s%s %5.0f %s took %5.3f seconds, %8.3f %s/s",
        desc, BENCH_ASYNC_GET_NAME(doAsync), blocks, blockType, total,
        persec, blockType);
    SHOW_INTEL_CYCLES
@ -1275,7 +1275,31 @@ void bench_aesgcm(int doAsync)
        count += times;
    } while (bench_stats_sym_check(start));
 exit_aes_gcm:
-    bench_stats_sym_finish("AES-GCM", doAsync, count, start);
+    bench_stats_sym_finish("AES-GCM-Enc", doAsync, count, start);
+
+    /* GCM uses same routine in backend for both encrypt and decrypt */
+    bench_stats_start(&count, &start);
+    do {
+        for (times = 0; times < numBlocks || BENCH_ASYNC_IS_PEND(); ) {
+            bench_async_poll();
+
+            /* while free pending slots in queue, submit ops */
+            for (i = 0; i < BENCH_MAX_PENDING; i++) {
+                if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, &times, numBlocks)) {
+                    ret = wc_AesGcmDecrypt(&enc[i], bench_plain,
+                        bench_cipher, BENCH_SIZE,
+                        bench_iv, 12, bench_tag, AES_AUTH_TAG_SZ,
+                        bench_additional, AES_AUTH_ADD_SZ);
+                    if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, &times)) {
+                        goto exit_aes_gcm_dec;
+                    }
+                }
+            } /* for i */
+        } /* for times */
+        count += times;
+    } while (bench_stats_sym_check(start));
+exit_aes_gcm_dec:
+    bench_stats_sym_finish("AES-GCM-Dec", doAsync, count, start);

 exit:

--- a/wolfcrypt/src/aes.c
+++ b/wolfcrypt/src/aes.c
--- a/wolfcrypt/src/aes_asm.asm
+++ b/wolfcrypt/src/aes_asm.asm
@ -1502,100 +1502,4 @@ MAKE_RK256_b:
 	pxor	xmm3,xmm2
 	ret

-
-; See Intel® Carry-Less Multiplication Instruction
-; and its Usage for Computing the GCM Mode White Paper
-; by Shay Gueron, Intel Mobility Group, Israel Development Center;
-; and Michael E. Kounavis, Intel Labs, Circuits and Systems Research
-
-; void gfmul(__m128i a, __m128i b, __m128i* out);
-
-; .globl gfmul
-gfmul PROC
-        ; xmm0 holds operand a (128 bits)
-        ; xmm1 holds operand b (128 bits)
-        ; r8  holds the pointer to output (128 bits)
-
-        ; convert to what we had for att&t convention
-        movdqa  xmm0, [rcx]
-        movdqa  xmm1, [rdx]
-
-        ; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
-        sub rsp,8+4*16  ; 8 = align stack , 4 xmm6-9 16 bytes each
-        movdqa [rsp+0], xmm6
-        movdqa [rsp+16], xmm7
-        movdqa [rsp+32], xmm8
-        movdqa [rsp+48], xmm9
-
-        movdqa     xmm3, xmm0
-        pclmulqdq  xmm3, xmm1, 0    ; xmm3 holds a0*b0
-        movdqa     xmm4, xmm0
-        pclmulqdq  xmm4, xmm1, 16    ; xmm4 holds a0*b1
-        movdqa     xmm5, xmm0
-        pclmulqdq  xmm5, xmm1, 1     ; xmm5 holds a1*b0
-        movdqa     xmm6, xmm0
-        pclmulqdq  xmm6, xmm1, 17    ; xmm6 holds a1*b1
-        pxor       xmm4, xmm5         ; xmm4 holds a0*b1 + a1*b0
-        movdqa     xmm5, xmm4
-        psrldq     xmm4, 8
-        pslldq     xmm5, 8
-        pxor       xmm3, xmm5
-        pxor       xmm6, xmm4         ; <xmm6:xmm3> holds the result of
-                                        ; the carry-less multiplication of
-                                        ; xmm0 by xmm1
-
-; shift the result by one bit position to the left cope for the fact
-; that bits are reversed
-        movdqa   xmm7, xmm3
-        movdqa   xmm8, xmm6
-        pslld    xmm3, 1
-        pslld    xmm6, 1
-        psrld    xmm7, 31
-        psrld    xmm8, 31
-        movdqa   xmm9, xmm7
-        pslldq   xmm8, 4
-        pslldq   xmm7, 4
-        psrldq   xmm9, 12
-        por      xmm3, xmm7
-        por      xmm6, xmm8
-        por      xmm6, xmm9
-
-; first phase of the reduction
-        movdqa   xmm7, xmm3
-        movdqa   xmm8, xmm3
-        movdqa   xmm9, xmm3
-        pslld    xmm7, 31             ; packed right shifting << 31
-        pslld    xmm8, 30             ; packed right shifting shift << 30
-        pslld    xmm9, 25             ; packed right shifting shift << 25
-        pxor     xmm7, xmm8           ; xor the shifted versions
-        pxor     xmm7, xmm9
-
-        movdqa   xmm8, xmm7
-        pslldq   xmm7, 12
-        psrldq   xmm8, 4
-        pxor     xmm3, xmm7     ; first phase of the reduction complete
-        movdqa   xmm2, xmm3           ; second phase of the reduction
-        movdqa   xmm4, xmm3
-        movdqa   xmm5, xmm3
-        psrld    xmm2, 1              ; packed left shifting >> 1
-        psrld    xmm4, 2              ; packed left shifting >> 2
-        psrld    xmm5, 7              ; packed left shifting >> 7
-
-        pxor     xmm2, xmm4           ; xor the shifted versions
-        pxor     xmm2, xmm5
-        pxor     xmm2, xmm8
-        pxor     xmm3, xmm2
-        pxor     xmm6, xmm3           ; the result is in xmm6
-        movdqu   [r8],xmm6          ; store the result
-
-        ; restore non volatile xmms from stack
-        movdqa xmm6, [rsp+0]
-        movdqa xmm7, [rsp+16]
-        movdqa xmm8, [rsp+32]
-        movdqa xmm9, [rsp+48]
-        add rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each
-
-        ret
-gfmul ENDP
-
 END
--- a/wolfcrypt/src/aes_asm.s
+++ b/wolfcrypt/src/aes_asm.s
@ -1288,91 +1288,6 @@ pxor  %xmm4, %xmm3
 pxor   %xmm2, %xmm3
 ret

-
-#ifdef HAVE_AESGCM
-
-/* See Intel® Carry-Less Multiplication Instruction
- * and its Usage for Computing the GCM Mode White Paper
- * by Shay Gueron, Intel Mobility Group, Israel Development Center;
- * and Michael E. Kounavis, Intel Labs, Circuits and Systems Research
- *
- * This is for use with the C code.
- */
-
-/* Figure 6. Code Sample - Performing Ghash Using Algorithms 1 and 5 */
-
-/*
- * void gfmul(__m128i a, __m128i b, __m128i* out);
- */
-.globl gfmul
-gfmul:
-        #xmm0 holds operand a (128 bits)
-        #xmm1 holds operand b (128 bits)
-        #rdi  holds the pointer to output (128 bits)
-        movdqa     %xmm0, %xmm3
-        pclmulqdq  $0, %xmm1, %xmm3     # xmm3 holds a0*b0
-        movdqa     %xmm0, %xmm4
-        pclmulqdq  $16, %xmm1, %xmm4    # xmm4 holds a0*b1
-        movdqa     %xmm0, %xmm5
-        pclmulqdq  $1, %xmm1, %xmm5     # xmm5 holds a1*b0
-        movdqa     %xmm0, %xmm6
-        pclmulqdq  $17, %xmm1, %xmm6    # xmm6 holds a1*b1
-        pxor       %xmm5, %xmm4         # xmm4 holds a0*b1 + a1*b0
-        movdqa     %xmm4, %xmm5
-        psrldq     $8, %xmm4
-        pslldq     $8, %xmm5
-        pxor       %xmm5, %xmm3
-        pxor       %xmm4, %xmm6         # <xmm6:xmm3> holds the result of
-                                        # the carry-less multiplication of
-                                        # xmm0 by xmm1
-
-# shift the result by one bit position to the left cope for the fact
-# that bits are reversed
-        movdqa   %xmm3, %xmm7
-        movdqa   %xmm6, %xmm8
-        pslld    $1, %xmm3
-        pslld    $1, %xmm6
-        psrld    $31, %xmm7
-        psrld    $31, %xmm8
-        movdqa   %xmm7, %xmm9
-        pslldq   $4, %xmm8
-        pslldq   $4, %xmm7
-        psrldq   $12, %xmm9
-        por      %xmm7, %xmm3
-        por      %xmm8, %xmm6
-        por      %xmm9, %xmm6
-
-# first phase of the reduction
-        movdqa   %xmm3, %xmm7
-        movdqa   %xmm3, %xmm8
-        movdqa   %xmm3, %xmm9
-        pslld    $31, %xmm7             # packed right shifting << 31
-        pslld    $30, %xmm8             # packed right shifting shift << 30
-        pslld    $25, %xmm9             # packed right shifting shift << 25
-        pxor     %xmm8, %xmm7           # xor the shifted versions
-        pxor     %xmm9, %xmm7
-
-        movdqa   %xmm7, %xmm8
-        pslldq   $12, %xmm7
-        psrldq   $4, %xmm8
-        pxor     %xmm7, %xmm3           # first phase of the reduction complete
-        movdqa   %xmm3,%xmm2            # second phase of the reduction
-        movdqa   %xmm3,%xmm4
-        movdqa   %xmm3,%xmm5
-        psrld    $1, %xmm2              # packed left shifting >> 1
-        psrld    $2, %xmm4              # packed left shifting >> 2
-        psrld    $7, %xmm5              # packed left shifting >> 7
-
-        pxor     %xmm4, %xmm2           # xor the shifted versions
-        pxor     %xmm5, %xmm2
-        pxor     %xmm8, %xmm2
-        pxor     %xmm2, %xmm3
-        pxor     %xmm3, %xmm6           # the result is in xmm6
-        movdqu   %xmm6, (%rdi)          # store the result
-        ret
-
-#endif /* HAVE_AESGCM */
-
 #if defined(__linux__) && defined(__ELF__)
    .section .note.GNU-stack,"",%progbits
 #endif
--- a/wolfcrypt/src/cpuid.c
+++ b/wolfcrypt/src/cpuid.c
@ -0,0 +1,99 @@
+/* cpuid.c
+ *
+ * Copyright (C) 2006-2016 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#include <wolfssl/wolfcrypt/cpuid.h>
+
+#ifdef WOLFSSL_X86_64_BUILD
+    /* Each platform needs to query info type 1 from cpuid to see if aesni is
+     * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
+     */
+
+    #ifndef _MSC_VER
+        #define cpuid(reg, leaf, sub)\
+            __asm__ __volatile__ ("cpuid":\
+                "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
+                "a" (leaf), "c"(sub));
+
+        #define XASM_LINK(f) asm(f)
+    #else
+
+        #include <intrin.h>
+        #define cpuid(a,b) __cpuid((int*)a,b)
+
+        #define XASM_LINK(f)
+    #endif /* _MSC_VER */
+
+    #define EAX 0
+    #define EBX 1
+    #define ECX 2
+    #define EDX 3
+
+    static word32 cpuid_check = 0;
+    static word32 cpuid_flags = 0;
+
+    static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit)
+    {
+        static int got_intel_cpu = 0;
+        static unsigned int reg[5];
+
+        reg[4] = '\0';
+        cpuid(reg, 0, 0);
+        if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
+            XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
+            XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) {
+            got_intel_cpu = 1;
+        }
+        if (got_intel_cpu) {
+            cpuid(reg, leaf, sub);
+            return ((reg[num] >> bit) & 0x1);
+        }
+        return 0;
+    }
+
+
+    void cpuid_set_flags(void)
+    {
+        if (!cpuid_check) {
+            cpuid_check = 1;
+            if (cpuid_flag(1, 0, ECX, 28)) { cpuid_flags |= CPUID_AVX1  ; }
+            if (cpuid_flag(7, 0, EBX,  5)) { cpuid_flags |= CPUID_AVX2  ; }
+            if (cpuid_flag(7, 0, EBX,  8)) { cpuid_flags |= CPUID_BMI2  ; }
+            if (cpuid_flag(1, 0, ECX, 30)) { cpuid_flags |= CPUID_RDRAND; }
+            if (cpuid_flag(7, 0, EBX, 18)) { cpuid_flags |= CPUID_RDSEED; }
+            if (cpuid_flag(1, 0, ECX, 26)) { cpuid_flags |= CPUID_AESNI ; }
+        }
+    }
+
+    word32 cpuid_get_flags(void)
+    {
+        if (!cpuid_check)
+            cpuid_set_flags();
+        return cpuid_flags;
+    }
+#endif
+
--- a/wolfcrypt/src/random.c
+++ b/wolfcrypt/src/random.c
@ -32,6 +32,7 @@
 */

 #include <wolfssl/wolfcrypt/random.h>
+#include <wolfssl/wolfcrypt/cpuid.h>


 #ifdef HAVE_FIPS
@ -141,12 +142,6 @@ int wc_RNG_GenerateByte(WC_RNG* rng, byte* b)
    #ifdef HAVE_INTEL_RDRAND
    static int wc_GenerateRand_IntelRD(OS_Seed* os, byte* output, word32 sz);
    #endif
-    static word32 cpuid_check = 0;
-    static word32 cpuid_flags = 0;
-    #define CPUID_RDRAND 0x4
-    #define CPUID_RDSEED 0x8
-    #define IS_INTEL_RDRAND     (cpuid_flags & CPUID_RDRAND)
-    #define IS_INTEL_RDSEED     (cpuid_flags & CPUID_RDSEED)
 #endif

 /* Start NIST DRBG code */
@ -540,7 +535,7 @@ int wc_InitRng_ex(WC_RNG* rng, void* heap, int devId)

 #ifdef HAVE_INTEL_RDRAND
    /* if CPU supports RDRAND, use it directly and by-pass DRBG init */
-    if (IS_INTEL_RDRAND)
+    if (IS_INTEL_RDRAND(cpuid_get_flags()))
        return 0;
 #endif

@ -610,7 +605,7 @@ int wc_RNG_GenerateBlock(WC_RNG* rng, byte* output, word32 sz)
        return BAD_FUNC_ARG;

 #ifdef HAVE_INTEL_RDRAND
-    if (IS_INTEL_RDRAND)
+    if (IS_INTEL_RDRAND(cpuid_get_flags()))
        return wc_GenerateRand_IntelRD(NULL, output, sz);
 #endif

@ -982,52 +977,8 @@ int wc_FreeNetRandom(void)

 #if defined(HAVE_INTEL_RDRAND) || defined(HAVE_INTEL_RDSEED)

-#ifndef _MSC_VER
-    #define cpuid(reg, leaf, sub)\
-            __asm__ __volatile__ ("cpuid":\
-             "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
-             "a" (leaf), "c"(sub));
-
-    #define XASM_LINK(f) asm(f)
-#else
-
-    #include <intrin.h>
-    #define cpuid(a,b) __cpuid((int*)a,b)
-
-    #define XASM_LINK(f)
-
-#endif /* _MSC_VER */
-
-#define EAX 0
-#define EBX 1
-#define ECX 2
-#define EDX 3
-
-static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
-    int got_intel_cpu = 0;
-    unsigned int reg[5];
-
-    reg[4] = '\0';
-    cpuid(reg, 0, 0);
-    if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
-        XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
-        XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0)
-    {
-        got_intel_cpu = 1;
-    }
-    if (got_intel_cpu) {
-        cpuid(reg, leaf, sub);
-        return ((reg[num] >> bit) & 0x1);
-    }
-    return 0;
-}
-
 static void wc_InitRng_IntelRD(void) {
-    if (cpuid_check==0) {
-        if (cpuid_flag(1, 0, ECX, 30)) { cpuid_flags |= CPUID_RDRAND; }
-        if (cpuid_flag(7, 0, EBX, 18)) { cpuid_flags |= CPUID_RDSEED; }
-        cpuid_check = 1;
-    }
+    cpuid_set_flags();
 }

 #ifdef WOLFSSL_ASYNC_CRYPT
@ -1067,7 +1018,7 @@ static int wc_GenerateSeed_IntelRD(OS_Seed* os, byte* output, word32 sz)

    (void)os;

-    if (!IS_INTEL_RDSEED)
+    if (!IS_INTEL_RDSEED(cpuid_get_flags()))
        return -1;

    for (; (sz / sizeof(word64)) > 0; sz -= sizeof(word64),
@ -1122,7 +1073,7 @@ static int wc_GenerateRand_IntelRD(OS_Seed* os, byte* output, word32 sz)

    (void)os;

-    if (!IS_INTEL_RDRAND)
+    if (!IS_INTEL_RDRAND(cpuid_get_flags()))
        return -1;

    for (; (sz / sizeof(word64)) > 0; sz -= sizeof(word64),
@ -1702,7 +1653,7 @@ int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz)
        int ret = 0;

    #ifdef HAVE_INTEL_RDSEED
-        if (IS_INTEL_RDSEED) {
+        if (IS_INTEL_RDSEED(cpuid_get_flags())) {
             ret = wc_GenerateSeed_IntelRD(NULL, output, sz);
             if (ret == 0) {
                 /* success, we're done */
--- a/wolfcrypt/src/sha256.c
+++ b/wolfcrypt/src/sha256.c
@ -32,6 +32,7 @@

 #include <wolfssl/wolfcrypt/sha256.h>
 #include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/cpuid.h>

 /* fips wrapper calls, user can call direct */
 #ifdef HAVE_FIPS
@ -177,77 +178,14 @@ static int InitSha256(Sha256* sha256)
          More granural Stitched Message Sched/Round
      }

+    #endif
+
    */

    /* Each platform needs to query info type 1 from cpuid to see if aesni is
     * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
     */

-    #ifndef _MSC_VER
-        #define cpuid(reg, leaf, sub)\
-                __asm__ __volatile__ ("cpuid":\
-                 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
-                 "a" (leaf), "c"(sub));
-
-        #define XASM_LINK(f) asm(f)
-    #else
-        #include <intrin.h>
-        #define cpuid(a,b) __cpuid((int*)a,b)
-
-        #define XASM_LINK(f)
-    #endif /* _MSC_VER */
-
-    #define EAX 0
-    #define EBX 1
-    #define ECX 2
-    #define EDX 3
-
-    #define CPUID_AVX1   0x1
-    #define CPUID_AVX2   0x2
-    #define CPUID_RDRAND 0x4
-    #define CPUID_RDSEED 0x8
-    #define CPUID_BMI2   0x10   /* MULX, RORX */
-
-    #define IS_INTEL_AVX1       (cpuid_flags & CPUID_AVX1)
-    #define IS_INTEL_AVX2       (cpuid_flags & CPUID_AVX2)
-    #define IS_INTEL_BMI2       (cpuid_flags & CPUID_BMI2)
-    #define IS_INTEL_RDRAND     (cpuid_flags & CPUID_RDRAND)
-    #define IS_INTEL_RDSEED     (cpuid_flags & CPUID_RDSEED)
-
-    static word32 cpuid_check = 0;
-    static word32 cpuid_flags = 0;
-
-    static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
-        int got_intel_cpu=0;
-        unsigned int reg[5];
-
-        reg[4] = '\0';
-        cpuid(reg, 0, 0);
-        if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
-            XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
-            XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) {
-            got_intel_cpu = 1;
-        }
-        if (got_intel_cpu) {
-            cpuid(reg, leaf, sub);
-            return ((reg[num] >> bit) & 0x1);
-        }
-        return 0;
-    }
-
-    static int set_cpuid_flags(void) {
-        if (cpuid_check==0) {
-            if (cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1; }
-            if (cpuid_flag(7, 0, EBX, 5)) { cpuid_flags |= CPUID_AVX2; }
-            if (cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2; }
-            if (cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND; }
-            if (cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED; }
-            cpuid_check = 1;
-            return 0;
-        }
-        return 1;
-    }
-
    /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */
    static int Transform(Sha256* sha256);
    #if defined(HAVE_INTEL_AVX1)
@ -258,22 +196,31 @@ static int InitSha256(Sha256* sha256)
        static int Transform_AVX1_RORX(Sha256 *sha256);
    #endif
    static int (*Transform_p)(Sha256* sha256) /* = _Transform */;
+    static int transform_check = 0;
    #define XTRANSFORM(sha256, B)  (*Transform_p)(sha256)

-    static void set_Transform(void) {
-         if (set_cpuid_flags()) return;
+    static void set_Transform(void)
+    {
+        word32 intel_flags;
+
+        cpuid_set_flags();
+        if (transform_check)
+            return;
+        transform_check = 1;
+        intel_flags = cpuid_get_flags();

    #if defined(HAVE_INTEL_AVX2)
-         if (IS_INTEL_AVX2 && IS_INTEL_BMI2) {
-             Transform_p = Transform_AVX1_RORX; return;
-             Transform_p = Transform_AVX2;
-                      /* for avoiding warning,"not used" */
-         }
+        if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) {
+            Transform_p = Transform_AVX1_RORX; return;
+            Transform_p = Transform_AVX2;
+                     /* for avoiding warning,"not used" */
+        }
    #endif
    #if defined(HAVE_INTEL_AVX1)
-         Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform); return;
+        Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 :
+                                                       Transform); return;
    #endif
-         Transform_p = Transform; return;
+        Transform_p = Transform; return;
    }

    /* Dummy for saving MM_REGs on behalf of Transform */
@ -519,6 +466,11 @@ static int InitSha256(Sha256* sha256)
    {
        int ret = 0;
        byte* local;
+#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
+#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
+        word32 intel_flags = cpuid_get_flags();
+#endif
+#endif

        if (sha256 == NULL || (data == NULL && len > 0)) {
            return BAD_FUNC_ARG;
@ -552,7 +504,7 @@ static int InitSha256(Sha256* sha256)
            if (sha256->buffLen == SHA256_BLOCK_SIZE) {
        #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
            #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
-                if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
+                if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
            #endif
                {
                    ByteReverseWords(sha256->buffer, sha256->buffer,
@ -582,6 +534,11 @@ static int InitSha256(Sha256* sha256)

        int ret;
        byte* local = (byte*)sha256->buffer;
+#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
+#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
+        word32 intel_flags = cpuid_get_flags();
+#endif
+#endif

        if (sha256 == NULL) {
            return BAD_FUNC_ARG;
@ -598,15 +555,15 @@ static int InitSha256(Sha256* sha256)
                SHA256_BLOCK_SIZE - sha256->buffLen);
            sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;

-    #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
-        #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
-            if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
-        #endif
            {
-                ByteReverseWords(sha256->buffer, sha256->buffer,
-                    SHA256_BLOCK_SIZE);
+        #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
+            #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
+                if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
+            #endif
+                    ByteReverseWords(sha256->buffer, sha256->buffer,
+                        SHA256_BLOCK_SIZE);
+        #endif
            }
-    #endif

            ret = XTRANSFORM(sha256, local);
            if (ret != 0)
@ -624,7 +581,7 @@ static int InitSha256(Sha256* sha256)
        /* store lengths */
    #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
        #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
-            if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
+            if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
        #endif
            {
                ByteReverseWords(sha256->buffer, sha256->buffer,
@ -640,7 +597,7 @@ static int InitSha256(Sha256* sha256)
        defined(HAVE_INTEL_AVX2)
        /* Kinetis requires only these bytes reversed */
        #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
-            if (IS_INTEL_AVX1 || IS_INTEL_AVX2)
+            if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
        #endif
            {
                ByteReverseWords(
--- a/wolfcrypt/src/sha512.c
+++ b/wolfcrypt/src/sha512.c
@ -27,10 +27,9 @@
 #include <wolfssl/wolfcrypt/settings.h>

 #ifdef WOLFSSL_SHA512
-#include <wolfssl/wolfcrypt/error-crypt.h>
-
 #include <wolfssl/wolfcrypt/sha512.h>
 #include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/cpuid.h>

 /* fips wrapper calls, user can call direct */
 #ifdef HAVE_FIPS
@ -261,74 +260,6 @@ static int InitSha512(Sha512* sha512)
     * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
     */

-    #ifndef _MSC_VER
-        #define cpuid(reg, leaf, sub)\
-            __asm__ __volatile__ ("cpuid":\
-                "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
-                "a" (leaf), "c"(sub));
-
-        #define XASM_LINK(f) asm(f)
-    #else
-
-        #include <intrin.h>
-        #define cpuid(a,b) __cpuid((int*)a,b)
-
-        #define XASM_LINK(f)
-    #endif /* _MSC_VER */
-
-    #define EAX 0
-    #define EBX 1
-    #define ECX 2
-    #define EDX 3
-
-    #define CPUID_AVX1   0x1
-    #define CPUID_AVX2   0x2
-    #define CPUID_RDRAND 0x4
-    #define CPUID_RDSEED 0x8
-    #define CPUID_BMI2   0x10   /* MULX, RORX */
-
-    #define IS_INTEL_AVX1       (cpuid_flags & CPUID_AVX1)
-    #define IS_INTEL_AVX2       (cpuid_flags & CPUID_AVX2)
-    #define IS_INTEL_BMI2       (cpuid_flags & CPUID_BMI2)
-    #define IS_INTEL_RDRAND     (cpuid_flags & CPUID_RDRAND)
-    #define IS_INTEL_RDSEED     (cpuid_flags & CPUID_RDSEED)
-
-    static word32 cpuid_check = 0;
-    static word32 cpuid_flags = 0;
-
-    static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
-        int got_intel_cpu = 0;
-        unsigned int reg[5];
-
-        reg[4] = '\0';
-        cpuid(reg, 0, 0);
-        if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
-            XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
-            XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) {
-            got_intel_cpu = 1;
-        }
-        if (got_intel_cpu) {
-            cpuid(reg, leaf, sub);
-            return ((reg[num] >> bit) & 0x1);
-        }
-        return 0;
-    }
-
-
-    static int set_cpuid_flags() {
-        if(cpuid_check ==0) {
-            if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
-            if(cpuid_flag(7, 0, EBX, 5)){  cpuid_flags |= CPUID_AVX2 ; }
-            if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
-            if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ;  }
-            if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ;  }
-                cpuid_check = 1 ;
-                return 0 ;
-        }
-        return 1 ;
-    }
-
-
    #if defined(HAVE_INTEL_AVX1)
        static int Transform_AVX1(Sha512 *sha512);
    #endif
@ -340,6 +271,7 @@ static int InitSha512(Sha512* sha512)
    #endif
    static int _Transform(Sha512 *sha512);
    static int (*Transform_p)(Sha512* sha512) = _Transform;
+    static int transform_check = 0;
    #define Transform(sha512) (*Transform_p)(sha512)

    /* Dummy for saving MM_REGs on behalf of Transform */
@ -353,6 +285,28 @@ static int InitSha512(Sha512* sha512)
            "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15")
    #endif

+    static void Sha512_SetTransform()
+    {
+        word32 intel_flags;
+
+        if (transform_check)
+            return;
+        transform_check = 1;
+        intel_flags = cpuid_get_flags();
+
+    #if defined(HAVE_INTEL_AVX2)
+        if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) {
+            Transform_p = Transform_AVX1_RORX; return;
+            Transform_p = Transform_AVX2;
+                /* for avoiding warning,"not used" */
+        }
+    #endif
+    #if defined(HAVE_INTEL_AVX1)
+        Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 :
+                                                      _Transform); return;
+    #endif
+        Transform_p = _Transform;
+    }

    int wc_InitSha512_ex(Sha512* sha512, void* heap, int devId)
    {
@ -361,20 +315,7 @@ static int InitSha512(Sha512* sha512)
        (void)heap;
        (void)devId;

-        if (set_cpuid_flags())
-            return ret;
-
-    #if defined(HAVE_INTEL_AVX2)
-        if (IS_INTEL_AVX2 && IS_INTEL_BMI2) {
-            Transform_p = Transform_AVX1_RORX; return ret;
-            Transform_p = Transform_AVX2;
-                /* for avoiding warning,"not used" */
-        }
-    #endif
-    #if defined(HAVE_INTEL_AVX1)
-        Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform); return ret;
-    #endif
-        Transform_p = _Transform;
+        Sha512_SetTransform();

        return ret;
    }
@ -554,6 +495,11 @@ static INLINE int Sha512Update(Sha512* sha512, const byte* data, word32 len)
    int ret = 0;
    /* do block size increments */
    byte* local = (byte*)sha512->buffer;
+#if defined(LITTLE_ENDIAN_ORDER)
+#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
+    word32 intel_flags = cpuid_get_flags();
+#endif
+#endif

    if (sha512 == NULL || (data == NULL && len > 0)) {
        return BAD_FUNC_ARG;
@ -570,16 +516,18 @@ static INLINE int Sha512Update(Sha512* sha512, const byte* data, word32 len)
        XMEMCPY(&local[sha512->buffLen], data, add);

        sha512->buffLen += add;
-        data         += add;
-        len          -= add;
+        data            += add;
+        len             -= add;

        if (sha512->buffLen == SHA512_BLOCK_SIZE) {
    #if defined(LITTLE_ENDIAN_ORDER)
        #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
-            if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
+            if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
        #endif
+            {
                ByteReverseWords64(sha512->buffer, sha512->buffer,
-                               SHA512_BLOCK_SIZE);
+                                                             SHA512_BLOCK_SIZE);
+            }
    #endif
            ret = Transform(sha512);
            if (ret != 0)
@ -615,6 +563,11 @@ static INLINE int Sha512Final(Sha512* sha512)
 {
    byte* local = (byte*)sha512->buffer;
    int ret;
+#if defined(LITTLE_ENDIAN_ORDER)
+#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
+    word32 intel_flags = cpuid_get_flags();
+#endif
+#endif

    if (sha512 == NULL) {
        return BAD_FUNC_ARG;
@ -629,13 +582,15 @@ static INLINE int Sha512Final(Sha512* sha512)
    if (sha512->buffLen > SHA512_PAD_SIZE) {
        XMEMSET(&local[sha512->buffLen], 0, SHA512_BLOCK_SIZE - sha512->buffLen);
        sha512->buffLen += SHA512_BLOCK_SIZE - sha512->buffLen;
-#if defined(LITTLE_ENDIAN_ORDER)
-    #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
-        if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
-    #endif
-            ByteReverseWords64(sha512->buffer,sha512->buffer,SHA512_BLOCK_SIZE);
-
-#endif /* LITTLE_ENDIAN_ORDER */
+    #if defined(LITTLE_ENDIAN_ORDER)
+        #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
+            if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
+        #endif
+            {
+                ByteReverseWords64(sha512->buffer,sha512->buffer,
+                                                             SHA512_BLOCK_SIZE);
+            }
+    #endif /* LITTLE_ENDIAN_ORDER */
        ret = Transform(sha512);
        if (ret != 0)
            return ret;
@ -651,17 +606,19 @@ static INLINE int Sha512Final(Sha512* sha512)

    /* store lengths */
 #if defined(LITTLE_ENDIAN_ORDER)
-#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
-    if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
-#endif
-        ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE);
+    #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
+        if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
+    #endif
+        {
+            ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE);
+        }
 #endif
    /* ! length ordering dependent on digest endian type ! */

    sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
    sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
-    if (IS_INTEL_AVX1 || IS_INTEL_AVX2)
+    if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
        ByteReverseWords64(&(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
                           &(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
                           SHA512_BLOCK_SIZE - SHA512_PAD_SIZE);
@ -1470,6 +1427,21 @@ int wc_Sha384Final(Sha384* sha384, byte* hash)
 }


+/* Hardware Acceleration */
+#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
+
+    int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId)
+    {
+        int ret = InitSha384(sha384);
+
+        (void)heap;
+        (void)devId;
+
+        Sha512_SetTransform();
+
+        return ret;
+    }
+#else
 int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId)
 {
    int ret;
@ -1492,6 +1464,7 @@ int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId)

    return ret;
 }
+#endif

 int wc_InitSha384(Sha384* sha384)
 {
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@ -195,6 +195,12 @@ static int devId = INVALID_DEVID;
    const char* wnrConfigFile = "wnr-example.conf";
 #endif

+#ifdef HAVE_AESGCM
+#define LARGE_BUFFER_SIZE       1024
+static byte large_input[LARGE_BUFFER_SIZE];
+static byte large_output[LARGE_BUFFER_SIZE];
+static byte large_outdec[LARGE_BUFFER_SIZE];
+#endif

 typedef struct testVector {
    const char*  input;
@ -375,6 +381,9 @@ int wolfcrypt_test(void* args)
 #endif
 {
    int ret;
+#ifdef HAVE_AESGCM
+    int i;
+#endif

    ((func_args*)args)->return_code = -1; /* error state */

@ -665,6 +674,8 @@ int wolfcrypt_test(void* args)
        printf( "AES256   test passed!\n");

 #ifdef HAVE_AESGCM
+    for (i=0; i<LARGE_BUFFER_SIZE; i++)
+        large_input[i] = i;
    if ( (ret = aesgcm_test()) != 0)
        return err_sys("AES-GCM  test failed!\n", ret);
    else
@ -4594,6 +4605,10 @@ int aesgcm_test(void)
    byte resultP[sizeof(p)];
    byte resultC[sizeof(p)];
    int  result;
+#if !defined(HAVE_FIPS) && !defined(STM32F2_CRYPTO) && !defined(STM32F4_CRYPTO)
+    int  ivlen;
+#endif
+    int  alen, plen;

    XMEMSET(resultT, 0, sizeof(resultT));
    XMEMSET(resultC, 0, sizeof(resultC));
@ -4630,6 +4645,87 @@ int aesgcm_test(void)
    if (XMEMCMP(p, resultP, sizeof(resultP)))
        return -4306;

+    /* Large buffer test */
+    /* AES-GCM encrypt and decrypt both use AES encrypt internally */
+    result = wc_AesGcmEncrypt(&enc, large_output, large_input,
+                              LARGE_BUFFER_SIZE, iv1, sizeof(iv1),
+                              resultT, sizeof(resultT), a, sizeof(a));
+#if defined(WOLFSSL_ASYNC_CRYPT)
+    result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
+#endif
+    if (result != 0)
+        return -4307;
+
+    result = wc_AesGcmDecrypt(&enc, large_outdec, large_output,
+                              LARGE_BUFFER_SIZE, iv1, sizeof(iv1), resultT,
+                              sizeof(resultT), a, sizeof(a));
+#if defined(WOLFSSL_ASYNC_CRYPT)
+    result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
+#endif
+    if (result != 0)
+        return -4308;
+    if (XMEMCMP(large_input, large_outdec, LARGE_BUFFER_SIZE))
+        return -4309;
+
+#if !defined(HAVE_FIPS) && !defined(STM32F2_CRYPTO) && !defined(STM32F4_CRYPTO)
+    /* Variable IV length test */
+    for (ivlen=0; ivlen<(int)sizeof(k1); ivlen++) {
+         /* AES-GCM encrypt and decrypt both use AES encrypt internally */
+         result = wc_AesGcmEncrypt(&enc, resultC, p, sizeof(p), k1, ivlen,
+                                        resultT, sizeof(resultT), a, sizeof(a));
+#if defined(WOLFSSL_ASYNC_CRYPT)
+        result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
+#endif
+        if (result != 0)
+            return -4310;
+        result = wc_AesGcmDecrypt(&enc, resultP, resultC, sizeof(resultC), k1,
+                                 ivlen, resultT, sizeof(resultT), a, sizeof(a));
+#if defined(WOLFSSL_ASYNC_CRYPT)
+        result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
+#endif
+        if (result != 0)
+            return -4311;
+    }
+#endif
+
+    /* Variable authenticed data length test */
+    for (alen=0; alen<(int)sizeof(p); alen++) {
+         /* AES-GCM encrypt and decrypt both use AES encrypt internally */
+         result = wc_AesGcmEncrypt(&enc, resultC, p, sizeof(p), iv1,
+                                sizeof(iv1), resultT, sizeof(resultT), p, alen);
+#if defined(WOLFSSL_ASYNC_CRYPT)
+        result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
+#endif
+        if (result != 0)
+            return -4312;
+        result = wc_AesGcmDecrypt(&enc, resultP, resultC, sizeof(resultC), iv1,
+                                sizeof(iv1), resultT, sizeof(resultT), p, alen);
+#if defined(WOLFSSL_ASYNC_CRYPT)
+        result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
+#endif
+        if (result != 0)
+            return -4313;
+    }
+
+    /* Variable plain text length test */
+    for (plen=1; plen<(int)sizeof(p); plen++) {
+         /* AES-GCM encrypt and decrypt both use AES encrypt internally */
+         result = wc_AesGcmEncrypt(&enc, resultC, p, plen, iv1, sizeof(iv1),
+                                        resultT, sizeof(resultT), a, sizeof(a));
+#if defined(WOLFSSL_ASYNC_CRYPT)
+        result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
+#endif
+        if (result != 0)
+            return -4314;
+        result = wc_AesGcmDecrypt(&enc, resultP, resultC, plen, iv1,
+                           sizeof(iv1), resultT, sizeof(resultT), a, sizeof(a));
+#if defined(WOLFSSL_ASYNC_CRYPT)
+        result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
+#endif
+        if (result != 0)
+            return -4315;
+    }
+
    /* FIPS, QAT and STM32F2/4 HW Crypto only support 12-byte IV */
 #if !defined(HAVE_FIPS) && !defined(HAVE_INTEL_QA) && \
        !defined(STM32F2_CRYPTO) && !defined(STM32F4_CRYPTO) && \
@ -4646,11 +4742,11 @@ int aesgcm_test(void)
    result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
 #endif
    if (result != 0)
-        return -4307;
+        return -4316;
    if (XMEMCMP(c2, resultC, sizeof(resultC)))
-        return -4308;
+        return -4317;
    if (XMEMCMP(t2, resultT, sizeof(resultT)))
-        return -4309;
+        return -4318;

    result = wc_AesGcmDecrypt(&enc, resultP, resultC, sizeof(resultC),
                      iv2, sizeof(iv2), resultT, sizeof(resultT), a, sizeof(a));
@ -4658,9 +4754,9 @@ int aesgcm_test(void)
    result = wc_AsyncWait(result, &enc.asyncDev, WC_ASYNC_FLAG_NONE);
 #endif
    if (result != 0)
-        return -4310;
+        return -4319;
    if (XMEMCMP(p, resultP, sizeof(resultP)))
-        return -4311;
+        return -4320;
 #endif /* !HAVE_FIPS && !HAVE_INTEL_QA && !STM32F2_CRYPTO && !STM32F4_CRYPTO */

    wc_AesFree(&enc);
--- a/wolfssl/wolfcrypt/cpuid.h
+++ b/wolfssl/wolfcrypt/cpuid.h
@ -0,0 +1,59 @@
+/* cpuid.h
+ *
+ * Copyright (C) 2006-2016 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+
+#ifndef WOLF_CRYPT_CPUID_H
+#define WOLF_CRYPT_CPUID_H
+
+
+#include <wolfssl/wolfcrypt/types.h>
+
+
+#ifdef __cplusplus
+    extern "C" {
+#endif
+
+#ifdef WOLFSSL_X86_64_BUILD
+    #define CPUID_AVX1   0x0001
+    #define CPUID_AVX2   0x0002
+    #define CPUID_RDRAND 0x0004
+    #define CPUID_RDSEED 0x0008
+    #define CPUID_BMI2   0x0010   /* MULX, RORX */
+    #define CPUID_AESNI  0x0020
+
+    #define IS_INTEL_AVX1(f)    ((f) & CPUID_AVX1)
+    #define IS_INTEL_AVX2(f)    ((f) & CPUID_AVX2)
+    #define IS_INTEL_RDRAND(f)  ((f) & CPUID_RDRAND)
+    #define IS_INTEL_RDSEED(f)  ((f) & CPUID_RDSEED)
+    #define IS_INTEL_BMI2(f)    ((f) & CPUID_BMI2)
+    #define IS_INTEL_AESNI(f)   ((f) & CPUID_AESNI)
+
+    void cpuid_set_flags(void);
+    word32 cpuid_get_flags(void);
+#endif
+
+#ifdef __cplusplus
+    }   /* extern "C" */
+#endif
+
+
+#endif /* WOLF_CRYPT_CPUID_H */
--- a/wolfssl/wolfcrypt/include.am
+++ b/wolfssl/wolfcrypt/include.am
@ -60,7 +60,8 @@ nobase_include_HEADERS+= \
                         wolfssl/wolfcrypt/wolfevent.h \
                         wolfssl/wolfcrypt/pkcs12.h \
                         wolfssl/wolfcrypt/wolfmath.h \
-                         wolfssl/wolfcrypt/sha3.h
+                         wolfssl/wolfcrypt/sha3.h \
+                         wolfssl/wolfcrypt/cpuid.h

 noinst_HEADERS+= \
                         wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h \