diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras index cf9b3109a..6eabf575a 100644 --- a/.wolfssl_known_macro_extras +++ b/.wolfssl_known_macro_extras @@ -651,6 +651,7 @@ WOLFSSL_HARDEN_TLS_ALLOW_OLD_TLS WOLFSSL_HARDEN_TLS_ALLOW_TRUNCATED_HMAC WOLFSSL_HARDEN_TLS_NO_PKEY_CHECK WOLFSSL_HARDEN_TLS_NO_SCR_CHECK +WOLFSSL_HMAC_COPY_HASH WOLFSSL_HOSTNAME_VERIFY_ALT_NAME_ONLY WOLFSSL_I2D_ECDSA_SIG_ALLOC WOLFSSL_IAR_ARM_TIME diff --git a/configure.ac b/configure.ac index ca095d480..afe54ad7d 100644 --- a/configure.ac +++ b/configure.ac @@ -295,6 +295,25 @@ AC_ARG_ENABLE([hmac], [ ENABLED_HMAC=yes ] ) +# enable HMAC hash copying automatically for x86_64 and aarch64 (except Linux kernel module) +HMAC_COPY_DEFAULT=no +if test "$ENABLED_LINUXKM_DEFAULTS" = "no" +then + if test "$host_cpu" = "x86_64" || test "$host_cpu" = "aarch64" || test "$host_cpu" = "amd64" + then + HMAC_COPY_DEFAULT=yes + fi +fi +AC_ARG_ENABLE([hmac-copy], + [AS_HELP_STRING([--enable-hmac-copy],[Enables digest copying implementation for HMAC (default: disabled)])], + [ ENABLED_HMAC_COPY=$enableval ], + [ ENABLED_HMAC_COPY=$HMAC_COPY_DEFAULT ] + ) +if test "$ENABLED_HMAC_COPY" = "yes" +then + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HMAC_COPY_HASH" +fi + AC_ARG_ENABLE([do178], [AS_HELP_STRING([--enable-do178],[Enable DO-178, Will NOT work w/o DO178 license (default: disabled)])], [ENABLED_DO178=$enableval], diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 689f03d24..e65fb0860 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -1193,7 +1193,7 @@ static int lng_index = 0; #ifndef NO_MAIN_DRIVER #ifndef MAIN_NO_ARGS -static const char* bench_Usage_msg1[][25] = { +static const char* bench_Usage_msg1[][27] = { /* 0 English */ { "-? Help, print this usage\n", " 0: English, 1: Japanese\n", @@ -1207,6 +1207,8 @@ static const char* bench_Usage_msg1[][25] = { " (if set via -aad_size) bytes.\n" ), "-dgst_full Full digest operation performed.\n", + "-mac_final MAC update and final operation timed.\n", + "-aead_set_key Set the key as part of the timing of AEAD ciphers.\n", "-rsa_sign Measure RSA sign/verify instead of encrypt/decrypt.\n", " -rsa-sz\n Measure RSA performance.\n", "-ffhdhe2048 Measure DH using FFDHE 2048-bit parameters.\n", @@ -1240,6 +1242,8 @@ static const char* bench_Usage_msg1[][25] = { "-aad_size TBD.\n", "-all_aad TBD.\n", "-dgst_full フルの digest 暗号操作を実施します。\n", + "-mac_final MAC update and final operation timed.\n", + "-aead_set_key Set the key as part of the timing of AEAD ciphers.\n", "-rsa_sign 暗号/復号化の代わりに RSA の署名/検証を測定します。\n", " -rsa-sz\n RSA の性能を測定します。\n", "-ffhdhe2048 Measure DH using FFDHE 2048-bit parameters.\n", @@ -2056,6 +2060,8 @@ static int numBlocks = NUM_BLOCKS; static word32 bench_size = BENCH_SIZE; static int base2 = 1; static int digest_stream = 1; +static int mac_stream = 1; +static int aead_set_key = 0; #ifdef HAVE_CHACHA static int encrypt_only = 0; #endif @@ -4505,10 +4511,12 @@ static void bench_aesgcm_internal(int useDeviceID, goto exit; } - ret = wc_AesGcmSetKey(enc[i], key, keySz); - if (ret != 0) { - printf("AesGcmSetKey failed, ret = %d\n", ret); - goto exit; + if (!aead_set_key) { + ret = wc_AesGcmSetKey(enc[i], key, keySz); + if (ret != 0) { + printf("AesGcmSetKey failed, ret = %d\n", ret); + goto exit; + } } } @@ -4522,6 +4530,14 @@ static void bench_aesgcm_internal(int useDeviceID, for (i = 0; i < BENCH_MAX_PENDING; i++) { if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(enc[i]), 0, ×, numBlocks, &pending)) { + if (aead_set_key) { + ret = wc_AesGcmSetKey(enc[i], key, keySz); + if (!bench_async_handle(&ret, + BENCH_ASYNC_GET_DEV(enc[i]), 0, + ×, &pending)) { + goto exit_aes_gcm; + } + } ret = wc_AesGcmEncrypt(enc[i], bench_cipher, bench_plain, bench_size, iv, ivSz, bench_tag, AES_AUTH_TAG_SZ, @@ -4560,10 +4576,12 @@ exit_aes_gcm: goto exit; } - ret = wc_AesGcmSetKey(dec[i], key, keySz); - if (ret != 0) { - printf("AesGcmSetKey failed, ret = %d\n", ret); - goto exit; + if (!aead_set_key) { + ret = wc_AesGcmSetKey(dec[i], key, keySz); + if (ret != 0) { + printf("AesGcmSetKey failed, ret = %d\n", ret); + goto exit; + } } } @@ -4576,6 +4594,14 @@ exit_aes_gcm: for (i = 0; i < BENCH_MAX_PENDING; i++) { if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(dec[i]), 0, ×, numBlocks, &pending)) { + if (aead_set_key) { + ret = wc_AesGcmSetKey(dec[i], key, keySz); + if (!bench_async_handle(&ret, + BENCH_ASYNC_GET_DEV(dec[i]), 0, + ×, &pending)) { + goto exit_aes_gcm_dec; + } + } ret = wc_AesGcmDecrypt(dec[i], bench_plain, bench_cipher, bench_size, iv, ivSz, bench_tag, AES_AUTH_TAG_SZ, @@ -8300,50 +8326,89 @@ static void bench_hmac(int useDeviceID, int type, int digestSz, } } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, - BENCH_ASYNC_GET_DEV(hmac[i]), 0, - ×, numBlocks, &pending)) { - ret = wc_HmacUpdate(hmac[i], bench_plain, bench_size); - if (!bench_async_handle(&ret, - BENCH_ASYNC_GET_DEV(hmac[i]), - 0, ×, &pending)) { - goto exit_hmac; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + if (mac_stream) { + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, - BENCH_ASYNC_GET_DEV(hmac[i]), 0, - ×, numBlocks, &pending)) { - ret = wc_HmacFinal(hmac[i], digest[i]); - if (!bench_async_handle(&ret, - BENCH_ASYNC_GET_DEV(hmac[i]), - 0, ×, &pending)) { - goto exit_hmac; + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, + BENCH_ASYNC_GET_DEV(hmac[i]), 0, + ×, numBlocks, &pending)) { + ret = wc_HmacUpdate(hmac[i], bench_plain, bench_size); + if (!bench_async_handle(&ret, + BENCH_ASYNC_GET_DEV(hmac[i]), + 0, ×, &pending)) { + goto exit_hmac; + } } - } - RECORD_MULTI_VALUE_STATS(); - } /* for i */ - } while (pending > 0); - } while (bench_stats_check(start) -#ifdef MULTI_VALUE_STATISTICS - || runs < minimum_runs -#endif - ); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, + BENCH_ASYNC_GET_DEV(hmac[i]), 0, + ×, numBlocks, &pending)) { + ret = wc_HmacFinal(hmac[i], digest[i]); + if (!bench_async_handle(&ret, + BENCH_ASYNC_GET_DEV(hmac[i]), + 0, ×, &pending)) { + goto exit_hmac; + } + } + RECORD_MULTI_VALUE_STATS(); + } /* for i */ + } while (pending > 0); + } while (bench_stats_check(start) + #ifdef MULTI_VALUE_STATISTICS + || runs < minimum_runs + #endif + ); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, + BENCH_ASYNC_GET_DEV(hmac[i]), 0, + ×, numBlocks, &pending)) { + ret = wc_HmacUpdate(hmac[i], bench_plain, bench_size); + if (!bench_async_handle(&ret, + BENCH_ASYNC_GET_DEV(hmac[i]), + 0, ×, &pending)) { + goto exit_hmac; + } + } + if (bench_async_check(&ret, + BENCH_ASYNC_GET_DEV(hmac[i]), 0, + ×, numBlocks, &pending)) { + ret = wc_HmacFinal(hmac[i], digest[i]); + if (!bench_async_handle(&ret, + BENCH_ASYNC_GET_DEV(hmac[i]), + 0, ×, &pending)) { + goto exit_hmac; + } + } + } /* for i */ + } /* for times */ + count += times; + } while (bench_stats_check(start) + #ifdef MULTI_VALUE_STATISTICS + || runs < minimum_runs + #endif + ); + } exit_hmac: bench_stats_sym_finish(label, useDeviceID, count, bench_size, start, ret); @@ -15026,6 +15091,7 @@ static void Usage(void) e += 3; #endif printf("%s", bench_Usage_msg1[lng_index][e++]); /* option -dgst_full */ + printf("%s", bench_Usage_msg1[lng_index][e++]); /* option -mca_final */ #ifndef NO_RSA printf("%s", bench_Usage_msg1[lng_index][e++]); /* option -ras_sign */ #ifdef WOLFSSL_KEY_GEN @@ -15223,6 +15289,10 @@ int wolfcrypt_benchmark_main(int argc, char** argv) #endif else if (string_matches(argv[1], "-dgst_full")) digest_stream = 0; + else if (string_matches(argv[1], "-mac_final")) + mac_stream = 0; + else if (string_matches(argv[1], "-aead_set_key")) + aead_set_key = 1; #ifdef HAVE_CHACHA else if (string_matches(argv[1], "-enc_only")) encrypt_only = 1; diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index dbc7ef125..2cba92193 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -6633,6 +6633,25 @@ void GenerateM0(Gcm* gcm) #endif /* GCM_TABLE */ +#if defined(WOLFSSL_AESNI) && defined(USE_INTEL_SPEEDUP) + #define HAVE_INTEL_AVX1 + #define HAVE_INTEL_AVX2 +#endif + +#if defined(WOLFSSL_AESNI) && defined(GCM_TABLE_4BIT) && \ + defined(WC_C_DYNAMIC_FALLBACK) +void GCM_generate_m0_aesni(const unsigned char *h, unsigned char *m) + XASM_LINK("GCM_generate_m0_aesni"); +#ifdef HAVE_INTEL_AVX1 +void GCM_generate_m0_avx1(const unsigned char *h, unsigned char *m) + XASM_LINK("GCM_generate_m0_avx1"); +#endif +#ifdef HAVE_INTEL_AVX2 +void GCM_generate_m0_avx2(const unsigned char *h, unsigned char *m) + XASM_LINK("GCM_generate_m0_avx2"); +#endif +#endif /* WOLFSSL_AESNI && GCM_TABLE_4BIT && WC_C_DYNAMIC_FALLBACK */ + /* Software AES - GCM SetKey */ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) { @@ -6702,9 +6721,33 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) VECTOR_REGISTERS_POP; } if (ret == 0) { - #if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) - GenerateM0(&aes->gcm); - #endif /* GCM_TABLE */ +#if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) +#if defined(WOLFSSL_AESNI) && defined(GCM_TABLE_4BIT) + if (aes->use_aesni) { + #if defined(WC_C_DYNAMIC_FALLBACK) + #ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_AVX2(intel_flags)) { + GCM_generate_m0_avx2(aes->gcm.H, (byte*)aes->gcm.M0); + } + else + #endif + #if defined(HAVE_INTEL_AVX1) + if (IS_INTEL_AVX1(intel_flags)) { + GCM_generate_m0_avx1(aes->gcm.H, (byte*)aes->gcm.M0); + } + else + #endif + { + GCM_generate_m0_aesni(aes->gcm.H, (byte*)aes->gcm.M0); + } + #endif + } + else +#endif + { + GenerateM0(&aes->gcm); + } +#endif /* GCM_TABLE || GCM_TABLE_4BIT */ } #endif /* FREESCALE_LTC_AES_GCM */ @@ -6727,11 +6770,6 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) #ifdef WOLFSSL_AESNI -#if defined(USE_INTEL_SPEEDUP) - #define HAVE_INTEL_AVX1 - #define HAVE_INTEL_AVX2 -#endif /* USE_INTEL_SPEEDUP */ - void AES_GCM_encrypt_aesni(const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, unsigned char *tag, word32 nbytes, diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S index f6fb79bc5..b14620be0 100644 --- a/wolfcrypt/src/aes_gcm_asm.S +++ b/wolfcrypt/src/aes_gcm_asm.S @@ -56,6 +56,272 @@ #else .p2align 4 #endif /* __APPLE__ */ +L_GCM_generate_m0_aesni_rev8: +.quad 0x8090a0b0c0d0e0f, 0x1020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_aesni_mod2_128: +.quad 0x0, 0xe100000000000000 +#ifndef __APPLE__ +.text +.globl GCM_generate_m0_aesni +.type GCM_generate_m0_aesni,@function +.align 16 +GCM_generate_m0_aesni: +#else +.section __TEXT,__text +.globl _GCM_generate_m0_aesni +.p2align 4 +_GCM_generate_m0_aesni: +#endif /* __APPLE__ */ + movdqu L_GCM_generate_m0_aesni_rev8(%rip), %xmm9 + movdqu L_GCM_generate_m0_aesni_mod2_128(%rip), %xmm10 + pxor %xmm8, %xmm8 + movdqu (%rdi), %xmm0 + movdqu %xmm8, (%rsi) + movdqu %xmm0, %xmm8 + pshufb %xmm9, %xmm0 + movdqu %xmm0, %xmm5 + movdqu %xmm0, %xmm4 + psllq $63, %xmm5 + psrlq $0x01, %xmm4 + movdqu %xmm5, %xmm1 + pslldq $8, %xmm1 + psrldq $8, %xmm5 + pshufd $0xff, %xmm1, %xmm1 + por %xmm5, %xmm4 + psrad $31, %xmm1 + pand %xmm10, %xmm1 + pxor %xmm4, %xmm1 + movdqu %xmm1, %xmm5 + movdqu %xmm1, %xmm4 + psllq $63, %xmm5 + psrlq $0x01, %xmm4 + movdqu %xmm5, %xmm2 + pslldq $8, %xmm2 + psrldq $8, %xmm5 + pshufd $0xff, %xmm2, %xmm2 + por %xmm5, %xmm4 + psrad $31, %xmm2 + pand %xmm10, %xmm2 + pxor %xmm4, %xmm2 + movdqu %xmm2, %xmm5 + movdqu %xmm2, %xmm4 + psllq $63, %xmm5 + psrlq $0x01, %xmm4 + movdqu %xmm5, %xmm3 + pslldq $8, %xmm3 + psrldq $8, %xmm5 + pshufd $0xff, %xmm3, %xmm3 + por %xmm5, %xmm4 + psrad $31, %xmm3 + pand %xmm10, %xmm3 + pxor %xmm4, %xmm3 + pshufb %xmm9, %xmm3 + pshufb %xmm9, %xmm2 + movdqu %xmm3, %xmm8 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm0 + pxor %xmm2, %xmm8 + movdqu %xmm3, 16(%rsi) + movdqu %xmm2, 32(%rsi) + movdqu %xmm8, 48(%rsi) + movdqu %xmm1, 64(%rsi) + movdqu %xmm3, %xmm4 + movdqu %xmm2, %xmm5 + movdqu %xmm8, %xmm6 + pxor %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm6 + movdqu %xmm4, 80(%rsi) + movdqu %xmm5, 96(%rsi) + movdqu %xmm6, 112(%rsi) + movdqu %xmm0, 128(%rsi) + pxor %xmm0, %xmm1 + movdqu %xmm3, %xmm4 + movdqu %xmm2, %xmm6 + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + movdqu %xmm4, 144(%rsi) + movdqu %xmm6, 160(%rsi) + pxor %xmm3, %xmm6 + movdqu %xmm6, 176(%rsi) + movdqu %xmm1, 192(%rsi) + movdqu %xmm3, %xmm4 + movdqu %xmm2, %xmm5 + movdqu %xmm8, %xmm6 + pxor %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm6 + movdqu %xmm4, 208(%rsi) + movdqu %xmm5, 224(%rsi) + movdqu %xmm6, 240(%rsi) + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 + movdqu %xmm0, %xmm4 + movdqu %xmm1, %xmm5 + movdqu %xmm2, %xmm6 + movdqu %xmm3, %xmm7 + psllq $60, %xmm4 + psllq $60, %xmm5 + psllq $60, %xmm6 + psllq $60, %xmm7 + psrlq $4, %xmm0 + psrlq $4, %xmm1 + psrlq $4, %xmm2 + psrlq $4, %xmm3 + psrldq $8, %xmm4 + psrldq $8, %xmm5 + psrldq $8, %xmm6 + psrldq $8, %xmm7 + por %xmm4, %xmm0 + por %xmm5, %xmm1 + por %xmm6, %xmm2 + por %xmm7, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + movdqu %xmm0, 256(%rsi) + movdqu %xmm1, 272(%rsi) + movdqu %xmm2, 288(%rsi) + movdqu %xmm3, 304(%rsi) + movdqu 64(%rsi), %xmm0 + movdqu 80(%rsi), %xmm1 + movdqu 96(%rsi), %xmm2 + movdqu 112(%rsi), %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 + movdqu %xmm0, %xmm4 + movdqu %xmm1, %xmm5 + movdqu %xmm2, %xmm6 + movdqu %xmm3, %xmm7 + psllq $60, %xmm4 + psllq $60, %xmm5 + psllq $60, %xmm6 + psllq $60, %xmm7 + psrlq $4, %xmm0 + psrlq $4, %xmm1 + psrlq $4, %xmm2 + psrlq $4, %xmm3 + psrldq $8, %xmm4 + psrldq $8, %xmm5 + psrldq $8, %xmm6 + psrldq $8, %xmm7 + por %xmm4, %xmm0 + por %xmm5, %xmm1 + por %xmm6, %xmm2 + por %xmm7, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + movdqu %xmm0, 320(%rsi) + movdqu %xmm1, 336(%rsi) + movdqu %xmm2, 352(%rsi) + movdqu %xmm3, 368(%rsi) + movdqu 128(%rsi), %xmm0 + movdqu 144(%rsi), %xmm1 + movdqu 160(%rsi), %xmm2 + movdqu 176(%rsi), %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 + movdqu %xmm0, %xmm4 + movdqu %xmm1, %xmm5 + movdqu %xmm2, %xmm6 + movdqu %xmm3, %xmm7 + psllq $60, %xmm4 + psllq $60, %xmm5 + psllq $60, %xmm6 + psllq $60, %xmm7 + psrlq $4, %xmm0 + psrlq $4, %xmm1 + psrlq $4, %xmm2 + psrlq $4, %xmm3 + psrldq $8, %xmm4 + psrldq $8, %xmm5 + psrldq $8, %xmm6 + psrldq $8, %xmm7 + por %xmm4, %xmm0 + por %xmm5, %xmm1 + por %xmm6, %xmm2 + por %xmm7, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + movdqu %xmm0, 384(%rsi) + movdqu %xmm1, 400(%rsi) + movdqu %xmm2, 416(%rsi) + movdqu %xmm3, 432(%rsi) + movdqu 192(%rsi), %xmm0 + movdqu 208(%rsi), %xmm1 + movdqu 224(%rsi), %xmm2 + movdqu 240(%rsi), %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 + movdqu %xmm0, %xmm4 + movdqu %xmm1, %xmm5 + movdqu %xmm2, %xmm6 + movdqu %xmm3, %xmm7 + psllq $60, %xmm4 + psllq $60, %xmm5 + psllq $60, %xmm6 + psllq $60, %xmm7 + psrlq $4, %xmm0 + psrlq $4, %xmm1 + psrlq $4, %xmm2 + psrlq $4, %xmm3 + psrldq $8, %xmm4 + psrldq $8, %xmm5 + psrldq $8, %xmm6 + psrldq $8, %xmm7 + por %xmm4, %xmm0 + por %xmm5, %xmm1 + por %xmm6, %xmm2 + por %xmm7, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + movdqu %xmm0, 448(%rsi) + movdqu %xmm1, 464(%rsi) + movdqu %xmm2, 480(%rsi) + movdqu %xmm3, 496(%rsi) + repz retq +#ifndef __APPLE__ +.size GCM_generate_m0_aesni,.-GCM_generate_m0_aesni +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ L_aes_gcm_one: .quad 0x0, 0x1 #ifndef __APPLE__ @@ -6221,6 +6487,238 @@ L_AES_GCM_decrypt_final_aesni_cmp_tag_done: #else .p2align 4 #endif /* __APPLE__ */ +L_GCM_generate_m0_avx1_rev8: +.quad 0x8090a0b0c0d0e0f, 0x1020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_avx1_mod2_128: +.quad 0x0, 0xe100000000000000 +#ifndef __APPLE__ +.text +.globl GCM_generate_m0_avx1 +.type GCM_generate_m0_avx1,@function +.align 16 +GCM_generate_m0_avx1: +#else +.section __TEXT,__text +.globl _GCM_generate_m0_avx1 +.p2align 4 +_GCM_generate_m0_avx1: +#endif /* __APPLE__ */ + vmovdqu L_GCM_generate_m0_avx1_rev8(%rip), %xmm9 + vmovdqu L_GCM_generate_m0_avx1_mod2_128(%rip), %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqu (%rdi), %xmm0 + vmovdqu %xmm8, (%rsi) + vmovdqu %xmm0, %xmm8 + vpshufb %xmm9, %xmm0, %xmm0 + vpsllq $63, %xmm0, %xmm5 + vpsrlq $0x01, %xmm0, %xmm4 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm1, %xmm1 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm1, %xmm1 + vpand %xmm10, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpsllq $63, %xmm1, %xmm5 + vpsrlq $0x01, %xmm1, %xmm4 + vpslldq $8, %xmm5, %xmm2 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm2, %xmm2 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm2, %xmm2 + vpand %xmm10, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpsllq $63, %xmm2, %xmm5 + vpsrlq $0x01, %xmm2, %xmm4 + vpslldq $8, %xmm5, %xmm3 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm3, %xmm3 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm3, %xmm3 + vpand %xmm10, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpshufb %xmm9, %xmm3, %xmm3 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm0, %xmm0 + vpxor %xmm2, %xmm3, %xmm8 + vmovdqu %xmm3, 16(%rsi) + vmovdqu %xmm2, 32(%rsi) + vmovdqu %xmm8, 48(%rsi) + vmovdqu %xmm1, 64(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 80(%rsi) + vmovdqu %xmm5, 96(%rsi) + vmovdqu %xmm6, 112(%rsi) + vmovdqu %xmm0, 128(%rsi) + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm0, %xmm3, %xmm4 + vpxor %xmm0, %xmm2, %xmm6 + vmovdqu %xmm4, 144(%rsi) + vmovdqu %xmm6, 160(%rsi) + vpxor %xmm6, %xmm3, %xmm6 + vmovdqu %xmm6, 176(%rsi) + vmovdqu %xmm1, 192(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 208(%rsi) + vmovdqu %xmm5, 224(%rsi) + vmovdqu %xmm6, 240(%rsi) + vmovdqu (%rsi), %xmm0 + vmovdqu 16(%rsi), %xmm1 + vmovdqu 32(%rsi), %xmm2 + vmovdqu 48(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 256(%rsi) + vmovdqu %xmm1, 272(%rsi) + vmovdqu %xmm2, 288(%rsi) + vmovdqu %xmm3, 304(%rsi) + vmovdqu 64(%rsi), %xmm0 + vmovdqu 80(%rsi), %xmm1 + vmovdqu 96(%rsi), %xmm2 + vmovdqu 112(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 320(%rsi) + vmovdqu %xmm1, 336(%rsi) + vmovdqu %xmm2, 352(%rsi) + vmovdqu %xmm3, 368(%rsi) + vmovdqu 128(%rsi), %xmm0 + vmovdqu 144(%rsi), %xmm1 + vmovdqu 160(%rsi), %xmm2 + vmovdqu 176(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 384(%rsi) + vmovdqu %xmm1, 400(%rsi) + vmovdqu %xmm2, 416(%rsi) + vmovdqu %xmm3, 432(%rsi) + vmovdqu 192(%rsi), %xmm0 + vmovdqu 208(%rsi), %xmm1 + vmovdqu 224(%rsi), %xmm2 + vmovdqu 240(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 448(%rsi) + vmovdqu %xmm1, 464(%rsi) + vmovdqu %xmm2, 480(%rsi) + vmovdqu %xmm3, 496(%rsi) + repz retq +#ifndef __APPLE__ +.size GCM_generate_m0_avx1,.-GCM_generate_m0_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ L_avx1_aes_gcm_one: .quad 0x0, 0x1 #ifndef __APPLE__ @@ -11454,6 +11952,238 @@ L_AES_GCM_decrypt_final_avx1_cmp_tag_done: #else .p2align 4 #endif /* __APPLE__ */ +L_GCM_generate_m0_avx2_rev8: +.quad 0x8090a0b0c0d0e0f, 0x1020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_avx2_mod2_128: +.quad 0x0, 0xe100000000000000 +#ifndef __APPLE__ +.text +.globl GCM_generate_m0_avx2 +.type GCM_generate_m0_avx2,@function +.align 16 +GCM_generate_m0_avx2: +#else +.section __TEXT,__text +.globl _GCM_generate_m0_avx2 +.p2align 4 +_GCM_generate_m0_avx2: +#endif /* __APPLE__ */ + vmovdqu L_GCM_generate_m0_avx2_rev8(%rip), %xmm9 + vmovdqu L_GCM_generate_m0_avx2_mod2_128(%rip), %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqu (%rdi), %xmm0 + vmovdqu %xmm8, (%rsi) + vmovdqu %xmm0, %xmm8 + vpshufb %xmm9, %xmm0, %xmm0 + vpsllq $63, %xmm0, %xmm5 + vpsrlq $0x01, %xmm0, %xmm4 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm1, %xmm1 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm1, %xmm1 + vpand %xmm10, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpsllq $63, %xmm1, %xmm5 + vpsrlq $0x01, %xmm1, %xmm4 + vpslldq $8, %xmm5, %xmm2 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm2, %xmm2 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm2, %xmm2 + vpand %xmm10, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpsllq $63, %xmm2, %xmm5 + vpsrlq $0x01, %xmm2, %xmm4 + vpslldq $8, %xmm5, %xmm3 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm3, %xmm3 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm3, %xmm3 + vpand %xmm10, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpxor %xmm2, %xmm3, %xmm8 + vmovdqu %xmm3, 16(%rsi) + vmovdqu %xmm2, 32(%rsi) + vmovdqu %xmm8, 48(%rsi) + vmovdqu %xmm1, 64(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 80(%rsi) + vmovdqu %xmm5, 96(%rsi) + vmovdqu %xmm6, 112(%rsi) + vmovdqu %xmm0, 128(%rsi) + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm0, %xmm3, %xmm4 + vpxor %xmm0, %xmm2, %xmm6 + vmovdqu %xmm4, 144(%rsi) + vmovdqu %xmm6, 160(%rsi) + vpxor %xmm6, %xmm3, %xmm6 + vmovdqu %xmm6, 176(%rsi) + vmovdqu %xmm1, 192(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 208(%rsi) + vmovdqu %xmm5, 224(%rsi) + vmovdqu %xmm6, 240(%rsi) + vmovdqu (%rsi), %xmm0 + vmovdqu 16(%rsi), %xmm1 + vmovdqu 32(%rsi), %xmm2 + vmovdqu 48(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 256(%rsi) + vmovdqu %xmm1, 272(%rsi) + vmovdqu %xmm2, 288(%rsi) + vmovdqu %xmm3, 304(%rsi) + vmovdqu 64(%rsi), %xmm0 + vmovdqu 80(%rsi), %xmm1 + vmovdqu 96(%rsi), %xmm2 + vmovdqu 112(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 320(%rsi) + vmovdqu %xmm1, 336(%rsi) + vmovdqu %xmm2, 352(%rsi) + vmovdqu %xmm3, 368(%rsi) + vmovdqu 128(%rsi), %xmm0 + vmovdqu 144(%rsi), %xmm1 + vmovdqu 160(%rsi), %xmm2 + vmovdqu 176(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 384(%rsi) + vmovdqu %xmm1, 400(%rsi) + vmovdqu %xmm2, 416(%rsi) + vmovdqu %xmm3, 432(%rsi) + vmovdqu 192(%rsi), %xmm0 + vmovdqu 208(%rsi), %xmm1 + vmovdqu 224(%rsi), %xmm2 + vmovdqu 240(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 448(%rsi) + vmovdqu %xmm1, 464(%rsi) + vmovdqu %xmm2, 480(%rsi) + vmovdqu %xmm3, 496(%rsi) + repz retq +#ifndef __APPLE__ +.size GCM_generate_m0_avx2,.-GCM_generate_m0_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ L_avx2_aes_gcm_one: .quad 0x0, 0x1 #ifndef __APPLE__ diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm index 4da29028c..2e4683cdd 100644 --- a/wolfcrypt/src/aes_gcm_asm.asm +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -40,6 +40,259 @@ IFNDEF _WIN64 _WIN64 = 1 ENDIF +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_aesni_rev8 QWORD 579005069656919567, 283686952306183 +ptr_L_GCM_generate_m0_aesni_rev8 QWORD L_GCM_generate_m0_aesni_rev8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_aesni_mod2_128 QWORD 0, 16212958658533785600 +ptr_L_GCM_generate_m0_aesni_mod2_128 QWORD L_GCM_generate_m0_aesni_mod2_128 +_DATA ENDS +_text SEGMENT READONLY PARA +GCM_generate_m0_aesni PROC + sub rsp, 80 + movdqu OWORD PTR [rsp], xmm6 + movdqu OWORD PTR [rsp+16], xmm7 + movdqu OWORD PTR [rsp+32], xmm8 + movdqu OWORD PTR [rsp+48], xmm9 + movdqu OWORD PTR [rsp+64], xmm10 + movdqu xmm9, OWORD PTR L_GCM_generate_m0_aesni_rev8 + movdqu xmm10, OWORD PTR L_GCM_generate_m0_aesni_mod2_128 + pxor xmm8, xmm8 + movdqu xmm0, OWORD PTR [rcx] + movdqu OWORD PTR [rdx], xmm8 + movdqu xmm8, xmm0 + pshufb xmm0, xmm9 + movdqu xmm5, xmm0 + movdqu xmm4, xmm0 + psllq xmm5, 63 + psrlq xmm4, 1 + movdqu xmm1, xmm5 + pslldq xmm1, 8 + psrldq xmm5, 8 + pshufd xmm1, xmm1, 255 + por xmm4, xmm5 + psrad xmm1, 31 + pand xmm1, xmm10 + pxor xmm1, xmm4 + movdqu xmm5, xmm1 + movdqu xmm4, xmm1 + psllq xmm5, 63 + psrlq xmm4, 1 + movdqu xmm2, xmm5 + pslldq xmm2, 8 + psrldq xmm5, 8 + pshufd xmm2, xmm2, 255 + por xmm4, xmm5 + psrad xmm2, 31 + pand xmm2, xmm10 + pxor xmm2, xmm4 + movdqu xmm5, xmm2 + movdqu xmm4, xmm2 + psllq xmm5, 63 + psrlq xmm4, 1 + movdqu xmm3, xmm5 + pslldq xmm3, 8 + psrldq xmm5, 8 + pshufd xmm3, xmm3, 255 + por xmm4, xmm5 + psrad xmm3, 31 + pand xmm3, xmm10 + pxor xmm3, xmm4 + pshufb xmm3, xmm9 + pshufb xmm2, xmm9 + movdqu xmm8, xmm3 + pshufb xmm1, xmm9 + pshufb xmm0, xmm9 + pxor xmm8, xmm2 + movdqu OWORD PTR [rdx+16], xmm3 + movdqu OWORD PTR [rdx+32], xmm2 + movdqu OWORD PTR [rdx+48], xmm8 + movdqu OWORD PTR [rdx+64], xmm1 + movdqu xmm4, xmm3 + movdqu xmm5, xmm2 + movdqu xmm6, xmm8 + pxor xmm4, xmm1 + pxor xmm5, xmm1 + pxor xmm6, xmm1 + movdqu OWORD PTR [rdx+80], xmm4 + movdqu OWORD PTR [rdx+96], xmm5 + movdqu OWORD PTR [rdx+112], xmm6 + movdqu OWORD PTR [rdx+128], xmm0 + pxor xmm1, xmm0 + movdqu xmm4, xmm3 + movdqu xmm6, xmm2 + pxor xmm4, xmm0 + pxor xmm6, xmm0 + movdqu OWORD PTR [rdx+144], xmm4 + movdqu OWORD PTR [rdx+160], xmm6 + pxor xmm6, xmm3 + movdqu OWORD PTR [rdx+176], xmm6 + movdqu OWORD PTR [rdx+192], xmm1 + movdqu xmm4, xmm3 + movdqu xmm5, xmm2 + movdqu xmm6, xmm8 + pxor xmm4, xmm1 + pxor xmm5, xmm1 + pxor xmm6, xmm1 + movdqu OWORD PTR [rdx+208], xmm4 + movdqu OWORD PTR [rdx+224], xmm5 + movdqu OWORD PTR [rdx+240], xmm6 + movdqu xmm0, OWORD PTR [rdx] + movdqu xmm1, OWORD PTR [rdx+16] + movdqu xmm2, OWORD PTR [rdx+32] + movdqu xmm3, OWORD PTR [rdx+48] + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 + movdqu xmm4, xmm0 + movdqu xmm5, xmm1 + movdqu xmm6, xmm2 + movdqu xmm7, xmm3 + psllq xmm4, 60 + psllq xmm5, 60 + psllq xmm6, 60 + psllq xmm7, 60 + psrlq xmm0, 4 + psrlq xmm1, 4 + psrlq xmm2, 4 + psrlq xmm3, 4 + psrldq xmm4, 8 + psrldq xmm5, 8 + psrldq xmm6, 8 + psrldq xmm7, 8 + por xmm0, xmm4 + por xmm1, xmm5 + por xmm2, xmm6 + por xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + movdqu OWORD PTR [rdx+256], xmm0 + movdqu OWORD PTR [rdx+272], xmm1 + movdqu OWORD PTR [rdx+288], xmm2 + movdqu OWORD PTR [rdx+304], xmm3 + movdqu xmm0, OWORD PTR [rdx+64] + movdqu xmm1, OWORD PTR [rdx+80] + movdqu xmm2, OWORD PTR [rdx+96] + movdqu xmm3, OWORD PTR [rdx+112] + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 + movdqu xmm4, xmm0 + movdqu xmm5, xmm1 + movdqu xmm6, xmm2 + movdqu xmm7, xmm3 + psllq xmm4, 60 + psllq xmm5, 60 + psllq xmm6, 60 + psllq xmm7, 60 + psrlq xmm0, 4 + psrlq xmm1, 4 + psrlq xmm2, 4 + psrlq xmm3, 4 + psrldq xmm4, 8 + psrldq xmm5, 8 + psrldq xmm6, 8 + psrldq xmm7, 8 + por xmm0, xmm4 + por xmm1, xmm5 + por xmm2, xmm6 + por xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + movdqu OWORD PTR [rdx+320], xmm0 + movdqu OWORD PTR [rdx+336], xmm1 + movdqu OWORD PTR [rdx+352], xmm2 + movdqu OWORD PTR [rdx+368], xmm3 + movdqu xmm0, OWORD PTR [rdx+128] + movdqu xmm1, OWORD PTR [rdx+144] + movdqu xmm2, OWORD PTR [rdx+160] + movdqu xmm3, OWORD PTR [rdx+176] + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 + movdqu xmm4, xmm0 + movdqu xmm5, xmm1 + movdqu xmm6, xmm2 + movdqu xmm7, xmm3 + psllq xmm4, 60 + psllq xmm5, 60 + psllq xmm6, 60 + psllq xmm7, 60 + psrlq xmm0, 4 + psrlq xmm1, 4 + psrlq xmm2, 4 + psrlq xmm3, 4 + psrldq xmm4, 8 + psrldq xmm5, 8 + psrldq xmm6, 8 + psrldq xmm7, 8 + por xmm0, xmm4 + por xmm1, xmm5 + por xmm2, xmm6 + por xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + movdqu OWORD PTR [rdx+384], xmm0 + movdqu OWORD PTR [rdx+400], xmm1 + movdqu OWORD PTR [rdx+416], xmm2 + movdqu OWORD PTR [rdx+432], xmm3 + movdqu xmm0, OWORD PTR [rdx+192] + movdqu xmm1, OWORD PTR [rdx+208] + movdqu xmm2, OWORD PTR [rdx+224] + movdqu xmm3, OWORD PTR [rdx+240] + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 + movdqu xmm4, xmm0 + movdqu xmm5, xmm1 + movdqu xmm6, xmm2 + movdqu xmm7, xmm3 + psllq xmm4, 60 + psllq xmm5, 60 + psllq xmm6, 60 + psllq xmm7, 60 + psrlq xmm0, 4 + psrlq xmm1, 4 + psrlq xmm2, 4 + psrlq xmm3, 4 + psrldq xmm4, 8 + psrldq xmm5, 8 + psrldq xmm6, 8 + psrldq xmm7, 8 + por xmm0, xmm4 + por xmm1, xmm5 + por xmm2, xmm6 + por xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + movdqu OWORD PTR [rdx+448], xmm0 + movdqu OWORD PTR [rdx+464], xmm1 + movdqu OWORD PTR [rdx+480], xmm2 + movdqu OWORD PTR [rdx+496], xmm3 + movdqu xmm6, OWORD PTR [rsp] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm8, OWORD PTR [rsp+32] + movdqu xmm9, OWORD PTR [rsp+48] + movdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +GCM_generate_m0_aesni ENDP +_text ENDS _DATA SEGMENT ALIGN 16 L_aes_gcm_one QWORD 0, 1 @@ -6205,6 +6458,225 @@ _text ENDS IFDEF HAVE_INTEL_AVX1 _DATA SEGMENT ALIGN 16 +L_GCM_generate_m0_avx1_rev8 QWORD 579005069656919567, 283686952306183 +ptr_L_GCM_generate_m0_avx1_rev8 QWORD L_GCM_generate_m0_avx1_rev8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_avx1_mod2_128 QWORD 0, 16212958658533785600 +ptr_L_GCM_generate_m0_avx1_mod2_128 QWORD L_GCM_generate_m0_avx1_mod2_128 +_DATA ENDS +_text SEGMENT READONLY PARA +GCM_generate_m0_avx1 PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu xmm9, OWORD PTR L_GCM_generate_m0_avx1_rev8 + vmovdqu xmm10, OWORD PTR L_GCM_generate_m0_avx1_mod2_128 + vpxor xmm8, xmm8, xmm8 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu xmm8, xmm0 + vpshufb xmm0, xmm0, xmm9 + vpsllq xmm5, xmm0, 63 + vpsrlq xmm4, xmm0, 1 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm1, xmm1, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm1, xmm1, 31 + vpand xmm1, xmm1, xmm10 + vpxor xmm1, xmm1, xmm4 + vpsllq xmm5, xmm1, 63 + vpsrlq xmm4, xmm1, 1 + vpslldq xmm2, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm2, xmm2, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm2, xmm2, 31 + vpand xmm2, xmm2, xmm10 + vpxor xmm2, xmm2, xmm4 + vpsllq xmm5, xmm2, 63 + vpsrlq xmm4, xmm2, 1 + vpslldq xmm3, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm3, xmm3, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm3, xmm3, 31 + vpand xmm3, xmm3, xmm10 + vpxor xmm3, xmm3, xmm4 + vpshufb xmm3, xmm3, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm0, xmm0, xmm9 + vpxor xmm8, xmm3, xmm2 + vmovdqu OWORD PTR [rdx+16], xmm3 + vmovdqu OWORD PTR [rdx+32], xmm2 + vmovdqu OWORD PTR [rdx+48], xmm8 + vmovdqu OWORD PTR [rdx+64], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+80], xmm4 + vmovdqu OWORD PTR [rdx+96], xmm5 + vmovdqu OWORD PTR [rdx+112], xmm6 + vmovdqu OWORD PTR [rdx+128], xmm0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm3, xmm0 + vpxor xmm6, xmm2, xmm0 + vmovdqu OWORD PTR [rdx+144], xmm4 + vmovdqu OWORD PTR [rdx+160], xmm6 + vpxor xmm6, xmm3, xmm6 + vmovdqu OWORD PTR [rdx+176], xmm6 + vmovdqu OWORD PTR [rdx+192], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+208], xmm4 + vmovdqu OWORD PTR [rdx+224], xmm5 + vmovdqu OWORD PTR [rdx+240], xmm6 + vmovdqu xmm0, OWORD PTR [rdx] + vmovdqu xmm1, OWORD PTR [rdx+16] + vmovdqu xmm2, OWORD PTR [rdx+32] + vmovdqu xmm3, OWORD PTR [rdx+48] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+256], xmm0 + vmovdqu OWORD PTR [rdx+272], xmm1 + vmovdqu OWORD PTR [rdx+288], xmm2 + vmovdqu OWORD PTR [rdx+304], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+64] + vmovdqu xmm1, OWORD PTR [rdx+80] + vmovdqu xmm2, OWORD PTR [rdx+96] + vmovdqu xmm3, OWORD PTR [rdx+112] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+320], xmm0 + vmovdqu OWORD PTR [rdx+336], xmm1 + vmovdqu OWORD PTR [rdx+352], xmm2 + vmovdqu OWORD PTR [rdx+368], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+128] + vmovdqu xmm1, OWORD PTR [rdx+144] + vmovdqu xmm2, OWORD PTR [rdx+160] + vmovdqu xmm3, OWORD PTR [rdx+176] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+384], xmm0 + vmovdqu OWORD PTR [rdx+400], xmm1 + vmovdqu OWORD PTR [rdx+416], xmm2 + vmovdqu OWORD PTR [rdx+432], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+192] + vmovdqu xmm1, OWORD PTR [rdx+208] + vmovdqu xmm2, OWORD PTR [rdx+224] + vmovdqu xmm3, OWORD PTR [rdx+240] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+448], xmm0 + vmovdqu OWORD PTR [rdx+464], xmm1 + vmovdqu OWORD PTR [rdx+480], xmm2 + vmovdqu OWORD PTR [rdx+496], xmm3 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +GCM_generate_m0_avx1 ENDP +_text ENDS +_DATA SEGMENT +ALIGN 16 L_avx1_aes_gcm_one QWORD 0, 1 ptr_L_avx1_aes_gcm_one QWORD L_avx1_aes_gcm_one _DATA ENDS @@ -11436,6 +11908,225 @@ ENDIF IFDEF HAVE_INTEL_AVX2 _DATA SEGMENT ALIGN 16 +L_GCM_generate_m0_avx2_rev8 QWORD 579005069656919567, 283686952306183 +ptr_L_GCM_generate_m0_avx2_rev8 QWORD L_GCM_generate_m0_avx2_rev8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_avx2_mod2_128 QWORD 0, 16212958658533785600 +ptr_L_GCM_generate_m0_avx2_mod2_128 QWORD L_GCM_generate_m0_avx2_mod2_128 +_DATA ENDS +_text SEGMENT READONLY PARA +GCM_generate_m0_avx2 PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu xmm9, OWORD PTR L_GCM_generate_m0_avx2_rev8 + vmovdqu xmm10, OWORD PTR L_GCM_generate_m0_avx2_mod2_128 + vpxor xmm8, xmm8, xmm8 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu xmm8, xmm0 + vpshufb xmm0, xmm0, xmm9 + vpsllq xmm5, xmm0, 63 + vpsrlq xmm4, xmm0, 1 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm1, xmm1, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm1, xmm1, 31 + vpand xmm1, xmm1, xmm10 + vpxor xmm1, xmm1, xmm4 + vpsllq xmm5, xmm1, 63 + vpsrlq xmm4, xmm1, 1 + vpslldq xmm2, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm2, xmm2, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm2, xmm2, 31 + vpand xmm2, xmm2, xmm10 + vpxor xmm2, xmm2, xmm4 + vpsllq xmm5, xmm2, 63 + vpsrlq xmm4, xmm2, 1 + vpslldq xmm3, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm3, xmm3, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm3, xmm3, 31 + vpand xmm3, xmm3, xmm10 + vpxor xmm3, xmm3, xmm4 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpxor xmm8, xmm3, xmm2 + vmovdqu OWORD PTR [rdx+16], xmm3 + vmovdqu OWORD PTR [rdx+32], xmm2 + vmovdqu OWORD PTR [rdx+48], xmm8 + vmovdqu OWORD PTR [rdx+64], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+80], xmm4 + vmovdqu OWORD PTR [rdx+96], xmm5 + vmovdqu OWORD PTR [rdx+112], xmm6 + vmovdqu OWORD PTR [rdx+128], xmm0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm3, xmm0 + vpxor xmm6, xmm2, xmm0 + vmovdqu OWORD PTR [rdx+144], xmm4 + vmovdqu OWORD PTR [rdx+160], xmm6 + vpxor xmm6, xmm3, xmm6 + vmovdqu OWORD PTR [rdx+176], xmm6 + vmovdqu OWORD PTR [rdx+192], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+208], xmm4 + vmovdqu OWORD PTR [rdx+224], xmm5 + vmovdqu OWORD PTR [rdx+240], xmm6 + vmovdqu xmm0, OWORD PTR [rdx] + vmovdqu xmm1, OWORD PTR [rdx+16] + vmovdqu xmm2, OWORD PTR [rdx+32] + vmovdqu xmm3, OWORD PTR [rdx+48] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+256], xmm0 + vmovdqu OWORD PTR [rdx+272], xmm1 + vmovdqu OWORD PTR [rdx+288], xmm2 + vmovdqu OWORD PTR [rdx+304], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+64] + vmovdqu xmm1, OWORD PTR [rdx+80] + vmovdqu xmm2, OWORD PTR [rdx+96] + vmovdqu xmm3, OWORD PTR [rdx+112] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+320], xmm0 + vmovdqu OWORD PTR [rdx+336], xmm1 + vmovdqu OWORD PTR [rdx+352], xmm2 + vmovdqu OWORD PTR [rdx+368], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+128] + vmovdqu xmm1, OWORD PTR [rdx+144] + vmovdqu xmm2, OWORD PTR [rdx+160] + vmovdqu xmm3, OWORD PTR [rdx+176] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+384], xmm0 + vmovdqu OWORD PTR [rdx+400], xmm1 + vmovdqu OWORD PTR [rdx+416], xmm2 + vmovdqu OWORD PTR [rdx+432], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+192] + vmovdqu xmm1, OWORD PTR [rdx+208] + vmovdqu xmm2, OWORD PTR [rdx+224] + vmovdqu xmm3, OWORD PTR [rdx+240] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+448], xmm0 + vmovdqu OWORD PTR [rdx+464], xmm1 + vmovdqu OWORD PTR [rdx+480], xmm2 + vmovdqu OWORD PTR [rdx+496], xmm3 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +GCM_generate_m0_avx2 ENDP +_text ENDS +_DATA SEGMENT +ALIGN 16 L_avx2_aes_gcm_one QWORD 0, 1 ptr_L_avx2_aes_gcm_one QWORD L_avx2_aes_gcm_one _DATA ENDS diff --git a/wolfcrypt/src/hmac.c b/wolfcrypt/src/hmac.c index cc52010e2..9a7b82384 100644 --- a/wolfcrypt/src/hmac.c +++ b/wolfcrypt/src/hmac.c @@ -155,76 +155,72 @@ int wc_HmacSizeByType(int type) return ret; } -int _InitHmac(Hmac* hmac, int type, void* heap) +static int HmacKeyInitHash(wc_HmacHash* hash, int type, void* heap, int devId) { int ret = 0; -#ifdef WOLF_CRYPTO_CB - int devId = hmac->devId; -#else - int devId = INVALID_DEVID; -#endif + switch (type) { #ifndef NO_MD5 case WC_MD5: - ret = wc_InitMd5_ex(&hmac->hash.md5, heap, devId); + ret = wc_InitMd5_ex(&hash->md5, heap, devId); break; #endif /* !NO_MD5 */ #ifndef NO_SHA case WC_SHA: - ret = wc_InitSha_ex(&hmac->hash.sha, heap, devId); + ret = wc_InitSha_ex(&hash->sha, heap, devId); break; #endif /* !NO_SHA */ #ifdef WOLFSSL_SHA224 case WC_SHA224: - ret = wc_InitSha224_ex(&hmac->hash.sha224, heap, devId); + ret = wc_InitSha224_ex(&hash->sha224, heap, devId); break; #endif /* WOLFSSL_SHA224 */ #ifndef NO_SHA256 case WC_SHA256: - ret = wc_InitSha256_ex(&hmac->hash.sha256, heap, devId); + ret = wc_InitSha256_ex(&hash->sha256, heap, devId); break; #endif /* !NO_SHA256 */ #ifdef WOLFSSL_SHA384 case WC_SHA384: - ret = wc_InitSha384_ex(&hmac->hash.sha384, heap, devId); + ret = wc_InitSha384_ex(&hash->sha384, heap, devId); break; #endif /* WOLFSSL_SHA384 */ #ifdef WOLFSSL_SHA512 case WC_SHA512: - ret = wc_InitSha512_ex(&hmac->hash.sha512, heap, devId); + ret = wc_InitSha512_ex(&hash->sha512, heap, devId); break; #endif /* WOLFSSL_SHA512 */ #ifdef WOLFSSL_SHA3 #ifndef WOLFSSL_NOSHA3_224 case WC_SHA3_224: - ret = wc_InitSha3_224(&hmac->hash.sha3, heap, devId); + ret = wc_InitSha3_224(&hash->sha3, heap, devId); break; #endif #ifndef WOLFSSL_NOSHA3_256 case WC_SHA3_256: - ret = wc_InitSha3_256(&hmac->hash.sha3, heap, devId); + ret = wc_InitSha3_256(&hash->sha3, heap, devId); break; #endif #ifndef WOLFSSL_NOSHA3_384 case WC_SHA3_384: - ret = wc_InitSha3_384(&hmac->hash.sha3, heap, devId); + ret = wc_InitSha3_384(&hash->sha3, heap, devId); break; #endif #ifndef WOLFSSL_NOSHA3_512 case WC_SHA3_512: - ret = wc_InitSha3_512(&hmac->hash.sha3, heap, devId); + ret = wc_InitSha3_512(&hash->sha3, heap, devId); break; #endif #endif #ifdef WOLFSSL_SM3 case WC_SM3: - ret = wc_InitSm3(&hmac->hash.sm3, heap, devId); + ret = wc_InitSm3(&hash->sm3, heap, devId); break; #endif @@ -233,6 +229,22 @@ int _InitHmac(Hmac* hmac, int type, void* heap) break; } + return ret; +} + +int _InitHmac(Hmac* hmac, int type, void* heap) +{ + int ret; +#ifdef WOLF_CRYPTO_CB + int devId = hmac->devId; +#else + int devId = INVALID_DEVID; +#endif + + ret = HmacKeyInitHash(&hmac->hash, type, heap, devId); + if (ret != 0) + return ret; + /* default to NULL heap hint or test value */ #ifdef WOLFSSL_HEAP_TEST hmac->heap = (void*)WOLFSSL_HEAP_TEST; @@ -243,6 +255,158 @@ int _InitHmac(Hmac* hmac, int type, void* heap) return ret; } +#ifdef WOLFSSL_HMAC_COPY_HASH +static int HmacKeyCopyHash(byte macType, wc_HmacHash* src, wc_HmacHash* dst) +{ + int ret = 0; + + switch (macType) { + #ifndef NO_MD5 + case WC_MD5: + ret = wc_Md5Copy(&src->md5, &dst->md5); + break; + #endif /* !NO_MD5 */ + + #ifndef NO_SHA + case WC_SHA: + ret = wc_ShaCopy(&src->sha, &dst->sha); + break; + #endif /* !NO_SHA */ + + #ifdef WOLFSSL_SHA224 + case WC_SHA224: + ret = wc_Sha224Copy(&src->sha224, &dst->sha224); + break; + #endif /* WOLFSSL_SHA224 */ + #ifndef NO_SHA256 + case WC_SHA256: + ret = wc_Sha256Copy(&src->sha256, &dst->sha256); + break; + #endif /* !NO_SHA256 */ + + #ifdef WOLFSSL_SHA384 + case WC_SHA384: + ret = wc_Sha384Copy(&src->sha384, &dst->sha384); + break; + #endif /* WOLFSSL_SHA384 */ + #ifdef WOLFSSL_SHA512 + case WC_SHA512: + ret = wc_Sha512Copy(&src->sha512, &dst->sha512); + break; + #endif /* WOLFSSL_SHA512 */ + + #ifdef WOLFSSL_SHA3 + #ifndef WOLFSSL_NOSHA3_224 + case WC_SHA3_224: + ret = wc_Sha3_224_Copy(&src->sha3, &dst->sha3); + break; + #endif + #ifndef WOLFSSL_NOSHA3_256 + case WC_SHA3_256: + ret = wc_Sha3_256_Copy(&src->sha3, &dst->sha3); + break; + #endif + #ifndef WOLFSSL_NOSHA3_384 + case WC_SHA3_384: + ret = wc_Sha3_384_Copy(&src->sha3, &dst->sha3); + break; + #endif + #ifndef WOLFSSL_NOSHA3_512 + case WC_SHA3_512: + ret = wc_Sha3_512_Copy(&src->sha3, &dst->sha3); + break; + #endif + #endif /* WOLFSSL_SHA3 */ + + #ifdef WOLFSSL_SM3 + case WC_SM3: + ret = wc_Sm3Copy(&src->sm3, &dst->sm3); + break; + #endif + + default: + break; + } + + return ret; +} +#endif + +static int HmacKeyHashUpdate(byte macType, wc_HmacHash* hash, byte* pad) +{ + int ret = 0; + + switch (macType) { + #ifndef NO_MD5 + case WC_MD5: + ret = wc_Md5Update(&hash->md5, pad, WC_MD5_BLOCK_SIZE); + break; + #endif /* !NO_MD5 */ + + #ifndef NO_SHA + case WC_SHA: + ret = wc_ShaUpdate(&hash->sha, pad, WC_SHA_BLOCK_SIZE); + break; + #endif /* !NO_SHA */ + + #ifdef WOLFSSL_SHA224 + case WC_SHA224: + ret = wc_Sha224Update(&hash->sha224, pad, WC_SHA224_BLOCK_SIZE); + break; + #endif /* WOLFSSL_SHA224 */ + #ifndef NO_SHA256 + case WC_SHA256: + ret = wc_Sha256Update(&hash->sha256, pad, WC_SHA256_BLOCK_SIZE); + break; + #endif /* !NO_SHA256 */ + + #ifdef WOLFSSL_SHA384 + case WC_SHA384: + ret = wc_Sha384Update(&hash->sha384, pad, WC_SHA384_BLOCK_SIZE); + break; + #endif /* WOLFSSL_SHA384 */ + #ifdef WOLFSSL_SHA512 + case WC_SHA512: + ret = wc_Sha512Update(&hash->sha512, pad, WC_SHA512_BLOCK_SIZE); + break; + #endif /* WOLFSSL_SHA512 */ + + #ifdef WOLFSSL_SHA3 + #ifndef WOLFSSL_NOSHA3_224 + case WC_SHA3_224: + ret = wc_Sha3_224_Update(&hash->sha3, pad, WC_SHA3_224_BLOCK_SIZE); + break; + #endif + #ifndef WOLFSSL_NOSHA3_256 + case WC_SHA3_256: + ret = wc_Sha3_256_Update(&hash->sha3, pad, WC_SHA3_256_BLOCK_SIZE); + break; + #endif + #ifndef WOLFSSL_NOSHA3_384 + case WC_SHA3_384: + ret = wc_Sha3_384_Update(&hash->sha3, pad, WC_SHA3_384_BLOCK_SIZE); + break; + #endif + #ifndef WOLFSSL_NOSHA3_512 + case WC_SHA3_512: + ret = wc_Sha3_512_Update(&hash->sha3, pad, WC_SHA3_512_BLOCK_SIZE); + break; + #endif + #endif /* WOLFSSL_SHA3 */ + + #ifdef WOLFSSL_SM3 + case WC_SM3: + ret = wc_Sm3Update(&hash->sm3, pad, WC_SM3_BLOCK_SIZE); + break; + #endif + + default: + break; + } + + return ret; +} + int wc_HmacSetKey_ex(Hmac* hmac, int type, const byte* key, word32 length, int allowFlag) @@ -603,6 +767,29 @@ int wc_HmacSetKey_ex(Hmac* hmac, int type, const byte* key, word32 length, } } +#ifdef WOLFSSL_HMAC_COPY_HASH + if ( ret == 0) { + #ifdef WOLF_CRYPTO_CB + int devId = hmac->devId; + #else + int devId = INVALID_DEVID; + #endif + + ret = HmacKeyInitHash(&hmac->i_hash, hmac->macType, heap, devId); + if (ret != 0) + return ret; + ret = HmacKeyInitHash(&hmac->o_hash, hmac->macType, heap, devId); + if (ret != 0) + return ret; + ret = HmacKeyHashUpdate(hmac->macType, &hmac->i_hash, ip); + if (ret != 0) + return ret; + ret = HmacKeyHashUpdate(hmac->macType, &hmac->o_hash, op); + if (ret != 0) + return ret; + } +#endif + return ret; #endif /* WOLFSSL_MAXQ108X */ } @@ -618,96 +805,6 @@ int wc_HmacSetKey(Hmac* hmac, int type, const byte* key, word32 length) return wc_HmacSetKey_ex(hmac, type, key, length, allowFlag); } -static int HmacKeyInnerHash(Hmac* hmac) -{ - int ret = 0; - - switch (hmac->macType) { - #ifndef NO_MD5 - case WC_MD5: - ret = wc_Md5Update(&hmac->hash.md5, (byte*)hmac->ipad, - WC_MD5_BLOCK_SIZE); - break; - #endif /* !NO_MD5 */ - - #ifndef NO_SHA - case WC_SHA: - ret = wc_ShaUpdate(&hmac->hash.sha, (byte*)hmac->ipad, - WC_SHA_BLOCK_SIZE); - break; - #endif /* !NO_SHA */ - - #ifdef WOLFSSL_SHA224 - case WC_SHA224: - ret = wc_Sha224Update(&hmac->hash.sha224, (byte*)hmac->ipad, - WC_SHA224_BLOCK_SIZE); - break; - #endif /* WOLFSSL_SHA224 */ - #ifndef NO_SHA256 - case WC_SHA256: - ret = wc_Sha256Update(&hmac->hash.sha256, (byte*)hmac->ipad, - WC_SHA256_BLOCK_SIZE); - break; - #endif /* !NO_SHA256 */ - - #ifdef WOLFSSL_SHA384 - case WC_SHA384: - ret = wc_Sha384Update(&hmac->hash.sha384, (byte*)hmac->ipad, - WC_SHA384_BLOCK_SIZE); - break; - #endif /* WOLFSSL_SHA384 */ - #ifdef WOLFSSL_SHA512 - case WC_SHA512: - ret = wc_Sha512Update(&hmac->hash.sha512, (byte*)hmac->ipad, - WC_SHA512_BLOCK_SIZE); - break; - #endif /* WOLFSSL_SHA512 */ - - #ifdef WOLFSSL_SHA3 - #ifndef WOLFSSL_NOSHA3_224 - case WC_SHA3_224: - ret = wc_Sha3_224_Update(&hmac->hash.sha3, (byte*)hmac->ipad, - WC_SHA3_224_BLOCK_SIZE); - break; - #endif - #ifndef WOLFSSL_NOSHA3_256 - case WC_SHA3_256: - ret = wc_Sha3_256_Update(&hmac->hash.sha3, (byte*)hmac->ipad, - WC_SHA3_256_BLOCK_SIZE); - break; - #endif - #ifndef WOLFSSL_NOSHA3_384 - case WC_SHA3_384: - ret = wc_Sha3_384_Update(&hmac->hash.sha3, (byte*)hmac->ipad, - WC_SHA3_384_BLOCK_SIZE); - break; - #endif - #ifndef WOLFSSL_NOSHA3_512 - case WC_SHA3_512: - ret = wc_Sha3_512_Update(&hmac->hash.sha3, (byte*)hmac->ipad, - WC_SHA3_512_BLOCK_SIZE); - break; - #endif - #endif /* WOLFSSL_SHA3 */ - - #ifdef WOLFSSL_SM3 - case WC_SM3: - ret = wc_Sm3Update(&hmac->hash.sm3, (byte*)hmac->ipad, - WC_SM3_BLOCK_SIZE); - break; - #endif - - default: - break; - } - - if (ret == 0) - hmac->innerHashKeyed = WC_HMAC_INNER_HASH_KEYED_SW; - - return ret; -} - - int wc_HmacUpdate(Hmac* hmac, const byte* msg, word32 length) { int ret = 0; @@ -739,9 +836,14 @@ int wc_HmacUpdate(Hmac* hmac, const byte* msg, word32 length) #endif /* WOLFSSL_ASYNC_CRYPT */ if (!hmac->innerHashKeyed) { - ret = HmacKeyInnerHash(hmac); +#ifndef WOLFSSL_HMAC_COPY_HASH + ret = HmacKeyHashUpdate(hmac->macType, &hmac->hash, (byte*)hmac->ipad); +#else + ret = HmacKeyCopyHash(hmac->macType, &hmac->i_hash, &hmac->hash); +#endif if (ret != 0) return ret; + hmac->innerHashKeyed = WC_HMAC_INNER_HASH_KEYED_SW; } switch (hmac->macType) { @@ -851,9 +953,14 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) #endif /* WOLFSSL_ASYNC_CRYPT */ if (!hmac->innerHashKeyed) { - ret = HmacKeyInnerHash(hmac); +#ifndef WOLFSSL_HMAC_COPY_HASH + ret = HmacKeyHashUpdate(hmac->macType, &hmac->hash, (byte*)hmac->ipad); +#else + ret = HmacKeyCopyHash(hmac->macType, &hmac->i_hash, &hmac->hash); +#endif if (ret != 0) return ret; + hmac->innerHashKeyed = WC_HMAC_INNER_HASH_KEYED_SW; } switch (hmac->macType) { @@ -862,8 +969,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Md5Final(&hmac->hash.md5, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Md5Update(&hmac->hash.md5, (byte*)hmac->opad, WC_MD5_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_MD5, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Md5Update(&hmac->hash.md5, (byte*)hmac->innerHash, @@ -879,8 +990,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_ShaFinal(&hmac->hash.sha, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_ShaUpdate(&hmac->hash.sha, (byte*)hmac->opad, WC_SHA_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SHA, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_ShaUpdate(&hmac->hash.sha, (byte*)hmac->innerHash, @@ -896,8 +1011,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Sha224Final(&hmac->hash.sha224, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Sha224Update(&hmac->hash.sha224, (byte*)hmac->opad, WC_SHA224_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SHA224, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Sha224Update(&hmac->hash.sha224, (byte*)hmac->innerHash, @@ -914,8 +1033,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Sha256Final(&hmac->hash.sha256, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Sha256Update(&hmac->hash.sha256, (byte*)hmac->opad, WC_SHA256_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SHA256, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Sha256Update(&hmac->hash.sha256, (byte*)hmac->innerHash, @@ -931,8 +1054,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Sha384Final(&hmac->hash.sha384, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Sha384Update(&hmac->hash.sha384, (byte*)hmac->opad, WC_SHA384_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SHA384, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Sha384Update(&hmac->hash.sha384, (byte*)hmac->innerHash, @@ -947,8 +1074,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Sha512Final(&hmac->hash.sha512, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Sha512Update(&hmac->hash.sha512, (byte*)hmac->opad, WC_SHA512_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SHA512, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Sha512Update(&hmac->hash.sha512, (byte*)hmac->innerHash, @@ -965,8 +1096,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Sha3_224_Final(&hmac->hash.sha3, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Sha3_224_Update(&hmac->hash.sha3, (byte*)hmac->opad, WC_SHA3_224_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SHA3_224, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Sha3_224_Update(&hmac->hash.sha3, (byte*)hmac->innerHash, @@ -981,8 +1116,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Sha3_256_Final(&hmac->hash.sha3, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Sha3_256_Update(&hmac->hash.sha3, (byte*)hmac->opad, WC_SHA3_256_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SHA3_256, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Sha3_256_Update(&hmac->hash.sha3, (byte*)hmac->innerHash, @@ -997,8 +1136,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Sha3_384_Final(&hmac->hash.sha3, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Sha3_384_Update(&hmac->hash.sha3, (byte*)hmac->opad, WC_SHA3_384_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SHA3_384, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Sha3_384_Update(&hmac->hash.sha3, (byte*)hmac->innerHash, @@ -1013,8 +1156,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Sha3_512_Final(&hmac->hash.sha3, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Sha3_512_Update(&hmac->hash.sha3, (byte*)hmac->opad, WC_SHA3_512_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SHA3_512, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Sha3_512_Update(&hmac->hash.sha3, (byte*)hmac->innerHash, @@ -1031,8 +1178,12 @@ int wc_HmacFinal(Hmac* hmac, byte* hash) ret = wc_Sm3Final(&hmac->hash.sm3, (byte*)hmac->innerHash); if (ret != 0) break; + #ifndef WOLFSSL_HMAC_COPY_HASH ret = wc_Sm3Update(&hmac->hash.sm3, (byte*)hmac->opad, WC_SM3_BLOCK_SIZE); + #else + ret = HmacKeyCopyHash(WC_SM3, &hmac->o_hash, &hmac->hash); + #endif if (ret != 0) break; ret = wc_Sm3Update(&hmac->hash.sm3, (byte*)hmac->innerHash, @@ -1163,34 +1314,58 @@ void wc_HmacFree(Hmac* hmac) #ifndef NO_MD5 case WC_MD5: wc_Md5Free(&hmac->hash.md5); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Md5Free(&hmac->i_hash.md5); + wc_Md5Free(&hmac->o_hash.md5); + #endif break; #endif /* !NO_MD5 */ #ifndef NO_SHA case WC_SHA: wc_ShaFree(&hmac->hash.sha); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_ShaFree(&hmac->i_hash.sha); + wc_ShaFree(&hmac->o_hash.sha); + #endif break; #endif /* !NO_SHA */ #ifdef WOLFSSL_SHA224 case WC_SHA224: wc_Sha224Free(&hmac->hash.sha224); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Sha224Free(&hmac->i_hash.sha224); + wc_Sha224Free(&hmac->o_hash.sha224); + #endif break; #endif /* WOLFSSL_SHA224 */ #ifndef NO_SHA256 case WC_SHA256: wc_Sha256Free(&hmac->hash.sha256); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Sha256Free(&hmac->i_hash.sha256); + wc_Sha256Free(&hmac->o_hash.sha256); + #endif break; #endif /* !NO_SHA256 */ #ifdef WOLFSSL_SHA384 case WC_SHA384: wc_Sha384Free(&hmac->hash.sha384); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Sha384Free(&hmac->i_hash.sha384); + wc_Sha384Free(&hmac->o_hash.sha384); + #endif break; #endif /* WOLFSSL_SHA384 */ #ifdef WOLFSSL_SHA512 case WC_SHA512: wc_Sha512Free(&hmac->hash.sha512); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Sha512Free(&hmac->i_hash.sha512); + wc_Sha512Free(&hmac->o_hash.sha512); + #endif break; #endif /* WOLFSSL_SHA512 */ @@ -1198,21 +1373,37 @@ void wc_HmacFree(Hmac* hmac) #ifndef WOLFSSL_NOSHA3_224 case WC_SHA3_224: wc_Sha3_224_Free(&hmac->hash.sha3); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Sha3_224_Free(&hmac->i_hash.sha3); + wc_Sha3_224_Free(&hmac->o_hash.sha3); + #endif break; #endif #ifndef WOLFSSL_NOSHA3_256 case WC_SHA3_256: wc_Sha3_256_Free(&hmac->hash.sha3); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Sha3_256_Free(&hmac->i_hash.sha3); + wc_Sha3_256_Free(&hmac->o_hash.sha3); + #endif break; #endif #ifndef WOLFSSL_NOSHA3_384 case WC_SHA3_384: wc_Sha3_384_Free(&hmac->hash.sha3); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Sha3_384_Free(&hmac->i_hash.sha3); + wc_Sha3_384_Free(&hmac->o_hash.sha3); + #endif break; #endif #ifndef WOLFSSL_NOSHA3_512 case WC_SHA3_512: wc_Sha3_512_Free(&hmac->hash.sha3); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Sha3_512_Free(&hmac->i_hash.sha3); + wc_Sha3_512_Free(&hmac->o_hash.sha3); + #endif break; #endif #endif /* WOLFSSL_SHA3 */ @@ -1220,6 +1411,10 @@ void wc_HmacFree(Hmac* hmac) #ifdef WOLFSSL_SM3 case WC_SM3: wc_Sm3Free(&hmac->hash.sm3); + #ifdef WOLFSSL_HMAC_COPY_HASH + wc_Sm3Free(&hmac->i_hash.sm3); + wc_Sm3Free(&hmac->i_hash.sm3); + #endif break; #endif diff --git a/wolfssl/wolfcrypt/hmac.h b/wolfssl/wolfcrypt/hmac.h index a4d9bb546..96da94c6c 100644 --- a/wolfssl/wolfcrypt/hmac.h +++ b/wolfssl/wolfcrypt/hmac.h @@ -124,6 +124,10 @@ typedef wc_Hashes wc_HmacHash; /* Hmac digest */ struct Hmac { wc_HmacHash hash; +#ifdef WOLFSSL_HMAC_COPY_HASH + wc_HmacHash i_hash; + wc_HmacHash o_hash; +#endif word32 ipad[WC_HMAC_BLOCK_SIZE / sizeof(word32)]; /* same block size all*/ word32 opad[WC_HMAC_BLOCK_SIZE / sizeof(word32)]; word32 innerHash[WC_MAX_DIGEST_SIZE / sizeof(word32)];