From 643f472cfb45f54be9cdfb8fe838f00e6853ee49 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 14 May 2024 15:17:50 +1000 Subject: [PATCH] AES-XTS ASM x64: Add Intel x64 implementation of streaming Changed APIs from wc_AesXts*Start -> wc_AesXts*Init. Enabled ASM for x64 in aes.c. AesXtsDecryptStart_sw same as AesXtsEncryptStart_sw so changed them to AesXtsInit_sw. --- wolfcrypt/src/aes.c | 73 +- wolfcrypt/src/aes_xts_asm.S | 1339 +++++++++++++++++++++++++++++++- wolfcrypt/src/aes_xts_asm.asm | 1359 ++++++++++++++++++++++++++++++++- wolfssl/wolfcrypt/aes.h | 4 +- 4 files changed, 2724 insertions(+), 51 deletions(-) diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index cce13b1a8..796683234 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -12530,9 +12530,9 @@ void AES_XTS_encrypt_aesni(const unsigned char *in, unsigned char *out, word32 s const unsigned char* key2, int nr) XASM_LINK("AES_XTS_encrypt_aesni"); #ifdef WOLFSSL_AESXTS_STREAM -void AES_XTS_encrypt_start_aesni(unsigned char* i, const unsigned char* tweak_key, +void AES_XTS_init_aesni(unsigned char* i, const unsigned char* tweak_key, int tweak_nr) - XASM_LINK("AES_XTS_encrypt_start_aesni"); + XASM_LINK("AES_XTS_init_aesni"); void AES_XTS_encrypt_update_aesni(const unsigned char *in, unsigned char *out, word32 sz, const unsigned char* key, unsigned char *i, int nr) XASM_LINK("AES_XTS_encrypt_update_aesni"); @@ -12544,9 +12544,9 @@ void AES_XTS_encrypt_avx1(const unsigned char *in, unsigned char *out, int nr) XASM_LINK("AES_XTS_encrypt_avx1"); #ifdef WOLFSSL_AESXTS_STREAM -void AES_XTS_encrypt_start_avx1(unsigned char* i, const unsigned char* tweak_key, +void AES_XTS_init_avx1(unsigned char* i, const unsigned char* tweak_key, int tweak_nr) - XASM_LINK("AES_XTS_encrypt_start_avx1"); + XASM_LINK("AES_XTS_init_avx1"); void AES_XTS_encrypt_update_avx1(const unsigned char *in, unsigned char *out, word32 sz, const unsigned char* key, unsigned char *i, int nr) XASM_LINK("AES_XTS_encrypt_update_avx1"); @@ -12559,9 +12559,6 @@ void AES_XTS_decrypt_aesni(const unsigned char *in, unsigned char *out, word32 s const unsigned char* key2, int nr) XASM_LINK("AES_XTS_decrypt_aesni"); #ifdef WOLFSSL_AESXTS_STREAM -void AES_XTS_decrypt_start_aesni(unsigned char* i, const unsigned char* tweak_key, - int tweak_nr) - XASM_LINK("AES_XTS_decrypt_start_aesni"); void AES_XTS_decrypt_update_aesni(const unsigned char *in, unsigned char *out, word32 sz, const unsigned char* key, unsigned char *i, int nr) XASM_LINK("AES_XTS_decrypt_update_aesni"); @@ -12573,9 +12570,6 @@ void AES_XTS_decrypt_avx1(const unsigned char *in, unsigned char *out, int nr) XASM_LINK("AES_XTS_decrypt_avx1"); #ifdef WOLFSSL_AESXTS_STREAM -void AES_XTS_decrypt_start_avx1(unsigned char* i, const unsigned char* tweak_key, - int tweak_nr) - XASM_LINK("AES_XTS_decrypt_start_avx1"); void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, word32 sz, const unsigned char* key, unsigned char *i, int nr) XASM_LINK("AES_XTS_decrypt_update_avx1"); @@ -12732,7 +12726,7 @@ static int AesXtsEncrypt_sw(XtsAes* xaes, byte* out, const byte* in, word32 sz, * * returns 0 on success */ -static int AesXtsEncryptStart_sw(XtsAes* xaes, byte* i) { +static int AesXtsInit_sw(XtsAes* xaes, byte* i) { return wc_AesEncryptDirect(&xaes->tweak, i, i); } @@ -12916,7 +12910,7 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, #ifdef WOLFSSL_AESXTS_STREAM -int wc_AesXtsEncryptStart(XtsAes* xaes, byte* i, word32 iSz) +int wc_AesXtsEncryptInit(XtsAes* xaes, byte* i, word32 iSz) { int ret; @@ -12942,30 +12936,28 @@ int wc_AesXtsEncryptStart(XtsAes* xaes, byte* i, word32 iSz) } { -#if 0 && defined(WOLFSSL_AESNI) +#ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { - AES_XTS_encrypt_start_avx1(i, - (const byte*)xaes->tweak.key, - (int)xaes->tweak.rounds); + AES_XTS_init_avx1(i, (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); ret = 0; } else #endif { - AES_XTS_encrypt_start_aesni(i, - (const byte*)xaes->tweak.key, - (int)xaes->tweak.rounds); + AES_XTS_init_aesni(i, (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); ret = 0; } RESTORE_VECTOR_REGISTERS(); } else -#endif /* 0 && defined(WOLFSSL_AESNI) */ +#endif /* WOLFSSL_AESNI */ { - ret = AesXtsEncryptStart_sw(xaes, i); + ret = AesXtsInit_sw(xaes, i); } } @@ -12989,7 +12981,7 @@ int wc_AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 sz, { int ret; -#if 0 && defined(WOLFSSL_AESNI) +#ifdef WOLFSSL_AESNI Aes *aes; #endif @@ -12997,7 +12989,7 @@ int wc_AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 sz, return BAD_FUNC_ARG; } -#if 0 && defined(WOLFSSL_AESNI) +#ifdef WOLFSSL_AESNI aes = &xaes->aes; #endif @@ -13007,7 +12999,7 @@ int wc_AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 sz, } { -#if 0 && defined(WOLFSSL_AESNI) +#ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) @@ -13030,7 +13022,7 @@ int wc_AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 sz, RESTORE_VECTOR_REGISTERS(); } else -#endif /* 0 && defined(WOLFSSL_AESNI) */ +#endif /* WOLFSSL_AESNI */ { ret = AesXtsEncryptUpdate_sw(xaes, out, in, sz, i); } @@ -13171,11 +13163,6 @@ static int AesXtsDecrypt_sw(XtsAes* xaes, byte* out, const byte* in, word32 sz, #ifdef WOLFSSL_AESXTS_STREAM -static int AesXtsDecryptStart_sw(XtsAes* xaes, byte* i) -{ - return wc_AesEncryptDirect(&xaes->tweak, i, i); -} - /* Block-streaming AES-XTS. * * Same process as encryption but use decrypt key. @@ -13402,7 +13389,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, * * returns 0 on success */ -int wc_AesXtsDecryptStart(XtsAes* xaes, byte* i, word32 iSz) +int wc_AesXtsDecryptInit(XtsAes* xaes, byte* i, word32 iSz) { int ret; Aes *aes; @@ -13427,30 +13414,28 @@ int wc_AesXtsDecryptStart(XtsAes* xaes, byte* i, word32 iSz) } { -#if 0 && defined(WOLFSSL_AESNI) +#ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { - AES_XTS_decrypt_start_avx1(i, - (const byte*)xaes->tweak.key, - (int)xaes->tweak.rounds); + AES_XTS_init_avx1(i, (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); ret = 0; } else #endif { - AES_XTS_decrypt_start_aesni(i, - (const byte*)xaes->tweak.key, - (int)xaes->tweak.rounds); + AES_XTS_init_aesni(i, (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); ret = 0; } RESTORE_VECTOR_REGISTERS(); } else -#endif /* 0 && defined(WOLFSSL_AESNI) */ +#endif /* WOLFSSL_AESNI */ { - ret = AesXtsDecryptStart_sw(xaes, i); + ret = AesXtsInit_sw(xaes, i); } } @@ -13472,7 +13457,7 @@ int wc_AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 sz, byte *i) { int ret; -#if 0 && defined(WOLFSSL_AESNI) +#ifdef WOLFSSL_AESNI Aes *aes; #endif @@ -13480,7 +13465,7 @@ int wc_AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 sz, return BAD_FUNC_ARG; } -#if 0 && defined(WOLFSSL_AESNI) +#ifdef WOLFSSL_AESNI #ifdef WC_AES_XTS_SUPPORT_SIMULTANEOUS_ENC_AND_DEC_KEYS aes = &xaes->aes_decrypt; #else @@ -13494,7 +13479,7 @@ int wc_AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 sz, } { -#if 0 && defined(WOLFSSL_AESNI) +#ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) @@ -13517,7 +13502,7 @@ int wc_AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 sz, RESTORE_VECTOR_REGISTERS(); } else -#endif /* 0 && defined(WOLFSSL_AESNI) */ +#endif /* WOLFSSL_AESNI */ { ret = AesXtsDecryptUpdate_sw(xaes, out, in, sz, i); } diff --git a/wolfcrypt/src/aes_xts_asm.S b/wolfcrypt/src/aes_xts_asm.S index fedead84f..f65c01525 100644 --- a/wolfcrypt/src/aes_xts_asm.S +++ b/wolfcrypt/src/aes_xts_asm.S @@ -1,6 +1,6 @@ /* aes_xts_asm.S */ /* - * Copyright (C) 2006-2023 wolfSSL Inc. + * Copyright (C) 2006-2024 wolfSSL Inc. * * This file is part of wolfSSL. * @@ -48,6 +48,59 @@ #ifdef WOLFSSL_AES_XTS #ifdef WOLFSSL_X86_64_BUILD #ifndef __APPLE__ +.text +.globl AES_XTS_init_aesni +.type AES_XTS_init_aesni,@function +.align 16 +AES_XTS_init_aesni: +#else +.section __TEXT,__text +.globl _AES_XTS_init_aesni +.p2align 4 +_AES_XTS_init_aesni: +#endif /* __APPLE__ */ + movdqu (%rdi), %xmm0 + # aes_enc_block + pxor (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + aesenc %xmm2, %xmm0 + movdqu 32(%rsi), %xmm2 + aesenc %xmm2, %xmm0 + movdqu 48(%rsi), %xmm2 + aesenc %xmm2, %xmm0 + movdqu 64(%rsi), %xmm2 + aesenc %xmm2, %xmm0 + movdqu 80(%rsi), %xmm2 + aesenc %xmm2, %xmm0 + movdqu 96(%rsi), %xmm2 + aesenc %xmm2, %xmm0 + movdqu 112(%rsi), %xmm2 + aesenc %xmm2, %xmm0 + movdqu 128(%rsi), %xmm2 + aesenc %xmm2, %xmm0 + movdqu 144(%rsi), %xmm2 + aesenc %xmm2, %xmm0 + cmpl $11, %edx + movdqu 160(%rsi), %xmm2 + jl L_AES_XTS_init_aesni_tweak_aes_enc_block_last + aesenc %xmm2, %xmm0 + movdqu 176(%rsi), %xmm3 + aesenc %xmm3, %xmm0 + cmpl $13, %edx + movdqu 192(%rsi), %xmm2 + jl L_AES_XTS_init_aesni_tweak_aes_enc_block_last + aesenc %xmm2, %xmm0 + movdqu 208(%rsi), %xmm3 + aesenc %xmm3, %xmm0 + movdqu 224(%rsi), %xmm2 +L_AES_XTS_init_aesni_tweak_aes_enc_block_last: + aesenclast %xmm2, %xmm0 + movdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_XTS_init_aesni,.-AES_XTS_init_aesni +#endif /* __APPLE__ */ +#ifndef __APPLE__ .data #else .section __DATA,__data @@ -378,6 +431,291 @@ L_AES_XTS_encrypt_aesni_done_enc: #endif /* __APPLE__ */ #ifndef __APPLE__ .text +.globl AES_XTS_encrypt_update_aesni +.type AES_XTS_encrypt_update_aesni,@function +.align 16 +AES_XTS_encrypt_update_aesni: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_update_aesni +.p2align 4 +_AES_XTS_encrypt_update_aesni: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + movdqu L_aes_xts_gc_xts(%rip), %xmm12 + movdqu (%r8), %xmm0 + xorl %r12d, %r12d + cmpl $0x40, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_aesni_done_64 + andl $0xffffffc0, %r11d +L_AES_XTS_encrypt_update_aesni_enc_64: + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + movdqu (%rcx), %xmm8 + movdqu 16(%rcx), %xmm9 + movdqu 32(%rcx), %xmm10 + movdqu 48(%rcx), %xmm11 + movdqa %xmm0, %xmm4 + movdqa %xmm0, %xmm1 + psrad $31, %xmm4 + pslld $0x01, %xmm1 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm1, %xmm4 + movdqa %xmm1, %xmm2 + psrad $31, %xmm4 + pslld $0x01, %xmm2 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm2 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm3 + psrad $31, %xmm4 + pslld $0x01, %xmm3 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm3 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + # aes_enc_block + movdqu (%r10), %xmm4 + pxor %xmm4, %xmm8 + pxor %xmm4, %xmm9 + pxor %xmm4, %xmm10 + pxor %xmm4, %xmm11 + movdqu 16(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 32(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 48(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 64(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 80(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 96(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 112(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 128(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 144(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + cmpl $11, %r9d + movdqu 160(%r10), %xmm4 + jl L_AES_XTS_encrypt_update_aesni_aes_enc_64_aes_enc_block_last + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 176(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + cmpl $13, %r9d + movdqu 192(%r10), %xmm4 + jl L_AES_XTS_encrypt_update_aesni_aes_enc_64_aes_enc_block_last + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 208(%r10), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 224(%r10), %xmm4 +L_AES_XTS_encrypt_update_aesni_aes_enc_64_aes_enc_block_last: + aesenclast %xmm4, %xmm8 + aesenclast %xmm4, %xmm9 + aesenclast %xmm4, %xmm10 + aesenclast %xmm4, %xmm11 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + movdqu %xmm8, (%rdx) + movdqu %xmm9, 16(%rdx) + movdqu %xmm10, 32(%rdx) + movdqu %xmm11, 48(%rdx) + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm0 + psrad $31, %xmm4 + pslld $0x01, %xmm0 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm0 + addl $0x40, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_aesni_enc_64 +L_AES_XTS_encrypt_update_aesni_done_64: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_encrypt_update_aesni_done_enc + subl %r12d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_aesni_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_update_aesni_enc_16: + leaq (%rdi,%r12,1), %rcx + movdqu (%rcx), %xmm8 + pxor %xmm0, %xmm8 + # aes_enc_block + pxor (%r10), %xmm8 + movdqu 16(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 32(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 48(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 64(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 80(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 96(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 112(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 128(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 144(%r10), %xmm5 + aesenc %xmm5, %xmm8 + cmpl $11, %r9d + movdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_aesni_aes_enc_block_last + aesenc %xmm5, %xmm8 + movdqu 176(%r10), %xmm6 + aesenc %xmm6, %xmm8 + cmpl $13, %r9d + movdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_aesni_aes_enc_block_last + aesenc %xmm5, %xmm8 + movdqu 208(%r10), %xmm6 + aesenc %xmm6, %xmm8 + movdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_aesni_aes_enc_block_last: + aesenclast %xmm5, %xmm8 + pxor %xmm0, %xmm8 + leaq (%rsi,%r12,1), %rcx + movdqu %xmm8, (%rcx) + movdqa %xmm0, %xmm4 + psrad $31, %xmm4 + pslld $0x01, %xmm0 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm0 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_aesni_enc_16 + cmpl %eax, %r12d + je L_AES_XTS_encrypt_update_aesni_done_enc +L_AES_XTS_encrypt_update_aesni_last_15: + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + movdqu (%rcx), %xmm8 + addq $16, %r12 + movdqu %xmm8, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_update_aesni_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_encrypt_update_aesni_last_15_byte_loop + subq %rdx, %r12 + movdqu (%rsp), %xmm8 + subq $16, %r12 + pxor %xmm0, %xmm8 + # aes_enc_block + pxor (%r10), %xmm8 + movdqu 16(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 32(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 48(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 64(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 80(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 96(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 112(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 128(%r10), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 144(%r10), %xmm5 + aesenc %xmm5, %xmm8 + cmpl $11, %r9d + movdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_aesni_last_15_aes_enc_block_last + aesenc %xmm5, %xmm8 + movdqu 176(%r10), %xmm6 + aesenc %xmm6, %xmm8 + cmpl $13, %r9d + movdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_aesni_last_15_aes_enc_block_last + aesenc %xmm5, %xmm8 + movdqu 208(%r10), %xmm6 + aesenc %xmm6, %xmm8 + movdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_aesni_last_15_aes_enc_block_last: + aesenclast %xmm5, %xmm8 + pxor %xmm0, %xmm8 + leaq (%rsi,%r12,1), %rcx + movdqu %xmm8, (%rcx) +L_AES_XTS_encrypt_update_aesni_done_enc: + movdqu %xmm0, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_update_aesni,.-AES_XTS_encrypt_update_aesni +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text .globl AES_XTS_decrypt_aesni .type AES_XTS_decrypt_aesni,@function .align 16 @@ -752,8 +1090,401 @@ L_AES_XTS_decrypt_aesni_done_dec: #ifndef __APPLE__ .size AES_XTS_decrypt_aesni,.-AES_XTS_decrypt_aesni #endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_update_aesni +.type AES_XTS_decrypt_update_aesni,@function +.align 16 +AES_XTS_decrypt_update_aesni: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_update_aesni +.p2align 4 +_AES_XTS_decrypt_update_aesni: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $16, %rsp + movdqu L_aes_xts_gc_xts(%rip), %xmm12 + movdqu (%r8), %xmm0 + xorl %r12d, %r12d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_aesni_mul16_64 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_aesni_last_31_start +L_AES_XTS_decrypt_update_aesni_mul16_64: + cmpl $0x40, %r11d + jl L_AES_XTS_decrypt_update_aesni_done_64 + andl $0xffffffc0, %r11d +L_AES_XTS_decrypt_update_aesni_dec_64: + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + movdqu (%rcx), %xmm8 + movdqu 16(%rcx), %xmm9 + movdqu 32(%rcx), %xmm10 + movdqu 48(%rcx), %xmm11 + movdqa %xmm0, %xmm4 + movdqa %xmm0, %xmm1 + psrad $31, %xmm4 + pslld $0x01, %xmm1 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm1, %xmm4 + movdqa %xmm1, %xmm2 + psrad $31, %xmm4 + pslld $0x01, %xmm2 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm2 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm3 + psrad $31, %xmm4 + pslld $0x01, %xmm3 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm3 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + # aes_dec_block + movdqu (%r10), %xmm4 + pxor %xmm4, %xmm8 + pxor %xmm4, %xmm9 + pxor %xmm4, %xmm10 + pxor %xmm4, %xmm11 + movdqu 16(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 32(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 48(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 64(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 80(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 96(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 112(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 128(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 144(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + cmpl $11, %r9d + movdqu 160(%r10), %xmm4 + jl L_AES_XTS_decrypt_update_aesni_aes_dec_64_aes_dec_block_last + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 176(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + cmpl $13, %r9d + movdqu 192(%r10), %xmm4 + jl L_AES_XTS_decrypt_update_aesni_aes_dec_64_aes_dec_block_last + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 208(%r10), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 224(%r10), %xmm4 +L_AES_XTS_decrypt_update_aesni_aes_dec_64_aes_dec_block_last: + aesdeclast %xmm4, %xmm8 + aesdeclast %xmm4, %xmm9 + aesdeclast %xmm4, %xmm10 + aesdeclast %xmm4, %xmm11 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + movdqu %xmm8, (%rdx) + movdqu %xmm9, 16(%rdx) + movdqu %xmm10, 32(%rdx) + movdqu %xmm11, 48(%rdx) + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm0 + psrad $31, %xmm4 + pslld $0x01, %xmm0 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm0 + addl $0x40, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_aesni_dec_64 +L_AES_XTS_decrypt_update_aesni_done_64: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_aesni_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_aesni_mul16 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_aesni_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_aesni_mul16: +L_AES_XTS_decrypt_update_aesni_dec_16: + # 16 bytes of input + leaq (%rdi,%r12,1), %rcx + movdqu (%rcx), %xmm8 + pxor %xmm0, %xmm8 + # aes_dec_block + pxor (%r10), %xmm8 + movdqu 16(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 32(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 48(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 64(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 80(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 96(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 112(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 128(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 144(%r10), %xmm5 + aesdec %xmm5, %xmm8 + cmpl $11, %r9d + movdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_aesni_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 176(%r10), %xmm6 + aesdec %xmm6, %xmm8 + cmpl $13, %r9d + movdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_aesni_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 208(%r10), %xmm6 + aesdec %xmm6, %xmm8 + movdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_aesni_aes_dec_block_last: + aesdeclast %xmm5, %xmm8 + pxor %xmm0, %xmm8 + leaq (%rsi,%r12,1), %rcx + movdqu %xmm8, (%rcx) + movdqa %xmm0, %xmm4 + psrad $31, %xmm4 + pslld $0x01, %xmm0 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm0 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_aesni_dec_16 + cmpl %eax, %r12d + je L_AES_XTS_decrypt_update_aesni_done_dec +L_AES_XTS_decrypt_update_aesni_last_31_start: + movdqa %xmm0, %xmm4 + movdqa %xmm0, %xmm7 + psrad $31, %xmm4 + pslld $0x01, %xmm7 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm7 + leaq (%rdi,%r12,1), %rcx + movdqu (%rcx), %xmm8 + pxor %xmm7, %xmm8 + # aes_dec_block + pxor (%r10), %xmm8 + movdqu 16(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 32(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 48(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 64(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 80(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 96(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 112(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 128(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 144(%r10), %xmm5 + aesdec %xmm5, %xmm8 + cmpl $11, %r9d + movdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_aesni_last_31_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 176(%r10), %xmm6 + aesdec %xmm6, %xmm8 + cmpl $13, %r9d + movdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_aesni_last_31_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 208(%r10), %xmm6 + aesdec %xmm6, %xmm8 + movdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_aesni_last_31_aes_dec_block_last: + aesdeclast %xmm5, %xmm8 + pxor %xmm7, %xmm8 + movdqu %xmm8, (%rsp) + addq $16, %r12 + xorq %rdx, %rdx +L_AES_XTS_decrypt_update_aesni_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_decrypt_update_aesni_last_31_byte_loop + subq %rdx, %r12 + movdqu (%rsp), %xmm8 + pxor %xmm0, %xmm8 + # aes_dec_block + pxor (%r10), %xmm8 + movdqu 16(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 32(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 48(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 64(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 80(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 96(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 112(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 128(%r10), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 144(%r10), %xmm5 + aesdec %xmm5, %xmm8 + cmpl $11, %r9d + movdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_aesni_last_31_2_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 176(%r10), %xmm6 + aesdec %xmm6, %xmm8 + cmpl $13, %r9d + movdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_aesni_last_31_2_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 208(%r10), %xmm6 + aesdec %xmm6, %xmm8 + movdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_aesni_last_31_2_aes_dec_block_last: + aesdeclast %xmm5, %xmm8 + pxor %xmm0, %xmm8 + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + movdqu %xmm8, (%rcx) +L_AES_XTS_decrypt_update_aesni_done_dec: + movdqu %xmm0, (%r8) + addq $16, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_update_aesni,.-AES_XTS_decrypt_update_aesni +#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX1 #ifndef __APPLE__ +.text +.globl AES_XTS_init_avx1 +.type AES_XTS_init_avx1,@function +.align 16 +AES_XTS_init_avx1: +#else +.section __TEXT,__text +.globl _AES_XTS_init_avx1 +.p2align 4 +_AES_XTS_init_avx1: +#endif /* __APPLE__ */ + movl %edx, %eax + vmovdqu (%rdi), %xmm0 + # aes_enc_block + vpxor (%rsi), %xmm0, %xmm0 + vmovdqu 16(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 32(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 48(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 64(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 80(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 96(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 112(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 128(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 144(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + cmpl $11, %eax + vmovdqu 160(%rsi), %xmm2 + jl L_AES_XTS_init_avx1_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 176(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + cmpl $13, %eax + vmovdqu 192(%rsi), %xmm2 + jl L_AES_XTS_init_avx1_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 208(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + vmovdqu 224(%rsi), %xmm2 +L_AES_XTS_init_avx1_tweak_aes_enc_block_last: + vaesenclast %xmm2, %xmm0, %xmm0 + vmovdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_XTS_init_avx1,.-AES_XTS_init_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ .data #else .section __DATA,__data @@ -1066,7 +1797,6 @@ L_AES_XTS_encrypt_avx1_last_15_aes_enc_block_last: leaq (%rsi,%r13,1), %rcx vmovdqu %xmm8, (%rcx) L_AES_XTS_encrypt_avx1_done_enc: - vzeroupper addq $0x40, %rsp popq %r13 popq %r12 @@ -1076,6 +1806,282 @@ L_AES_XTS_encrypt_avx1_done_enc: #endif /* __APPLE__ */ #ifndef __APPLE__ .text +.globl AES_XTS_encrypt_update_avx1 +.type AES_XTS_encrypt_update_avx1,@function +.align 16 +AES_XTS_encrypt_update_avx1: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_update_avx1 +.p2align 4 +_AES_XTS_encrypt_update_avx1: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + vmovdqu L_avx1_aes_xts_gc_xts(%rip), %xmm12 + vmovdqu (%r8), %xmm0 + xorl %r12d, %r12d + cmpl $0x40, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_avx1_done_64 + andl $0xffffffc0, %r11d +L_AES_XTS_encrypt_update_avx1_enc_64: + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %xmm8 + vmovdqu 16(%rcx), %xmm9 + vmovdqu 32(%rcx), %xmm10 + vmovdqu 48(%rcx), %xmm11 + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm1 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm1, %xmm1 + vpsrad $31, %xmm1, %xmm4 + vpslld $0x01, %xmm1, %xmm2 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + vpsrad $31, %xmm2, %xmm4 + vpslld $0x01, %xmm2, %xmm3 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm0, %xmm8, %xmm8 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm10, %xmm10 + vpxor %xmm3, %xmm11, %xmm11 + # aes_enc_block + vmovdqu (%r10), %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + vpxor %xmm4, %xmm9, %xmm9 + vpxor %xmm4, %xmm10, %xmm10 + vpxor %xmm4, %xmm11, %xmm11 + vmovdqu 16(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 32(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 48(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 64(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 80(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 96(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 112(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 128(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 144(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm4 + jl L_AES_XTS_encrypt_update_avx1_aes_enc_64_aes_enc_block_last + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 176(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm4 + jl L_AES_XTS_encrypt_update_avx1_aes_enc_64_aes_enc_block_last + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 208(%r10), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 224(%r10), %xmm4 +L_AES_XTS_encrypt_update_avx1_aes_enc_64_aes_enc_block_last: + vaesenclast %xmm4, %xmm8, %xmm8 + vaesenclast %xmm4, %xmm9, %xmm9 + vaesenclast %xmm4, %xmm10, %xmm10 + vaesenclast %xmm4, %xmm11, %xmm11 + vpxor %xmm0, %xmm8, %xmm8 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm10, %xmm10 + vpxor %xmm3, %xmm11, %xmm11 + vmovdqu %xmm8, (%rdx) + vmovdqu %xmm9, 16(%rdx) + vmovdqu %xmm10, 32(%rdx) + vmovdqu %xmm11, 48(%rdx) + vpsrad $31, %xmm3, %xmm4 + vpslld $0x01, %xmm3, %xmm0 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + addl $0x40, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_avx1_enc_64 +L_AES_XTS_encrypt_update_avx1_done_64: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_encrypt_update_avx1_done_enc + subl %r12d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_avx1_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_update_avx1_enc_16: + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + # aes_enc_block + vpxor (%r10), %xmm8, %xmm8 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx1_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx1_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_avx1_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm8, (%rcx) + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm0 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_avx1_enc_16 + cmpl %eax, %r12d + je L_AES_XTS_encrypt_update_avx1_done_enc +L_AES_XTS_encrypt_update_avx1_last_15: + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu (%rcx), %xmm8 + addq $16, %r12 + vmovdqu %xmm8, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_update_avx1_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_encrypt_update_avx1_last_15_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm8 + subq $16, %r12 + vpxor %xmm0, %xmm8, %xmm8 + # aes_enc_block + vpxor (%r10), %xmm8, %xmm8 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx1_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx1_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_avx1_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm8, (%rcx) +L_AES_XTS_encrypt_update_avx1_done_enc: + vmovdqu %xmm0, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_update_avx1,.-AES_XTS_encrypt_update_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text .globl AES_XTS_decrypt_avx1 .type AES_XTS_decrypt_avx1,@function .align 16 @@ -1432,7 +2438,6 @@ L_AES_XTS_decrypt_avx1_last_31_2_aes_dec_block_last: leaq (%rsi,%r13,1), %rcx vmovdqu %xmm8, (%rcx) L_AES_XTS_decrypt_avx1_done_dec: - vzeroupper addq $16, %rsp popq %r13 popq %r12 @@ -1440,6 +2445,334 @@ L_AES_XTS_decrypt_avx1_done_dec: #ifndef __APPLE__ .size AES_XTS_decrypt_avx1,.-AES_XTS_decrypt_avx1 #endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_update_avx1 +.type AES_XTS_decrypt_update_avx1,@function +.align 16 +AES_XTS_decrypt_update_avx1: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_update_avx1 +.p2align 4 +_AES_XTS_decrypt_update_avx1: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $16, %rsp + vmovdqu L_avx1_aes_xts_gc_xts(%rip), %xmm12 + vmovdqu (%r8), %xmm0 + xorl %r12d, %r12d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx1_mul16_64 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx1_last_31_start +L_AES_XTS_decrypt_update_avx1_mul16_64: + cmpl $0x40, %r11d + jl L_AES_XTS_decrypt_update_avx1_done_64 + andl $0xffffffc0, %r11d +L_AES_XTS_decrypt_update_avx1_dec_64: + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %xmm8 + vmovdqu 16(%rcx), %xmm9 + vmovdqu 32(%rcx), %xmm10 + vmovdqu 48(%rcx), %xmm11 + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm1 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm1, %xmm1 + vpsrad $31, %xmm1, %xmm4 + vpslld $0x01, %xmm1, %xmm2 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + vpsrad $31, %xmm2, %xmm4 + vpslld $0x01, %xmm2, %xmm3 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm0, %xmm8, %xmm8 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm10, %xmm10 + vpxor %xmm3, %xmm11, %xmm11 + # aes_dec_block + vmovdqu (%r10), %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + vpxor %xmm4, %xmm9, %xmm9 + vpxor %xmm4, %xmm10, %xmm10 + vpxor %xmm4, %xmm11, %xmm11 + vmovdqu 16(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 32(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 48(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 64(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 80(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 96(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 112(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 128(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 144(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm4 + jl L_AES_XTS_decrypt_update_avx1_aes_dec_64_aes_dec_block_last + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 176(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm4 + jl L_AES_XTS_decrypt_update_avx1_aes_dec_64_aes_dec_block_last + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 208(%r10), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 224(%r10), %xmm4 +L_AES_XTS_decrypt_update_avx1_aes_dec_64_aes_dec_block_last: + vaesdeclast %xmm4, %xmm8, %xmm8 + vaesdeclast %xmm4, %xmm9, %xmm9 + vaesdeclast %xmm4, %xmm10, %xmm10 + vaesdeclast %xmm4, %xmm11, %xmm11 + vpxor %xmm0, %xmm8, %xmm8 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm10, %xmm10 + vpxor %xmm3, %xmm11, %xmm11 + vmovdqu %xmm8, (%rdx) + vmovdqu %xmm9, 16(%rdx) + vmovdqu %xmm10, 32(%rdx) + vmovdqu %xmm11, 48(%rdx) + vpsrad $31, %xmm3, %xmm4 + vpslld $0x01, %xmm3, %xmm0 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + addl $0x40, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_avx1_dec_64 +L_AES_XTS_decrypt_update_avx1_done_64: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_avx1_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx1_mul16 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx1_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_avx1_mul16: +L_AES_XTS_decrypt_update_avx1_dec_16: + # 16 bytes of input + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + # aes_dec_block + vpxor (%r10), %xmm8, %xmm8 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx1_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx1_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_avx1_aes_dec_block_last: + vaesdeclast %xmm5, %xmm8, %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm8, (%rcx) + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm0 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_avx1_dec_16 + cmpl %eax, %r12d + je L_AES_XTS_decrypt_update_avx1_done_dec +L_AES_XTS_decrypt_update_avx1_last_31_start: + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm7 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm7, %xmm7 + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + # aes_dec_block + vpxor (%r10), %xmm8, %xmm8 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx1_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx1_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_avx1_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vmovdqu %xmm8, (%rsp) + addq $16, %r12 + xorq %rdx, %rdx +L_AES_XTS_decrypt_update_avx1_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_decrypt_update_avx1_last_31_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + # aes_dec_block + vpxor (%r10), %xmm8, %xmm8 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx1_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx1_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_avx1_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm8, %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm8, (%rcx) +L_AES_XTS_decrypt_update_avx1_done_dec: + vmovdqu %xmm0, (%r8) + addq $16, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_update_avx1,.-AES_XTS_decrypt_update_avx1 +#endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX1 */ #endif /* WOLFSSL_X86_64_BUILD */ #endif /* WOLFSSL_AES_XTS */ diff --git a/wolfcrypt/src/aes_xts_asm.asm b/wolfcrypt/src/aes_xts_asm.asm index 3185ec224..7194a06b8 100644 --- a/wolfcrypt/src/aes_xts_asm.asm +++ b/wolfcrypt/src/aes_xts_asm.asm @@ -40,6 +40,48 @@ IFNDEF _WIN64 _WIN64 = 1 ENDIF +_text SEGMENT READONLY PARA +AES_XTS_init_aesni PROC + movdqu xmm0, OWORD PTR [rcx] + ; aes_enc_block + pxor xmm0, [rdx] + movdqu xmm2, OWORD PTR [rdx+16] + aesenc xmm0, xmm2 + movdqu xmm2, OWORD PTR [rdx+32] + aesenc xmm0, xmm2 + movdqu xmm2, OWORD PTR [rdx+48] + aesenc xmm0, xmm2 + movdqu xmm2, OWORD PTR [rdx+64] + aesenc xmm0, xmm2 + movdqu xmm2, OWORD PTR [rdx+80] + aesenc xmm0, xmm2 + movdqu xmm2, OWORD PTR [rdx+96] + aesenc xmm0, xmm2 + movdqu xmm2, OWORD PTR [rdx+112] + aesenc xmm0, xmm2 + movdqu xmm2, OWORD PTR [rdx+128] + aesenc xmm0, xmm2 + movdqu xmm2, OWORD PTR [rdx+144] + aesenc xmm0, xmm2 + cmp r8d, 11 + movdqu xmm2, OWORD PTR [rdx+160] + jl L_AES_XTS_init_aesni_tweak_aes_enc_block_last + aesenc xmm0, xmm2 + movdqu xmm3, OWORD PTR [rdx+176] + aesenc xmm0, xmm3 + cmp r8d, 13 + movdqu xmm2, OWORD PTR [rdx+192] + jl L_AES_XTS_init_aesni_tweak_aes_enc_block_last + aesenc xmm0, xmm2 + movdqu xmm3, OWORD PTR [rdx+208] + aesenc xmm0, xmm3 + movdqu xmm2, OWORD PTR [rdx+224] +L_AES_XTS_init_aesni_tweak_aes_enc_block_last: + aesenclast xmm0, xmm2 + movdqu OWORD PTR [rcx], xmm0 + ret +AES_XTS_init_aesni ENDP +_text ENDS _DATA SEGMENT ALIGN 16 L_aes_xts_gc_xts DWORD 135,1,1,1 @@ -379,6 +421,302 @@ L_AES_XTS_encrypt_aesni_done_enc: AES_XTS_encrypt_aesni ENDP _text ENDS _text SEGMENT READONLY PARA +AES_XTS_encrypt_update_aesni PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 176 + movdqu OWORD PTR [rsp+64], xmm6 + movdqu OWORD PTR [rsp+80], xmm7 + movdqu OWORD PTR [rsp+96], xmm8 + movdqu OWORD PTR [rsp+112], xmm9 + movdqu OWORD PTR [rsp+128], xmm10 + movdqu OWORD PTR [rsp+144], xmm11 + movdqu OWORD PTR [rsp+160], xmm12 + movdqu xmm12, OWORD PTR L_aes_xts_gc_xts + movdqu xmm0, OWORD PTR [r8] + xor r12d, r12d + cmp eax, 64 + mov r11d, eax + jl L_AES_XTS_encrypt_update_aesni_done_64 + and r11d, 4294967232 +L_AES_XTS_encrypt_update_aesni_enc_64: + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + movdqu xmm8, OWORD PTR [rcx] + movdqu xmm9, OWORD PTR [rcx+16] + movdqu xmm10, OWORD PTR [rcx+32] + movdqu xmm11, OWORD PTR [rcx+48] + movdqa xmm4, xmm0 + movdqa xmm1, xmm0 + psrad xmm4, 31 + pslld xmm1, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm1, xmm4 + movdqa xmm4, xmm1 + movdqa xmm2, xmm1 + psrad xmm4, 31 + pslld xmm2, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm2, xmm4 + movdqa xmm4, xmm2 + movdqa xmm3, xmm2 + psrad xmm4, 31 + pslld xmm3, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm3, xmm4 + pxor xmm8, xmm0 + pxor xmm9, xmm1 + pxor xmm10, xmm2 + pxor xmm11, xmm3 + ; aes_enc_block + movdqu xmm4, OWORD PTR [r10] + pxor xmm8, xmm4 + pxor xmm9, xmm4 + pxor xmm10, xmm4 + pxor xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+16] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+32] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+48] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+64] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+80] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+96] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+112] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+128] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+144] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + cmp r9d, 11 + movdqu xmm4, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_aesni_aes_enc_64_aes_enc_block_last + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+176] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + cmp r9d, 13 + movdqu xmm4, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_aesni_aes_enc_64_aes_enc_block_last + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+208] + aesenc xmm8, xmm4 + aesenc xmm9, xmm4 + aesenc xmm10, xmm4 + aesenc xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_aesni_aes_enc_64_aes_enc_block_last: + aesenclast xmm8, xmm4 + aesenclast xmm9, xmm4 + aesenclast xmm10, xmm4 + aesenclast xmm11, xmm4 + pxor xmm8, xmm0 + pxor xmm9, xmm1 + pxor xmm10, xmm2 + pxor xmm11, xmm3 + movdqu OWORD PTR [rdx], xmm8 + movdqu OWORD PTR [rdx+16], xmm9 + movdqu OWORD PTR [rdx+32], xmm10 + movdqu OWORD PTR [rdx+48], xmm11 + movdqa xmm4, xmm3 + movdqa xmm0, xmm3 + psrad xmm4, 31 + pslld xmm0, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm0, xmm4 + add r12d, 64 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_aesni_enc_64 +L_AES_XTS_encrypt_update_aesni_done_64: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_encrypt_update_aesni_done_enc + sub r11d, r12d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_update_aesni_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_update_aesni_enc_16: + lea rcx, QWORD PTR [rdi+r12] + movdqu xmm8, OWORD PTR [rcx] + pxor xmm8, xmm0 + ; aes_enc_block + pxor xmm8, [r10] + movdqu xmm5, OWORD PTR [r10+16] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+32] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+48] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+64] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+80] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+96] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+112] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+128] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+144] + aesenc xmm8, xmm5 + cmp r9d, 11 + movdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_aesni_aes_enc_block_last + aesenc xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+176] + aesenc xmm8, xmm6 + cmp r9d, 13 + movdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_aesni_aes_enc_block_last + aesenc xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+208] + aesenc xmm8, xmm6 + movdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_aesni_aes_enc_block_last: + aesenclast xmm8, xmm5 + pxor xmm8, xmm0 + lea rcx, QWORD PTR [rsi+r12] + movdqu OWORD PTR [rcx], xmm8 + movdqa xmm4, xmm0 + psrad xmm4, 31 + pslld xmm0, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm0, xmm4 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_aesni_enc_16 + cmp r12d, eax + je L_AES_XTS_encrypt_update_aesni_done_enc +L_AES_XTS_encrypt_update_aesni_last_15: + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + movdqu xmm8, OWORD PTR [rcx] + add r12, 16 + movdqu OWORD PTR [rsp], xmm8 + xor rdx, rdx +L_AES_XTS_encrypt_update_aesni_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_encrypt_update_aesni_last_15_byte_loop + sub r12, rdx + movdqu xmm8, OWORD PTR [rsp] + sub r12, 16 + pxor xmm8, xmm0 + ; aes_enc_block + pxor xmm8, [r10] + movdqu xmm5, OWORD PTR [r10+16] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+32] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+48] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+64] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+80] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+96] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+112] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+128] + aesenc xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+144] + aesenc xmm8, xmm5 + cmp r9d, 11 + movdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_aesni_last_15_aes_enc_block_last + aesenc xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+176] + aesenc xmm8, xmm6 + cmp r9d, 13 + movdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_aesni_last_15_aes_enc_block_last + aesenc xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+208] + aesenc xmm8, xmm6 + movdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_aesni_last_15_aes_enc_block_last: + aesenclast xmm8, xmm5 + pxor xmm8, xmm0 + lea rcx, QWORD PTR [rsi+r12] + movdqu OWORD PTR [rcx], xmm8 +L_AES_XTS_encrypt_update_aesni_done_enc: + movdqu OWORD PTR [r8], xmm0 + movdqu xmm6, OWORD PTR [rsp+64] + movdqu xmm7, OWORD PTR [rsp+80] + movdqu xmm8, OWORD PTR [rsp+96] + movdqu xmm9, OWORD PTR [rsp+112] + movdqu xmm10, OWORD PTR [rsp+128] + movdqu xmm11, OWORD PTR [rsp+144] + movdqu xmm12, OWORD PTR [rsp+160] + add rsp, 176 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_update_aesni ENDP +_text ENDS +_text SEGMENT READONLY PARA AES_XTS_decrypt_aesni PROC push rdi push rsi @@ -765,7 +1103,400 @@ L_AES_XTS_decrypt_aesni_done_dec: ret AES_XTS_decrypt_aesni ENDP _text ENDS +_text SEGMENT READONLY PARA +AES_XTS_decrypt_update_aesni PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 128 + movdqu OWORD PTR [rsp+16], xmm6 + movdqu OWORD PTR [rsp+32], xmm7 + movdqu OWORD PTR [rsp+48], xmm8 + movdqu OWORD PTR [rsp+64], xmm9 + movdqu OWORD PTR [rsp+80], xmm10 + movdqu OWORD PTR [rsp+96], xmm11 + movdqu OWORD PTR [rsp+112], xmm12 + movdqu xmm12, OWORD PTR L_aes_xts_gc_xts + movdqu xmm0, OWORD PTR [r8] + xor r12d, r12d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_aesni_mul16_64 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_aesni_last_31_start +L_AES_XTS_decrypt_update_aesni_mul16_64: + cmp r11d, 64 + jl L_AES_XTS_decrypt_update_aesni_done_64 + and r11d, 4294967232 +L_AES_XTS_decrypt_update_aesni_dec_64: + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + movdqu xmm8, OWORD PTR [rcx] + movdqu xmm9, OWORD PTR [rcx+16] + movdqu xmm10, OWORD PTR [rcx+32] + movdqu xmm11, OWORD PTR [rcx+48] + movdqa xmm4, xmm0 + movdqa xmm1, xmm0 + psrad xmm4, 31 + pslld xmm1, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm1, xmm4 + movdqa xmm4, xmm1 + movdqa xmm2, xmm1 + psrad xmm4, 31 + pslld xmm2, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm2, xmm4 + movdqa xmm4, xmm2 + movdqa xmm3, xmm2 + psrad xmm4, 31 + pslld xmm3, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm3, xmm4 + pxor xmm8, xmm0 + pxor xmm9, xmm1 + pxor xmm10, xmm2 + pxor xmm11, xmm3 + ; aes_dec_block + movdqu xmm4, OWORD PTR [r10] + pxor xmm8, xmm4 + pxor xmm9, xmm4 + pxor xmm10, xmm4 + pxor xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+16] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+32] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+48] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+64] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+80] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+96] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+112] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+128] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+144] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + cmp r9d, 11 + movdqu xmm4, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_aesni_aes_dec_64_aes_dec_block_last + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+176] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + cmp r9d, 13 + movdqu xmm4, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_aesni_aes_dec_64_aes_dec_block_last + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+208] + aesdec xmm8, xmm4 + aesdec xmm9, xmm4 + aesdec xmm10, xmm4 + aesdec xmm11, xmm4 + movdqu xmm4, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_aesni_aes_dec_64_aes_dec_block_last: + aesdeclast xmm8, xmm4 + aesdeclast xmm9, xmm4 + aesdeclast xmm10, xmm4 + aesdeclast xmm11, xmm4 + pxor xmm8, xmm0 + pxor xmm9, xmm1 + pxor xmm10, xmm2 + pxor xmm11, xmm3 + movdqu OWORD PTR [rdx], xmm8 + movdqu OWORD PTR [rdx+16], xmm9 + movdqu OWORD PTR [rdx+32], xmm10 + movdqu OWORD PTR [rdx+48], xmm11 + movdqa xmm4, xmm3 + movdqa xmm0, xmm3 + psrad xmm4, 31 + pslld xmm0, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm0, xmm4 + add r12d, 64 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_aesni_dec_64 +L_AES_XTS_decrypt_update_aesni_done_64: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_aesni_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_aesni_mul16 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_aesni_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_aesni_mul16: +L_AES_XTS_decrypt_update_aesni_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r12] + movdqu xmm8, OWORD PTR [rcx] + pxor xmm8, xmm0 + ; aes_dec_block + pxor xmm8, [r10] + movdqu xmm5, OWORD PTR [r10+16] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+32] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+48] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+64] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+80] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+96] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+112] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+128] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+144] + aesdec xmm8, xmm5 + cmp r9d, 11 + movdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_aesni_aes_dec_block_last + aesdec xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+176] + aesdec xmm8, xmm6 + cmp r9d, 13 + movdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_aesni_aes_dec_block_last + aesdec xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+208] + aesdec xmm8, xmm6 + movdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_aesni_aes_dec_block_last: + aesdeclast xmm8, xmm5 + pxor xmm8, xmm0 + lea rcx, QWORD PTR [rsi+r12] + movdqu OWORD PTR [rcx], xmm8 + movdqa xmm4, xmm0 + psrad xmm4, 31 + pslld xmm0, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm0, xmm4 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_aesni_dec_16 + cmp r12d, eax + je L_AES_XTS_decrypt_update_aesni_done_dec +L_AES_XTS_decrypt_update_aesni_last_31_start: + movdqa xmm4, xmm0 + movdqa xmm7, xmm0 + psrad xmm4, 31 + pslld xmm7, 1 + pshufd xmm4, xmm4, 147 + pand xmm4, xmm12 + pxor xmm7, xmm4 + lea rcx, QWORD PTR [rdi+r12] + movdqu xmm8, OWORD PTR [rcx] + pxor xmm8, xmm7 + ; aes_dec_block + pxor xmm8, [r10] + movdqu xmm5, OWORD PTR [r10+16] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+32] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+48] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+64] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+80] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+96] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+112] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+128] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+144] + aesdec xmm8, xmm5 + cmp r9d, 11 + movdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_aesni_last_31_aes_dec_block_last + aesdec xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+176] + aesdec xmm8, xmm6 + cmp r9d, 13 + movdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_aesni_last_31_aes_dec_block_last + aesdec xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+208] + aesdec xmm8, xmm6 + movdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_aesni_last_31_aes_dec_block_last: + aesdeclast xmm8, xmm5 + pxor xmm8, xmm7 + movdqu OWORD PTR [rsp], xmm8 + add r12, 16 + xor rdx, rdx +L_AES_XTS_decrypt_update_aesni_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_decrypt_update_aesni_last_31_byte_loop + sub r12, rdx + movdqu xmm8, OWORD PTR [rsp] + pxor xmm8, xmm0 + ; aes_dec_block + pxor xmm8, [r10] + movdqu xmm5, OWORD PTR [r10+16] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+32] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+48] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+64] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+80] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+96] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+112] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+128] + aesdec xmm8, xmm5 + movdqu xmm5, OWORD PTR [r10+144] + aesdec xmm8, xmm5 + cmp r9d, 11 + movdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_aesni_last_31_2_aes_dec_block_last + aesdec xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+176] + aesdec xmm8, xmm6 + cmp r9d, 13 + movdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_aesni_last_31_2_aes_dec_block_last + aesdec xmm8, xmm5 + movdqu xmm6, OWORD PTR [r10+208] + aesdec xmm8, xmm6 + movdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_aesni_last_31_2_aes_dec_block_last: + aesdeclast xmm8, xmm5 + pxor xmm8, xmm0 + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + movdqu OWORD PTR [rcx], xmm8 +L_AES_XTS_decrypt_update_aesni_done_dec: + movdqu OWORD PTR [r8], xmm0 + movdqu xmm6, OWORD PTR [rsp+16] + movdqu xmm7, OWORD PTR [rsp+32] + movdqu xmm8, OWORD PTR [rsp+48] + movdqu xmm9, OWORD PTR [rsp+64] + movdqu xmm10, OWORD PTR [rsp+80] + movdqu xmm11, OWORD PTR [rsp+96] + movdqu xmm12, OWORD PTR [rsp+112] + add rsp, 128 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_update_aesni ENDP +_text ENDS IFDEF HAVE_INTEL_AVX1 +_text SEGMENT READONLY PARA +AES_XTS_init_avx1 PROC + mov eax, r8d + vmovdqu xmm0, OWORD PTR [rcx] + ; aes_enc_block + vpxor xmm0, xmm0, [rdx] + vmovdqu xmm2, OWORD PTR [rdx+16] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+32] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+48] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+64] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+80] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+96] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+112] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+128] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+144] + vaesenc xmm0, xmm0, xmm2 + cmp eax, 11 + vmovdqu xmm2, OWORD PTR [rdx+160] + jl L_AES_XTS_init_avx1_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+176] + vaesenc xmm0, xmm0, xmm3 + cmp eax, 13 + vmovdqu xmm2, OWORD PTR [rdx+192] + jl L_AES_XTS_init_avx1_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+208] + vaesenc xmm0, xmm0, xmm3 + vmovdqu xmm2, OWORD PTR [rdx+224] +L_AES_XTS_init_avx1_tweak_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm2 + vmovdqu OWORD PTR [rcx], xmm0 + ret +AES_XTS_init_avx1 ENDP +_text ENDS _DATA SEGMENT ALIGN 16 L_avx1_aes_xts_gc_xts DWORD 135,1,1,1 @@ -1080,7 +1811,6 @@ L_AES_XTS_encrypt_avx1_last_15_aes_enc_block_last: lea rcx, QWORD PTR [rsi+r13] vmovdqu OWORD PTR [rcx], xmm8 L_AES_XTS_encrypt_avx1_done_enc: - vzeroupper vmovdqu xmm6, OWORD PTR [rsp+64] vmovdqu xmm7, OWORD PTR [rsp+80] vmovdqu xmm8, OWORD PTR [rsp+96] @@ -1097,6 +1827,293 @@ L_AES_XTS_encrypt_avx1_done_enc: AES_XTS_encrypt_avx1 ENDP _text ENDS _text SEGMENT READONLY PARA +AES_XTS_encrypt_update_avx1 PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 176 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu xmm12, OWORD PTR L_avx1_aes_xts_gc_xts + vmovdqu xmm0, OWORD PTR [r8] + xor r12d, r12d + cmp eax, 64 + mov r11d, eax + jl L_AES_XTS_encrypt_update_avx1_done_64 + and r11d, 4294967232 +L_AES_XTS_encrypt_update_avx1_enc_64: + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu xmm8, OWORD PTR [rcx] + vmovdqu xmm9, OWORD PTR [rcx+16] + vmovdqu xmm10, OWORD PTR [rcx+32] + vmovdqu xmm11, OWORD PTR [rcx+48] + vpsrad xmm4, xmm0, 31 + vpslld xmm1, xmm0, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm1, xmm1, xmm4 + vpsrad xmm4, xmm1, 31 + vpslld xmm2, xmm1, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm2, xmm2, xmm4 + vpsrad xmm4, xmm2, 31 + vpslld xmm3, xmm2, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm3, xmm3, xmm4 + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + ; aes_enc_block + vmovdqu xmm4, OWORD PTR [r10] + vpxor xmm8, xmm8, xmm4 + vpxor xmm9, xmm9, xmm4 + vpxor xmm10, xmm10, xmm4 + vpxor xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+16] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+32] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+48] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+64] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+80] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+96] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+112] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+128] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+144] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + cmp r9d, 11 + vmovdqu xmm4, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_avx1_aes_enc_64_aes_enc_block_last + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+176] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + cmp r9d, 13 + vmovdqu xmm4, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_avx1_aes_enc_64_aes_enc_block_last + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+208] + vaesenc xmm8, xmm8, xmm4 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_avx1_aes_enc_64_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm4 + vaesenclast xmm9, xmm9, xmm4 + vaesenclast xmm10, xmm10, xmm4 + vaesenclast xmm11, xmm11, xmm4 + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vpsrad xmm4, xmm3, 31 + vpslld xmm0, xmm3, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm0, xmm0, xmm4 + add r12d, 64 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_avx1_enc_64 +L_AES_XTS_encrypt_update_avx1_done_64: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_encrypt_update_avx1_done_enc + sub r11d, r12d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_update_avx1_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_update_avx1_enc_16: + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm8, OWORD PTR [rcx] + vpxor xmm8, xmm8, xmm0 + ; aes_enc_block + vpxor xmm8, xmm8, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm8, xmm8, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_avx1_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm8, xmm8, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_avx1_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_avx1_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + vpxor xmm8, xmm8, xmm0 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm8 + vpsrad xmm4, xmm0, 31 + vpslld xmm0, xmm0, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm0, xmm0, xmm4 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_avx1_enc_16 + cmp r12d, eax + je L_AES_XTS_encrypt_update_avx1_done_enc +L_AES_XTS_encrypt_update_avx1_last_15: + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu xmm8, OWORD PTR [rcx] + add r12, 16 + vmovdqu OWORD PTR [rsp], xmm8 + xor rdx, rdx +L_AES_XTS_encrypt_update_avx1_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_encrypt_update_avx1_last_15_byte_loop + sub r12, rdx + vmovdqu xmm8, OWORD PTR [rsp] + sub r12, 16 + vpxor xmm8, xmm8, xmm0 + ; aes_enc_block + vpxor xmm8, xmm8, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm8, xmm8, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_avx1_last_15_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm8, xmm8, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_avx1_last_15_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_avx1_last_15_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + vpxor xmm8, xmm8, xmm0 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm8 +L_AES_XTS_encrypt_update_avx1_done_enc: + vmovdqu OWORD PTR [r8], xmm0 + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + add rsp, 176 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_update_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA AES_XTS_decrypt_avx1 PROC push rdi push rsi @@ -1457,7 +2474,6 @@ L_AES_XTS_decrypt_avx1_last_31_2_aes_dec_block_last: lea rcx, QWORD PTR [rsi+r13] vmovdqu OWORD PTR [rcx], xmm8 L_AES_XTS_decrypt_avx1_done_dec: - vzeroupper vmovdqu xmm6, OWORD PTR [rsp+16] vmovdqu xmm7, OWORD PTR [rsp+32] vmovdqu xmm8, OWORD PTR [rsp+48] @@ -1473,5 +2489,344 @@ L_AES_XTS_decrypt_avx1_done_dec: ret AES_XTS_decrypt_avx1 ENDP _text ENDS +_text SEGMENT READONLY PARA +AES_XTS_decrypt_update_avx1 PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 128 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu xmm12, OWORD PTR L_avx1_aes_xts_gc_xts + vmovdqu xmm0, OWORD PTR [r8] + xor r12d, r12d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx1_mul16_64 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx1_last_31_start +L_AES_XTS_decrypt_update_avx1_mul16_64: + cmp r11d, 64 + jl L_AES_XTS_decrypt_update_avx1_done_64 + and r11d, 4294967232 +L_AES_XTS_decrypt_update_avx1_dec_64: + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu xmm8, OWORD PTR [rcx] + vmovdqu xmm9, OWORD PTR [rcx+16] + vmovdqu xmm10, OWORD PTR [rcx+32] + vmovdqu xmm11, OWORD PTR [rcx+48] + vpsrad xmm4, xmm0, 31 + vpslld xmm1, xmm0, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm1, xmm1, xmm4 + vpsrad xmm4, xmm1, 31 + vpslld xmm2, xmm1, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm2, xmm2, xmm4 + vpsrad xmm4, xmm2, 31 + vpslld xmm3, xmm2, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm3, xmm3, xmm4 + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + ; aes_dec_block + vmovdqu xmm4, OWORD PTR [r10] + vpxor xmm8, xmm8, xmm4 + vpxor xmm9, xmm9, xmm4 + vpxor xmm10, xmm10, xmm4 + vpxor xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+16] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+32] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+48] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+64] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+80] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+96] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+112] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+128] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+144] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + cmp r9d, 11 + vmovdqu xmm4, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx1_aes_dec_64_aes_dec_block_last + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+176] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + cmp r9d, 13 + vmovdqu xmm4, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx1_aes_dec_64_aes_dec_block_last + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+208] + vaesdec xmm8, xmm8, xmm4 + vaesdec xmm9, xmm9, xmm4 + vaesdec xmm10, xmm10, xmm4 + vaesdec xmm11, xmm11, xmm4 + vmovdqu xmm4, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx1_aes_dec_64_aes_dec_block_last: + vaesdeclast xmm8, xmm8, xmm4 + vaesdeclast xmm9, xmm9, xmm4 + vaesdeclast xmm10, xmm10, xmm4 + vaesdeclast xmm11, xmm11, xmm4 + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vpsrad xmm4, xmm3, 31 + vpslld xmm0, xmm3, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm0, xmm0, xmm4 + add r12d, 64 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_avx1_dec_64 +L_AES_XTS_decrypt_update_avx1_done_64: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_avx1_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx1_mul16 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx1_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_avx1_mul16: +L_AES_XTS_decrypt_update_avx1_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm8, OWORD PTR [rcx] + vpxor xmm8, xmm8, xmm0 + ; aes_dec_block + vpxor xmm8, xmm8, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm8, xmm8, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx1_aes_dec_block_last + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm8, xmm8, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx1_aes_dec_block_last + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx1_aes_dec_block_last: + vaesdeclast xmm8, xmm8, xmm5 + vpxor xmm8, xmm8, xmm0 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm8 + vpsrad xmm4, xmm0, 31 + vpslld xmm0, xmm0, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm0, xmm0, xmm4 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_avx1_dec_16 + cmp r12d, eax + je L_AES_XTS_decrypt_update_avx1_done_dec +L_AES_XTS_decrypt_update_avx1_last_31_start: + vpsrad xmm4, xmm0, 31 + vpslld xmm7, xmm0, 1 + vpshufd xmm4, xmm4, 147 + vpand xmm4, xmm4, xmm12 + vpxor xmm7, xmm7, xmm4 + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm8, OWORD PTR [rcx] + vpxor xmm8, xmm8, xmm7 + ; aes_dec_block + vpxor xmm8, xmm8, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm8, xmm8, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx1_last_31_aes_dec_block_last + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm8, xmm8, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx1_last_31_aes_dec_block_last + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx1_last_31_aes_dec_block_last: + vaesdeclast xmm8, xmm8, xmm5 + vpxor xmm8, xmm8, xmm7 + vmovdqu OWORD PTR [rsp], xmm8 + add r12, 16 + xor rdx, rdx +L_AES_XTS_decrypt_update_avx1_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_decrypt_update_avx1_last_31_byte_loop + sub r12, rdx + vmovdqu xmm8, OWORD PTR [rsp] + vpxor xmm8, xmm8, xmm0 + ; aes_dec_block + vpxor xmm8, xmm8, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm8, xmm8, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx1_last_31_2_aes_dec_block_last + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm8, xmm8, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx1_last_31_2_aes_dec_block_last + vaesdec xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx1_last_31_2_aes_dec_block_last: + vaesdeclast xmm8, xmm8, xmm5 + vpxor xmm8, xmm8, xmm0 + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm8 +L_AES_XTS_decrypt_update_avx1_done_dec: + vmovdqu OWORD PTR [r8], xmm0 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + add rsp, 128 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_update_avx1 ENDP +_text ENDS ENDIF END diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h index df0636dfe..772e6e8eb 100644 --- a/wolfssl/wolfcrypt/aes.h +++ b/wolfssl/wolfcrypt/aes.h @@ -671,9 +671,9 @@ WOLFSSL_API int wc_AesXtsDecryptConsecutiveSectors(XtsAes* aes, #ifdef WOLFSSL_AESXTS_STREAM -WOLFSSL_API int wc_AesXtsEncryptStart(XtsAes* aes, byte* i, word32 iSz); +WOLFSSL_API int wc_AesXtsEncryptInit(XtsAes* aes, byte* i, word32 iSz); -WOLFSSL_API int wc_AesXtsDecryptStart(XtsAes* aes, byte* i, word32 iSz); +WOLFSSL_API int wc_AesXtsDecryptInit(XtsAes* aes, byte* i, word32 iSz); WOLFSSL_API int wc_AesXtsEncryptUpdate(XtsAes* aes, byte* out, const byte* in, word32 sz, byte *i);