Merge pull request #1271 from SparkiDev/chacha20_sb

Improve performance of small number of blocks for chacha20
pull/1298/head
toddouska 2018-01-02 09:40:49 -08:00 committed by GitHub
commit f2375f3fee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1055 additions and 489 deletions

File diff suppressed because it is too large Load Diff

View File

@ -74,6 +74,9 @@
#ifndef NO_AVX2_SUPPORT
#define HAVE_INTEL_AVX2
#endif
static int cpuidFlagsSet = 0;
static int cpuidFlags = 0;
#endif
#ifdef BIG_ENDIAN_ORDER
@ -413,19 +416,379 @@ static INLINE void wc_Chacha_wordtobyte(word32 output[CHACHA_CHUNK_WORDS],
#ifdef USE_INTEL_CHACHA_SPEEDUP
#define QUARTERROUND_2_X64(r11, r12, r13, r14, r21, r22, r23, r24) \
"addl "#r12", "#r11"\n\t" \
"addl "#r22", "#r21"\n\t" \
"xorl "#r11", "#r14"\n\t" \
"xorl "#r21", "#r24"\n\t" \
"roll $16, "#r14"\n\t" \
"roll $16, "#r24"\n\t" \
"addl "#r14", "#r13"\n\t" \
"addl "#r24", "#r23"\n\t" \
"xorl "#r13", "#r12"\n\t" \
"xorl "#r23", "#r22"\n\t" \
"roll $12, "#r12"\n\t" \
"roll $12, "#r22"\n\t" \
"addl "#r12", "#r11"\n\t" \
"addl "#r22", "#r21"\n\t" \
"xorl "#r11", "#r14"\n\t" \
"xorl "#r21", "#r24"\n\t" \
"roll $8, "#r14"\n\t" \
"roll $8, "#r24"\n\t" \
"addl "#r14", "#r13"\n\t" \
"addl "#r24", "#r23"\n\t" \
"xorl "#r13", "#r12"\n\t" \
"xorl "#r23", "#r22"\n\t" \
"roll $7, "#r12"\n\t" \
"roll $7, "#r22"\n\t" \
#define CHACHA_CRYPT_X64() \
"subq $40, %%rsp\n\t" \
"movq 32(%[input]), %%rax\n\t" \
"movq 40(%[input]), %%rdx\n\t" \
"movq %%rax, 8(%%rsp)\n\t" \
"movq %%rdx, 16(%%rsp)\n\t" \
"movl 0(%[input]), %%eax\n\t" \
"movl 4(%[input]), %%ebx\n\t" \
"movl 8(%[input]), %%ecx\n\t" \
"movl 12(%[input]), %%edx\n\t" \
"movl 16(%[input]), %%r8d\n\t" \
"movl 20(%[input]), %%r9d\n\t" \
"movl 24(%[input]), %%r10d\n\t" \
"movl 28(%[input]), %%r11d\n\t" \
"movl 48(%[input]), %%r12d\n\t" \
"movl 52(%[input]), %%r13d\n\t" \
"movl 56(%[input]), %%r14d\n\t" \
"movl 60(%[input]), %%r15d\n\t" \
"movb $10, (%%rsp)\n\t" \
"movq %%rsi, 32(%%rsp)\n\t" \
"movq %%rdi, 24(%%rsp)\n\t" \
"movl 8(%%rsp), %%esi\n\t" \
"movl 12(%%rsp), %%edi\n\t" \
"\n" \
"1:\n\t" \
QUARTERROUND_2_X64(%%eax, %%r8d, %%esi, %%r12d, \
%%ebx, %%r9d, %%edi, %%r13d) \
"movl %%esi, 8(%%rsp)\n\t" \
"movl %%edi, 12(%%rsp)\n\t" \
"movl 16(%%rsp), %%esi\n\t" \
"movl 20(%%rsp), %%edi\n\t" \
QUARTERROUND_2_X64(%%ecx, %%r10d, %%esi, %%r14d, \
%%edx, %%r11d, %%edi, %%r15d) \
QUARTERROUND_2_X64(%%eax, %%r9d, %%esi, %%r15d, \
%%ebx, %%r10d, %%edi, %%r12d) \
"movl %%esi, 16(%%rsp)\n\t" \
"movl %%edi, 20(%%rsp)\n\t" \
"movl 8(%%rsp), %%esi\n\t" \
"movl 12(%%rsp), %%edi\n\t" \
QUARTERROUND_2_X64(%%ecx, %%r11d, %%esi, %%r13d, \
%%edx, %%r8d, %%edi, %%r14d) \
"decb (%%rsp)\n\t" \
"jnz 1b\n\t" \
"movl %%esi, 8(%%rsp)\n\t" \
"movl %%edi, 12(%%rsp)\n\t" \
"movq 32(%%rsp), %%rsi\n\t" \
"movq 24(%%rsp), %%rdi\n\t" \
"addl 0(%[input]), %%eax\n\t" \
"addl 4(%[input]), %%ebx\n\t" \
"addl 8(%[input]), %%ecx\n\t" \
"addl 12(%[input]), %%edx\n\t" \
"addl 16(%[input]), %%r8d\n\t" \
"addl 20(%[input]), %%r9d\n\t" \
"addl 24(%[input]), %%r10d\n\t" \
"addl 28(%[input]), %%r11d\n\t" \
"addl 48(%[input]), %%r12d\n\t" \
"addl 52(%[input]), %%r13d\n\t" \
"addl 56(%[input]), %%r14d\n\t" \
"addl 60(%[input]), %%r15d\n\t" \
#define CHACHA_PARTIAL_CHUNK_X64() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_X64() \
"movl %%eax , 0(%[c])\n\t" \
"movl %%ebx , 4(%[c])\n\t" \
"movl %%ecx , 8(%[c])\n\t" \
"movl %%edx , 12(%[c])\n\t" \
"movl %%r8d , 16(%[c])\n\t" \
"movl %%r9d , 20(%[c])\n\t" \
"movl %%r10d, 24(%[c])\n\t" \
"movl %%r11d, 28(%[c])\n\t" \
"movl %%r12d, 48(%[c])\n\t" \
"movl %%r13d, 52(%[c])\n\t" \
"movl %%r14d, 56(%[c])\n\t" \
"movl %%r15d, 60(%[c])\n\t" \
"movl 8(%%rsp), %%eax\n\t" \
"movl 12(%%rsp), %%ebx\n\t" \
"movl 16(%%rsp), %%ecx\n\t" \
"movl 20(%%rsp), %%edx\n\t" \
"addl 32(%[input]), %%eax\n\t" \
"addl 36(%[input]), %%ebx\n\t" \
"addl 40(%[input]), %%ecx\n\t" \
"addl 44(%[input]), %%edx\n\t" \
"movl %%eax , 32(%[c])\n\t" \
"movl %%ebx , 36(%[c])\n\t" \
"movl %%ecx , 40(%[c])\n\t" \
"movl %%edx , 44(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
"addq $40, %%rsp\n\t" \
"movq %[output], %%rax\n\t" \
"movq %[m], %%rbx\n\t" \
"movl %[bytes], %%r8d\n\t" \
"xorq %%rdx, %%rdx\n\t" \
"movl %%r8d, %%r9d\n\t" \
"andl $7, %%r9d\n\t" \
"jz 4f\n\t" \
"\n" \
"2:\n\t" \
"movzbl (%[c],%%rdx,1), %%ecx\n\t" \
"xorb (%%rbx,%%rdx,1), %%cl\n\t" \
"movb %%cl, (%%rax,%%rdx,1)\n\t" \
"incl %%edx\n\t" \
"cmpl %%r9d, %%edx\n\t" \
"jne 2b\n\t" \
"je 3f\n\t" \
"\n" \
"4:\n\t" \
"movq (%[c],%%rdx,1), %%rcx\n\t" \
"xorq (%%rbx,%%rdx,1), %%rcx\n\t" \
"movq %%rcx, (%%rax,%%rdx,1)\n\t" \
"addl $8, %%edx\n\t" \
"\n" \
"3:\n\t" \
"cmpl %%r8d, %%edx\n\t" \
"jne 4b\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (x), \
[output] "m" (c), [bytes] "m" (bytes), [m] "m" (m) \
: "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \
"r14", "r15", "memory" \
)
#define CHACHA_CHUNK_X64() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_X64() \
"movq %%rsi, 32(%%rsp)\n\t" \
"addq $40, %%rsp\n\t" \
"movq %[m], %%rsi\n\t" \
"subq $40, %%rsp\n\t" \
"xorl 0(%%rsi), %%eax\n\t" \
"xorl 4(%%rsi), %%ebx\n\t" \
"xorl 8(%%rsi), %%ecx\n\t" \
"xorl 12(%%rsi), %%edx\n\t" \
"xorl 16(%%rsi), %%r8d\n\t" \
"xorl 20(%%rsi), %%r9d\n\t" \
"xorl 24(%%rsi), %%r10d\n\t" \
"xorl 28(%%rsi), %%r11d\n\t" \
"xorl 48(%%rsi), %%r12d\n\t" \
"xorl 52(%%rsi), %%r13d\n\t" \
"xorl 56(%%rsi), %%r14d\n\t" \
"xorl 60(%%rsi), %%r15d\n\t" \
"movq 32(%%rsp), %%rsi\n\t" \
"movl %%eax , 0(%[c])\n\t" \
"movl %%ebx , 4(%[c])\n\t" \
"movl %%ecx , 8(%[c])\n\t" \
"movl %%edx , 12(%[c])\n\t" \
"movl %%r8d , 16(%[c])\n\t" \
"movl %%r9d , 20(%[c])\n\t" \
"movl %%r10d, 24(%[c])\n\t" \
"movl %%r11d, 28(%[c])\n\t" \
"movl %%r12d, 48(%[c])\n\t" \
"movl %%r13d, 52(%[c])\n\t" \
"movl %%r14d, 56(%[c])\n\t" \
"movl %%r15d, 60(%[c])\n\t" \
"addq $40, %%rsp\n\t" \
"movq %[m], %%r8\n\t" \
"subq $40, %%rsp\n\t" \
"movl 8(%%rsp), %%eax\n\t" \
"movl 12(%%rsp), %%ebx\n\t" \
"movl 16(%%rsp), %%ecx\n\t" \
"movl 20(%%rsp), %%edx\n\t" \
"addl 32(%[input]), %%eax\n\t" \
"addl 36(%[input]), %%ebx\n\t" \
"addl 40(%[input]), %%ecx\n\t" \
"addl 44(%[input]), %%edx\n\t" \
"xorl 32(%%r8), %%eax\n\t" \
"xorl 36(%%r8), %%ebx\n\t" \
"xorl 40(%%r8), %%ecx\n\t" \
"xorl 44(%%r8), %%edx\n\t" \
"movl %%eax , 32(%[c])\n\t" \
"movl %%ebx , 36(%[c])\n\t" \
"movl %%ecx , 40(%[c])\n\t" \
"movl %%edx , 44(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
"addq $40, %%rsp\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (c), [m] "m" (m) \
: "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \
"r14", "r15", "memory" \
)
static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c,
word32 bytes)
{
word32 x[CHACHA_CHUNK_WORDS];
if (bytes == 0)
return;
for (; bytes >= CHACHA_CHUNK_BYTES;) {
CHACHA_CHUNK_X64();
bytes -= CHACHA_CHUNK_BYTES;
c += CHACHA_CHUNK_BYTES;
m += CHACHA_CHUNK_BYTES;
}
if (bytes > 0) {
CHACHA_PARTIAL_CHUNK_X64();
}
}
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
#define QUARTERROUND_2_AVX() \
"paddd %%xmm1, %%xmm0\n\t" \
"pxor %%xmm0, %%xmm3\n\t" \
"pshufb %[rotl16], %%xmm3\n\t" \
"paddd %%xmm3, %%xmm2\n\t" \
"pxor %%xmm2, %%xmm1\n\t" \
"movdqa %%xmm1, %%xmm4\n\t" \
"pslld $12, %%xmm1\n\t" \
"psrld $20, %%xmm4\n\t" \
"pxor %%xmm4, %%xmm1\n\t" \
"paddd %%xmm1, %%xmm0\n\t" \
"pxor %%xmm0, %%xmm3\n\t" \
"pshufb %[rotl8], %%xmm3\n\t" \
"paddd %%xmm3, %%xmm2\n\t" \
"pxor %%xmm2, %%xmm1\n\t" \
"movdqa %%xmm1, %%xmm4\n\t" \
"pslld $7, %%xmm1\n\t" \
"psrld $25, %%xmm4\n\t" \
"pxor %%xmm4, %%xmm1\n\t" \
"# Swap words for next round\n\t" \
"pshufd $0x39, %%xmm1, %%xmm1\n\t" \
"pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
"pshufd $0x93, %%xmm3, %%xmm3\n\t" \
"paddd %%xmm1, %%xmm0\n\t" \
"pxor %%xmm0, %%xmm3\n\t" \
"pshufb %[rotl16], %%xmm3\n\t" \
"paddd %%xmm3, %%xmm2\n\t" \
"pxor %%xmm2, %%xmm1\n\t" \
"movdqa %%xmm1, %%xmm4\n\t" \
"pslld $12, %%xmm1\n\t" \
"psrld $20, %%xmm4\n\t" \
"pxor %%xmm4, %%xmm1\n\t" \
"paddd %%xmm1, %%xmm0\n\t" \
"pxor %%xmm0, %%xmm3\n\t" \
"pshufb %[rotl8], %%xmm3\n\t" \
"paddd %%xmm3, %%xmm2\n\t" \
"pxor %%xmm2, %%xmm1\n\t" \
"movdqa %%xmm1, %%xmm4\n\t" \
"pslld $7, %%xmm1\n\t" \
"psrld $25, %%xmm4\n\t" \
"pxor %%xmm4, %%xmm1\n\t" \
"# Swap words back\n\t" \
"pshufd $0x93, %%xmm1, %%xmm1\n\t" \
"pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
"pshufd $0x39, %%xmm3, %%xmm3\n\t" \
#define CHACHA_CRYPT_AVX() \
"movdqu 0(%[input]), %%xmm0\n\t" \
"movdqu 16(%[input]), %%xmm1\n\t" \
"movdqu 32(%[input]), %%xmm2\n\t" \
"movdqu 48(%[input]), %%xmm3\n\t" \
"movb $10, %%al\n\t" \
"\n" \
"1:\n\t" \
QUARTERROUND_2_AVX() \
"decb %%al\n\t" \
"jnz 1b\n\t" \
"movdqu 0(%[input]), %%xmm4\n\t" \
"movdqu 16(%[input]), %%xmm5\n\t" \
"movdqu 32(%[input]), %%xmm6\n\t" \
"movdqu 48(%[input]), %%xmm7\n\t" \
"paddd %%xmm4, %%xmm0\n\t" \
"paddd %%xmm5, %%xmm1\n\t" \
"paddd %%xmm6, %%xmm2\n\t" \
"paddd %%xmm7, %%xmm3\n\t" \
#define CHACHA_PARTIAL_CHUNK_AVX() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_AVX() \
"movdqu %%xmm0, 0(%[c])\n\t" \
"movdqu %%xmm1, 16(%[c])\n\t" \
"movdqu %%xmm2, 32(%[c])\n\t" \
"movdqu %%xmm3, 48(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
"movl %[bytes], %%r8d\n\t" \
"xorq %%rdx, %%rdx\n\t" \
"movl %%r8d, %%r9d\n\t" \
"andl $7, %%r9d\n\t" \
"jz 4f\n\t" \
"\n" \
"2:\n\t" \
"movzbl (%[c],%%rdx,1), %%ecx\n\t" \
"xorb (%[m],%%rdx,1), %%cl\n\t" \
"movb %%cl, (%[output],%%rdx,1)\n\t" \
"incl %%edx\n\t" \
"cmpl %%r9d, %%edx\n\t" \
"jne 2b\n\t" \
"je 3f\n\t" \
"\n" \
"4:\n\t" \
"movq (%[c],%%rdx,1), %%rcx\n\t" \
"xorq (%[m],%%rdx,1), %%rcx\n\t" \
"movq %%rcx, (%[output],%%rdx,1)\n\t" \
"addl $8, %%edx\n\t" \
"\n" \
"3:\n\t" \
"cmpl %%r8d, %%edx\n\t" \
"jne 4b\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (x), \
[output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
: "eax", "ecx", "edx", "r8", "r9", "memory", \
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
)
#define CHACHA_CHUNK_AVX() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_AVX() \
"movdqu 0(%[m]), %%xmm4\n\t" \
"movdqu 16(%[m]), %%xmm5\n\t" \
"movdqu 32(%[m]), %%xmm6\n\t" \
"movdqu 48(%[m]), %%xmm7\n\t" \
"pxor %%xmm4, %%xmm0\n\t" \
"pxor %%xmm5, %%xmm1\n\t" \
"pxor %%xmm6, %%xmm2\n\t" \
"pxor %%xmm7, %%xmm3\n\t" \
"movdqu %%xmm0, 0(%[c])\n\t" \
"movdqu %%xmm1, 16(%[c])\n\t" \
"movdqu %%xmm2, 32(%[c])\n\t" \
"movdqu %%xmm3, 48(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
: "rax", "memory", \
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
)
#endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX1
static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
word32 bytes)
{
ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
byte* output;
word32 i;
word32 cnt = 0;
static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL };
static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL };
static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
if (bytes == 0)
return;
@ -433,7 +796,7 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
__asm__ __volatile__ (
"movl %[bytes], %[cnt]\n\t"
"shrl $8, %[cnt]\n\t"
"jz L_end128\n\t"
"jz L_end128\n\t"
"vpshufd $0, (%[key]), %%xmm0\n\t"
"vpshufd $0, 4(%[key]), %%xmm1\n\t"
@ -646,7 +1009,7 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
"add 48(%[key]), %[cnt]\n\t"
"movl %[cnt], 48(%[key])\n\t"
"\n"
"L_end128:"
"L_end128:\n\t"
: [bytes] "+r" (bytes), [cnt] "+r" (cnt),
[in] "+r" (m), [out] "+r" (c)
: [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
@ -658,23 +1021,15 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
"xmm12", "xmm13", "xmm14", "xmm15", "memory"
);
output = (byte*)x;
for (; bytes > 0;) {
wc_Chacha_wordtobyte(x, ctx->X);
ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
if (bytes <= CHACHA_CHUNK_BYTES) {
for (i = 0; i < bytes; ++i) {
c[i] = m[i] ^ output[i];
}
return;
}
for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) {
c[i] = m[i] ^ output[i];
}
for (; bytes >= CHACHA_CHUNK_BYTES;) {
CHACHA_CHUNK_AVX();
bytes -= CHACHA_CHUNK_BYTES;
c += CHACHA_CHUNK_BYTES;
m += CHACHA_CHUNK_BYTES;
}
if (bytes > 0) {
CHACHA_PARTIAL_CHUNK_AVX();
}
}
#endif /* HAVE_INTEL_AVX1 */
@ -684,16 +1039,16 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
{
ALIGN256 word32 X[8*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
ALIGN256 word32 x[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
byte* output;
word32 i;
word32 cnt = 0;
static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL,
0x0000000500000004UL,0x0000000700000006UL };
static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL,
0x0000000800000008UL,0x0000000800000008UL };
static const __m256i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL,
static const __m256i rotl8_256 =
{ 0x0605040702010003UL,0x0e0d0c0f0a09080bUL,
0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
static const __m256i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL,
static const __m256i rotl16_256 =
{ 0x0504070601000302UL,0x0d0c0f0e09080b0aUL,
0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
if (bytes == 0)
@ -702,7 +1057,7 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
__asm__ __volatile__ (
"movl %[bytes], %[cnt]\n\t"
"shrl $9, %[cnt]\n\t"
"jz L_end256\n\t"
"jz L_end256\n\t"
"vpbroadcastd (%[key]), %%ymm0\n\t"
"vpbroadcastd 4(%[key]), %%ymm1\n\t"
@ -931,35 +1286,27 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
"add 48(%[key]), %[cnt]\n\t"
"movl %[cnt], 48(%[key])\n\t"
"\n"
"L_end256:"
"L_end256:\n\t"
: [bytes] "+r" (bytes), [cnt] "+r" (cnt),
[in] "+r" (m), [out] "+r" (c)
: [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
[add] "m" (add), [eight] "m" (eight),
[rotl8] "m" (rotl8), [rotl16] "m" (rotl16)
[rotl8] "m" (rotl8_256), [rotl16] "m" (rotl16_256)
: "ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12", "ymm13", "ymm14", "ymm15", "memory"
);
output = (byte*)x;
for (; bytes > 0;) {
wc_Chacha_wordtobyte(x, ctx->X);
ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
if (bytes <= CHACHA_CHUNK_BYTES) {
for (i = 0; i < bytes; ++i) {
c[i] = m[i] ^ output[i];
}
return;
}
for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) {
c[i] = m[i] ^ output[i];
}
for (; bytes >= CHACHA_CHUNK_BYTES;) {
CHACHA_CHUNK_AVX();
bytes -= CHACHA_CHUNK_BYTES;
c += CHACHA_CHUNK_BYTES;
m += CHACHA_CHUNK_BYTES;
}
if (bytes > 0) {
CHACHA_PARTIAL_CHUNK_AVX();
}
}
#endif /* HAVE_INTEL_AVX2 */
#endif /* USE_INTEL_CHACHA_SPEEDUP */
@ -1004,16 +1351,25 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
return BAD_FUNC_ARG;
#ifdef USE_INTEL_CHACHA_SPEEDUP
if (!cpuidFlagsSet) {
cpuidFlags = cpuid_get_flags();
cpuidFlagsSet = 1;
}
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(cpuid_get_flags())) {
if (IS_INTEL_AVX2(cpuidFlags)) {
chacha_encrypt_avx2(ctx, input, output, msglen);
return 0;
}
#endif
if (IS_INTEL_AVX1(cpuid_get_flags())) {
if (IS_INTEL_AVX1(cpuidFlags)) {
chacha_encrypt_avx(ctx, input, output, msglen);
return 0;
}
else {
chacha_encrypt_x64(ctx, input, output, msglen);
return 0;
}
#endif
wc_Chacha_encrypt_bytes(ctx, input, output, msglen);

View File

@ -144,10 +144,10 @@ static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
"addq %%rax, %%r12\n\t"
"movq %%r15, %%rax\n\t"
"adcq %%rdx, %%r13\n\t"
"# r[0] * h[0] => rdx, rax +=> t1, t0\n\t"
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
"mulq %%r8\n\t"
"movq %%rdx, %%r8\n\t"
"movq %%rax, %%r11\n\t"
"movq %%rdx, %%r8\n\t"
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
"movq 8(%[ctx]), %%rax\n\t"
"mulq %%r9\n\t"
@ -211,10 +211,10 @@ POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
"addq %%rax, %%r12\n\t"
"movq %%r15, %%rax\n\t"
"adcq %%rdx, %%r13\n\t"
"# r[0] * h[0] => rdx, rax +=> t1, t0\n\t"
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
"mulq %%r8\n\t"
"movq %%rdx, %%r8\n\t"
"movq %%rax, %%r11\n\t"
"movq %%rdx, %%r8\n\t"
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
"movq 8(%[ctx]), %%rax\n\t"
"mulq %%r9\n\t"