Merge pull request #1317 from SparkiDev/chacha20_sb_avx2

Improve performance of chacha20-poly1305 on AVX and AVX2.
pull/1351/head
toddouska 2018-02-02 10:46:39 -08:00 committed by GitHub
commit c66ebb6748
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 437 additions and 363 deletions

View File

@ -2102,27 +2102,49 @@ void bench_poly1305()
Poly1305 enc;
byte mac[16];
double start;
int ret, i, count;
int ret = 0, i, count;
ret = wc_Poly1305SetKey(&enc, bench_key, 32);
if (ret != 0) {
printf("Poly1305SetKey failed, ret = %d\n", ret);
return;
}
bench_stats_start(&count, &start);
do {
for (i = 0; i < numBlocks; i++) {
ret = wc_Poly1305Update(&enc, bench_plain, BENCH_SIZE);
if (ret != 0) {
printf("Poly1305Update failed: %d\n", ret);
break;
}
if (digest_stream) {
ret = wc_Poly1305SetKey(&enc, bench_key, 32);
if (ret != 0) {
printf("Poly1305SetKey failed, ret = %d\n", ret);
return;
}
wc_Poly1305Final(&enc, mac);
count += i;
} while (bench_stats_sym_check(start));
bench_stats_sym_finish("POLY1305", 0, count, bench_size, start, ret);
bench_stats_start(&count, &start);
do {
for (i = 0; i < numBlocks; i++) {
ret = wc_Poly1305Update(&enc, bench_plain, BENCH_SIZE);
if (ret != 0) {
printf("Poly1305Update failed: %d\n", ret);
break;
}
}
wc_Poly1305Final(&enc, mac);
count += i;
} while (bench_stats_sym_check(start));
bench_stats_sym_finish("POLY1305", 0, count, bench_size, start, ret);
}
else {
bench_stats_start(&count, &start);
do {
for (i = 0; i < numBlocks; i++) {
ret = wc_Poly1305SetKey(&enc, bench_key, 32);
if (ret != 0) {
printf("Poly1305SetKey failed, ret = %d\n", ret);
return;
}
ret = wc_Poly1305Update(&enc, bench_plain, BENCH_SIZE);
if (ret != 0) {
printf("Poly1305Update failed: %d\n", ret);
break;
}
wc_Poly1305Final(&enc, mac);
}
count += i;
} while (bench_stats_sym_check(start));
bench_stats_sym_finish("POLY1305", 0, count, bench_size, start, ret);
}
}
#endif /* HAVE_POLY1305 */

View File

@ -75,6 +75,14 @@
#define HAVE_INTEL_AVX2
#endif
#if defined(_MSC_VER)
#define CHACHA20_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
#define CHACHA20_NOINLINE __attribute__((noinline))
#else
#define CHACHA20_NOINLINE
#endif
static int cpuidFlagsSet = 0;
static int cpuidFlags = 0;
#endif
@ -647,7 +655,9 @@ static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c,
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
#endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX1
#define QUARTERROUND_2_AVX() \
"paddd %%xmm1, %%xmm0\n\t" \
"pxor %%xmm0, %%xmm3\n\t" \
@ -778,11 +788,8 @@ static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
)
#endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX1
static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
word32 bytes)
CHACHA20_NOINLINE static void chacha_encrypt_avx(ChaCha* ctx, const byte* m,
byte* c, word32 bytes)
{
ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
@ -1034,6 +1041,135 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
#endif /* HAVE_INTEL_AVX1 */
#ifdef HAVE_INTEL_AVX2
#define QUARTERROUND_2_AVX2() \
"vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
"vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
"vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \
"vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
"vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
"vpsrld $20, %%xmm1, %%xmm4\n\t" \
"vpslld $12, %%xmm1, %%xmm1\n\t" \
"vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
"vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
"vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
"vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \
"vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
"vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
"vpsrld $25, %%xmm1, %%xmm4\n\t" \
"vpslld $7, %%xmm1, %%xmm1\n\t" \
"vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
"# Swap words for next round\n\t" \
"vpshufd $0x39, %%xmm1, %%xmm1\n\t" \
"vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \
"vpshufd $0x93, %%xmm3, %%xmm3\n\t" \
"vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
"vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
"vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \
"vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
"vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
"vpsrld $20, %%xmm1, %%xmm4\n\t" \
"vpslld $12, %%xmm1, %%xmm1\n\t" \
"vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
"vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
"vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
"vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \
"vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
"vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
"vpsrld $25, %%Xmm1, %%xmm4\n\t" \
"vpslld $7, %%xmm1, %%xmm1\n\t" \
"vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
"# Swap words back\n\t" \
"vpshufd $0x93, %%xmm1, %%xmm1\n\t" \
"vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \
"vpshufd $0x39, %%xmm3, %%xmm3\n\t" \
#define CHACHA_CRYPT_AVX2() \
"vmovdqu 0(%[input]), %%xmm8\n\t" \
"vmovdqu 16(%[input]), %%xmm9\n\t" \
"vmovdqu 32(%[input]), %%xmm10\n\t" \
"vmovdqu 48(%[input]), %%xmm11\n\t" \
"vmovdqu %%xmm8, %%xmm0\n\t" \
"vmovdqu %%xmm9, %%xmm1\n\t" \
"vmovdqu %%xmm10, %%xmm2\n\t" \
"vmovdqu %%xmm11, %%xmm3\n\t" \
"movb $10, %%al\n\t" \
"\n" \
"1:\n\t" \
QUARTERROUND_2_AVX2() \
"decb %%al\n\t" \
"jnz 1b\n\t" \
"vpaddd %%xmm8, %%xmm0, %%xmm0\n\t" \
"vpaddd %%xmm9, %%xmm1, %%xmm1\n\t" \
"vpaddd %%xmm10, %%xmm2, %%xmm2\n\t" \
"vpaddd %%xmm11, %%xmm3, %%xmm3\n\t" \
#define CHACHA_PARTIAL_CHUNK_AVX2() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_AVX2() \
"vmovdqu %%xmm0, 0(%[c])\n\t" \
"vmovdqu %%xmm1, 16(%[c])\n\t" \
"vmovdqu %%xmm2, 32(%[c])\n\t" \
"vmovdqu %%xmm3, 48(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
"movl %[bytes], %%r8d\n\t" \
"xorq %%rdx, %%rdx\n\t" \
"movl %%r8d, %%r9d\n\t" \
"andl $7, %%r9d\n\t" \
"jz 4f\n\t" \
"\n" \
"2:\n\t" \
"movzbl (%[c],%%rdx,1), %%ecx\n\t" \
"xorb (%[m],%%rdx,1), %%cl\n\t" \
"movb %%cl, (%[output],%%rdx,1)\n\t" \
"incl %%edx\n\t" \
"cmpl %%r9d, %%edx\n\t" \
"jne 2b\n\t" \
"je 3f\n\t" \
"\n" \
"4:\n\t" \
"movq (%[c],%%rdx,1), %%rcx\n\t" \
"xorq (%[m],%%rdx,1), %%rcx\n\t" \
"movq %%rcx, (%[output],%%rdx,1)\n\t" \
"addl $8, %%edx\n\t" \
"\n" \
"3:\n\t" \
"cmpl %%r8d, %%edx\n\t" \
"jne 4b\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (x), \
[output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
: "eax", "ecx", "edx", "r8", "r9", "memory", \
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \
"xmm8", "xmm9", "xmm10", "xmm11" \
)
#define CHACHA_CHUNK_AVX2() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_AVX2() \
"vmovdqu 0(%[m]), %%xmm4\n\t" \
"vmovdqu 16(%[m]), %%xmm5\n\t" \
"vmovdqu 32(%[m]), %%xmm6\n\t" \
"vmovdqu 48(%[m]), %%xmm7\n\t" \
"vpxor %%xmm4, %%xmm0, %%xmm0\n\t" \
"vpxor %%xmm5, %%xmm1, %%xmm1\n\t" \
"vpxor %%xmm6, %%xmm2, %%xmm2\n\t" \
"vpxor %%xmm7, %%xmm3, %%xmm3\n\t" \
"vmovdqu %%xmm0, 0(%[c])\n\t" \
"vmovdqu %%xmm1, 16(%[c])\n\t" \
"vmovdqu %%xmm2, 32(%[c])\n\t" \
"vmovdqu %%xmm3, 48(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
: "rax", "memory", \
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \
"xmm8", "xmm9", "xmm10", "xmm11" \
)
static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
word32 bytes)
{
@ -1298,14 +1434,20 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
"ymm12", "ymm13", "ymm14", "ymm15", "memory"
);
/* AVX code optimised for multiples of 256 bytes. */
if (bytes == 256) {
chacha_encrypt_avx(ctx, m, c, bytes);
bytes -= 256;
}
for (; bytes >= CHACHA_CHUNK_BYTES;) {
CHACHA_CHUNK_AVX();
CHACHA_CHUNK_AVX2();
bytes -= CHACHA_CHUNK_BYTES;
c += CHACHA_CHUNK_BYTES;
m += CHACHA_CHUNK_BYTES;
}
if (bytes > 0) {
CHACHA_PARTIAL_CHUNK_AVX();
CHACHA_PARTIAL_CHUNK_AVX2();
}
}
#endif /* HAVE_INTEL_AVX2 */

View File

@ -178,84 +178,55 @@ static int calculateAuthTag(
Poly1305 poly1305Ctx;
byte padding[CHACHA20_POLY1305_MAC_PADDING_ALIGNMENT - 1];
word32 paddingLen;
byte little64[8];
byte little64[16];
XMEMSET(padding, 0, sizeof(padding));
/* Initialize Poly1305 */
err = wc_Poly1305SetKey(&poly1305Ctx, inAuthKey,
CHACHA20_POLY1305_AEAD_KEYSIZE);
if (err)
{
return err;
}
/* Create the authTag by MAC'ing the following items: */
/* -- AAD */
if (inAAD && inAADLen)
{
err = wc_Poly1305Update(&poly1305Ctx, inAAD, inAADLen);
/* -- padding1: pad the AAD to 16 bytes */
paddingLen = -(int)inAADLen & (CHACHA20_POLY1305_MAC_PADDING_ALIGNMENT - 1);
paddingLen = -(int)inAADLen &
(CHACHA20_POLY1305_MAC_PADDING_ALIGNMENT - 1);
if (paddingLen)
{
err += wc_Poly1305Update(&poly1305Ctx, padding, paddingLen);
}
if (err)
{
return err;
}
}
/* -- Ciphertext */
err = wc_Poly1305Update(&poly1305Ctx, inCiphertext, inCiphertextLen);
if (err)
{
return err;
}
/* -- padding2: pad the ciphertext to 16 bytes */
paddingLen = -(int)inCiphertextLen &
(CHACHA20_POLY1305_MAC_PADDING_ALIGNMENT - 1);
if (paddingLen)
{
err = wc_Poly1305Update(&poly1305Ctx, padding, paddingLen);
if (err)
{
return err;
}
}
/* -- AAD length as a 64-bit little endian integer */
word32ToLittle64(inAADLen, little64);
err = wc_Poly1305Update(&poly1305Ctx, little64, sizeof(little64));
if (err)
{
return err;
}
/* -- Ciphertext length as a 64-bit little endian integer */
word32ToLittle64(inCiphertextLen, little64);
word32ToLittle64(inCiphertextLen, little64 + 8);
err = wc_Poly1305Update(&poly1305Ctx, little64, sizeof(little64));
if (err)
{
return err;
}
/* Finalize the auth tag */
err = wc_Poly1305Final(&poly1305Ctx, outAuthTag);
return err;
@ -264,12 +235,16 @@ static int calculateAuthTag(
static void word32ToLittle64(const word32 inLittle32, byte outLittle64[8])
{
XMEMSET(outLittle64, 0, 8);
#ifndef WOLFSSL_X86_64_BUILD
XMEMSET(outLittle64 + 4, 0, 4);
outLittle64[0] = (byte)(inLittle32 & 0x000000FF);
outLittle64[1] = (byte)((inLittle32 & 0x0000FF00) >> 8);
outLittle64[2] = (byte)((inLittle32 & 0x00FF0000) >> 16);
outLittle64[3] = (byte)((inLittle32 & 0xFF000000) >> 24);
#else
*(word64*)outLittle64 = inLittle32;
#endif
}

View File

@ -118,6 +118,11 @@ static word32 cpu_flags_set = 0;
#ifdef USE_INTEL_SPEEDUP
#ifdef HAVE_INTEL_AVX1
/* Process one block (16 bytes) of data.
*
* ctx Poly1305 context.
* m One block of message data.
*/
static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
{
__asm__ __volatile__ (
@ -152,12 +157,12 @@ static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
"movq 8(%[ctx]), %%rax\n\t"
"mulq %%r9\n\t"
"# r[0] * h[2] +> t2\n\t"
"addq 64(%[ctx],%%r10,8), %%r13\n\t"
"addq 352(%[ctx],%%r10,8), %%r13\n\t"
"movq %%rdx, %%r14\n\t"
"addq %%r8, %%r12\n\t"
"adcq %%rax, %%r13\n\t"
"# r[1] * h[2] +> t3\n\t"
"adcq 120(%[ctx],%%r10,8), %%r14\n\t"
"adcq 408(%[ctx],%%r10,8), %%r14\n\t"
"# r * h in r14, r13, r12, r11 \n\t"
"# h = (r * h) mod 2^130 - 5\n\t"
"movq %%r13, %%r10\n\t"
@ -185,6 +190,12 @@ static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
);
}
/* Process multiple blocks (n * 16 bytes) of data.
*
* ctx Poly1305 context.
* m Blocks of message data.
* bytes The number of bytes to process.
*/
POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
const unsigned char* m, size_t bytes)
{
@ -219,12 +230,12 @@ POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
"movq 8(%[ctx]), %%rax\n\t"
"mulq %%r9\n\t"
"# r[0] * h[2] +> t2\n\t"
"addq 72(%[ctx],%%r10,8), %%r13\n\t"
"addq 360(%[ctx],%%r10,8), %%r13\n\t"
"movq %%rdx, %%r14\n\t"
"addq %%r8, %%r12\n\t"
"adcq %%rax, %%r13\n\t"
"# r[1] * h[2] +> t3\n\t"
"adcq 128(%[ctx],%%r10,8), %%r14\n\t"
"adcq 416(%[ctx],%%r10,8), %%r14\n\t"
"# r * h in r14, r13, r12, r11 \n\t"
"# h = (r * h) mod 2^130 - 5\n\t"
"movq %%r13, %%r10\n\t"
@ -257,6 +268,12 @@ POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
);
}
/* Set the key to use when processing data.
* Initialize the context.
*
* ctx Poly1305 context.
* key The key data (16 bytes).
*/
static void poly1305_setkey_avx(Poly1305* ctx, const byte* key)
{
int i;
@ -265,8 +282,8 @@ static void poly1305_setkey_avx(Poly1305* ctx, const byte* key)
ctx->r[1] = *(word64*)(key + 8) & 0x0ffffffc0ffffffcL;
for (i=0; i<7; i++) {
ctx->hh[i + 0] = ctx->r[0] * i;
ctx->hh[i + 7] = ctx->r[1] * i;
ctx->hm[i + 0] = ctx->r[0] * i;
ctx->hm[i + 7] = ctx->r[1] * i;
}
/* h (accumulator) = 0 */
@ -282,6 +299,12 @@ static void poly1305_setkey_avx(Poly1305* ctx, const byte* key)
ctx->finished = 1;
}
/* Calculate the final result - authentication data.
* Zeros out the private data in the context.
*
* ctx Poly1305 context.
* mac Buffer to hold 16 bytes.
*/
static void poly1305_final_avx(Poly1305* ctx, byte* mac)
{
word64 h0, h1, h2;
@ -357,22 +380,15 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
/* Load H into five 256-bit registers.
*
* h is the memory location of the data - 26 bits in 32.
* h is the memory location of the data - 26 of 32 bits.
* h0-h4 the 4 H values with 26 bits stored in 64 for multiply.
* z is zero.
*/
#define LOAD_H(h, h0, h1, h2, h3, h4, z) \
"vmovdqu ("#h"), "#h1"\n\t" \
"vmovdqu 32("#h"), "#h3"\n\t" \
"vmovdqu 64("#h"), "#h4"\n\t" \
"vpermq $0xd8, "#h1", "#h1"\n\t" \
"vpermq $0xd8, "#h3", "#h3"\n\t" \
"vpermq $0xd8, "#h4", "#h4"\n\t" \
"vpunpckldq "#z", "#h1", "#h0"\n\t" \
"vpunpckhdq "#z", "#h1", "#h1"\n\t" \
"vpunpckldq "#z", "#h3", "#h2"\n\t" \
"vpunpckhdq "#z", "#h3", "#h3"\n\t" \
"vpunpckldq "#z", "#h4", "#h4"\n\t"
#define LOAD_H(h, h0, h1, h2, h3, h4) \
"vmovdqu ("#h"), "#h0"\n\t" \
"vmovdqu 32("#h"), "#h1"\n\t" \
"vmovdqu 64("#h"), "#h2"\n\t" \
"vmovdqu 96("#h"), "#h3"\n\t" \
"vmovdqu 128("#h"), "#h4"\n\t"
/* Store H, five 256-bit registers, packed.
*
@ -381,35 +397,23 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
* x4 is the xmm register of h4.
*/
#define STORE_H(h, h0, h1, h2, h3, h4, x4) \
"vpshufd $0x08, "#h0", "#h0"\n\t" \
"vpshufd $0x08, "#h1", "#h1"\n\t" \
"vpshufd $0x08, "#h2", "#h2"\n\t" \
"vpshufd $0x08, "#h3", "#h3"\n\t" \
"vpshufd $0x08, "#h4", "#h4"\n\t" \
"vpermq $0x08, "#h0", "#h0"\n\t" \
"vpermq $0x08, "#h1", "#h1"\n\t" \
"vpermq $0x08, "#h2", "#h2"\n\t" \
"vpermq $0x08, "#h3", "#h3"\n\t" \
"vpermq $0x08, "#h4", "#h4"\n\t" \
"vperm2i128 $0x20, "#h1", "#h0", "#h0"\n\t" \
"vperm2i128 $0x20, "#h3", "#h2", "#h2"\n\t" \
"vmovdqu "#h0", ("#h")\n\t" \
"vmovdqu "#h2", 32("#h")\n\t" \
"vmovdqu "#x4", 64("#h")\n\t"
"vmovdqu "#h0", ("#h")\n\t" \
"vmovdqu "#h1", 32("#h")\n\t" \
"vmovdqu "#h2", 64("#h")\n\t" \
"vmovdqu "#h3", 96("#h")\n\t" \
"vmovdqu "#h4", 128("#h")\n\t"
/* Load four powers of r into position to be multiplied by the 4 H values.
*
* rp0-rp3 are the register holding pointers to the values of the powers of r.
* r0-r4 holds the loaded values with 26 bits store in 64 for multiply.
* r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
* t0-t3 are temporary registers.
*/
#define LOAD_Rx4(rp0, rp1, rp2, rp3, \
r0, r1, r2, r3, r4, \
#define LOAD_Rx4(r0, r1, r2, r3, r4, \
t0, t1, t2, t3) \
"vmovdqu ("#rp0"), "#r0"\n\t" \
"vmovdqu ("#rp1"), "#r1"\n\t" \
"vmovdqu ("#rp2"), "#r2"\n\t" \
"vmovdqu ("#rp3"), "#r3"\n\t" \
"vmovdqu 224(%[ctx]), "#r3"\n\t" \
"vmovdqu 256(%[ctx]), "#r2"\n\t" \
"vmovdqu 288(%[ctx]), "#r1"\n\t" \
"vmovdqu 320(%[ctx]), "#r0"\n\t" \
"vpermq $0xd8, "#r0", "#r0"\n\t" \
"vpermq $0xd8, "#r1", "#r1"\n\t" \
"vpermq $0xd8, "#r2", "#r2"\n\t" \
@ -427,18 +431,18 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
/* Load the r^4 value into position to be multiplied by all 4 H values.
*
* r4 holds r^4 as five 26 bits each in 32.
* r0-r4 holds the loaded values with 26 bits store in 64 for multiply.
* r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
* t0-t1 are temporary registers.
*/
#define LOAD_R4(r4, r40, r41, r42, r43, r44, \
t0, t1) \
"vmovdqu "#r4", "#t0"\n\t" \
"vpsrlq $32, "#t0", "#t1"\n\t" \
"vpermq $0x0, "#t0", "#r40"\n\t" \
"vpermq $0x0, "#t1", "#r41"\n\t" \
"vpsrlq $32, "#t0", "#t1"\n\t" \
"vpermq $0x55, "#t0", "#r42"\n\t" \
"vpermq $0x55, "#t1", "#r43"\n\t" \
"vpermq $0xaa, "#t0", "#r44"\n\t"
"vpermq $0xaa, "#t0", "#r44"\n\t" \
"vpermq $0x0, "#t1", "#r41"\n\t" \
"vpermq $0x55, "#t1", "#r43"\n\t"
/* Multiply the top 4 26-bit values in 64 bits of each H by 5 for reduction in
* multiply.
@ -464,21 +468,21 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
*/
#define FINALIZE_H(h0, h1, h2, h3, h4, \
t0, t1, t2, t3, t4) \
"vpermq $0xf5, "#h0", "#t0"\n\t" \
"vpermq $0xf5, "#h1", "#t1"\n\t" \
"vpermq $0xf5, "#h2", "#t2"\n\t" \
"vpermq $0xf5, "#h3", "#t3"\n\t" \
"vpermq $0xf5, "#h4", "#t4"\n\t" \
"vpsrldq $8, "#h0", "#t0"\n\t" \
"vpsrldq $8, "#h1", "#t1"\n\t" \
"vpsrldq $8, "#h2", "#t2"\n\t" \
"vpsrldq $8, "#h3", "#t3"\n\t" \
"vpsrldq $8, "#h4", "#t4"\n\t" \
"vpaddq "#h0", "#t0", "#h0"\n\t" \
"vpaddq "#h1", "#t1", "#h1"\n\t" \
"vpaddq "#h2", "#t2", "#h2"\n\t" \
"vpaddq "#h3", "#t3", "#h3"\n\t" \
"vpaddq "#h4", "#t4", "#h4"\n\t" \
"vpermq $0xaa, "#h0", "#t0"\n\t" \
"vpermq $0xaa, "#h1", "#t1"\n\t" \
"vpermq $0xaa, "#h2", "#t2"\n\t" \
"vpermq $0xaa, "#h3", "#t3"\n\t" \
"vpermq $0xaa, "#h4", "#t4"\n\t" \
"vpermq $0x02, "#h0", "#t0"\n\t" \
"vpermq $0x02, "#h1", "#t1"\n\t" \
"vpermq $0x02, "#h2", "#t2"\n\t" \
"vpermq $0x02, "#h3", "#t3"\n\t" \
"vpermq $0x02, "#h4", "#t4"\n\t" \
"vpaddq "#h0", "#t0", "#h0"\n\t" \
"vpaddq "#h1", "#t1", "#h1"\n\t" \
"vpaddq "#h2", "#t2", "#h2"\n\t" \
@ -562,7 +566,7 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
*
* m the address of the message to load.
* m0-m4 is the loaded message with 32 bits in 64. Loaded so data is parallel.
* hi is the high bits of the 4 m (1<< 128 if not final block).
* hi is the high bits of the 4 m (1 << 128 as not final block).
* z is zero.
*/
#define LOAD_M(m, m0, m1, m2, m3, m4, hi, z) \
@ -591,7 +595,7 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
* r0-r4 contain the 4 powers of r.
* s1-s4 contain r1-r4 times 5.
* t0-t4 and v0-v3 are temporary registers.
* hi is the high bits of the 4 m (1<< 128 if not final block).
* hi is the high bits of the 4 m (1 << 128 as not final block).
* z is zero.
*/
#define MUL_ADD_AVX2(h0, h1, h2, h3, h4, \
@ -665,41 +669,6 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
"vpaddq "#t4", "#v2", "#t4"\n\t" \
"vpaddq "#t4", "#v3", "#t4"\n\t"
/* Reduce, in place, the 64 bits of data to 26 bits.
*
* h0-h4 contain the 4 H values to reduce.
* t0-t2 are temporaries.
* mask contains the 26-bit mask for each 64 bit value in the 256 bit register.
*/
#define REDUCE_IN(h0, h1, h2, h3, h4, \
t0, t1, t2, mask) \
"vpsrlq $26, "#h0", "#t0"\n\t" \
"vpsrlq $26, "#h3", "#t1"\n\t" \
"vpand "#mask", "#h0", "#h0"\n\t" \
"vpand "#mask", "#h3", "#h3"\n\t" \
"vpaddq "#h1", "#t0", "#h1"\n\t" \
"vpaddq "#h4", "#t1", "#h4"\n\t" \
\
"vpsrlq $26, "#h1", "#t0"\n\t" \
"vpsrlq $26, "#h4", "#t1"\n\t" \
"vpand "#mask", "#h1", "#h1"\n\t" \
"vpand "#mask", "#h4", "#h4"\n\t" \
"vpaddq "#h2", "#t0", "#h2"\n\t" \
"vpslld $2, "#t1", "#t2"\n\t" \
"vpaddd "#t2", "#t1", "#t2"\n\t" \
"vpaddq "#h0", "#t2", "#h0"\n\t" \
\
"vpsrlq $26, "#h2", "#t0"\n\t" \
"vpsrlq $26, "#h0", "#t1"\n\t" \
"vpand "#mask", "#h2", "#h2"\n\t" \
"vpand "#mask", "#h0", "#h0"\n\t" \
"vpaddq "#h3", "#t0", "#h3"\n\t" \
"vpaddq "#h1", "#t1", "#h1"\n\t" \
\
"vpsrlq $26, "#h3", "#t0"\n\t" \
"vpand "#mask", "#h3", "#h3"\n\t" \
"vpaddq "#h4", "#t0", "#h4"\n\t"
/* Reduce the 64 bits of data to 26 bits.
*
* h0-h4 contain the reduced H values.
@ -724,9 +693,9 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
"vpaddq "#m2", "#t0", "#m2"\n\t" \
"vpslld $2, "#t1", "#t2"\n\t" \
"vpaddd "#t2", "#t1", "#t2"\n\t" \
"vpaddq "#m0", "#t2", "#m0"\n\t" \
\
"vpsrlq $26, "#m2", "#t0"\n\t" \
"vpaddq "#m0", "#t2", "#m0"\n\t" \
"vpsrlq $26, "#m0", "#t1"\n\t" \
"vpand "#mask", "#m2", "#h2"\n\t" \
"vpand "#mask", "#m0", "#h0"\n\t" \
@ -735,9 +704,15 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
\
"vpsrlq $26, "#m3", "#t0"\n\t" \
"vpand "#mask", "#m3", "#h3"\n\t" \
"vpaddq "#h4", "#t0", "#h4"\n\t"
"vpaddq "#h4", "#t0", "#h4"\n\t" \
/* Process multiple blocks (n * 16 bytes) of data.
*
* ctx Poly1305 context.
* m Blocks of message data.
* bytes The number of bytes to process.
*/
POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
const unsigned char* m, size_t bytes)
{
@ -750,41 +725,42 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
register word32 t4 asm("r12") = 0;
static const word64 mask[4] = { 0x0000000003ffffff, 0x0000000003ffffff,
0x0000000003ffffff, 0x0000000003ffffff };
static const word64 hibit[4] = { 0x1000000, 0x1000000,
0x1000000, 0x1000000 };
__asm__ __volatile__ (
"vpxor %%ymm15, %%ymm15, %%ymm15\n\t"
"cmpb $0x0, %[started]\n\t"
"jne L_begin\n\t"
"cmpb $1, %[started]\n\t"
"je L_begin\n\t"
"cmpb $1, %[fin]\n\t"
"je L_begin\n\t"
"# Load the message data\n\t"
LOAD_M(m, %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %[hibit], %%ymm15)
"vmovdqu %[mask], %%ymm14\n\t"
"# Reduce, in place, the message data\n\t"
REDUCE_IN(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
%%ymm10, %%ymm11, %%ymm12, %%ymm14)
REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
%%ymm10, %%ymm11, %%ymm12, %%ymm14)
"addq $64, %[m]\n\t"
"subq $64, %[bytes]\n\t"
"jz L_store\n\t"
"jmp L_load_r4\n\t"
"\n"
"L_begin:\n\t"
"# Load the H values.\n\t"
LOAD_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %%ymm15)
"movq 336(%[ctx]), %%r8\n\t"
LOAD_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4)
"# Check if there is a power of r to load - otherwise use r^4.\n\t"
"cmpq $0x0, %%r8\n\t"
"cmpb $0, %[fin]\n\t"
"je L_load_r4\n\t"
"\n\t"
"movq 344(%[ctx]), %%r9\n\t"
"movq 352(%[ctx]), %%r10\n\t"
"movq 360(%[ctx]), %%r11\n\t"
"# Load the 4 powers of r.\n\t"
LOAD_Rx4(%%r8, %%r9, %%r10, %%r11, \
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
"# Load the 4 powers of r - r^4, r^3, r^2, r^1.\n\t"
LOAD_Rx4(%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
%%ymm10, %%ymm11, %%ymm12, %%ymm13)
"jmp L_mul_5\n\t"
"\n"
"L_load_r4:\n\t"
"# Load r^4 into all four positions.\n\t"
LOAD_R4(304(%[ctx]), %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
LOAD_R4(320(%[ctx]), %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
%%ymm13, %%ymm14)
"\n"
"L_mul_5:\n\t"
@ -795,11 +771,11 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
"vmovdqa %%ymm11, 32(%[s])\n\t"
"vmovdqa %%ymm12, 64(%[s])\n\t"
"vmovdqa %%ymm13, 96(%[s])\n\t"
"vmovdqa %%ymm5, (%[r4])\n\t"
"vmovdqa %%ymm6, 32(%[r4])\n\t"
"vmovdqa %%ymm7, 64(%[r4])\n\t"
"vmovdqa %%ymm8, 96(%[r4])\n\t"
"vmovdqa %%ymm9, 128(%[r4])\n\t"
"vmovdqa %%ymm5 , (%[r4])\n\t"
"vmovdqa %%ymm6 , 32(%[r4])\n\t"
"vmovdqa %%ymm7 , 64(%[r4])\n\t"
"vmovdqa %%ymm8 , 96(%[r4])\n\t"
"vmovdqa %%ymm9 , 128(%[r4])\n\t"
"vmovdqu %[mask], %%ymm14\n\t"
"\n"
"# If not finished then loop over data\n\t"
@ -846,7 +822,7 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
: [ctx] "r" (ctx), [h] "r" (ctx->hh),
[r4] "r" (r4), [s] "r" (s),
[fin] "m" (ctx->finished), [started] "m" (ctx->started),
[mask] "m" (mask), [hibit] "m" (ctx->hibit)
[mask] "m" (mask), [hibit] "m" (hibit)
: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
"memory"
@ -854,9 +830,9 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
if (ctx->finished)
{
word64 h0, h1, h2, g0, g1, g2, c;
word64 h0, h1, h2, c;
/* Convert to 64 bit form. */
/* Convert to 64-bit form. */
h0 = (((word64)(t1 & 0x3FFFF)) << 26) + t0;
h1 = (((word64)(t3 & 0x3FF)) << 34) +
(((word64) t2 ) << 8) + (t1 >> 18);
@ -871,31 +847,17 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += c;
/* compute h + -p */
g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
g2 = h2 + c - ((word64)1 << 42);
/* select h if h < p, or h + -p if h >= p */
c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1;
g0 &= c;
g1 &= c;
g2 &= c;
c = ~c;
h0 = (h0 & c) | g0;
h1 = (h1 & c) | g1;
h2 = (h2 & c) | g2;
/* Store for return */
ctx->h[0] = h0;
ctx->h[1] = h1;
ctx->h[2] = h2;
/* Convert from 42/44/44 to 2/64/64 bits used and store result. */
ctx->h[0] = h0 | (h1 << 44);
ctx->h[1] = (h1 >> 20) | (h2 << 24);
ctx->h[2] = h2 >> 40;
}
ctx->started = 1;
}
/* Multiply two 130-bit numbers in 64-bit registers and reduce.
* 44 + 44 + 42 = 130 bits
*
* r0-r2 are the first operand and the result.
* a0-a2 are the second operand.
@ -913,10 +875,22 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \
r1 += c
#define SQR_64(r0, r1, r2) \
s2 = r2 * (5 << 2); \
MUL(d0, r1, s2); ADD(d0, d0); MUL(d, r0, r0); ADD(d0, d); \
MUL(d1, r0, r1); ADD(d1, d1); MUL(d, r2, s2); ADD(d1, d); \
MUL(d2, r0, r2); ADD(d2, d2); MUL(d, r1, r1); ADD(d2, d); \
\
c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \
ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \
ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \
r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \
r1 += c
/* Store the 130-bit number in 64-bit registers as 26-bit values in 32 bits.
*
* r0-r2 contains the 130-bit number in 64-bit registers.
* r is the address of where to store the 26 bits in 32 result.
* r is the address of where to store the 26 of 32 bits result.
*/
#define CONV_64_TO_32(r0, r1, r2, r) \
r[0] = (word32)( r0 ) & 0x3ffffff; \
@ -925,8 +899,11 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
r[3] = (word32)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; \
r[4] = (word32)( r2 >> 16 )
static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
/* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
*
* ctx Poly1305 context.
*/
static void poly1305_calc_powers(Poly1305* ctx)
{
word64 r0, r1, r2, t0, t1, c;
word64 r20, r21, r22;
@ -935,46 +912,18 @@ static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
word64 s1, s2;
word128 d0, d1, d2, d;
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
t0 = ((word64*)key)[0];
t1 = ((word64*)key)[1];
r0 = ( t0 ) & 0xffc0fffffff;
r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
r2 = ((t1 >> 24) ) & 0x00ffffffc0f;
__asm__ __volatile__ (
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
"vmovdqu %%ymm0, (%[h])\n\t"
"vmovdqu %%ymm0, 32(%[h])\n\t"
"vmovdqu %%ymm0, 64(%[h])\n\t"
"vmovdqu %%ymm0, (%[r0])\n\t"
"vmovdqu %%ymm0, (%[r1])\n\t"
"vmovdqu %%ymm0, (%[r2])\n\t"
"vmovdqu %%ymm0, (%[r3])\n\t"
"vmovdqu %%ymm0, (%[r4])\n\t"
:
: [h] "r" (ctx->hh), [r0] "r" (ctx->r0), [r1] "r" (ctx->r1),
[r2] "r" (ctx->r2), [r3] "r" (ctx->r3), [r4] "r" (ctx->r4)
: "memory", "ymm0"
);
/* h = 0 */
ctx->h[0] = 0;
ctx->h[1] = 0;
ctx->h[2] = 0;
/* save pad for later */
ctx->pad[0] = ((word64*)key)[2];
ctx->pad[1] = ((word64*)key)[3];
/* Set 1 for r^0 */
ctx->r0[0] = 1;
t0 = ctx->r[0];
t1 = ctx->r[1];
r0 = ( t0 ) & 0xfffffffffff;
r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff;
r2 = ((t1 >> 24) ) & 0x00fffffffff;
/* Store r^1 */
CONV_64_TO_32(r0, r1, r2, ctx->r1);
/* Calc and store r^2 */
r20 = r0; r21 = r1; r22 = r2;
MUL_64(r20, r21, r22, r0, r1, r2);
SQR_64(r20, r21, r22);
CONV_64_TO_32(r20, r21, r22, ctx->r2);
/* Calc and store r^3 */
@ -984,133 +933,83 @@ static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
/* Calc and store r^4 */
r40 = r20; r41 = r21; r42 = r22;
MUL_64(r40, r41, r42, r20, r21, r22);
SQR_64(r40, r41, r42);
CONV_64_TO_32(r40, r41, r42, ctx->r4);
/* NULL means use [r^4, r^4, r^4, r^4] */
ctx->rp[0] = ctx->rp[1] = ctx->rp[2] = ctx->rp[3] = NULL;
}
/* Message high bits set unless last partial block. */
ctx->hibit[0] = ctx->hibit[1] = ctx->hibit[2] = ctx->hibit[3] = 0x1000000;
/* Set the key to use when processing data.
* Initialize the context.
* Calls AVX set key function as final function calls AVX code.
*
* ctx Poly1305 context.
* key The key data (16 bytes).
*/
static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
{
poly1305_setkey_avx(ctx, key);
__asm__ __volatile__ (
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
"vmovdqu %%ymm0, (%[hh])\n\t"
"vmovdqu %%ymm0, 32(%[hh])\n\t"
"vmovdqu %%ymm0, 64(%[hh])\n\t"
"vmovdqu %%ymm0, 96(%[hh])\n\t"
"vmovdqu %%ymm0, 128(%[hh])\n\t"
:
: [hh] "r" (ctx->hh)
: "memory", "ymm0"
);
ctx->leftover = 0;
ctx->finished = 0;
ctx->started = 0;
}
/* Calculate the final result - authentication data.
* Zeros out the private data in the context.
* Calls AVX final function to quickly process last blocks.
*
* ctx Poly1305 context.
* mac Buffer to hold 16 bytes - authentication data.
*/
static void poly1305_final_avx2(Poly1305* ctx, byte* mac)
{
word64 h0, h1, h2, t0, t1, c;
/* process the remaining block */
if (ctx->leftover) {
size_t i = ctx->leftover;
if (i & 15)
ctx->buffer[i++] = 1;
for (; i < POLY1305_BLOCK_SIZE * 4; i++)
ctx->buffer[i] = 0;
ctx->hibit[3] = 0;
if (ctx->leftover < 48)
ctx->hibit[2] = 0;
if (ctx->leftover < 32)
ctx->hibit[1] = 0;
if (ctx->leftover < 16)
ctx->hibit[0] = 0;
if (ctx->started) {
if (ctx->leftover <= 16) {
ctx->rp[0] = ctx->r4;
ctx->rp[1] = ctx->r4;
ctx->rp[2] = ctx->r3;
ctx->rp[3] = ctx->r2;
}
else if (ctx->leftover <= 32) {
ctx->rp[0] = ctx->r4;
ctx->rp[1] = ctx->r4;
ctx->rp[2] = ctx->r4;
ctx->rp[3] = ctx->r3;
}
}
int i, j;
int l = (int)ctx->leftover;
ctx->finished = 1;
if (ctx->started)
poly1305_blocks_avx2(ctx, ctx->buffer, POLY1305_BLOCK_SIZE * 4);
}
if (ctx->started) {
if (ctx->leftover == 0 || ctx->leftover > 48) {
ctx->rp[0] = ctx->r4;
ctx->rp[1] = ctx->r3;
ctx->rp[2] = ctx->r2;
ctx->rp[3] = ctx->r1;
}
else if (ctx->leftover > 32) {
ctx->rp[0] = ctx->r3;
ctx->rp[1] = ctx->r2;
ctx->rp[2] = ctx->r1;
ctx->rp[3] = ctx->r0;
}
else if (ctx->leftover > 16) {
ctx->rp[0] = ctx->r2;
ctx->rp[1] = ctx->r1;
ctx->rp[2] = ctx->r0;
ctx->rp[3] = ctx->r0;
}
else {
ctx->rp[0] = ctx->r1;
ctx->rp[1] = ctx->r0;
ctx->rp[2] = ctx->r0;
ctx->rp[3] = ctx->r0;
}
ctx->finished = 1;
poly1305_blocks_avx2(ctx, ctx->buffer, POLY1305_BLOCK_SIZE * 4);
}
h0 = ctx->h[0];
h1 = ctx->h[1];
h2 = ctx->h[2];
i = l & ~(POLY1305_BLOCK_SIZE - 1);
if (i > 0)
poly1305_blocks_avx(ctx, ctx->buffer, i);
ctx->leftover -= i;
for (j = 0; i < l; i++, j++)
ctx->buffer[j] = ctx->buffer[i];
/* h = (h + pad) */
t0 = ctx->pad[0];
t1 = ctx->pad[1];
h0 += (( t0 ) & 0xfffffffffff) ;
c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
c = (h1 >> 44); h1 &= 0xfffffffffff;
h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c;
h2 &= 0x3ffffffffff;
/* mac = h % (2^128) */
h0 = ((h0 ) | (h1 << 44));
h1 = ((h1 >> 20) | (h2 << 24));
((word64*)mac)[0] = h0;
((word64*)mac)[1] = h1;
poly1305_final_avx(ctx, mac);
/* zero out the state */
__asm__ __volatile__ (
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
"vmovdqu %%ymm0, (%[h])\n\t"
"vmovdqu %%ymm0, 32(%[h])\n\t"
"vmovdqu %%ymm0, 64(%[h])\n\t"
"vmovdqu %%ymm0, (%[r1])\n\t"
"vmovdqu %%ymm0, (%[r2])\n\t"
"vmovdqu %%ymm0, (%[r3])\n\t"
"vmovdqu %%ymm0, (%[r4])\n\t"
"vmovdqu %%ymm0, (%[hh])\n\t"
"vmovdqu %%ymm0, 32(%[hh])\n\t"
"vmovdqu %%ymm0, 64(%[hh])\n\t"
"vmovdqu %%ymm0, 96(%[hh])\n\t"
"vmovdqu %%ymm0, 128(%[hh])\n\t"
"vmovdqu %%ymm0, (%[r1])\n\t"
"vmovdqu %%ymm0, (%[r2])\n\t"
"vmovdqu %%ymm0, (%[r3])\n\t"
"vmovdqu %%ymm0, (%[r4])\n\t"
:
: [h] "r" (ctx->hh), [r1] "r" (ctx->r1), [r2] "r" (ctx->r2),
: [hh] "r" (ctx->hh), [r1] "r" (ctx->r1), [r2] "r" (ctx->r2),
[r3] "r" (ctx->r3), [r4] "r" (ctx->r4)
: "memory", "ymm0"
);
ctx->h[0] = 0;
ctx->h[1] = 0;
ctx->h[2] = 0;
ctx->r[0] = 0;
ctx->r[1] = 0;
ctx->r[2] = 0;
ctx->pad[0] = 0;
ctx->pad[1] = 0;
ctx->leftover = 0;
ctx->finished = 0;
ctx->started = 0;
}
@ -1298,7 +1197,7 @@ static void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
static void poly1305_block(Poly1305* ctx, const unsigned char *m)
{
#ifdef USE_INTEL_SPEEDUP
/* AVX2 does 4 blocks at a time - this func not used. */
/* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
poly1305_block_avx(ctx, m);
#else
poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
@ -1595,23 +1494,30 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
if (IS_INTEL_AVX2(intel_flags)) {
/* handle leftover */
if (ctx->leftover) {
size_t want = (4 * POLY1305_BLOCK_SIZE - ctx->leftover);
size_t want = sizeof(ctx->buffer) - ctx->leftover;
if (want > bytes)
want = bytes;
for (i = 0; i < want; i++)
ctx->buffer[ctx->leftover + i] = m[i];
bytes -= (word32)want;
m += want;
ctx->leftover += want;
if (ctx->leftover < 4 * POLY1305_BLOCK_SIZE)
if (ctx->leftover < sizeof(ctx->buffer))
return 0;
poly1305_blocks_avx2(ctx, ctx->buffer, 4 * POLY1305_BLOCK_SIZE);
if (!ctx->started)
poly1305_calc_powers(ctx);
poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
ctx->leftover = 0;
}
/* process full blocks */
if (bytes >= 4 * POLY1305_BLOCK_SIZE) {
size_t want = (bytes & ~(4 * POLY1305_BLOCK_SIZE - 1));
if (bytes >= sizeof(ctx->buffer)) {
size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
if (!ctx->started)
poly1305_calc_powers(ctx);
poly1305_blocks_avx2(ctx, m, want);
m += want;
bytes -= (word32)want;

View File

@ -3662,6 +3662,30 @@ int poly1305_test(void)
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
};
static const byte msg6[] =
{
0xd3,0x1a,0x8d,0x34,0x64,0x8e,0x60,0xdb,
0x7b,0x86,0xaf,0xbc,0x53,0xef,0x7e,0xc2,
0xa4,0xad,0xed,0x51,0x29,0x6e,0x08,0xfe,
0xa9,0xe2,0xb5,0xa7,0x36,0xee,0x62,0xd6,
0x3d,0xbe,0xa4,0x5e,0x8c,0xa9,0x67,0x12,
0x82,0xfa,0xfb,0x69,0xda,0x92,0x72,0x8b,
0xfa,0xb3,0x24,0xe4,0xfa,0xd6,0x75,0x94,
0x1a,0x71,0xde,0x0a,0x9e,0x06,0x0b,0x29,
0xa9,0xe2,0xb5,0xa7,0x36,0xee,0x62,0xd6,
0x3d,0xbe,0xa4,0x5e,0x8c,0xa9,0x67,0x12,
0xfa,0xb3,0x24,0xe4,0xfa,0xd6,0x75,0x94,
0x05,0xd6,0xa5,0xb6,0x7e,0xcd,0x3b,0x36,
0x92,0xdd,0xbd,0x7f,0x2d,0x77,0x8b,0x8c,
0x7b,0x86,0xaf,0xbc,0x53,0xef,0x7e,0xc2,
0x98,0x03,0xae,0xe3,0x28,0x09,0x1b,0x58,
0xfa,0xb3,0x24,0xe4,0xfa,0xd6,0x75,0x94,
0x55,0x85,0x80,0x8b,0x48,0x31,0xd7,0xbc,
0x3f,0xf4,0xde,0xf0,0x8e,0x4b,0x7a,0x9d,
0xe5,0x76,0xd2,0x65,0x86,0xce,0xc6,0x4b,
0x61,0x16
};
byte additional[] =
{
0x50,0x51,0x52,0x53,0xc0,0xc1,0xc2,0xc3,
@ -3704,6 +3728,12 @@ int poly1305_test(void)
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
};
static const byte correct6[] =
{
0xea,0x11,0x5c,0x4f,0xd0,0xc0,0x10,0xae,
0xf7,0xdf,0xda,0x77,0xa2,0xe9,0xaf,0xca
};
static const byte key[] = {
0x85,0xd6,0xbe,0x78,0x57,0x55,0x6d,0x33,
0x7f,0x44,0x52,0xfe,0x42,0xd5,0x06,0xa8,
@ -3732,42 +3762,43 @@ int poly1305_test(void)
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
};
const byte* msgs[] = {NULL, msg1, msg2, msg3, msg5};
const byte* msgs[] = {NULL, msg1, msg2, msg3, msg5, msg6};
word32 szm[] = {0, sizeof(msg1), sizeof(msg2),
sizeof(msg3), sizeof(msg5)};
const byte* keys[] = {key, key, key2, key2, key5};
const byte* tests[] = {correct0, correct1, correct2, correct3, correct5};
sizeof(msg3), sizeof(msg5), sizeof(msg6)};
const byte* keys[] = {key, key, key2, key2, key5, key};
const byte* tests[] = {correct0, correct1, correct2, correct3, correct5,
correct6};
for (i = 0; i < 5; i++) {
for (i = 0; i < 6; i++) {
ret = wc_Poly1305SetKey(&enc, keys[i], 32);
if (ret != 0)
return -3600 + i;
return -3600 - i;
ret = wc_Poly1305Update(&enc, msgs[i], szm[i]);
if (ret != 0)
return -3605 + i;
return -3610 - i;
ret = wc_Poly1305Final(&enc, tag);
if (ret != 0)
return -36108 + i;
return -3620 - i;
if (XMEMCMP(tag, tests[i], sizeof(tag)))
return -3615 + i;
return -3630 - i;
}
/* Check TLS MAC function from 2.8.2 https://tools.ietf.org/html/rfc7539 */
XMEMSET(tag, 0, sizeof(tag));
ret = wc_Poly1305SetKey(&enc, key4, sizeof(key4));
if (ret != 0)
return -3614;
return -3650;
ret = wc_Poly1305_MAC(&enc, additional, sizeof(additional),
(byte*)msg4, sizeof(msg4), tag, sizeof(tag));
if (ret != 0)
return -3615;
return -3651;
if (XMEMCMP(tag, correct4, sizeof(tag)))
return -3616;
return -3652;
/* Check fail of TLS MAC function if altering additional data */
XMEMSET(tag, 0, sizeof(tag));
@ -3775,10 +3806,10 @@ int poly1305_test(void)
ret = wc_Poly1305_MAC(&enc, additional, sizeof(additional),
(byte*)msg4, sizeof(msg4), tag, sizeof(tag));
if (ret != 0)
return -3617;
return -3653;
if (XMEMCMP(tag, correct4, sizeof(tag)) == 0)
return -3618;
return -3654;
return 0;

View File

@ -68,16 +68,14 @@ typedef struct Poly1305 {
word64 r[3];
word64 h[3];
word64 pad[2];
word64 hh[14];
word32 r0[8];
word64 hh[20];
word32 r1[8];
word32 r2[8];
word32 r3[8];
word32 r4[8];
word32* rp[4];
word64 hibit[4];
word64 hm[16];
unsigned char buffer[8*POLY1305_BLOCK_SIZE];
size_t leftover;
unsigned char buffer[4*POLY1305_BLOCK_SIZE];
unsigned char finished;
unsigned char started;
#else