mirror of https://github.com/wolfSSL/wolfssl.git
Merge pull request #1317 from SparkiDev/chacha20_sb_avx2
Improve performance of chacha20-poly1305 on AVX and AVX2.pull/1351/head
commit
c66ebb6748
|
@ -2102,8 +2102,9 @@ void bench_poly1305()
|
|||
Poly1305 enc;
|
||||
byte mac[16];
|
||||
double start;
|
||||
int ret, i, count;
|
||||
int ret = 0, i, count;
|
||||
|
||||
if (digest_stream) {
|
||||
ret = wc_Poly1305SetKey(&enc, bench_key, 32);
|
||||
if (ret != 0) {
|
||||
printf("Poly1305SetKey failed, ret = %d\n", ret);
|
||||
|
@ -2124,6 +2125,27 @@ void bench_poly1305()
|
|||
} while (bench_stats_sym_check(start));
|
||||
bench_stats_sym_finish("POLY1305", 0, count, bench_size, start, ret);
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (i = 0; i < numBlocks; i++) {
|
||||
ret = wc_Poly1305SetKey(&enc, bench_key, 32);
|
||||
if (ret != 0) {
|
||||
printf("Poly1305SetKey failed, ret = %d\n", ret);
|
||||
return;
|
||||
}
|
||||
ret = wc_Poly1305Update(&enc, bench_plain, BENCH_SIZE);
|
||||
if (ret != 0) {
|
||||
printf("Poly1305Update failed: %d\n", ret);
|
||||
break;
|
||||
}
|
||||
wc_Poly1305Final(&enc, mac);
|
||||
}
|
||||
count += i;
|
||||
} while (bench_stats_sym_check(start));
|
||||
bench_stats_sym_finish("POLY1305", 0, count, bench_size, start, ret);
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_POLY1305 */
|
||||
|
||||
|
||||
|
|
|
@ -75,6 +75,14 @@
|
|||
#define HAVE_INTEL_AVX2
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define CHACHA20_NOINLINE __declspec(noinline)
|
||||
#elif defined(__GNUC__)
|
||||
#define CHACHA20_NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define CHACHA20_NOINLINE
|
||||
#endif
|
||||
|
||||
static int cpuidFlagsSet = 0;
|
||||
static int cpuidFlags = 0;
|
||||
#endif
|
||||
|
@ -647,7 +655,9 @@ static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c,
|
|||
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
||||
static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
|
||||
static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
|
||||
#endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */
|
||||
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
#define QUARTERROUND_2_AVX() \
|
||||
"paddd %%xmm1, %%xmm0\n\t" \
|
||||
"pxor %%xmm0, %%xmm3\n\t" \
|
||||
|
@ -778,11 +788,8 @@ static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
|
|||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
|
||||
)
|
||||
|
||||
#endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */
|
||||
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
|
||||
word32 bytes)
|
||||
CHACHA20_NOINLINE static void chacha_encrypt_avx(ChaCha* ctx, const byte* m,
|
||||
byte* c, word32 bytes)
|
||||
{
|
||||
ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
|
||||
ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
|
||||
|
@ -1034,6 +1041,135 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
|
|||
#endif /* HAVE_INTEL_AVX1 */
|
||||
|
||||
#ifdef HAVE_INTEL_AVX2
|
||||
#define QUARTERROUND_2_AVX2() \
|
||||
"vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
|
||||
"vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
|
||||
"vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \
|
||||
"vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
|
||||
"vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
|
||||
"vpsrld $20, %%xmm1, %%xmm4\n\t" \
|
||||
"vpslld $12, %%xmm1, %%xmm1\n\t" \
|
||||
"vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
|
||||
"vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
|
||||
"vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
|
||||
"vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \
|
||||
"vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
|
||||
"vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
|
||||
"vpsrld $25, %%xmm1, %%xmm4\n\t" \
|
||||
"vpslld $7, %%xmm1, %%xmm1\n\t" \
|
||||
"vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
|
||||
"# Swap words for next round\n\t" \
|
||||
"vpshufd $0x39, %%xmm1, %%xmm1\n\t" \
|
||||
"vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \
|
||||
"vpshufd $0x93, %%xmm3, %%xmm3\n\t" \
|
||||
"vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
|
||||
"vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
|
||||
"vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \
|
||||
"vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
|
||||
"vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
|
||||
"vpsrld $20, %%xmm1, %%xmm4\n\t" \
|
||||
"vpslld $12, %%xmm1, %%xmm1\n\t" \
|
||||
"vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
|
||||
"vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
|
||||
"vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
|
||||
"vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \
|
||||
"vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
|
||||
"vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
|
||||
"vpsrld $25, %%Xmm1, %%xmm4\n\t" \
|
||||
"vpslld $7, %%xmm1, %%xmm1\n\t" \
|
||||
"vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
|
||||
"# Swap words back\n\t" \
|
||||
"vpshufd $0x93, %%xmm1, %%xmm1\n\t" \
|
||||
"vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \
|
||||
"vpshufd $0x39, %%xmm3, %%xmm3\n\t" \
|
||||
|
||||
#define CHACHA_CRYPT_AVX2() \
|
||||
"vmovdqu 0(%[input]), %%xmm8\n\t" \
|
||||
"vmovdqu 16(%[input]), %%xmm9\n\t" \
|
||||
"vmovdqu 32(%[input]), %%xmm10\n\t" \
|
||||
"vmovdqu 48(%[input]), %%xmm11\n\t" \
|
||||
"vmovdqu %%xmm8, %%xmm0\n\t" \
|
||||
"vmovdqu %%xmm9, %%xmm1\n\t" \
|
||||
"vmovdqu %%xmm10, %%xmm2\n\t" \
|
||||
"vmovdqu %%xmm11, %%xmm3\n\t" \
|
||||
"movb $10, %%al\n\t" \
|
||||
"\n" \
|
||||
"1:\n\t" \
|
||||
QUARTERROUND_2_AVX2() \
|
||||
"decb %%al\n\t" \
|
||||
"jnz 1b\n\t" \
|
||||
"vpaddd %%xmm8, %%xmm0, %%xmm0\n\t" \
|
||||
"vpaddd %%xmm9, %%xmm1, %%xmm1\n\t" \
|
||||
"vpaddd %%xmm10, %%xmm2, %%xmm2\n\t" \
|
||||
"vpaddd %%xmm11, %%xmm3, %%xmm3\n\t" \
|
||||
|
||||
#define CHACHA_PARTIAL_CHUNK_AVX2() \
|
||||
__asm__ __volatile__ ( \
|
||||
CHACHA_CRYPT_AVX2() \
|
||||
"vmovdqu %%xmm0, 0(%[c])\n\t" \
|
||||
"vmovdqu %%xmm1, 16(%[c])\n\t" \
|
||||
"vmovdqu %%xmm2, 32(%[c])\n\t" \
|
||||
"vmovdqu %%xmm3, 48(%[c])\n\t" \
|
||||
"addl $1, 48(%[input])\n\t" \
|
||||
"movl %[bytes], %%r8d\n\t" \
|
||||
"xorq %%rdx, %%rdx\n\t" \
|
||||
"movl %%r8d, %%r9d\n\t" \
|
||||
"andl $7, %%r9d\n\t" \
|
||||
"jz 4f\n\t" \
|
||||
"\n" \
|
||||
"2:\n\t" \
|
||||
"movzbl (%[c],%%rdx,1), %%ecx\n\t" \
|
||||
"xorb (%[m],%%rdx,1), %%cl\n\t" \
|
||||
"movb %%cl, (%[output],%%rdx,1)\n\t" \
|
||||
"incl %%edx\n\t" \
|
||||
"cmpl %%r9d, %%edx\n\t" \
|
||||
"jne 2b\n\t" \
|
||||
"je 3f\n\t" \
|
||||
"\n" \
|
||||
"4:\n\t" \
|
||||
"movq (%[c],%%rdx,1), %%rcx\n\t" \
|
||||
"xorq (%[m],%%rdx,1), %%rcx\n\t" \
|
||||
"movq %%rcx, (%[output],%%rdx,1)\n\t" \
|
||||
"addl $8, %%edx\n\t" \
|
||||
"\n" \
|
||||
"3:\n\t" \
|
||||
"cmpl %%r8d, %%edx\n\t" \
|
||||
"jne 4b\n\t" \
|
||||
: \
|
||||
: [input] "r" (ctx->X), [c] "r" (x), \
|
||||
[output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \
|
||||
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
|
||||
: "eax", "ecx", "edx", "r8", "r9", "memory", \
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \
|
||||
"xmm8", "xmm9", "xmm10", "xmm11" \
|
||||
)
|
||||
|
||||
|
||||
#define CHACHA_CHUNK_AVX2() \
|
||||
__asm__ __volatile__ ( \
|
||||
CHACHA_CRYPT_AVX2() \
|
||||
"vmovdqu 0(%[m]), %%xmm4\n\t" \
|
||||
"vmovdqu 16(%[m]), %%xmm5\n\t" \
|
||||
"vmovdqu 32(%[m]), %%xmm6\n\t" \
|
||||
"vmovdqu 48(%[m]), %%xmm7\n\t" \
|
||||
"vpxor %%xmm4, %%xmm0, %%xmm0\n\t" \
|
||||
"vpxor %%xmm5, %%xmm1, %%xmm1\n\t" \
|
||||
"vpxor %%xmm6, %%xmm2, %%xmm2\n\t" \
|
||||
"vpxor %%xmm7, %%xmm3, %%xmm3\n\t" \
|
||||
"vmovdqu %%xmm0, 0(%[c])\n\t" \
|
||||
"vmovdqu %%xmm1, 16(%[c])\n\t" \
|
||||
"vmovdqu %%xmm2, 32(%[c])\n\t" \
|
||||
"vmovdqu %%xmm3, 48(%[c])\n\t" \
|
||||
"addl $1, 48(%[input])\n\t" \
|
||||
: \
|
||||
: [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \
|
||||
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
|
||||
: "rax", "memory", \
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \
|
||||
"xmm8", "xmm9", "xmm10", "xmm11" \
|
||||
)
|
||||
|
||||
|
||||
static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
|
||||
word32 bytes)
|
||||
{
|
||||
|
@ -1298,14 +1434,20 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
|
|||
"ymm12", "ymm13", "ymm14", "ymm15", "memory"
|
||||
);
|
||||
|
||||
/* AVX code optimised for multiples of 256 bytes. */
|
||||
if (bytes == 256) {
|
||||
chacha_encrypt_avx(ctx, m, c, bytes);
|
||||
bytes -= 256;
|
||||
}
|
||||
|
||||
for (; bytes >= CHACHA_CHUNK_BYTES;) {
|
||||
CHACHA_CHUNK_AVX();
|
||||
CHACHA_CHUNK_AVX2();
|
||||
bytes -= CHACHA_CHUNK_BYTES;
|
||||
c += CHACHA_CHUNK_BYTES;
|
||||
m += CHACHA_CHUNK_BYTES;
|
||||
}
|
||||
if (bytes > 0) {
|
||||
CHACHA_PARTIAL_CHUNK_AVX();
|
||||
CHACHA_PARTIAL_CHUNK_AVX2();
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_INTEL_AVX2 */
|
||||
|
|
|
@ -178,84 +178,55 @@ static int calculateAuthTag(
|
|||
Poly1305 poly1305Ctx;
|
||||
byte padding[CHACHA20_POLY1305_MAC_PADDING_ALIGNMENT - 1];
|
||||
word32 paddingLen;
|
||||
byte little64[8];
|
||||
byte little64[16];
|
||||
|
||||
XMEMSET(padding, 0, sizeof(padding));
|
||||
|
||||
/* Initialize Poly1305 */
|
||||
|
||||
err = wc_Poly1305SetKey(&poly1305Ctx, inAuthKey,
|
||||
CHACHA20_POLY1305_AEAD_KEYSIZE);
|
||||
if (err)
|
||||
{
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Create the authTag by MAC'ing the following items: */
|
||||
|
||||
/* -- AAD */
|
||||
|
||||
if (inAAD && inAADLen)
|
||||
{
|
||||
err = wc_Poly1305Update(&poly1305Ctx, inAAD, inAADLen);
|
||||
|
||||
/* -- padding1: pad the AAD to 16 bytes */
|
||||
|
||||
paddingLen = -(int)inAADLen & (CHACHA20_POLY1305_MAC_PADDING_ALIGNMENT - 1);
|
||||
paddingLen = -(int)inAADLen &
|
||||
(CHACHA20_POLY1305_MAC_PADDING_ALIGNMENT - 1);
|
||||
if (paddingLen)
|
||||
{
|
||||
err += wc_Poly1305Update(&poly1305Ctx, padding, paddingLen);
|
||||
}
|
||||
|
||||
if (err)
|
||||
{
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* -- Ciphertext */
|
||||
|
||||
err = wc_Poly1305Update(&poly1305Ctx, inCiphertext, inCiphertextLen);
|
||||
if (err)
|
||||
{
|
||||
return err;
|
||||
}
|
||||
|
||||
/* -- padding2: pad the ciphertext to 16 bytes */
|
||||
|
||||
paddingLen = -(int)inCiphertextLen &
|
||||
(CHACHA20_POLY1305_MAC_PADDING_ALIGNMENT - 1);
|
||||
if (paddingLen)
|
||||
{
|
||||
err = wc_Poly1305Update(&poly1305Ctx, padding, paddingLen);
|
||||
if (err)
|
||||
{
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* -- AAD length as a 64-bit little endian integer */
|
||||
|
||||
word32ToLittle64(inAADLen, little64);
|
||||
|
||||
err = wc_Poly1305Update(&poly1305Ctx, little64, sizeof(little64));
|
||||
if (err)
|
||||
{
|
||||
return err;
|
||||
}
|
||||
|
||||
/* -- Ciphertext length as a 64-bit little endian integer */
|
||||
|
||||
word32ToLittle64(inCiphertextLen, little64);
|
||||
|
||||
word32ToLittle64(inCiphertextLen, little64 + 8);
|
||||
err = wc_Poly1305Update(&poly1305Ctx, little64, sizeof(little64));
|
||||
if (err)
|
||||
{
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Finalize the auth tag */
|
||||
|
||||
err = wc_Poly1305Final(&poly1305Ctx, outAuthTag);
|
||||
|
||||
return err;
|
||||
|
@ -264,12 +235,16 @@ static int calculateAuthTag(
|
|||
|
||||
static void word32ToLittle64(const word32 inLittle32, byte outLittle64[8])
|
||||
{
|
||||
XMEMSET(outLittle64, 0, 8);
|
||||
#ifndef WOLFSSL_X86_64_BUILD
|
||||
XMEMSET(outLittle64 + 4, 0, 4);
|
||||
|
||||
outLittle64[0] = (byte)(inLittle32 & 0x000000FF);
|
||||
outLittle64[1] = (byte)((inLittle32 & 0x0000FF00) >> 8);
|
||||
outLittle64[2] = (byte)((inLittle32 & 0x00FF0000) >> 16);
|
||||
outLittle64[3] = (byte)((inLittle32 & 0xFF000000) >> 24);
|
||||
#else
|
||||
*(word64*)outLittle64 = inLittle32;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -118,6 +118,11 @@ static word32 cpu_flags_set = 0;
|
|||
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
/* Process one block (16 bytes) of data.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* m One block of message data.
|
||||
*/
|
||||
static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
|
@ -152,12 +157,12 @@ static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
|
|||
"movq 8(%[ctx]), %%rax\n\t"
|
||||
"mulq %%r9\n\t"
|
||||
"# r[0] * h[2] +> t2\n\t"
|
||||
"addq 64(%[ctx],%%r10,8), %%r13\n\t"
|
||||
"addq 352(%[ctx],%%r10,8), %%r13\n\t"
|
||||
"movq %%rdx, %%r14\n\t"
|
||||
"addq %%r8, %%r12\n\t"
|
||||
"adcq %%rax, %%r13\n\t"
|
||||
"# r[1] * h[2] +> t3\n\t"
|
||||
"adcq 120(%[ctx],%%r10,8), %%r14\n\t"
|
||||
"adcq 408(%[ctx],%%r10,8), %%r14\n\t"
|
||||
"# r * h in r14, r13, r12, r11 \n\t"
|
||||
"# h = (r * h) mod 2^130 - 5\n\t"
|
||||
"movq %%r13, %%r10\n\t"
|
||||
|
@ -185,6 +190,12 @@ static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
|
|||
);
|
||||
}
|
||||
|
||||
/* Process multiple blocks (n * 16 bytes) of data.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* m Blocks of message data.
|
||||
* bytes The number of bytes to process.
|
||||
*/
|
||||
POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
|
||||
const unsigned char* m, size_t bytes)
|
||||
{
|
||||
|
@ -219,12 +230,12 @@ POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
|
|||
"movq 8(%[ctx]), %%rax\n\t"
|
||||
"mulq %%r9\n\t"
|
||||
"# r[0] * h[2] +> t2\n\t"
|
||||
"addq 72(%[ctx],%%r10,8), %%r13\n\t"
|
||||
"addq 360(%[ctx],%%r10,8), %%r13\n\t"
|
||||
"movq %%rdx, %%r14\n\t"
|
||||
"addq %%r8, %%r12\n\t"
|
||||
"adcq %%rax, %%r13\n\t"
|
||||
"# r[1] * h[2] +> t3\n\t"
|
||||
"adcq 128(%[ctx],%%r10,8), %%r14\n\t"
|
||||
"adcq 416(%[ctx],%%r10,8), %%r14\n\t"
|
||||
"# r * h in r14, r13, r12, r11 \n\t"
|
||||
"# h = (r * h) mod 2^130 - 5\n\t"
|
||||
"movq %%r13, %%r10\n\t"
|
||||
|
@ -257,6 +268,12 @@ POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
|
|||
);
|
||||
}
|
||||
|
||||
/* Set the key to use when processing data.
|
||||
* Initialize the context.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* key The key data (16 bytes).
|
||||
*/
|
||||
static void poly1305_setkey_avx(Poly1305* ctx, const byte* key)
|
||||
{
|
||||
int i;
|
||||
|
@ -265,8 +282,8 @@ static void poly1305_setkey_avx(Poly1305* ctx, const byte* key)
|
|||
ctx->r[1] = *(word64*)(key + 8) & 0x0ffffffc0ffffffcL;
|
||||
|
||||
for (i=0; i<7; i++) {
|
||||
ctx->hh[i + 0] = ctx->r[0] * i;
|
||||
ctx->hh[i + 7] = ctx->r[1] * i;
|
||||
ctx->hm[i + 0] = ctx->r[0] * i;
|
||||
ctx->hm[i + 7] = ctx->r[1] * i;
|
||||
}
|
||||
|
||||
/* h (accumulator) = 0 */
|
||||
|
@ -282,6 +299,12 @@ static void poly1305_setkey_avx(Poly1305* ctx, const byte* key)
|
|||
ctx->finished = 1;
|
||||
}
|
||||
|
||||
/* Calculate the final result - authentication data.
|
||||
* Zeros out the private data in the context.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* mac Buffer to hold 16 bytes.
|
||||
*/
|
||||
static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
||||
{
|
||||
word64 h0, h1, h2;
|
||||
|
@ -357,22 +380,15 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
|||
|
||||
/* Load H into five 256-bit registers.
|
||||
*
|
||||
* h is the memory location of the data - 26 bits in 32.
|
||||
* h is the memory location of the data - 26 of 32 bits.
|
||||
* h0-h4 the 4 H values with 26 bits stored in 64 for multiply.
|
||||
* z is zero.
|
||||
*/
|
||||
#define LOAD_H(h, h0, h1, h2, h3, h4, z) \
|
||||
"vmovdqu ("#h"), "#h1"\n\t" \
|
||||
"vmovdqu 32("#h"), "#h3"\n\t" \
|
||||
"vmovdqu 64("#h"), "#h4"\n\t" \
|
||||
"vpermq $0xd8, "#h1", "#h1"\n\t" \
|
||||
"vpermq $0xd8, "#h3", "#h3"\n\t" \
|
||||
"vpermq $0xd8, "#h4", "#h4"\n\t" \
|
||||
"vpunpckldq "#z", "#h1", "#h0"\n\t" \
|
||||
"vpunpckhdq "#z", "#h1", "#h1"\n\t" \
|
||||
"vpunpckldq "#z", "#h3", "#h2"\n\t" \
|
||||
"vpunpckhdq "#z", "#h3", "#h3"\n\t" \
|
||||
"vpunpckldq "#z", "#h4", "#h4"\n\t"
|
||||
#define LOAD_H(h, h0, h1, h2, h3, h4) \
|
||||
"vmovdqu ("#h"), "#h0"\n\t" \
|
||||
"vmovdqu 32("#h"), "#h1"\n\t" \
|
||||
"vmovdqu 64("#h"), "#h2"\n\t" \
|
||||
"vmovdqu 96("#h"), "#h3"\n\t" \
|
||||
"vmovdqu 128("#h"), "#h4"\n\t"
|
||||
|
||||
/* Store H, five 256-bit registers, packed.
|
||||
*
|
||||
|
@ -381,35 +397,23 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
|||
* x4 is the xmm register of h4.
|
||||
*/
|
||||
#define STORE_H(h, h0, h1, h2, h3, h4, x4) \
|
||||
"vpshufd $0x08, "#h0", "#h0"\n\t" \
|
||||
"vpshufd $0x08, "#h1", "#h1"\n\t" \
|
||||
"vpshufd $0x08, "#h2", "#h2"\n\t" \
|
||||
"vpshufd $0x08, "#h3", "#h3"\n\t" \
|
||||
"vpshufd $0x08, "#h4", "#h4"\n\t" \
|
||||
"vpermq $0x08, "#h0", "#h0"\n\t" \
|
||||
"vpermq $0x08, "#h1", "#h1"\n\t" \
|
||||
"vpermq $0x08, "#h2", "#h2"\n\t" \
|
||||
"vpermq $0x08, "#h3", "#h3"\n\t" \
|
||||
"vpermq $0x08, "#h4", "#h4"\n\t" \
|
||||
"vperm2i128 $0x20, "#h1", "#h0", "#h0"\n\t" \
|
||||
"vperm2i128 $0x20, "#h3", "#h2", "#h2"\n\t" \
|
||||
"vmovdqu "#h0", ("#h")\n\t" \
|
||||
"vmovdqu "#h2", 32("#h")\n\t" \
|
||||
"vmovdqu "#x4", 64("#h")\n\t"
|
||||
"vmovdqu "#h1", 32("#h")\n\t" \
|
||||
"vmovdqu "#h2", 64("#h")\n\t" \
|
||||
"vmovdqu "#h3", 96("#h")\n\t" \
|
||||
"vmovdqu "#h4", 128("#h")\n\t"
|
||||
|
||||
/* Load four powers of r into position to be multiplied by the 4 H values.
|
||||
*
|
||||
* rp0-rp3 are the register holding pointers to the values of the powers of r.
|
||||
* r0-r4 holds the loaded values with 26 bits store in 64 for multiply.
|
||||
* r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
|
||||
* t0-t3 are temporary registers.
|
||||
*/
|
||||
#define LOAD_Rx4(rp0, rp1, rp2, rp3, \
|
||||
r0, r1, r2, r3, r4, \
|
||||
#define LOAD_Rx4(r0, r1, r2, r3, r4, \
|
||||
t0, t1, t2, t3) \
|
||||
"vmovdqu ("#rp0"), "#r0"\n\t" \
|
||||
"vmovdqu ("#rp1"), "#r1"\n\t" \
|
||||
"vmovdqu ("#rp2"), "#r2"\n\t" \
|
||||
"vmovdqu ("#rp3"), "#r3"\n\t" \
|
||||
"vmovdqu 224(%[ctx]), "#r3"\n\t" \
|
||||
"vmovdqu 256(%[ctx]), "#r2"\n\t" \
|
||||
"vmovdqu 288(%[ctx]), "#r1"\n\t" \
|
||||
"vmovdqu 320(%[ctx]), "#r0"\n\t" \
|
||||
"vpermq $0xd8, "#r0", "#r0"\n\t" \
|
||||
"vpermq $0xd8, "#r1", "#r1"\n\t" \
|
||||
"vpermq $0xd8, "#r2", "#r2"\n\t" \
|
||||
|
@ -427,18 +431,18 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
|||
/* Load the r^4 value into position to be multiplied by all 4 H values.
|
||||
*
|
||||
* r4 holds r^4 as five 26 bits each in 32.
|
||||
* r0-r4 holds the loaded values with 26 bits store in 64 for multiply.
|
||||
* r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
|
||||
* t0-t1 are temporary registers.
|
||||
*/
|
||||
#define LOAD_R4(r4, r40, r41, r42, r43, r44, \
|
||||
t0, t1) \
|
||||
"vmovdqu "#r4", "#t0"\n\t" \
|
||||
"vpsrlq $32, "#t0", "#t1"\n\t" \
|
||||
"vpermq $0x0, "#t0", "#r40"\n\t" \
|
||||
"vpermq $0x0, "#t1", "#r41"\n\t" \
|
||||
"vpsrlq $32, "#t0", "#t1"\n\t" \
|
||||
"vpermq $0x55, "#t0", "#r42"\n\t" \
|
||||
"vpermq $0x55, "#t1", "#r43"\n\t" \
|
||||
"vpermq $0xaa, "#t0", "#r44"\n\t"
|
||||
"vpermq $0xaa, "#t0", "#r44"\n\t" \
|
||||
"vpermq $0x0, "#t1", "#r41"\n\t" \
|
||||
"vpermq $0x55, "#t1", "#r43"\n\t"
|
||||
|
||||
/* Multiply the top 4 26-bit values in 64 bits of each H by 5 for reduction in
|
||||
* multiply.
|
||||
|
@ -464,21 +468,21 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
|||
*/
|
||||
#define FINALIZE_H(h0, h1, h2, h3, h4, \
|
||||
t0, t1, t2, t3, t4) \
|
||||
"vpermq $0xf5, "#h0", "#t0"\n\t" \
|
||||
"vpermq $0xf5, "#h1", "#t1"\n\t" \
|
||||
"vpermq $0xf5, "#h2", "#t2"\n\t" \
|
||||
"vpermq $0xf5, "#h3", "#t3"\n\t" \
|
||||
"vpermq $0xf5, "#h4", "#t4"\n\t" \
|
||||
"vpsrldq $8, "#h0", "#t0"\n\t" \
|
||||
"vpsrldq $8, "#h1", "#t1"\n\t" \
|
||||
"vpsrldq $8, "#h2", "#t2"\n\t" \
|
||||
"vpsrldq $8, "#h3", "#t3"\n\t" \
|
||||
"vpsrldq $8, "#h4", "#t4"\n\t" \
|
||||
"vpaddq "#h0", "#t0", "#h0"\n\t" \
|
||||
"vpaddq "#h1", "#t1", "#h1"\n\t" \
|
||||
"vpaddq "#h2", "#t2", "#h2"\n\t" \
|
||||
"vpaddq "#h3", "#t3", "#h3"\n\t" \
|
||||
"vpaddq "#h4", "#t4", "#h4"\n\t" \
|
||||
"vpermq $0xaa, "#h0", "#t0"\n\t" \
|
||||
"vpermq $0xaa, "#h1", "#t1"\n\t" \
|
||||
"vpermq $0xaa, "#h2", "#t2"\n\t" \
|
||||
"vpermq $0xaa, "#h3", "#t3"\n\t" \
|
||||
"vpermq $0xaa, "#h4", "#t4"\n\t" \
|
||||
"vpermq $0x02, "#h0", "#t0"\n\t" \
|
||||
"vpermq $0x02, "#h1", "#t1"\n\t" \
|
||||
"vpermq $0x02, "#h2", "#t2"\n\t" \
|
||||
"vpermq $0x02, "#h3", "#t3"\n\t" \
|
||||
"vpermq $0x02, "#h4", "#t4"\n\t" \
|
||||
"vpaddq "#h0", "#t0", "#h0"\n\t" \
|
||||
"vpaddq "#h1", "#t1", "#h1"\n\t" \
|
||||
"vpaddq "#h2", "#t2", "#h2"\n\t" \
|
||||
|
@ -562,7 +566,7 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
|||
*
|
||||
* m the address of the message to load.
|
||||
* m0-m4 is the loaded message with 32 bits in 64. Loaded so data is parallel.
|
||||
* hi is the high bits of the 4 m (1<< 128 if not final block).
|
||||
* hi is the high bits of the 4 m (1 << 128 as not final block).
|
||||
* z is zero.
|
||||
*/
|
||||
#define LOAD_M(m, m0, m1, m2, m3, m4, hi, z) \
|
||||
|
@ -591,7 +595,7 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
|||
* r0-r4 contain the 4 powers of r.
|
||||
* s1-s4 contain r1-r4 times 5.
|
||||
* t0-t4 and v0-v3 are temporary registers.
|
||||
* hi is the high bits of the 4 m (1<< 128 if not final block).
|
||||
* hi is the high bits of the 4 m (1 << 128 as not final block).
|
||||
* z is zero.
|
||||
*/
|
||||
#define MUL_ADD_AVX2(h0, h1, h2, h3, h4, \
|
||||
|
@ -665,41 +669,6 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
|||
"vpaddq "#t4", "#v2", "#t4"\n\t" \
|
||||
"vpaddq "#t4", "#v3", "#t4"\n\t"
|
||||
|
||||
/* Reduce, in place, the 64 bits of data to 26 bits.
|
||||
*
|
||||
* h0-h4 contain the 4 H values to reduce.
|
||||
* t0-t2 are temporaries.
|
||||
* mask contains the 26-bit mask for each 64 bit value in the 256 bit register.
|
||||
*/
|
||||
#define REDUCE_IN(h0, h1, h2, h3, h4, \
|
||||
t0, t1, t2, mask) \
|
||||
"vpsrlq $26, "#h0", "#t0"\n\t" \
|
||||
"vpsrlq $26, "#h3", "#t1"\n\t" \
|
||||
"vpand "#mask", "#h0", "#h0"\n\t" \
|
||||
"vpand "#mask", "#h3", "#h3"\n\t" \
|
||||
"vpaddq "#h1", "#t0", "#h1"\n\t" \
|
||||
"vpaddq "#h4", "#t1", "#h4"\n\t" \
|
||||
\
|
||||
"vpsrlq $26, "#h1", "#t0"\n\t" \
|
||||
"vpsrlq $26, "#h4", "#t1"\n\t" \
|
||||
"vpand "#mask", "#h1", "#h1"\n\t" \
|
||||
"vpand "#mask", "#h4", "#h4"\n\t" \
|
||||
"vpaddq "#h2", "#t0", "#h2"\n\t" \
|
||||
"vpslld $2, "#t1", "#t2"\n\t" \
|
||||
"vpaddd "#t2", "#t1", "#t2"\n\t" \
|
||||
"vpaddq "#h0", "#t2", "#h0"\n\t" \
|
||||
\
|
||||
"vpsrlq $26, "#h2", "#t0"\n\t" \
|
||||
"vpsrlq $26, "#h0", "#t1"\n\t" \
|
||||
"vpand "#mask", "#h2", "#h2"\n\t" \
|
||||
"vpand "#mask", "#h0", "#h0"\n\t" \
|
||||
"vpaddq "#h3", "#t0", "#h3"\n\t" \
|
||||
"vpaddq "#h1", "#t1", "#h1"\n\t" \
|
||||
\
|
||||
"vpsrlq $26, "#h3", "#t0"\n\t" \
|
||||
"vpand "#mask", "#h3", "#h3"\n\t" \
|
||||
"vpaddq "#h4", "#t0", "#h4"\n\t"
|
||||
|
||||
/* Reduce the 64 bits of data to 26 bits.
|
||||
*
|
||||
* h0-h4 contain the reduced H values.
|
||||
|
@ -724,9 +693,9 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
|||
"vpaddq "#m2", "#t0", "#m2"\n\t" \
|
||||
"vpslld $2, "#t1", "#t2"\n\t" \
|
||||
"vpaddd "#t2", "#t1", "#t2"\n\t" \
|
||||
"vpaddq "#m0", "#t2", "#m0"\n\t" \
|
||||
\
|
||||
"vpsrlq $26, "#m2", "#t0"\n\t" \
|
||||
"vpaddq "#m0", "#t2", "#m0"\n\t" \
|
||||
"vpsrlq $26, "#m0", "#t1"\n\t" \
|
||||
"vpand "#mask", "#m2", "#h2"\n\t" \
|
||||
"vpand "#mask", "#m0", "#h0"\n\t" \
|
||||
|
@ -735,9 +704,15 @@ static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
|||
\
|
||||
"vpsrlq $26, "#m3", "#t0"\n\t" \
|
||||
"vpand "#mask", "#m3", "#h3"\n\t" \
|
||||
"vpaddq "#h4", "#t0", "#h4"\n\t"
|
||||
"vpaddq "#h4", "#t0", "#h4"\n\t" \
|
||||
|
||||
|
||||
/* Process multiple blocks (n * 16 bytes) of data.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* m Blocks of message data.
|
||||
* bytes The number of bytes to process.
|
||||
*/
|
||||
POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
|
||||
const unsigned char* m, size_t bytes)
|
||||
{
|
||||
|
@ -750,41 +725,42 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
|
|||
register word32 t4 asm("r12") = 0;
|
||||
static const word64 mask[4] = { 0x0000000003ffffff, 0x0000000003ffffff,
|
||||
0x0000000003ffffff, 0x0000000003ffffff };
|
||||
static const word64 hibit[4] = { 0x1000000, 0x1000000,
|
||||
0x1000000, 0x1000000 };
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"vpxor %%ymm15, %%ymm15, %%ymm15\n\t"
|
||||
"cmpb $0x0, %[started]\n\t"
|
||||
"jne L_begin\n\t"
|
||||
"cmpb $1, %[started]\n\t"
|
||||
"je L_begin\n\t"
|
||||
"cmpb $1, %[fin]\n\t"
|
||||
"je L_begin\n\t"
|
||||
"# Load the message data\n\t"
|
||||
LOAD_M(m, %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %[hibit], %%ymm15)
|
||||
"vmovdqu %[mask], %%ymm14\n\t"
|
||||
"# Reduce, in place, the message data\n\t"
|
||||
REDUCE_IN(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
%%ymm10, %%ymm11, %%ymm12, %%ymm14)
|
||||
"addq $64, %[m]\n\t"
|
||||
"subq $64, %[bytes]\n\t"
|
||||
"jz L_store\n\t"
|
||||
"jmp L_load_r4\n\t"
|
||||
"\n"
|
||||
"L_begin:\n\t"
|
||||
"# Load the H values.\n\t"
|
||||
LOAD_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %%ymm15)
|
||||
"movq 336(%[ctx]), %%r8\n\t"
|
||||
LOAD_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4)
|
||||
"# Check if there is a power of r to load - otherwise use r^4.\n\t"
|
||||
"cmpq $0x0, %%r8\n\t"
|
||||
"cmpb $0, %[fin]\n\t"
|
||||
"je L_load_r4\n\t"
|
||||
"\n\t"
|
||||
"movq 344(%[ctx]), %%r9\n\t"
|
||||
"movq 352(%[ctx]), %%r10\n\t"
|
||||
"movq 360(%[ctx]), %%r11\n\t"
|
||||
"# Load the 4 powers of r.\n\t"
|
||||
LOAD_Rx4(%%r8, %%r9, %%r10, %%r11, \
|
||||
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
"# Load the 4 powers of r - r^4, r^3, r^2, r^1.\n\t"
|
||||
LOAD_Rx4(%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
%%ymm10, %%ymm11, %%ymm12, %%ymm13)
|
||||
"jmp L_mul_5\n\t"
|
||||
"\n"
|
||||
"L_load_r4:\n\t"
|
||||
"# Load r^4 into all four positions.\n\t"
|
||||
LOAD_R4(304(%[ctx]), %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
LOAD_R4(320(%[ctx]), %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
%%ymm13, %%ymm14)
|
||||
"\n"
|
||||
"L_mul_5:\n\t"
|
||||
|
@ -846,7 +822,7 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
|
|||
: [ctx] "r" (ctx), [h] "r" (ctx->hh),
|
||||
[r4] "r" (r4), [s] "r" (s),
|
||||
[fin] "m" (ctx->finished), [started] "m" (ctx->started),
|
||||
[mask] "m" (mask), [hibit] "m" (ctx->hibit)
|
||||
[mask] "m" (mask), [hibit] "m" (hibit)
|
||||
: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
|
||||
"ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
|
||||
"memory"
|
||||
|
@ -854,9 +830,9 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
|
|||
|
||||
if (ctx->finished)
|
||||
{
|
||||
word64 h0, h1, h2, g0, g1, g2, c;
|
||||
word64 h0, h1, h2, c;
|
||||
|
||||
/* Convert to 64 bit form. */
|
||||
/* Convert to 64-bit form. */
|
||||
h0 = (((word64)(t1 & 0x3FFFF)) << 26) + t0;
|
||||
h1 = (((word64)(t3 & 0x3FF)) << 34) +
|
||||
(((word64) t2 ) << 8) + (t1 >> 18);
|
||||
|
@ -871,31 +847,17 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
|
|||
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
|
||||
h1 += c;
|
||||
|
||||
/* compute h + -p */
|
||||
g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
|
||||
g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
|
||||
g2 = h2 + c - ((word64)1 << 42);
|
||||
|
||||
/* select h if h < p, or h + -p if h >= p */
|
||||
c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1;
|
||||
g0 &= c;
|
||||
g1 &= c;
|
||||
g2 &= c;
|
||||
c = ~c;
|
||||
h0 = (h0 & c) | g0;
|
||||
h1 = (h1 & c) | g1;
|
||||
h2 = (h2 & c) | g2;
|
||||
|
||||
/* Store for return */
|
||||
ctx->h[0] = h0;
|
||||
ctx->h[1] = h1;
|
||||
ctx->h[2] = h2;
|
||||
/* Convert from 42/44/44 to 2/64/64 bits used and store result. */
|
||||
ctx->h[0] = h0 | (h1 << 44);
|
||||
ctx->h[1] = (h1 >> 20) | (h2 << 24);
|
||||
ctx->h[2] = h2 >> 40;
|
||||
}
|
||||
|
||||
ctx->started = 1;
|
||||
}
|
||||
|
||||
/* Multiply two 130-bit numbers in 64-bit registers and reduce.
|
||||
* 44 + 44 + 42 = 130 bits
|
||||
*
|
||||
* r0-r2 are the first operand and the result.
|
||||
* a0-a2 are the second operand.
|
||||
|
@ -913,10 +875,22 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
|
|||
r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \
|
||||
r1 += c
|
||||
|
||||
#define SQR_64(r0, r1, r2) \
|
||||
s2 = r2 * (5 << 2); \
|
||||
MUL(d0, r1, s2); ADD(d0, d0); MUL(d, r0, r0); ADD(d0, d); \
|
||||
MUL(d1, r0, r1); ADD(d1, d1); MUL(d, r2, s2); ADD(d1, d); \
|
||||
MUL(d2, r0, r2); ADD(d2, d2); MUL(d, r1, r1); ADD(d2, d); \
|
||||
\
|
||||
c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \
|
||||
ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \
|
||||
ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \
|
||||
r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \
|
||||
r1 += c
|
||||
|
||||
/* Store the 130-bit number in 64-bit registers as 26-bit values in 32 bits.
|
||||
*
|
||||
* r0-r2 contains the 130-bit number in 64-bit registers.
|
||||
* r is the address of where to store the 26 bits in 32 result.
|
||||
* r is the address of where to store the 26 of 32 bits result.
|
||||
*/
|
||||
#define CONV_64_TO_32(r0, r1, r2, r) \
|
||||
r[0] = (word32)( r0 ) & 0x3ffffff; \
|
||||
|
@ -925,8 +899,11 @@ POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
|
|||
r[3] = (word32)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; \
|
||||
r[4] = (word32)( r2 >> 16 )
|
||||
|
||||
|
||||
static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
|
||||
/* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
*/
|
||||
static void poly1305_calc_powers(Poly1305* ctx)
|
||||
{
|
||||
word64 r0, r1, r2, t0, t1, c;
|
||||
word64 r20, r21, r22;
|
||||
|
@ -935,46 +912,18 @@ static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
|
|||
word64 s1, s2;
|
||||
word128 d0, d1, d2, d;
|
||||
|
||||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
|
||||
t0 = ((word64*)key)[0];
|
||||
t1 = ((word64*)key)[1];
|
||||
r0 = ( t0 ) & 0xffc0fffffff;
|
||||
r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
|
||||
r2 = ((t1 >> 24) ) & 0x00ffffffc0f;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
|
||||
"vmovdqu %%ymm0, (%[h])\n\t"
|
||||
"vmovdqu %%ymm0, 32(%[h])\n\t"
|
||||
"vmovdqu %%ymm0, 64(%[h])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r0])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r1])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r2])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r3])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r4])\n\t"
|
||||
:
|
||||
: [h] "r" (ctx->hh), [r0] "r" (ctx->r0), [r1] "r" (ctx->r1),
|
||||
[r2] "r" (ctx->r2), [r3] "r" (ctx->r3), [r4] "r" (ctx->r4)
|
||||
: "memory", "ymm0"
|
||||
);
|
||||
/* h = 0 */
|
||||
ctx->h[0] = 0;
|
||||
ctx->h[1] = 0;
|
||||
ctx->h[2] = 0;
|
||||
|
||||
/* save pad for later */
|
||||
ctx->pad[0] = ((word64*)key)[2];
|
||||
ctx->pad[1] = ((word64*)key)[3];
|
||||
|
||||
/* Set 1 for r^0 */
|
||||
ctx->r0[0] = 1;
|
||||
t0 = ctx->r[0];
|
||||
t1 = ctx->r[1];
|
||||
r0 = ( t0 ) & 0xfffffffffff;
|
||||
r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff;
|
||||
r2 = ((t1 >> 24) ) & 0x00fffffffff;
|
||||
|
||||
/* Store r^1 */
|
||||
CONV_64_TO_32(r0, r1, r2, ctx->r1);
|
||||
|
||||
/* Calc and store r^2 */
|
||||
r20 = r0; r21 = r1; r22 = r2;
|
||||
MUL_64(r20, r21, r22, r0, r1, r2);
|
||||
SQR_64(r20, r21, r22);
|
||||
CONV_64_TO_32(r20, r21, r22, ctx->r2);
|
||||
|
||||
/* Calc and store r^3 */
|
||||
|
@ -984,133 +933,83 @@ static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
|
|||
|
||||
/* Calc and store r^4 */
|
||||
r40 = r20; r41 = r21; r42 = r22;
|
||||
MUL_64(r40, r41, r42, r20, r21, r22);
|
||||
SQR_64(r40, r41, r42);
|
||||
CONV_64_TO_32(r40, r41, r42, ctx->r4);
|
||||
|
||||
/* NULL means use [r^4, r^4, r^4, r^4] */
|
||||
ctx->rp[0] = ctx->rp[1] = ctx->rp[2] = ctx->rp[3] = NULL;
|
||||
}
|
||||
|
||||
/* Message high bits set unless last partial block. */
|
||||
ctx->hibit[0] = ctx->hibit[1] = ctx->hibit[2] = ctx->hibit[3] = 0x1000000;
|
||||
/* Set the key to use when processing data.
|
||||
* Initialize the context.
|
||||
* Calls AVX set key function as final function calls AVX code.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* key The key data (16 bytes).
|
||||
*/
|
||||
static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
|
||||
{
|
||||
poly1305_setkey_avx(ctx, key);
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
|
||||
"vmovdqu %%ymm0, (%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 32(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 64(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 96(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 128(%[hh])\n\t"
|
||||
:
|
||||
: [hh] "r" (ctx->hh)
|
||||
: "memory", "ymm0"
|
||||
);
|
||||
|
||||
ctx->leftover = 0;
|
||||
ctx->finished = 0;
|
||||
ctx->started = 0;
|
||||
}
|
||||
|
||||
/* Calculate the final result - authentication data.
|
||||
* Zeros out the private data in the context.
|
||||
* Calls AVX final function to quickly process last blocks.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* mac Buffer to hold 16 bytes - authentication data.
|
||||
*/
|
||||
static void poly1305_final_avx2(Poly1305* ctx, byte* mac)
|
||||
{
|
||||
word64 h0, h1, h2, t0, t1, c;
|
||||
int i, j;
|
||||
int l = (int)ctx->leftover;
|
||||
|
||||
/* process the remaining block */
|
||||
if (ctx->leftover) {
|
||||
size_t i = ctx->leftover;
|
||||
|
||||
if (i & 15)
|
||||
ctx->buffer[i++] = 1;
|
||||
for (; i < POLY1305_BLOCK_SIZE * 4; i++)
|
||||
ctx->buffer[i] = 0;
|
||||
|
||||
ctx->hibit[3] = 0;
|
||||
if (ctx->leftover < 48)
|
||||
ctx->hibit[2] = 0;
|
||||
if (ctx->leftover < 32)
|
||||
ctx->hibit[1] = 0;
|
||||
if (ctx->leftover < 16)
|
||||
ctx->hibit[0] = 0;
|
||||
|
||||
if (ctx->started) {
|
||||
if (ctx->leftover <= 16) {
|
||||
ctx->rp[0] = ctx->r4;
|
||||
ctx->rp[1] = ctx->r4;
|
||||
ctx->rp[2] = ctx->r3;
|
||||
ctx->rp[3] = ctx->r2;
|
||||
}
|
||||
else if (ctx->leftover <= 32) {
|
||||
ctx->rp[0] = ctx->r4;
|
||||
ctx->rp[1] = ctx->r4;
|
||||
ctx->rp[2] = ctx->r4;
|
||||
ctx->rp[3] = ctx->r3;
|
||||
}
|
||||
}
|
||||
|
||||
poly1305_blocks_avx2(ctx, ctx->buffer, POLY1305_BLOCK_SIZE * 4);
|
||||
}
|
||||
if (ctx->started) {
|
||||
if (ctx->leftover == 0 || ctx->leftover > 48) {
|
||||
ctx->rp[0] = ctx->r4;
|
||||
ctx->rp[1] = ctx->r3;
|
||||
ctx->rp[2] = ctx->r2;
|
||||
ctx->rp[3] = ctx->r1;
|
||||
}
|
||||
else if (ctx->leftover > 32) {
|
||||
ctx->rp[0] = ctx->r3;
|
||||
ctx->rp[1] = ctx->r2;
|
||||
ctx->rp[2] = ctx->r1;
|
||||
ctx->rp[3] = ctx->r0;
|
||||
}
|
||||
else if (ctx->leftover > 16) {
|
||||
ctx->rp[0] = ctx->r2;
|
||||
ctx->rp[1] = ctx->r1;
|
||||
ctx->rp[2] = ctx->r0;
|
||||
ctx->rp[3] = ctx->r0;
|
||||
}
|
||||
else {
|
||||
ctx->rp[0] = ctx->r1;
|
||||
ctx->rp[1] = ctx->r0;
|
||||
ctx->rp[2] = ctx->r0;
|
||||
ctx->rp[3] = ctx->r0;
|
||||
}
|
||||
ctx->finished = 1;
|
||||
if (ctx->started)
|
||||
poly1305_blocks_avx2(ctx, ctx->buffer, POLY1305_BLOCK_SIZE * 4);
|
||||
}
|
||||
|
||||
h0 = ctx->h[0];
|
||||
h1 = ctx->h[1];
|
||||
h2 = ctx->h[2];
|
||||
i = l & ~(POLY1305_BLOCK_SIZE - 1);
|
||||
if (i > 0)
|
||||
poly1305_blocks_avx(ctx, ctx->buffer, i);
|
||||
ctx->leftover -= i;
|
||||
for (j = 0; i < l; i++, j++)
|
||||
ctx->buffer[j] = ctx->buffer[i];
|
||||
|
||||
/* h = (h + pad) */
|
||||
t0 = ctx->pad[0];
|
||||
t1 = ctx->pad[1];
|
||||
|
||||
h0 += (( t0 ) & 0xfffffffffff) ;
|
||||
c = (h0 >> 44); h0 &= 0xfffffffffff;
|
||||
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
|
||||
c = (h1 >> 44); h1 &= 0xfffffffffff;
|
||||
h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c;
|
||||
h2 &= 0x3ffffffffff;
|
||||
|
||||
/* mac = h % (2^128) */
|
||||
h0 = ((h0 ) | (h1 << 44));
|
||||
h1 = ((h1 >> 20) | (h2 << 24));
|
||||
|
||||
((word64*)mac)[0] = h0;
|
||||
((word64*)mac)[1] = h1;
|
||||
poly1305_final_avx(ctx, mac);
|
||||
|
||||
/* zero out the state */
|
||||
__asm__ __volatile__ (
|
||||
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
|
||||
"vmovdqu %%ymm0, (%[h])\n\t"
|
||||
"vmovdqu %%ymm0, 32(%[h])\n\t"
|
||||
"vmovdqu %%ymm0, 64(%[h])\n\t"
|
||||
"vmovdqu %%ymm0, (%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 32(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 64(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 96(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 128(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r1])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r2])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r3])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r4])\n\t"
|
||||
:
|
||||
: [h] "r" (ctx->hh), [r1] "r" (ctx->r1), [r2] "r" (ctx->r2),
|
||||
: [hh] "r" (ctx->hh), [r1] "r" (ctx->r1), [r2] "r" (ctx->r2),
|
||||
[r3] "r" (ctx->r3), [r4] "r" (ctx->r4)
|
||||
: "memory", "ymm0"
|
||||
);
|
||||
ctx->h[0] = 0;
|
||||
ctx->h[1] = 0;
|
||||
ctx->h[2] = 0;
|
||||
ctx->r[0] = 0;
|
||||
ctx->r[1] = 0;
|
||||
ctx->r[2] = 0;
|
||||
ctx->pad[0] = 0;
|
||||
ctx->pad[1] = 0;
|
||||
|
||||
ctx->leftover = 0;
|
||||
ctx->finished = 0;
|
||||
ctx->started = 0;
|
||||
}
|
||||
|
@ -1298,7 +1197,7 @@ static void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
|||
static void poly1305_block(Poly1305* ctx, const unsigned char *m)
|
||||
{
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
/* AVX2 does 4 blocks at a time - this func not used. */
|
||||
/* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
|
||||
poly1305_block_avx(ctx, m);
|
||||
#else
|
||||
poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
|
||||
|
@ -1595,23 +1494,30 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
|
|||
if (IS_INTEL_AVX2(intel_flags)) {
|
||||
/* handle leftover */
|
||||
if (ctx->leftover) {
|
||||
size_t want = (4 * POLY1305_BLOCK_SIZE - ctx->leftover);
|
||||
size_t want = sizeof(ctx->buffer) - ctx->leftover;
|
||||
if (want > bytes)
|
||||
want = bytes;
|
||||
|
||||
for (i = 0; i < want; i++)
|
||||
ctx->buffer[ctx->leftover + i] = m[i];
|
||||
bytes -= (word32)want;
|
||||
m += want;
|
||||
ctx->leftover += want;
|
||||
if (ctx->leftover < 4 * POLY1305_BLOCK_SIZE)
|
||||
if (ctx->leftover < sizeof(ctx->buffer))
|
||||
return 0;
|
||||
poly1305_blocks_avx2(ctx, ctx->buffer, 4 * POLY1305_BLOCK_SIZE);
|
||||
|
||||
if (!ctx->started)
|
||||
poly1305_calc_powers(ctx);
|
||||
poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
|
||||
ctx->leftover = 0;
|
||||
}
|
||||
|
||||
/* process full blocks */
|
||||
if (bytes >= 4 * POLY1305_BLOCK_SIZE) {
|
||||
size_t want = (bytes & ~(4 * POLY1305_BLOCK_SIZE - 1));
|
||||
if (bytes >= sizeof(ctx->buffer)) {
|
||||
size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
|
||||
|
||||
if (!ctx->started)
|
||||
poly1305_calc_powers(ctx);
|
||||
poly1305_blocks_avx2(ctx, m, want);
|
||||
m += want;
|
||||
bytes -= (word32)want;
|
||||
|
|
|
@ -3662,6 +3662,30 @@ int poly1305_test(void)
|
|||
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
|
||||
};
|
||||
|
||||
static const byte msg6[] =
|
||||
{
|
||||
0xd3,0x1a,0x8d,0x34,0x64,0x8e,0x60,0xdb,
|
||||
0x7b,0x86,0xaf,0xbc,0x53,0xef,0x7e,0xc2,
|
||||
0xa4,0xad,0xed,0x51,0x29,0x6e,0x08,0xfe,
|
||||
0xa9,0xe2,0xb5,0xa7,0x36,0xee,0x62,0xd6,
|
||||
0x3d,0xbe,0xa4,0x5e,0x8c,0xa9,0x67,0x12,
|
||||
0x82,0xfa,0xfb,0x69,0xda,0x92,0x72,0x8b,
|
||||
0xfa,0xb3,0x24,0xe4,0xfa,0xd6,0x75,0x94,
|
||||
0x1a,0x71,0xde,0x0a,0x9e,0x06,0x0b,0x29,
|
||||
0xa9,0xe2,0xb5,0xa7,0x36,0xee,0x62,0xd6,
|
||||
0x3d,0xbe,0xa4,0x5e,0x8c,0xa9,0x67,0x12,
|
||||
0xfa,0xb3,0x24,0xe4,0xfa,0xd6,0x75,0x94,
|
||||
0x05,0xd6,0xa5,0xb6,0x7e,0xcd,0x3b,0x36,
|
||||
0x92,0xdd,0xbd,0x7f,0x2d,0x77,0x8b,0x8c,
|
||||
0x7b,0x86,0xaf,0xbc,0x53,0xef,0x7e,0xc2,
|
||||
0x98,0x03,0xae,0xe3,0x28,0x09,0x1b,0x58,
|
||||
0xfa,0xb3,0x24,0xe4,0xfa,0xd6,0x75,0x94,
|
||||
0x55,0x85,0x80,0x8b,0x48,0x31,0xd7,0xbc,
|
||||
0x3f,0xf4,0xde,0xf0,0x8e,0x4b,0x7a,0x9d,
|
||||
0xe5,0x76,0xd2,0x65,0x86,0xce,0xc6,0x4b,
|
||||
0x61,0x16
|
||||
};
|
||||
|
||||
byte additional[] =
|
||||
{
|
||||
0x50,0x51,0x52,0x53,0xc0,0xc1,0xc2,0xc3,
|
||||
|
@ -3704,6 +3728,12 @@ int poly1305_test(void)
|
|||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
};
|
||||
|
||||
static const byte correct6[] =
|
||||
{
|
||||
0xea,0x11,0x5c,0x4f,0xd0,0xc0,0x10,0xae,
|
||||
0xf7,0xdf,0xda,0x77,0xa2,0xe9,0xaf,0xca
|
||||
};
|
||||
|
||||
static const byte key[] = {
|
||||
0x85,0xd6,0xbe,0x78,0x57,0x55,0x6d,0x33,
|
||||
0x7f,0x44,0x52,0xfe,0x42,0xd5,0x06,0xa8,
|
||||
|
@ -3732,42 +3762,43 @@ int poly1305_test(void)
|
|||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
|
||||
const byte* msgs[] = {NULL, msg1, msg2, msg3, msg5};
|
||||
const byte* msgs[] = {NULL, msg1, msg2, msg3, msg5, msg6};
|
||||
word32 szm[] = {0, sizeof(msg1), sizeof(msg2),
|
||||
sizeof(msg3), sizeof(msg5)};
|
||||
const byte* keys[] = {key, key, key2, key2, key5};
|
||||
const byte* tests[] = {correct0, correct1, correct2, correct3, correct5};
|
||||
sizeof(msg3), sizeof(msg5), sizeof(msg6)};
|
||||
const byte* keys[] = {key, key, key2, key2, key5, key};
|
||||
const byte* tests[] = {correct0, correct1, correct2, correct3, correct5,
|
||||
correct6};
|
||||
|
||||
for (i = 0; i < 5; i++) {
|
||||
for (i = 0; i < 6; i++) {
|
||||
ret = wc_Poly1305SetKey(&enc, keys[i], 32);
|
||||
if (ret != 0)
|
||||
return -3600 + i;
|
||||
return -3600 - i;
|
||||
|
||||
ret = wc_Poly1305Update(&enc, msgs[i], szm[i]);
|
||||
if (ret != 0)
|
||||
return -3605 + i;
|
||||
return -3610 - i;
|
||||
|
||||
ret = wc_Poly1305Final(&enc, tag);
|
||||
if (ret != 0)
|
||||
return -36108 + i;
|
||||
return -3620 - i;
|
||||
|
||||
if (XMEMCMP(tag, tests[i], sizeof(tag)))
|
||||
return -3615 + i;
|
||||
return -3630 - i;
|
||||
}
|
||||
|
||||
/* Check TLS MAC function from 2.8.2 https://tools.ietf.org/html/rfc7539 */
|
||||
XMEMSET(tag, 0, sizeof(tag));
|
||||
ret = wc_Poly1305SetKey(&enc, key4, sizeof(key4));
|
||||
if (ret != 0)
|
||||
return -3614;
|
||||
return -3650;
|
||||
|
||||
ret = wc_Poly1305_MAC(&enc, additional, sizeof(additional),
|
||||
(byte*)msg4, sizeof(msg4), tag, sizeof(tag));
|
||||
if (ret != 0)
|
||||
return -3615;
|
||||
return -3651;
|
||||
|
||||
if (XMEMCMP(tag, correct4, sizeof(tag)))
|
||||
return -3616;
|
||||
return -3652;
|
||||
|
||||
/* Check fail of TLS MAC function if altering additional data */
|
||||
XMEMSET(tag, 0, sizeof(tag));
|
||||
|
@ -3775,10 +3806,10 @@ int poly1305_test(void)
|
|||
ret = wc_Poly1305_MAC(&enc, additional, sizeof(additional),
|
||||
(byte*)msg4, sizeof(msg4), tag, sizeof(tag));
|
||||
if (ret != 0)
|
||||
return -3617;
|
||||
return -3653;
|
||||
|
||||
if (XMEMCMP(tag, correct4, sizeof(tag)) == 0)
|
||||
return -3618;
|
||||
return -3654;
|
||||
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -68,16 +68,14 @@ typedef struct Poly1305 {
|
|||
word64 r[3];
|
||||
word64 h[3];
|
||||
word64 pad[2];
|
||||
word64 hh[14];
|
||||
word32 r0[8];
|
||||
word64 hh[20];
|
||||
word32 r1[8];
|
||||
word32 r2[8];
|
||||
word32 r3[8];
|
||||
word32 r4[8];
|
||||
word32* rp[4];
|
||||
word64 hibit[4];
|
||||
word64 hm[16];
|
||||
unsigned char buffer[8*POLY1305_BLOCK_SIZE];
|
||||
size_t leftover;
|
||||
unsigned char buffer[4*POLY1305_BLOCK_SIZE];
|
||||
unsigned char finished;
|
||||
unsigned char started;
|
||||
#else
|
||||
|
|
Loading…
Reference in New Issue