Merge pull request #1271 from SparkiDev/chacha20_sb

Improve performance of small number of blocks for chacha20
pull/1298/head
toddouska 2018-01-02 09:40:49 -08:00 committed by GitHub
commit f2375f3fee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1055 additions and 489 deletions

View File

@ -220,22 +220,22 @@ typedef struct bench_alg {
static const bench_alg bench_cipher_opt[] = {
{ "-cipher", -1 },
#ifdef HAVE_AES_CBC
{ "-aes_cbc", BENCH_AES_CBC },
{ "-aes-cbc", BENCH_AES_CBC },
#endif
#ifdef HAVE_AESGCM
{ "-aes_gcm", BENCH_AES_GCM },
{ "-aes-gcm", BENCH_AES_GCM },
#endif
#ifdef WOLFSSL_AES_DIRECT
{ "-aes_ecb", BENCH_AES_ECB },
{ "-aes-ecb", BENCH_AES_ECB },
#endif
#ifdef WOLFSSL_AES_XTS
{ "-aes_xts", BENCH_AES_XTS },
{ "-aes-xts", BENCH_AES_XTS },
#endif
#ifdef WOLFSSL_AES_COUNTER
{ "-aes_ctr", BENCH_AES_CTR },
{ "-aes-ctr", BENCH_AES_CTR },
#endif
#ifdef HAVE_AESCCM
{ "-aes_ccm", BENCH_AES_CCM },
{ "-aes-ccm", BENCH_AES_CCM },
#endif
#ifdef HAVE_CAMELLIA
{ "-camellia", BENCH_CAMELLIA },
@ -253,7 +253,7 @@ static const bench_alg bench_cipher_opt[] = {
{ "-chacha20", BENCH_CHACHA20 },
#endif
#if defined(HAVE_CHACHA) && defined(HAVE_POLY1305)
{ "-chacha20_poly1305", BENCH_CHACHA20_POLY1305 },
{ "-chacha20-poly1305", BENCH_CHACHA20_POLY1305 },
#endif
#ifndef NO_DES3
{ "-des", BENCH_DES },
@ -295,16 +295,16 @@ static const bench_alg bench_digest_opt[] = {
#ifdef WOLFSSL_SHA3
{ "-sha3", BENCH_SHA3 },
#ifndef WOLFSSL_NOSHA3_224
{ "-sha3_224", BENCH_SHA3_224 },
{ "-sha3-224", BENCH_SHA3_224 },
#endif
#ifndef WOLFSSL_NOSHA3_256
{ "-sha3_256", BENCH_SHA3_256 },
{ "-sha3-256", BENCH_SHA3_256 },
#endif
#ifndef WOLFSSL_NOSHA3_384
{ "-sha3_384", BENCH_SHA3_384 },
{ "-sha3-384", BENCH_SHA3_384 },
#endif
#ifndef WOLFSSL_NOSHA3_512
{ "-sha3_512", BENCH_SHA3_512 },
{ "-sha3-512", BENCH_SHA3_512 },
#endif
#endif
#ifdef WOLFSSL_RIPEMD
@ -325,22 +325,22 @@ static const bench_alg bench_mac_opt[] = {
#ifndef NO_HMAC
{ "-hmac", BENCH_HMAC },
#ifndef NO_MD5
{ "-hmac_md5", BENCH_HMAC_MD5 },
{ "-hmac-md5", BENCH_HMAC_MD5 },
#endif
#ifndef NO_SHA
{ "-hmac_sha", BENCH_HMAC_SHA },
{ "-hmac-sha", BENCH_HMAC_SHA },
#endif
#ifdef WOLFSSL_SHA224
{ "-hmac_sha224", BENCH_HMAC_SHA224 },
{ "-hmac-sha224", BENCH_HMAC_SHA224 },
#endif
#ifndef NO_SHA256
{ "-hmac_sha256", BENCH_HMAC_SHA256 },
{ "-hmac-sha256", BENCH_HMAC_SHA256 },
#endif
#ifdef WOLFSSL_SHA384
{ "-hmac_sha384", BENCH_HMAC_SHA384 },
{ "-hmac-sha384", BENCH_HMAC_SHA384 },
#endif
#ifdef WOLFSSL_SHA512
{ "-hmac_sha512", BENCH_HMAC_SHA512 },
{ "-hmac-sha512", BENCH_HMAC_SHA512 },
#endif
#endif
{ NULL, 0}
@ -351,7 +351,7 @@ static const bench_alg bench_asym_opt[] = {
{ "-asym", -1 },
#ifndef NO_RSA
#ifdef WOLFSSL_KEY_GEN
{ "-rsa_kg", BENCH_RSA_KEYGEN },
{ "-rsa-kg", BENCH_RSA_KEYGEN },
#endif
{ "-rsa", BENCH_RSA },
#endif
@ -360,13 +360,13 @@ static const bench_alg bench_asym_opt[] = {
#endif
#ifdef HAVE_NTRU
{ "-ntru", BENCH_NTRU },
{ "-ntru_kg", BENCH_NTRU_KEYGEN },
{ "-ntru-kg", BENCH_NTRU_KEYGEN },
#endif
#ifdef HAVE_ECC
{ "-ecc_kg", BENCH_ECC_MAKEKEY },
{ "-ecc-kg", BENCH_ECC_MAKEKEY },
{ "-ecc", BENCH_ECC },
#ifdef HAVE_ECC_ENCRYPT
{ "-ecc_enc", BENCH_ECC_ENCRYPT },
{ "-ecc-enc", BENCH_ECC_ENCRYPT },
#endif
#endif
#ifdef HAVE_CURVE25519
@ -376,7 +376,7 @@ static const bench_alg bench_asym_opt[] = {
#endif
#endif
#ifdef HAVE_ED25519
{ "-ed25519_kg", BENCH_ED25519_KEYGEN },
{ "-ed25519-kg", BENCH_ED25519_KEYGEN },
{ "-ed25519", BENCH_ED25519_SIGN },
#endif
{ NULL, 0}
@ -654,6 +654,7 @@ static THREAD_LS_T int devId = INVALID_DEVID;
#define AES_AUTH_ADD_SZ 13
#define AES_AUTH_TAG_SZ 16
#define BENCH_CIPHER_ADD AES_AUTH_TAG_SZ
static word32 aesAuthAddSz = AES_AUTH_ADD_SZ;
#endif
#ifndef BENCH_CIPHER_ADD
#define BENCH_CIPHER_ADD 0
@ -663,24 +664,25 @@ static THREAD_LS_T int devId = INVALID_DEVID;
/* use kB instead of mB for embedded benchmarking */
#ifdef BENCH_EMBEDDED
enum BenchmarkBounds {
numBlocks = 25, /* how many kB to test (en/de)cryption */
scryptCnt = 1,
ntimes = 2,
genTimes = BENCH_MAX_PENDING,
agreeTimes = 2
};
static int numBlocks = 25; /* how many kB to test (en/de)cryption */
static word32 bench_size = (1024ul);
#else
enum BenchmarkBounds {
numBlocks = 5, /* how many megs to test (en/de)cryption */
scryptCnt = 10,
ntimes = 100,
genTimes = BENCH_MAX_PENDING, /* must be at least BENCH_MAX_PENDING */
agreeTimes = 100
};
static int numBlocks = 5; /* how many megs to test (en/de)cryption */
static word32 bench_size = (1024*1024ul);
#endif
static int base2 = 1;
static int digest_stream = 1;
/* for compatibility */
#define BENCH_SIZE bench_size
@ -1741,7 +1743,7 @@ static void bench_aesgcm_internal(int doAsync, const byte* key, word32 keySz,
ret = wc_AesGcmEncrypt(&enc[i], bench_cipher,
bench_plain, BENCH_SIZE,
iv, ivSz, bench_tag, AES_AUTH_TAG_SZ,
bench_additional, AES_AUTH_ADD_SZ);
bench_additional, aesAuthAddSz);
if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, &times, &pending)) {
goto exit_aes_gcm;
}
@ -1765,7 +1767,7 @@ exit_aes_gcm:
ret = wc_AesGcmDecrypt(&enc[i], bench_plain,
bench_cipher, BENCH_SIZE,
iv, ivSz, bench_tag, AES_AUTH_TAG_SZ,
bench_additional, AES_AUTH_ADD_SZ);
bench_additional, aesAuthAddSz);
if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, &times, &pending)) {
goto exit_aes_gcm_dec;
}
@ -1977,7 +1979,7 @@ static void bench_aesctr_internal(const byte* key, word32 keySz, const byte* iv,
{
Aes enc;
double start;
int i, count, ret;
int i, count, ret = 0;
wc_AesSetKeyDirect(&enc, key, keySz, iv, AES_ENCRYPTION);
@ -2026,7 +2028,7 @@ void bench_aesccm(void)
for (i = 0; i < numBlocks; i++) {
wc_AesCcmEncrypt(&enc, bench_cipher, bench_plain, BENCH_SIZE,
bench_iv, 12, bench_tag, AES_AUTH_TAG_SZ,
bench_additional, AES_AUTH_ADD_SZ);
bench_additional, aesAuthAddSz);
}
count += i;
} while (bench_stats_sym_check(start));
@ -2336,6 +2338,7 @@ void bench_md5(int doAsync)
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitMd5_ex(&hash[i], HEAP_HINT,
@ -2381,6 +2384,20 @@ void bench_md5(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitMd5_ex(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_Md5Update(hash, bench_plain, BENCH_SIZE);
ret |= wc_Md5Final(hash, digest[0]);
if (ret != 0)
goto exit_md5;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_md5:
bench_stats_sym_finish("MD5", doAsync, count, bench_size, start, ret);
@ -2408,6 +2425,7 @@ void bench_sha(int doAsync)
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitSha_ex(&hash[i], HEAP_HINT,
@ -2453,6 +2471,20 @@ void bench_sha(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitSha_ex(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_ShaUpdate(hash, bench_plain, BENCH_SIZE);
ret |= wc_ShaFinal(hash, digest[0]);
if (ret != 0)
goto exit_sha;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_sha:
bench_stats_sym_finish("SHA", doAsync, count, bench_size, start, ret);
@ -2472,12 +2504,13 @@ void bench_sha224(int doAsync)
{
wc_Sha224 hash[BENCH_MAX_PENDING];
double start;
int ret, i, count = 0, times, pending = 0;
int ret = 0, i, count = 0, times, pending = 0;
DECLARE_ARRAY(digest, byte, BENCH_MAX_PENDING, WC_SHA224_DIGEST_SIZE, HEAP_HINT);
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitSha224_ex(&hash[i], HEAP_HINT,
@ -2519,6 +2552,20 @@ void bench_sha224(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitSha224_ex(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_Sha224Update(hash, bench_plain, BENCH_SIZE);
ret |= wc_Sha224Final(hash, digest[0]);
if (ret != 0)
goto exit_sha224;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_sha224:
bench_stats_sym_finish("SHA-224", doAsync, count, bench_size, start, ret);
@ -2543,6 +2590,7 @@ void bench_sha256(int doAsync)
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitSha256_ex(&hash[i], HEAP_HINT,
@ -2587,6 +2635,20 @@ void bench_sha256(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitSha256_ex(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_Sha256Update(hash, bench_plain, BENCH_SIZE);
ret |= wc_Sha256Final(hash, digest[0]);
if (ret != 0)
goto exit_sha256;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_sha256:
bench_stats_sym_finish("SHA-256", doAsync, count, bench_size, start, ret);
@ -2611,6 +2673,7 @@ void bench_sha384(int doAsync)
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitSha384_ex(&hash[i], HEAP_HINT,
@ -2652,6 +2715,20 @@ void bench_sha384(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitSha384_ex(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_Sha384Update(hash, bench_plain, BENCH_SIZE);
ret |= wc_Sha384Final(hash, digest[0]);
if (ret != 0)
goto exit_sha384;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_sha384:
bench_stats_sym_finish("SHA-384", doAsync, count, bench_size, start, ret);
@ -2676,6 +2753,7 @@ void bench_sha512(int doAsync)
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitSha512_ex(&hash[i], HEAP_HINT,
@ -2717,6 +2795,20 @@ void bench_sha512(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitSha512_ex(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_Sha512Update(hash, bench_plain, BENCH_SIZE);
ret |= wc_Sha512Final(hash, digest[0]);
if (ret != 0)
goto exit_sha512;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_sha512:
bench_stats_sym_finish("SHA-512", doAsync, count, bench_size, start, ret);
@ -2743,6 +2835,7 @@ void bench_sha3_224(int doAsync)
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitSha3_224(&hash[i], HEAP_HINT,
@ -2784,6 +2877,20 @@ void bench_sha3_224(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitSha3_224(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_Sha3_224_Update(hash, bench_plain, BENCH_SIZE);
ret |= wc_Sha3_224_Final(hash, digest[0]);
if (ret != 0)
goto exit_sha3_224;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_sha3_224:
bench_stats_sym_finish("SHA3-224", doAsync, count, bench_size, start, ret);
@ -2808,6 +2915,7 @@ void bench_sha3_256(int doAsync)
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitSha3_256(&hash[i], HEAP_HINT,
@ -2849,6 +2957,20 @@ void bench_sha3_256(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitSha3_256(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_Sha3_256_Update(hash, bench_plain, BENCH_SIZE);
ret |= wc_Sha3_256_Final(hash, digest[0]);
if (ret != 0)
goto exit_sha3_256;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_sha3_256:
bench_stats_sym_finish("SHA3-256", doAsync, count, bench_size, start, ret);
@ -2873,6 +2995,7 @@ void bench_sha3_384(int doAsync)
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitSha3_384(&hash[i], HEAP_HINT,
@ -2914,6 +3037,20 @@ void bench_sha3_384(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitSha3_384(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_Sha3_384_Update(hash, bench_plain, BENCH_SIZE);
ret |= wc_Sha3_384_Final(hash, digest[0]);
if (ret != 0)
goto exit_sha3_384;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_sha3_384:
bench_stats_sym_finish("SHA3-384", doAsync, count, bench_size, start, ret);
@ -2938,6 +3075,7 @@ void bench_sha3_512(int doAsync)
/* clear for done cleanup */
XMEMSET(hash, 0, sizeof(hash));
if (digest_stream) {
/* init keys */
for (i = 0; i < BENCH_MAX_PENDING; i++) {
ret = wc_InitSha3_512(&hash[i], HEAP_HINT,
@ -2979,6 +3117,20 @@ void bench_sha3_512(int doAsync)
} /* for i */
} while (pending > 0);
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (times = 0; times < numBlocks; times++) {
ret = wc_InitSha3_512(hash, HEAP_HINT, INVALID_DEVID);
ret |= wc_Sha3_512_Update(hash, bench_plain, BENCH_SIZE);
ret |= wc_Sha3_512_Final(hash, digest[0]);
if (ret != 0)
goto exit_sha3_512;
} /* for times */
count += times;
} while (bench_stats_sym_check(start));
}
exit_sha3_512:
bench_stats_sym_finish("SHA3-512", doAsync, count, bench_size, start, ret);
@ -3000,8 +3152,9 @@ int bench_ripemd(void)
RipeMd hash;
byte digest[RIPEMD_DIGEST_SIZE];
double start;
int i, count, ret;
int i, count, ret = 0;
if (digest_stream) {
ret = wc_InitRipeMd(&hash);
if (ret != 0) {
return ret;
@ -3022,6 +3175,27 @@ int bench_ripemd(void)
count += i;
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (i = 0; i < numBlocks; i++) {
ret = wc_InitRipeMd(&hash);
if (ret != 0) {
return ret;
}
ret = wc_RipeMdUpdate(&hash, bench_plain, BENCH_SIZE);
if (ret != 0) {
return ret;
}
ret = wc_RipeMdFinal(&hash, digest);
if (ret != 0) {
return ret;
}
}
count += i;
} while (bench_stats_sym_check(start));
}
bench_stats_sym_finish("RIPEMD", 0, count, bench_size, start, ret);
return 0;
@ -3037,6 +3211,7 @@ void bench_blake2(void)
double start;
int ret, i, count;
if (digest_stream) {
ret = wc_InitBlake2b(&b2b, 64);
if (ret != 0) {
printf("InitBlake2b failed, ret = %d\n", ret);
@ -3059,6 +3234,30 @@ void bench_blake2(void)
}
count += i;
} while (bench_stats_sym_check(start));
}
else {
bench_stats_start(&count, &start);
do {
for (i = 0; i < numBlocks; i++) {
ret = wc_InitBlake2b(&b2b, 64);
if (ret != 0) {
printf("InitBlake2b failed, ret = %d\n", ret);
return;
}
ret = wc_Blake2bUpdate(&b2b, bench_plain, BENCH_SIZE);
if (ret != 0) {
printf("Blake2bUpdate failed, ret = %d\n", ret);
return;
}
ret = wc_Blake2bFinal(&b2b, digest, 64);
if (ret != 0) {
printf("Blake2bFinal failed, ret = %d\n", ret);
return;
}
}
count += i;
} while (bench_stats_sym_check(start));
}
bench_stats_sym_finish("BLAKE2b", 0, count, bench_size, start, ret);
}
#endif
@ -4505,6 +4704,7 @@ void benchmark_configure(int block_size)
{
/* must be greater than 0 */
if (block_size > 0) {
numBlocks = numBlocks * bench_size / block_size;
bench_size = (word32)block_size;
}
}
@ -4542,6 +4742,10 @@ static void Usage(void)
printf("benchmark\n");
printf("-? Help, print this usage\n");
printf("-base10 Display bytes as power of 10 (eg 1 kB = 1000 Bytes)\n");
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
printf("-no_aad No additional authentication data passed.\n");
#endif
printf("-dgst_full Full digest operation performed.\n");
#ifndef WOLFSSL_BENCHMARK_ALL
printf("-<alg> Algorithm to benchmark. Available algorithms "
"include:\n");
@ -4597,6 +4801,12 @@ int main(int argc, char** argv)
}
else if (string_matches(argv[1], "-base10"))
base2 = 0;
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
else if (string_matches(argv[1], "-no_aad"))
aesAuthAddSz = 0;
#endif
else if (string_matches(argv[1], "-dgst_full"))
digest_stream = 0;
else if (argv[1][0] == '-') {
optMatched = 0;
#ifndef WOLFSSL_BENCHMARK_ALL

View File

@ -74,6 +74,9 @@
#ifndef NO_AVX2_SUPPORT
#define HAVE_INTEL_AVX2
#endif
static int cpuidFlagsSet = 0;
static int cpuidFlags = 0;
#endif
#ifdef BIG_ENDIAN_ORDER
@ -413,19 +416,379 @@ static INLINE void wc_Chacha_wordtobyte(word32 output[CHACHA_CHUNK_WORDS],
#ifdef USE_INTEL_CHACHA_SPEEDUP
#define QUARTERROUND_2_X64(r11, r12, r13, r14, r21, r22, r23, r24) \
"addl "#r12", "#r11"\n\t" \
"addl "#r22", "#r21"\n\t" \
"xorl "#r11", "#r14"\n\t" \
"xorl "#r21", "#r24"\n\t" \
"roll $16, "#r14"\n\t" \
"roll $16, "#r24"\n\t" \
"addl "#r14", "#r13"\n\t" \
"addl "#r24", "#r23"\n\t" \
"xorl "#r13", "#r12"\n\t" \
"xorl "#r23", "#r22"\n\t" \
"roll $12, "#r12"\n\t" \
"roll $12, "#r22"\n\t" \
"addl "#r12", "#r11"\n\t" \
"addl "#r22", "#r21"\n\t" \
"xorl "#r11", "#r14"\n\t" \
"xorl "#r21", "#r24"\n\t" \
"roll $8, "#r14"\n\t" \
"roll $8, "#r24"\n\t" \
"addl "#r14", "#r13"\n\t" \
"addl "#r24", "#r23"\n\t" \
"xorl "#r13", "#r12"\n\t" \
"xorl "#r23", "#r22"\n\t" \
"roll $7, "#r12"\n\t" \
"roll $7, "#r22"\n\t" \
#define CHACHA_CRYPT_X64() \
"subq $40, %%rsp\n\t" \
"movq 32(%[input]), %%rax\n\t" \
"movq 40(%[input]), %%rdx\n\t" \
"movq %%rax, 8(%%rsp)\n\t" \
"movq %%rdx, 16(%%rsp)\n\t" \
"movl 0(%[input]), %%eax\n\t" \
"movl 4(%[input]), %%ebx\n\t" \
"movl 8(%[input]), %%ecx\n\t" \
"movl 12(%[input]), %%edx\n\t" \
"movl 16(%[input]), %%r8d\n\t" \
"movl 20(%[input]), %%r9d\n\t" \
"movl 24(%[input]), %%r10d\n\t" \
"movl 28(%[input]), %%r11d\n\t" \
"movl 48(%[input]), %%r12d\n\t" \
"movl 52(%[input]), %%r13d\n\t" \
"movl 56(%[input]), %%r14d\n\t" \
"movl 60(%[input]), %%r15d\n\t" \
"movb $10, (%%rsp)\n\t" \
"movq %%rsi, 32(%%rsp)\n\t" \
"movq %%rdi, 24(%%rsp)\n\t" \
"movl 8(%%rsp), %%esi\n\t" \
"movl 12(%%rsp), %%edi\n\t" \
"\n" \
"1:\n\t" \
QUARTERROUND_2_X64(%%eax, %%r8d, %%esi, %%r12d, \
%%ebx, %%r9d, %%edi, %%r13d) \
"movl %%esi, 8(%%rsp)\n\t" \
"movl %%edi, 12(%%rsp)\n\t" \
"movl 16(%%rsp), %%esi\n\t" \
"movl 20(%%rsp), %%edi\n\t" \
QUARTERROUND_2_X64(%%ecx, %%r10d, %%esi, %%r14d, \
%%edx, %%r11d, %%edi, %%r15d) \
QUARTERROUND_2_X64(%%eax, %%r9d, %%esi, %%r15d, \
%%ebx, %%r10d, %%edi, %%r12d) \
"movl %%esi, 16(%%rsp)\n\t" \
"movl %%edi, 20(%%rsp)\n\t" \
"movl 8(%%rsp), %%esi\n\t" \
"movl 12(%%rsp), %%edi\n\t" \
QUARTERROUND_2_X64(%%ecx, %%r11d, %%esi, %%r13d, \
%%edx, %%r8d, %%edi, %%r14d) \
"decb (%%rsp)\n\t" \
"jnz 1b\n\t" \
"movl %%esi, 8(%%rsp)\n\t" \
"movl %%edi, 12(%%rsp)\n\t" \
"movq 32(%%rsp), %%rsi\n\t" \
"movq 24(%%rsp), %%rdi\n\t" \
"addl 0(%[input]), %%eax\n\t" \
"addl 4(%[input]), %%ebx\n\t" \
"addl 8(%[input]), %%ecx\n\t" \
"addl 12(%[input]), %%edx\n\t" \
"addl 16(%[input]), %%r8d\n\t" \
"addl 20(%[input]), %%r9d\n\t" \
"addl 24(%[input]), %%r10d\n\t" \
"addl 28(%[input]), %%r11d\n\t" \
"addl 48(%[input]), %%r12d\n\t" \
"addl 52(%[input]), %%r13d\n\t" \
"addl 56(%[input]), %%r14d\n\t" \
"addl 60(%[input]), %%r15d\n\t" \
#define CHACHA_PARTIAL_CHUNK_X64() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_X64() \
"movl %%eax , 0(%[c])\n\t" \
"movl %%ebx , 4(%[c])\n\t" \
"movl %%ecx , 8(%[c])\n\t" \
"movl %%edx , 12(%[c])\n\t" \
"movl %%r8d , 16(%[c])\n\t" \
"movl %%r9d , 20(%[c])\n\t" \
"movl %%r10d, 24(%[c])\n\t" \
"movl %%r11d, 28(%[c])\n\t" \
"movl %%r12d, 48(%[c])\n\t" \
"movl %%r13d, 52(%[c])\n\t" \
"movl %%r14d, 56(%[c])\n\t" \
"movl %%r15d, 60(%[c])\n\t" \
"movl 8(%%rsp), %%eax\n\t" \
"movl 12(%%rsp), %%ebx\n\t" \
"movl 16(%%rsp), %%ecx\n\t" \
"movl 20(%%rsp), %%edx\n\t" \
"addl 32(%[input]), %%eax\n\t" \
"addl 36(%[input]), %%ebx\n\t" \
"addl 40(%[input]), %%ecx\n\t" \
"addl 44(%[input]), %%edx\n\t" \
"movl %%eax , 32(%[c])\n\t" \
"movl %%ebx , 36(%[c])\n\t" \
"movl %%ecx , 40(%[c])\n\t" \
"movl %%edx , 44(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
"addq $40, %%rsp\n\t" \
"movq %[output], %%rax\n\t" \
"movq %[m], %%rbx\n\t" \
"movl %[bytes], %%r8d\n\t" \
"xorq %%rdx, %%rdx\n\t" \
"movl %%r8d, %%r9d\n\t" \
"andl $7, %%r9d\n\t" \
"jz 4f\n\t" \
"\n" \
"2:\n\t" \
"movzbl (%[c],%%rdx,1), %%ecx\n\t" \
"xorb (%%rbx,%%rdx,1), %%cl\n\t" \
"movb %%cl, (%%rax,%%rdx,1)\n\t" \
"incl %%edx\n\t" \
"cmpl %%r9d, %%edx\n\t" \
"jne 2b\n\t" \
"je 3f\n\t" \
"\n" \
"4:\n\t" \
"movq (%[c],%%rdx,1), %%rcx\n\t" \
"xorq (%%rbx,%%rdx,1), %%rcx\n\t" \
"movq %%rcx, (%%rax,%%rdx,1)\n\t" \
"addl $8, %%edx\n\t" \
"\n" \
"3:\n\t" \
"cmpl %%r8d, %%edx\n\t" \
"jne 4b\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (x), \
[output] "m" (c), [bytes] "m" (bytes), [m] "m" (m) \
: "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \
"r14", "r15", "memory" \
)
#define CHACHA_CHUNK_X64() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_X64() \
"movq %%rsi, 32(%%rsp)\n\t" \
"addq $40, %%rsp\n\t" \
"movq %[m], %%rsi\n\t" \
"subq $40, %%rsp\n\t" \
"xorl 0(%%rsi), %%eax\n\t" \
"xorl 4(%%rsi), %%ebx\n\t" \
"xorl 8(%%rsi), %%ecx\n\t" \
"xorl 12(%%rsi), %%edx\n\t" \
"xorl 16(%%rsi), %%r8d\n\t" \
"xorl 20(%%rsi), %%r9d\n\t" \
"xorl 24(%%rsi), %%r10d\n\t" \
"xorl 28(%%rsi), %%r11d\n\t" \
"xorl 48(%%rsi), %%r12d\n\t" \
"xorl 52(%%rsi), %%r13d\n\t" \
"xorl 56(%%rsi), %%r14d\n\t" \
"xorl 60(%%rsi), %%r15d\n\t" \
"movq 32(%%rsp), %%rsi\n\t" \
"movl %%eax , 0(%[c])\n\t" \
"movl %%ebx , 4(%[c])\n\t" \
"movl %%ecx , 8(%[c])\n\t" \
"movl %%edx , 12(%[c])\n\t" \
"movl %%r8d , 16(%[c])\n\t" \
"movl %%r9d , 20(%[c])\n\t" \
"movl %%r10d, 24(%[c])\n\t" \
"movl %%r11d, 28(%[c])\n\t" \
"movl %%r12d, 48(%[c])\n\t" \
"movl %%r13d, 52(%[c])\n\t" \
"movl %%r14d, 56(%[c])\n\t" \
"movl %%r15d, 60(%[c])\n\t" \
"addq $40, %%rsp\n\t" \
"movq %[m], %%r8\n\t" \
"subq $40, %%rsp\n\t" \
"movl 8(%%rsp), %%eax\n\t" \
"movl 12(%%rsp), %%ebx\n\t" \
"movl 16(%%rsp), %%ecx\n\t" \
"movl 20(%%rsp), %%edx\n\t" \
"addl 32(%[input]), %%eax\n\t" \
"addl 36(%[input]), %%ebx\n\t" \
"addl 40(%[input]), %%ecx\n\t" \
"addl 44(%[input]), %%edx\n\t" \
"xorl 32(%%r8), %%eax\n\t" \
"xorl 36(%%r8), %%ebx\n\t" \
"xorl 40(%%r8), %%ecx\n\t" \
"xorl 44(%%r8), %%edx\n\t" \
"movl %%eax , 32(%[c])\n\t" \
"movl %%ebx , 36(%[c])\n\t" \
"movl %%ecx , 40(%[c])\n\t" \
"movl %%edx , 44(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
"addq $40, %%rsp\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (c), [m] "m" (m) \
: "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \
"r14", "r15", "memory" \
)
static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c,
word32 bytes)
{
word32 x[CHACHA_CHUNK_WORDS];
if (bytes == 0)
return;
for (; bytes >= CHACHA_CHUNK_BYTES;) {
CHACHA_CHUNK_X64();
bytes -= CHACHA_CHUNK_BYTES;
c += CHACHA_CHUNK_BYTES;
m += CHACHA_CHUNK_BYTES;
}
if (bytes > 0) {
CHACHA_PARTIAL_CHUNK_X64();
}
}
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
#define QUARTERROUND_2_AVX() \
"paddd %%xmm1, %%xmm0\n\t" \
"pxor %%xmm0, %%xmm3\n\t" \
"pshufb %[rotl16], %%xmm3\n\t" \
"paddd %%xmm3, %%xmm2\n\t" \
"pxor %%xmm2, %%xmm1\n\t" \
"movdqa %%xmm1, %%xmm4\n\t" \
"pslld $12, %%xmm1\n\t" \
"psrld $20, %%xmm4\n\t" \
"pxor %%xmm4, %%xmm1\n\t" \
"paddd %%xmm1, %%xmm0\n\t" \
"pxor %%xmm0, %%xmm3\n\t" \
"pshufb %[rotl8], %%xmm3\n\t" \
"paddd %%xmm3, %%xmm2\n\t" \
"pxor %%xmm2, %%xmm1\n\t" \
"movdqa %%xmm1, %%xmm4\n\t" \
"pslld $7, %%xmm1\n\t" \
"psrld $25, %%xmm4\n\t" \
"pxor %%xmm4, %%xmm1\n\t" \
"# Swap words for next round\n\t" \
"pshufd $0x39, %%xmm1, %%xmm1\n\t" \
"pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
"pshufd $0x93, %%xmm3, %%xmm3\n\t" \
"paddd %%xmm1, %%xmm0\n\t" \
"pxor %%xmm0, %%xmm3\n\t" \
"pshufb %[rotl16], %%xmm3\n\t" \
"paddd %%xmm3, %%xmm2\n\t" \
"pxor %%xmm2, %%xmm1\n\t" \
"movdqa %%xmm1, %%xmm4\n\t" \
"pslld $12, %%xmm1\n\t" \
"psrld $20, %%xmm4\n\t" \
"pxor %%xmm4, %%xmm1\n\t" \
"paddd %%xmm1, %%xmm0\n\t" \
"pxor %%xmm0, %%xmm3\n\t" \
"pshufb %[rotl8], %%xmm3\n\t" \
"paddd %%xmm3, %%xmm2\n\t" \
"pxor %%xmm2, %%xmm1\n\t" \
"movdqa %%xmm1, %%xmm4\n\t" \
"pslld $7, %%xmm1\n\t" \
"psrld $25, %%xmm4\n\t" \
"pxor %%xmm4, %%xmm1\n\t" \
"# Swap words back\n\t" \
"pshufd $0x93, %%xmm1, %%xmm1\n\t" \
"pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
"pshufd $0x39, %%xmm3, %%xmm3\n\t" \
#define CHACHA_CRYPT_AVX() \
"movdqu 0(%[input]), %%xmm0\n\t" \
"movdqu 16(%[input]), %%xmm1\n\t" \
"movdqu 32(%[input]), %%xmm2\n\t" \
"movdqu 48(%[input]), %%xmm3\n\t" \
"movb $10, %%al\n\t" \
"\n" \
"1:\n\t" \
QUARTERROUND_2_AVX() \
"decb %%al\n\t" \
"jnz 1b\n\t" \
"movdqu 0(%[input]), %%xmm4\n\t" \
"movdqu 16(%[input]), %%xmm5\n\t" \
"movdqu 32(%[input]), %%xmm6\n\t" \
"movdqu 48(%[input]), %%xmm7\n\t" \
"paddd %%xmm4, %%xmm0\n\t" \
"paddd %%xmm5, %%xmm1\n\t" \
"paddd %%xmm6, %%xmm2\n\t" \
"paddd %%xmm7, %%xmm3\n\t" \
#define CHACHA_PARTIAL_CHUNK_AVX() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_AVX() \
"movdqu %%xmm0, 0(%[c])\n\t" \
"movdqu %%xmm1, 16(%[c])\n\t" \
"movdqu %%xmm2, 32(%[c])\n\t" \
"movdqu %%xmm3, 48(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
"movl %[bytes], %%r8d\n\t" \
"xorq %%rdx, %%rdx\n\t" \
"movl %%r8d, %%r9d\n\t" \
"andl $7, %%r9d\n\t" \
"jz 4f\n\t" \
"\n" \
"2:\n\t" \
"movzbl (%[c],%%rdx,1), %%ecx\n\t" \
"xorb (%[m],%%rdx,1), %%cl\n\t" \
"movb %%cl, (%[output],%%rdx,1)\n\t" \
"incl %%edx\n\t" \
"cmpl %%r9d, %%edx\n\t" \
"jne 2b\n\t" \
"je 3f\n\t" \
"\n" \
"4:\n\t" \
"movq (%[c],%%rdx,1), %%rcx\n\t" \
"xorq (%[m],%%rdx,1), %%rcx\n\t" \
"movq %%rcx, (%[output],%%rdx,1)\n\t" \
"addl $8, %%edx\n\t" \
"\n" \
"3:\n\t" \
"cmpl %%r8d, %%edx\n\t" \
"jne 4b\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (x), \
[output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
: "eax", "ecx", "edx", "r8", "r9", "memory", \
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
)
#define CHACHA_CHUNK_AVX() \
__asm__ __volatile__ ( \
CHACHA_CRYPT_AVX() \
"movdqu 0(%[m]), %%xmm4\n\t" \
"movdqu 16(%[m]), %%xmm5\n\t" \
"movdqu 32(%[m]), %%xmm6\n\t" \
"movdqu 48(%[m]), %%xmm7\n\t" \
"pxor %%xmm4, %%xmm0\n\t" \
"pxor %%xmm5, %%xmm1\n\t" \
"pxor %%xmm6, %%xmm2\n\t" \
"pxor %%xmm7, %%xmm3\n\t" \
"movdqu %%xmm0, 0(%[c])\n\t" \
"movdqu %%xmm1, 16(%[c])\n\t" \
"movdqu %%xmm2, 32(%[c])\n\t" \
"movdqu %%xmm3, 48(%[c])\n\t" \
"addl $1, 48(%[input])\n\t" \
: \
: [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
: "rax", "memory", \
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
)
#endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX1
static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
word32 bytes)
{
ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
byte* output;
word32 i;
word32 cnt = 0;
static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL };
static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL };
static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
if (bytes == 0)
return;
@ -646,7 +1009,7 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
"add 48(%[key]), %[cnt]\n\t"
"movl %[cnt], 48(%[key])\n\t"
"\n"
"L_end128:"
"L_end128:\n\t"
: [bytes] "+r" (bytes), [cnt] "+r" (cnt),
[in] "+r" (m), [out] "+r" (c)
: [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
@ -658,23 +1021,15 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
"xmm12", "xmm13", "xmm14", "xmm15", "memory"
);
output = (byte*)x;
for (; bytes > 0;) {
wc_Chacha_wordtobyte(x, ctx->X);
ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
if (bytes <= CHACHA_CHUNK_BYTES) {
for (i = 0; i < bytes; ++i) {
c[i] = m[i] ^ output[i];
}
return;
}
for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) {
c[i] = m[i] ^ output[i];
}
for (; bytes >= CHACHA_CHUNK_BYTES;) {
CHACHA_CHUNK_AVX();
bytes -= CHACHA_CHUNK_BYTES;
c += CHACHA_CHUNK_BYTES;
m += CHACHA_CHUNK_BYTES;
}
if (bytes > 0) {
CHACHA_PARTIAL_CHUNK_AVX();
}
}
#endif /* HAVE_INTEL_AVX1 */
@ -684,16 +1039,16 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
{
ALIGN256 word32 X[8*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
ALIGN256 word32 x[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
byte* output;
word32 i;
word32 cnt = 0;
static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL,
0x0000000500000004UL,0x0000000700000006UL };
static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL,
0x0000000800000008UL,0x0000000800000008UL };
static const __m256i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL,
static const __m256i rotl8_256 =
{ 0x0605040702010003UL,0x0e0d0c0f0a09080bUL,
0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
static const __m256i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL,
static const __m256i rotl16_256 =
{ 0x0504070601000302UL,0x0d0c0f0e09080b0aUL,
0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
if (bytes == 0)
@ -931,35 +1286,27 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
"add 48(%[key]), %[cnt]\n\t"
"movl %[cnt], 48(%[key])\n\t"
"\n"
"L_end256:"
"L_end256:\n\t"
: [bytes] "+r" (bytes), [cnt] "+r" (cnt),
[in] "+r" (m), [out] "+r" (c)
: [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
[add] "m" (add), [eight] "m" (eight),
[rotl8] "m" (rotl8), [rotl16] "m" (rotl16)
[rotl8] "m" (rotl8_256), [rotl16] "m" (rotl16_256)
: "ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12", "ymm13", "ymm14", "ymm15", "memory"
);
output = (byte*)x;
for (; bytes > 0;) {
wc_Chacha_wordtobyte(x, ctx->X);
ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
if (bytes <= CHACHA_CHUNK_BYTES) {
for (i = 0; i < bytes; ++i) {
c[i] = m[i] ^ output[i];
}
return;
}
for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) {
c[i] = m[i] ^ output[i];
}
for (; bytes >= CHACHA_CHUNK_BYTES;) {
CHACHA_CHUNK_AVX();
bytes -= CHACHA_CHUNK_BYTES;
c += CHACHA_CHUNK_BYTES;
m += CHACHA_CHUNK_BYTES;
}
if (bytes > 0) {
CHACHA_PARTIAL_CHUNK_AVX();
}
}
#endif /* HAVE_INTEL_AVX2 */
#endif /* USE_INTEL_CHACHA_SPEEDUP */
@ -1004,16 +1351,25 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
return BAD_FUNC_ARG;
#ifdef USE_INTEL_CHACHA_SPEEDUP
if (!cpuidFlagsSet) {
cpuidFlags = cpuid_get_flags();
cpuidFlagsSet = 1;
}
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(cpuid_get_flags())) {
if (IS_INTEL_AVX2(cpuidFlags)) {
chacha_encrypt_avx2(ctx, input, output, msglen);
return 0;
}
#endif
if (IS_INTEL_AVX1(cpuid_get_flags())) {
if (IS_INTEL_AVX1(cpuidFlags)) {
chacha_encrypt_avx(ctx, input, output, msglen);
return 0;
}
else {
chacha_encrypt_x64(ctx, input, output, msglen);
return 0;
}
#endif
wc_Chacha_encrypt_bytes(ctx, input, output, msglen);

View File

@ -144,10 +144,10 @@ static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
"addq %%rax, %%r12\n\t"
"movq %%r15, %%rax\n\t"
"adcq %%rdx, %%r13\n\t"
"# r[0] * h[0] => rdx, rax +=> t1, t0\n\t"
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
"mulq %%r8\n\t"
"movq %%rdx, %%r8\n\t"
"movq %%rax, %%r11\n\t"
"movq %%rdx, %%r8\n\t"
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
"movq 8(%[ctx]), %%rax\n\t"
"mulq %%r9\n\t"
@ -211,10 +211,10 @@ POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
"addq %%rax, %%r12\n\t"
"movq %%r15, %%rax\n\t"
"adcq %%rdx, %%r13\n\t"
"# r[0] * h[0] => rdx, rax +=> t1, t0\n\t"
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
"mulq %%r8\n\t"
"movq %%rdx, %%r8\n\t"
"movq %%rax, %%r11\n\t"
"movq %%rdx, %%r8\n\t"
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
"movq 8(%[ctx]), %%rax\n\t"
"mulq %%r9\n\t"