mirror of https://github.com/wolfSSL/wolfssl.git
Merge pull request #1271 from SparkiDev/chacha20_sb
Improve performance of small number of blocks for chacha20pull/1298/head
commit
f2375f3fee
|
@ -220,22 +220,22 @@ typedef struct bench_alg {
|
|||
static const bench_alg bench_cipher_opt[] = {
|
||||
{ "-cipher", -1 },
|
||||
#ifdef HAVE_AES_CBC
|
||||
{ "-aes_cbc", BENCH_AES_CBC },
|
||||
{ "-aes-cbc", BENCH_AES_CBC },
|
||||
#endif
|
||||
#ifdef HAVE_AESGCM
|
||||
{ "-aes_gcm", BENCH_AES_GCM },
|
||||
{ "-aes-gcm", BENCH_AES_GCM },
|
||||
#endif
|
||||
#ifdef WOLFSSL_AES_DIRECT
|
||||
{ "-aes_ecb", BENCH_AES_ECB },
|
||||
{ "-aes-ecb", BENCH_AES_ECB },
|
||||
#endif
|
||||
#ifdef WOLFSSL_AES_XTS
|
||||
{ "-aes_xts", BENCH_AES_XTS },
|
||||
{ "-aes-xts", BENCH_AES_XTS },
|
||||
#endif
|
||||
#ifdef WOLFSSL_AES_COUNTER
|
||||
{ "-aes_ctr", BENCH_AES_CTR },
|
||||
{ "-aes-ctr", BENCH_AES_CTR },
|
||||
#endif
|
||||
#ifdef HAVE_AESCCM
|
||||
{ "-aes_ccm", BENCH_AES_CCM },
|
||||
{ "-aes-ccm", BENCH_AES_CCM },
|
||||
#endif
|
||||
#ifdef HAVE_CAMELLIA
|
||||
{ "-camellia", BENCH_CAMELLIA },
|
||||
|
@ -253,7 +253,7 @@ static const bench_alg bench_cipher_opt[] = {
|
|||
{ "-chacha20", BENCH_CHACHA20 },
|
||||
#endif
|
||||
#if defined(HAVE_CHACHA) && defined(HAVE_POLY1305)
|
||||
{ "-chacha20_poly1305", BENCH_CHACHA20_POLY1305 },
|
||||
{ "-chacha20-poly1305", BENCH_CHACHA20_POLY1305 },
|
||||
#endif
|
||||
#ifndef NO_DES3
|
||||
{ "-des", BENCH_DES },
|
||||
|
@ -295,16 +295,16 @@ static const bench_alg bench_digest_opt[] = {
|
|||
#ifdef WOLFSSL_SHA3
|
||||
{ "-sha3", BENCH_SHA3 },
|
||||
#ifndef WOLFSSL_NOSHA3_224
|
||||
{ "-sha3_224", BENCH_SHA3_224 },
|
||||
{ "-sha3-224", BENCH_SHA3_224 },
|
||||
#endif
|
||||
#ifndef WOLFSSL_NOSHA3_256
|
||||
{ "-sha3_256", BENCH_SHA3_256 },
|
||||
{ "-sha3-256", BENCH_SHA3_256 },
|
||||
#endif
|
||||
#ifndef WOLFSSL_NOSHA3_384
|
||||
{ "-sha3_384", BENCH_SHA3_384 },
|
||||
{ "-sha3-384", BENCH_SHA3_384 },
|
||||
#endif
|
||||
#ifndef WOLFSSL_NOSHA3_512
|
||||
{ "-sha3_512", BENCH_SHA3_512 },
|
||||
{ "-sha3-512", BENCH_SHA3_512 },
|
||||
#endif
|
||||
#endif
|
||||
#ifdef WOLFSSL_RIPEMD
|
||||
|
@ -325,22 +325,22 @@ static const bench_alg bench_mac_opt[] = {
|
|||
#ifndef NO_HMAC
|
||||
{ "-hmac", BENCH_HMAC },
|
||||
#ifndef NO_MD5
|
||||
{ "-hmac_md5", BENCH_HMAC_MD5 },
|
||||
{ "-hmac-md5", BENCH_HMAC_MD5 },
|
||||
#endif
|
||||
#ifndef NO_SHA
|
||||
{ "-hmac_sha", BENCH_HMAC_SHA },
|
||||
{ "-hmac-sha", BENCH_HMAC_SHA },
|
||||
#endif
|
||||
#ifdef WOLFSSL_SHA224
|
||||
{ "-hmac_sha224", BENCH_HMAC_SHA224 },
|
||||
{ "-hmac-sha224", BENCH_HMAC_SHA224 },
|
||||
#endif
|
||||
#ifndef NO_SHA256
|
||||
{ "-hmac_sha256", BENCH_HMAC_SHA256 },
|
||||
{ "-hmac-sha256", BENCH_HMAC_SHA256 },
|
||||
#endif
|
||||
#ifdef WOLFSSL_SHA384
|
||||
{ "-hmac_sha384", BENCH_HMAC_SHA384 },
|
||||
{ "-hmac-sha384", BENCH_HMAC_SHA384 },
|
||||
#endif
|
||||
#ifdef WOLFSSL_SHA512
|
||||
{ "-hmac_sha512", BENCH_HMAC_SHA512 },
|
||||
{ "-hmac-sha512", BENCH_HMAC_SHA512 },
|
||||
#endif
|
||||
#endif
|
||||
{ NULL, 0}
|
||||
|
@ -351,7 +351,7 @@ static const bench_alg bench_asym_opt[] = {
|
|||
{ "-asym", -1 },
|
||||
#ifndef NO_RSA
|
||||
#ifdef WOLFSSL_KEY_GEN
|
||||
{ "-rsa_kg", BENCH_RSA_KEYGEN },
|
||||
{ "-rsa-kg", BENCH_RSA_KEYGEN },
|
||||
#endif
|
||||
{ "-rsa", BENCH_RSA },
|
||||
#endif
|
||||
|
@ -360,13 +360,13 @@ static const bench_alg bench_asym_opt[] = {
|
|||
#endif
|
||||
#ifdef HAVE_NTRU
|
||||
{ "-ntru", BENCH_NTRU },
|
||||
{ "-ntru_kg", BENCH_NTRU_KEYGEN },
|
||||
{ "-ntru-kg", BENCH_NTRU_KEYGEN },
|
||||
#endif
|
||||
#ifdef HAVE_ECC
|
||||
{ "-ecc_kg", BENCH_ECC_MAKEKEY },
|
||||
{ "-ecc-kg", BENCH_ECC_MAKEKEY },
|
||||
{ "-ecc", BENCH_ECC },
|
||||
#ifdef HAVE_ECC_ENCRYPT
|
||||
{ "-ecc_enc", BENCH_ECC_ENCRYPT },
|
||||
{ "-ecc-enc", BENCH_ECC_ENCRYPT },
|
||||
#endif
|
||||
#endif
|
||||
#ifdef HAVE_CURVE25519
|
||||
|
@ -376,7 +376,7 @@ static const bench_alg bench_asym_opt[] = {
|
|||
#endif
|
||||
#endif
|
||||
#ifdef HAVE_ED25519
|
||||
{ "-ed25519_kg", BENCH_ED25519_KEYGEN },
|
||||
{ "-ed25519-kg", BENCH_ED25519_KEYGEN },
|
||||
{ "-ed25519", BENCH_ED25519_SIGN },
|
||||
#endif
|
||||
{ NULL, 0}
|
||||
|
@ -654,6 +654,7 @@ static THREAD_LS_T int devId = INVALID_DEVID;
|
|||
#define AES_AUTH_ADD_SZ 13
|
||||
#define AES_AUTH_TAG_SZ 16
|
||||
#define BENCH_CIPHER_ADD AES_AUTH_TAG_SZ
|
||||
static word32 aesAuthAddSz = AES_AUTH_ADD_SZ;
|
||||
#endif
|
||||
#ifndef BENCH_CIPHER_ADD
|
||||
#define BENCH_CIPHER_ADD 0
|
||||
|
@ -663,24 +664,25 @@ static THREAD_LS_T int devId = INVALID_DEVID;
|
|||
/* use kB instead of mB for embedded benchmarking */
|
||||
#ifdef BENCH_EMBEDDED
|
||||
enum BenchmarkBounds {
|
||||
numBlocks = 25, /* how many kB to test (en/de)cryption */
|
||||
scryptCnt = 1,
|
||||
ntimes = 2,
|
||||
genTimes = BENCH_MAX_PENDING,
|
||||
agreeTimes = 2
|
||||
};
|
||||
static int numBlocks = 25; /* how many kB to test (en/de)cryption */
|
||||
static word32 bench_size = (1024ul);
|
||||
#else
|
||||
enum BenchmarkBounds {
|
||||
numBlocks = 5, /* how many megs to test (en/de)cryption */
|
||||
scryptCnt = 10,
|
||||
ntimes = 100,
|
||||
genTimes = BENCH_MAX_PENDING, /* must be at least BENCH_MAX_PENDING */
|
||||
agreeTimes = 100
|
||||
};
|
||||
static int numBlocks = 5; /* how many megs to test (en/de)cryption */
|
||||
static word32 bench_size = (1024*1024ul);
|
||||
#endif
|
||||
static int base2 = 1;
|
||||
static int digest_stream = 1;
|
||||
|
||||
/* for compatibility */
|
||||
#define BENCH_SIZE bench_size
|
||||
|
@ -1741,7 +1743,7 @@ static void bench_aesgcm_internal(int doAsync, const byte* key, word32 keySz,
|
|||
ret = wc_AesGcmEncrypt(&enc[i], bench_cipher,
|
||||
bench_plain, BENCH_SIZE,
|
||||
iv, ivSz, bench_tag, AES_AUTH_TAG_SZ,
|
||||
bench_additional, AES_AUTH_ADD_SZ);
|
||||
bench_additional, aesAuthAddSz);
|
||||
if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, ×, &pending)) {
|
||||
goto exit_aes_gcm;
|
||||
}
|
||||
|
@ -1765,7 +1767,7 @@ exit_aes_gcm:
|
|||
ret = wc_AesGcmDecrypt(&enc[i], bench_plain,
|
||||
bench_cipher, BENCH_SIZE,
|
||||
iv, ivSz, bench_tag, AES_AUTH_TAG_SZ,
|
||||
bench_additional, AES_AUTH_ADD_SZ);
|
||||
bench_additional, aesAuthAddSz);
|
||||
if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, ×, &pending)) {
|
||||
goto exit_aes_gcm_dec;
|
||||
}
|
||||
|
@ -1977,7 +1979,7 @@ static void bench_aesctr_internal(const byte* key, word32 keySz, const byte* iv,
|
|||
{
|
||||
Aes enc;
|
||||
double start;
|
||||
int i, count, ret;
|
||||
int i, count, ret = 0;
|
||||
|
||||
wc_AesSetKeyDirect(&enc, key, keySz, iv, AES_ENCRYPTION);
|
||||
|
||||
|
@ -2026,7 +2028,7 @@ void bench_aesccm(void)
|
|||
for (i = 0; i < numBlocks; i++) {
|
||||
wc_AesCcmEncrypt(&enc, bench_cipher, bench_plain, BENCH_SIZE,
|
||||
bench_iv, 12, bench_tag, AES_AUTH_TAG_SZ,
|
||||
bench_additional, AES_AUTH_ADD_SZ);
|
||||
bench_additional, aesAuthAddSz);
|
||||
}
|
||||
count += i;
|
||||
} while (bench_stats_sym_check(start));
|
||||
|
@ -2336,6 +2338,7 @@ void bench_md5(int doAsync)
|
|||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitMd5_ex(&hash[i], HEAP_HINT,
|
||||
|
@ -2381,6 +2384,20 @@ void bench_md5(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitMd5_ex(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_Md5Update(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_Md5Final(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_md5;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_md5:
|
||||
bench_stats_sym_finish("MD5", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -2408,6 +2425,7 @@ void bench_sha(int doAsync)
|
|||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitSha_ex(&hash[i], HEAP_HINT,
|
||||
|
@ -2453,6 +2471,20 @@ void bench_sha(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitSha_ex(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_ShaUpdate(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_ShaFinal(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_sha;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_sha:
|
||||
bench_stats_sym_finish("SHA", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -2472,12 +2504,13 @@ void bench_sha224(int doAsync)
|
|||
{
|
||||
wc_Sha224 hash[BENCH_MAX_PENDING];
|
||||
double start;
|
||||
int ret, i, count = 0, times, pending = 0;
|
||||
int ret = 0, i, count = 0, times, pending = 0;
|
||||
DECLARE_ARRAY(digest, byte, BENCH_MAX_PENDING, WC_SHA224_DIGEST_SIZE, HEAP_HINT);
|
||||
|
||||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitSha224_ex(&hash[i], HEAP_HINT,
|
||||
|
@ -2519,6 +2552,20 @@ void bench_sha224(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitSha224_ex(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_Sha224Update(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_Sha224Final(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_sha224;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_sha224:
|
||||
bench_stats_sym_finish("SHA-224", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -2543,6 +2590,7 @@ void bench_sha256(int doAsync)
|
|||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitSha256_ex(&hash[i], HEAP_HINT,
|
||||
|
@ -2587,6 +2635,20 @@ void bench_sha256(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitSha256_ex(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_Sha256Update(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_Sha256Final(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_sha256;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_sha256:
|
||||
bench_stats_sym_finish("SHA-256", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -2611,6 +2673,7 @@ void bench_sha384(int doAsync)
|
|||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitSha384_ex(&hash[i], HEAP_HINT,
|
||||
|
@ -2652,6 +2715,20 @@ void bench_sha384(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitSha384_ex(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_Sha384Update(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_Sha384Final(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_sha384;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_sha384:
|
||||
bench_stats_sym_finish("SHA-384", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -2676,6 +2753,7 @@ void bench_sha512(int doAsync)
|
|||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitSha512_ex(&hash[i], HEAP_HINT,
|
||||
|
@ -2717,6 +2795,20 @@ void bench_sha512(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitSha512_ex(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_Sha512Update(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_Sha512Final(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_sha512;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_sha512:
|
||||
bench_stats_sym_finish("SHA-512", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -2743,6 +2835,7 @@ void bench_sha3_224(int doAsync)
|
|||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitSha3_224(&hash[i], HEAP_HINT,
|
||||
|
@ -2784,6 +2877,20 @@ void bench_sha3_224(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitSha3_224(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_Sha3_224_Update(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_Sha3_224_Final(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_sha3_224;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_sha3_224:
|
||||
bench_stats_sym_finish("SHA3-224", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -2808,6 +2915,7 @@ void bench_sha3_256(int doAsync)
|
|||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitSha3_256(&hash[i], HEAP_HINT,
|
||||
|
@ -2849,6 +2957,20 @@ void bench_sha3_256(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitSha3_256(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_Sha3_256_Update(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_Sha3_256_Final(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_sha3_256;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_sha3_256:
|
||||
bench_stats_sym_finish("SHA3-256", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -2873,6 +2995,7 @@ void bench_sha3_384(int doAsync)
|
|||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitSha3_384(&hash[i], HEAP_HINT,
|
||||
|
@ -2914,6 +3037,20 @@ void bench_sha3_384(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitSha3_384(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_Sha3_384_Update(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_Sha3_384_Final(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_sha3_384;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_sha3_384:
|
||||
bench_stats_sym_finish("SHA3-384", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -2938,6 +3075,7 @@ void bench_sha3_512(int doAsync)
|
|||
/* clear for done cleanup */
|
||||
XMEMSET(hash, 0, sizeof(hash));
|
||||
|
||||
if (digest_stream) {
|
||||
/* init keys */
|
||||
for (i = 0; i < BENCH_MAX_PENDING; i++) {
|
||||
ret = wc_InitSha3_512(&hash[i], HEAP_HINT,
|
||||
|
@ -2979,6 +3117,20 @@ void bench_sha3_512(int doAsync)
|
|||
} /* for i */
|
||||
} while (pending > 0);
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (times = 0; times < numBlocks; times++) {
|
||||
ret = wc_InitSha3_512(hash, HEAP_HINT, INVALID_DEVID);
|
||||
ret |= wc_Sha3_512_Update(hash, bench_plain, BENCH_SIZE);
|
||||
ret |= wc_Sha3_512_Final(hash, digest[0]);
|
||||
if (ret != 0)
|
||||
goto exit_sha3_512;
|
||||
} /* for times */
|
||||
count += times;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
exit_sha3_512:
|
||||
bench_stats_sym_finish("SHA3-512", doAsync, count, bench_size, start, ret);
|
||||
|
||||
|
@ -3000,8 +3152,9 @@ int bench_ripemd(void)
|
|||
RipeMd hash;
|
||||
byte digest[RIPEMD_DIGEST_SIZE];
|
||||
double start;
|
||||
int i, count, ret;
|
||||
int i, count, ret = 0;
|
||||
|
||||
if (digest_stream) {
|
||||
ret = wc_InitRipeMd(&hash);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
|
@ -3022,6 +3175,27 @@ int bench_ripemd(void)
|
|||
|
||||
count += i;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (i = 0; i < numBlocks; i++) {
|
||||
ret = wc_InitRipeMd(&hash);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
ret = wc_RipeMdUpdate(&hash, bench_plain, BENCH_SIZE);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
ret = wc_RipeMdFinal(&hash, digest);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
count += i;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
bench_stats_sym_finish("RIPEMD", 0, count, bench_size, start, ret);
|
||||
|
||||
return 0;
|
||||
|
@ -3037,6 +3211,7 @@ void bench_blake2(void)
|
|||
double start;
|
||||
int ret, i, count;
|
||||
|
||||
if (digest_stream) {
|
||||
ret = wc_InitBlake2b(&b2b, 64);
|
||||
if (ret != 0) {
|
||||
printf("InitBlake2b failed, ret = %d\n", ret);
|
||||
|
@ -3059,6 +3234,30 @@ void bench_blake2(void)
|
|||
}
|
||||
count += i;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
else {
|
||||
bench_stats_start(&count, &start);
|
||||
do {
|
||||
for (i = 0; i < numBlocks; i++) {
|
||||
ret = wc_InitBlake2b(&b2b, 64);
|
||||
if (ret != 0) {
|
||||
printf("InitBlake2b failed, ret = %d\n", ret);
|
||||
return;
|
||||
}
|
||||
ret = wc_Blake2bUpdate(&b2b, bench_plain, BENCH_SIZE);
|
||||
if (ret != 0) {
|
||||
printf("Blake2bUpdate failed, ret = %d\n", ret);
|
||||
return;
|
||||
}
|
||||
ret = wc_Blake2bFinal(&b2b, digest, 64);
|
||||
if (ret != 0) {
|
||||
printf("Blake2bFinal failed, ret = %d\n", ret);
|
||||
return;
|
||||
}
|
||||
}
|
||||
count += i;
|
||||
} while (bench_stats_sym_check(start));
|
||||
}
|
||||
bench_stats_sym_finish("BLAKE2b", 0, count, bench_size, start, ret);
|
||||
}
|
||||
#endif
|
||||
|
@ -4505,6 +4704,7 @@ void benchmark_configure(int block_size)
|
|||
{
|
||||
/* must be greater than 0 */
|
||||
if (block_size > 0) {
|
||||
numBlocks = numBlocks * bench_size / block_size;
|
||||
bench_size = (word32)block_size;
|
||||
}
|
||||
}
|
||||
|
@ -4542,6 +4742,10 @@ static void Usage(void)
|
|||
printf("benchmark\n");
|
||||
printf("-? Help, print this usage\n");
|
||||
printf("-base10 Display bytes as power of 10 (eg 1 kB = 1000 Bytes)\n");
|
||||
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
|
||||
printf("-no_aad No additional authentication data passed.\n");
|
||||
#endif
|
||||
printf("-dgst_full Full digest operation performed.\n");
|
||||
#ifndef WOLFSSL_BENCHMARK_ALL
|
||||
printf("-<alg> Algorithm to benchmark. Available algorithms "
|
||||
"include:\n");
|
||||
|
@ -4597,6 +4801,12 @@ int main(int argc, char** argv)
|
|||
}
|
||||
else if (string_matches(argv[1], "-base10"))
|
||||
base2 = 0;
|
||||
#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
|
||||
else if (string_matches(argv[1], "-no_aad"))
|
||||
aesAuthAddSz = 0;
|
||||
#endif
|
||||
else if (string_matches(argv[1], "-dgst_full"))
|
||||
digest_stream = 0;
|
||||
else if (argv[1][0] == '-') {
|
||||
optMatched = 0;
|
||||
#ifndef WOLFSSL_BENCHMARK_ALL
|
||||
|
|
|
@ -74,6 +74,9 @@
|
|||
#ifndef NO_AVX2_SUPPORT
|
||||
#define HAVE_INTEL_AVX2
|
||||
#endif
|
||||
|
||||
static int cpuidFlagsSet = 0;
|
||||
static int cpuidFlags = 0;
|
||||
#endif
|
||||
|
||||
#ifdef BIG_ENDIAN_ORDER
|
||||
|
@ -413,19 +416,379 @@ static INLINE void wc_Chacha_wordtobyte(word32 output[CHACHA_CHUNK_WORDS],
|
|||
|
||||
#ifdef USE_INTEL_CHACHA_SPEEDUP
|
||||
|
||||
#define QUARTERROUND_2_X64(r11, r12, r13, r14, r21, r22, r23, r24) \
|
||||
"addl "#r12", "#r11"\n\t" \
|
||||
"addl "#r22", "#r21"\n\t" \
|
||||
"xorl "#r11", "#r14"\n\t" \
|
||||
"xorl "#r21", "#r24"\n\t" \
|
||||
"roll $16, "#r14"\n\t" \
|
||||
"roll $16, "#r24"\n\t" \
|
||||
"addl "#r14", "#r13"\n\t" \
|
||||
"addl "#r24", "#r23"\n\t" \
|
||||
"xorl "#r13", "#r12"\n\t" \
|
||||
"xorl "#r23", "#r22"\n\t" \
|
||||
"roll $12, "#r12"\n\t" \
|
||||
"roll $12, "#r22"\n\t" \
|
||||
"addl "#r12", "#r11"\n\t" \
|
||||
"addl "#r22", "#r21"\n\t" \
|
||||
"xorl "#r11", "#r14"\n\t" \
|
||||
"xorl "#r21", "#r24"\n\t" \
|
||||
"roll $8, "#r14"\n\t" \
|
||||
"roll $8, "#r24"\n\t" \
|
||||
"addl "#r14", "#r13"\n\t" \
|
||||
"addl "#r24", "#r23"\n\t" \
|
||||
"xorl "#r13", "#r12"\n\t" \
|
||||
"xorl "#r23", "#r22"\n\t" \
|
||||
"roll $7, "#r12"\n\t" \
|
||||
"roll $7, "#r22"\n\t" \
|
||||
|
||||
#define CHACHA_CRYPT_X64() \
|
||||
"subq $40, %%rsp\n\t" \
|
||||
"movq 32(%[input]), %%rax\n\t" \
|
||||
"movq 40(%[input]), %%rdx\n\t" \
|
||||
"movq %%rax, 8(%%rsp)\n\t" \
|
||||
"movq %%rdx, 16(%%rsp)\n\t" \
|
||||
"movl 0(%[input]), %%eax\n\t" \
|
||||
"movl 4(%[input]), %%ebx\n\t" \
|
||||
"movl 8(%[input]), %%ecx\n\t" \
|
||||
"movl 12(%[input]), %%edx\n\t" \
|
||||
"movl 16(%[input]), %%r8d\n\t" \
|
||||
"movl 20(%[input]), %%r9d\n\t" \
|
||||
"movl 24(%[input]), %%r10d\n\t" \
|
||||
"movl 28(%[input]), %%r11d\n\t" \
|
||||
"movl 48(%[input]), %%r12d\n\t" \
|
||||
"movl 52(%[input]), %%r13d\n\t" \
|
||||
"movl 56(%[input]), %%r14d\n\t" \
|
||||
"movl 60(%[input]), %%r15d\n\t" \
|
||||
"movb $10, (%%rsp)\n\t" \
|
||||
"movq %%rsi, 32(%%rsp)\n\t" \
|
||||
"movq %%rdi, 24(%%rsp)\n\t" \
|
||||
"movl 8(%%rsp), %%esi\n\t" \
|
||||
"movl 12(%%rsp), %%edi\n\t" \
|
||||
"\n" \
|
||||
"1:\n\t" \
|
||||
QUARTERROUND_2_X64(%%eax, %%r8d, %%esi, %%r12d, \
|
||||
%%ebx, %%r9d, %%edi, %%r13d) \
|
||||
"movl %%esi, 8(%%rsp)\n\t" \
|
||||
"movl %%edi, 12(%%rsp)\n\t" \
|
||||
"movl 16(%%rsp), %%esi\n\t" \
|
||||
"movl 20(%%rsp), %%edi\n\t" \
|
||||
QUARTERROUND_2_X64(%%ecx, %%r10d, %%esi, %%r14d, \
|
||||
%%edx, %%r11d, %%edi, %%r15d) \
|
||||
QUARTERROUND_2_X64(%%eax, %%r9d, %%esi, %%r15d, \
|
||||
%%ebx, %%r10d, %%edi, %%r12d) \
|
||||
"movl %%esi, 16(%%rsp)\n\t" \
|
||||
"movl %%edi, 20(%%rsp)\n\t" \
|
||||
"movl 8(%%rsp), %%esi\n\t" \
|
||||
"movl 12(%%rsp), %%edi\n\t" \
|
||||
QUARTERROUND_2_X64(%%ecx, %%r11d, %%esi, %%r13d, \
|
||||
%%edx, %%r8d, %%edi, %%r14d) \
|
||||
"decb (%%rsp)\n\t" \
|
||||
"jnz 1b\n\t" \
|
||||
"movl %%esi, 8(%%rsp)\n\t" \
|
||||
"movl %%edi, 12(%%rsp)\n\t" \
|
||||
"movq 32(%%rsp), %%rsi\n\t" \
|
||||
"movq 24(%%rsp), %%rdi\n\t" \
|
||||
"addl 0(%[input]), %%eax\n\t" \
|
||||
"addl 4(%[input]), %%ebx\n\t" \
|
||||
"addl 8(%[input]), %%ecx\n\t" \
|
||||
"addl 12(%[input]), %%edx\n\t" \
|
||||
"addl 16(%[input]), %%r8d\n\t" \
|
||||
"addl 20(%[input]), %%r9d\n\t" \
|
||||
"addl 24(%[input]), %%r10d\n\t" \
|
||||
"addl 28(%[input]), %%r11d\n\t" \
|
||||
"addl 48(%[input]), %%r12d\n\t" \
|
||||
"addl 52(%[input]), %%r13d\n\t" \
|
||||
"addl 56(%[input]), %%r14d\n\t" \
|
||||
"addl 60(%[input]), %%r15d\n\t" \
|
||||
|
||||
#define CHACHA_PARTIAL_CHUNK_X64() \
|
||||
__asm__ __volatile__ ( \
|
||||
CHACHA_CRYPT_X64() \
|
||||
"movl %%eax , 0(%[c])\n\t" \
|
||||
"movl %%ebx , 4(%[c])\n\t" \
|
||||
"movl %%ecx , 8(%[c])\n\t" \
|
||||
"movl %%edx , 12(%[c])\n\t" \
|
||||
"movl %%r8d , 16(%[c])\n\t" \
|
||||
"movl %%r9d , 20(%[c])\n\t" \
|
||||
"movl %%r10d, 24(%[c])\n\t" \
|
||||
"movl %%r11d, 28(%[c])\n\t" \
|
||||
"movl %%r12d, 48(%[c])\n\t" \
|
||||
"movl %%r13d, 52(%[c])\n\t" \
|
||||
"movl %%r14d, 56(%[c])\n\t" \
|
||||
"movl %%r15d, 60(%[c])\n\t" \
|
||||
"movl 8(%%rsp), %%eax\n\t" \
|
||||
"movl 12(%%rsp), %%ebx\n\t" \
|
||||
"movl 16(%%rsp), %%ecx\n\t" \
|
||||
"movl 20(%%rsp), %%edx\n\t" \
|
||||
"addl 32(%[input]), %%eax\n\t" \
|
||||
"addl 36(%[input]), %%ebx\n\t" \
|
||||
"addl 40(%[input]), %%ecx\n\t" \
|
||||
"addl 44(%[input]), %%edx\n\t" \
|
||||
"movl %%eax , 32(%[c])\n\t" \
|
||||
"movl %%ebx , 36(%[c])\n\t" \
|
||||
"movl %%ecx , 40(%[c])\n\t" \
|
||||
"movl %%edx , 44(%[c])\n\t" \
|
||||
"addl $1, 48(%[input])\n\t" \
|
||||
"addq $40, %%rsp\n\t" \
|
||||
"movq %[output], %%rax\n\t" \
|
||||
"movq %[m], %%rbx\n\t" \
|
||||
"movl %[bytes], %%r8d\n\t" \
|
||||
"xorq %%rdx, %%rdx\n\t" \
|
||||
"movl %%r8d, %%r9d\n\t" \
|
||||
"andl $7, %%r9d\n\t" \
|
||||
"jz 4f\n\t" \
|
||||
"\n" \
|
||||
"2:\n\t" \
|
||||
"movzbl (%[c],%%rdx,1), %%ecx\n\t" \
|
||||
"xorb (%%rbx,%%rdx,1), %%cl\n\t" \
|
||||
"movb %%cl, (%%rax,%%rdx,1)\n\t" \
|
||||
"incl %%edx\n\t" \
|
||||
"cmpl %%r9d, %%edx\n\t" \
|
||||
"jne 2b\n\t" \
|
||||
"je 3f\n\t" \
|
||||
"\n" \
|
||||
"4:\n\t" \
|
||||
"movq (%[c],%%rdx,1), %%rcx\n\t" \
|
||||
"xorq (%%rbx,%%rdx,1), %%rcx\n\t" \
|
||||
"movq %%rcx, (%%rax,%%rdx,1)\n\t" \
|
||||
"addl $8, %%edx\n\t" \
|
||||
"\n" \
|
||||
"3:\n\t" \
|
||||
"cmpl %%r8d, %%edx\n\t" \
|
||||
"jne 4b\n\t" \
|
||||
: \
|
||||
: [input] "r" (ctx->X), [c] "r" (x), \
|
||||
[output] "m" (c), [bytes] "m" (bytes), [m] "m" (m) \
|
||||
: "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \
|
||||
"r14", "r15", "memory" \
|
||||
)
|
||||
|
||||
|
||||
#define CHACHA_CHUNK_X64() \
|
||||
__asm__ __volatile__ ( \
|
||||
CHACHA_CRYPT_X64() \
|
||||
"movq %%rsi, 32(%%rsp)\n\t" \
|
||||
"addq $40, %%rsp\n\t" \
|
||||
"movq %[m], %%rsi\n\t" \
|
||||
"subq $40, %%rsp\n\t" \
|
||||
"xorl 0(%%rsi), %%eax\n\t" \
|
||||
"xorl 4(%%rsi), %%ebx\n\t" \
|
||||
"xorl 8(%%rsi), %%ecx\n\t" \
|
||||
"xorl 12(%%rsi), %%edx\n\t" \
|
||||
"xorl 16(%%rsi), %%r8d\n\t" \
|
||||
"xorl 20(%%rsi), %%r9d\n\t" \
|
||||
"xorl 24(%%rsi), %%r10d\n\t" \
|
||||
"xorl 28(%%rsi), %%r11d\n\t" \
|
||||
"xorl 48(%%rsi), %%r12d\n\t" \
|
||||
"xorl 52(%%rsi), %%r13d\n\t" \
|
||||
"xorl 56(%%rsi), %%r14d\n\t" \
|
||||
"xorl 60(%%rsi), %%r15d\n\t" \
|
||||
"movq 32(%%rsp), %%rsi\n\t" \
|
||||
"movl %%eax , 0(%[c])\n\t" \
|
||||
"movl %%ebx , 4(%[c])\n\t" \
|
||||
"movl %%ecx , 8(%[c])\n\t" \
|
||||
"movl %%edx , 12(%[c])\n\t" \
|
||||
"movl %%r8d , 16(%[c])\n\t" \
|
||||
"movl %%r9d , 20(%[c])\n\t" \
|
||||
"movl %%r10d, 24(%[c])\n\t" \
|
||||
"movl %%r11d, 28(%[c])\n\t" \
|
||||
"movl %%r12d, 48(%[c])\n\t" \
|
||||
"movl %%r13d, 52(%[c])\n\t" \
|
||||
"movl %%r14d, 56(%[c])\n\t" \
|
||||
"movl %%r15d, 60(%[c])\n\t" \
|
||||
"addq $40, %%rsp\n\t" \
|
||||
"movq %[m], %%r8\n\t" \
|
||||
"subq $40, %%rsp\n\t" \
|
||||
"movl 8(%%rsp), %%eax\n\t" \
|
||||
"movl 12(%%rsp), %%ebx\n\t" \
|
||||
"movl 16(%%rsp), %%ecx\n\t" \
|
||||
"movl 20(%%rsp), %%edx\n\t" \
|
||||
"addl 32(%[input]), %%eax\n\t" \
|
||||
"addl 36(%[input]), %%ebx\n\t" \
|
||||
"addl 40(%[input]), %%ecx\n\t" \
|
||||
"addl 44(%[input]), %%edx\n\t" \
|
||||
"xorl 32(%%r8), %%eax\n\t" \
|
||||
"xorl 36(%%r8), %%ebx\n\t" \
|
||||
"xorl 40(%%r8), %%ecx\n\t" \
|
||||
"xorl 44(%%r8), %%edx\n\t" \
|
||||
"movl %%eax , 32(%[c])\n\t" \
|
||||
"movl %%ebx , 36(%[c])\n\t" \
|
||||
"movl %%ecx , 40(%[c])\n\t" \
|
||||
"movl %%edx , 44(%[c])\n\t" \
|
||||
"addl $1, 48(%[input])\n\t" \
|
||||
"addq $40, %%rsp\n\t" \
|
||||
: \
|
||||
: [input] "r" (ctx->X), [c] "r" (c), [m] "m" (m) \
|
||||
: "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \
|
||||
"r14", "r15", "memory" \
|
||||
)
|
||||
|
||||
|
||||
static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c,
|
||||
word32 bytes)
|
||||
{
|
||||
word32 x[CHACHA_CHUNK_WORDS];
|
||||
|
||||
if (bytes == 0)
|
||||
return;
|
||||
|
||||
for (; bytes >= CHACHA_CHUNK_BYTES;) {
|
||||
CHACHA_CHUNK_X64();
|
||||
bytes -= CHACHA_CHUNK_BYTES;
|
||||
c += CHACHA_CHUNK_BYTES;
|
||||
m += CHACHA_CHUNK_BYTES;
|
||||
}
|
||||
if (bytes > 0) {
|
||||
CHACHA_PARTIAL_CHUNK_X64();
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
|
||||
static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
|
||||
static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
|
||||
|
||||
#define QUARTERROUND_2_AVX() \
|
||||
"paddd %%xmm1, %%xmm0\n\t" \
|
||||
"pxor %%xmm0, %%xmm3\n\t" \
|
||||
"pshufb %[rotl16], %%xmm3\n\t" \
|
||||
"paddd %%xmm3, %%xmm2\n\t" \
|
||||
"pxor %%xmm2, %%xmm1\n\t" \
|
||||
"movdqa %%xmm1, %%xmm4\n\t" \
|
||||
"pslld $12, %%xmm1\n\t" \
|
||||
"psrld $20, %%xmm4\n\t" \
|
||||
"pxor %%xmm4, %%xmm1\n\t" \
|
||||
"paddd %%xmm1, %%xmm0\n\t" \
|
||||
"pxor %%xmm0, %%xmm3\n\t" \
|
||||
"pshufb %[rotl8], %%xmm3\n\t" \
|
||||
"paddd %%xmm3, %%xmm2\n\t" \
|
||||
"pxor %%xmm2, %%xmm1\n\t" \
|
||||
"movdqa %%xmm1, %%xmm4\n\t" \
|
||||
"pslld $7, %%xmm1\n\t" \
|
||||
"psrld $25, %%xmm4\n\t" \
|
||||
"pxor %%xmm4, %%xmm1\n\t" \
|
||||
"# Swap words for next round\n\t" \
|
||||
"pshufd $0x39, %%xmm1, %%xmm1\n\t" \
|
||||
"pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
|
||||
"pshufd $0x93, %%xmm3, %%xmm3\n\t" \
|
||||
"paddd %%xmm1, %%xmm0\n\t" \
|
||||
"pxor %%xmm0, %%xmm3\n\t" \
|
||||
"pshufb %[rotl16], %%xmm3\n\t" \
|
||||
"paddd %%xmm3, %%xmm2\n\t" \
|
||||
"pxor %%xmm2, %%xmm1\n\t" \
|
||||
"movdqa %%xmm1, %%xmm4\n\t" \
|
||||
"pslld $12, %%xmm1\n\t" \
|
||||
"psrld $20, %%xmm4\n\t" \
|
||||
"pxor %%xmm4, %%xmm1\n\t" \
|
||||
"paddd %%xmm1, %%xmm0\n\t" \
|
||||
"pxor %%xmm0, %%xmm3\n\t" \
|
||||
"pshufb %[rotl8], %%xmm3\n\t" \
|
||||
"paddd %%xmm3, %%xmm2\n\t" \
|
||||
"pxor %%xmm2, %%xmm1\n\t" \
|
||||
"movdqa %%xmm1, %%xmm4\n\t" \
|
||||
"pslld $7, %%xmm1\n\t" \
|
||||
"psrld $25, %%xmm4\n\t" \
|
||||
"pxor %%xmm4, %%xmm1\n\t" \
|
||||
"# Swap words back\n\t" \
|
||||
"pshufd $0x93, %%xmm1, %%xmm1\n\t" \
|
||||
"pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
|
||||
"pshufd $0x39, %%xmm3, %%xmm3\n\t" \
|
||||
|
||||
#define CHACHA_CRYPT_AVX() \
|
||||
"movdqu 0(%[input]), %%xmm0\n\t" \
|
||||
"movdqu 16(%[input]), %%xmm1\n\t" \
|
||||
"movdqu 32(%[input]), %%xmm2\n\t" \
|
||||
"movdqu 48(%[input]), %%xmm3\n\t" \
|
||||
"movb $10, %%al\n\t" \
|
||||
"\n" \
|
||||
"1:\n\t" \
|
||||
QUARTERROUND_2_AVX() \
|
||||
"decb %%al\n\t" \
|
||||
"jnz 1b\n\t" \
|
||||
"movdqu 0(%[input]), %%xmm4\n\t" \
|
||||
"movdqu 16(%[input]), %%xmm5\n\t" \
|
||||
"movdqu 32(%[input]), %%xmm6\n\t" \
|
||||
"movdqu 48(%[input]), %%xmm7\n\t" \
|
||||
"paddd %%xmm4, %%xmm0\n\t" \
|
||||
"paddd %%xmm5, %%xmm1\n\t" \
|
||||
"paddd %%xmm6, %%xmm2\n\t" \
|
||||
"paddd %%xmm7, %%xmm3\n\t" \
|
||||
|
||||
#define CHACHA_PARTIAL_CHUNK_AVX() \
|
||||
__asm__ __volatile__ ( \
|
||||
CHACHA_CRYPT_AVX() \
|
||||
"movdqu %%xmm0, 0(%[c])\n\t" \
|
||||
"movdqu %%xmm1, 16(%[c])\n\t" \
|
||||
"movdqu %%xmm2, 32(%[c])\n\t" \
|
||||
"movdqu %%xmm3, 48(%[c])\n\t" \
|
||||
"addl $1, 48(%[input])\n\t" \
|
||||
"movl %[bytes], %%r8d\n\t" \
|
||||
"xorq %%rdx, %%rdx\n\t" \
|
||||
"movl %%r8d, %%r9d\n\t" \
|
||||
"andl $7, %%r9d\n\t" \
|
||||
"jz 4f\n\t" \
|
||||
"\n" \
|
||||
"2:\n\t" \
|
||||
"movzbl (%[c],%%rdx,1), %%ecx\n\t" \
|
||||
"xorb (%[m],%%rdx,1), %%cl\n\t" \
|
||||
"movb %%cl, (%[output],%%rdx,1)\n\t" \
|
||||
"incl %%edx\n\t" \
|
||||
"cmpl %%r9d, %%edx\n\t" \
|
||||
"jne 2b\n\t" \
|
||||
"je 3f\n\t" \
|
||||
"\n" \
|
||||
"4:\n\t" \
|
||||
"movq (%[c],%%rdx,1), %%rcx\n\t" \
|
||||
"xorq (%[m],%%rdx,1), %%rcx\n\t" \
|
||||
"movq %%rcx, (%[output],%%rdx,1)\n\t" \
|
||||
"addl $8, %%edx\n\t" \
|
||||
"\n" \
|
||||
"3:\n\t" \
|
||||
"cmpl %%r8d, %%edx\n\t" \
|
||||
"jne 4b\n\t" \
|
||||
: \
|
||||
: [input] "r" (ctx->X), [c] "r" (x), \
|
||||
[output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \
|
||||
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
|
||||
: "eax", "ecx", "edx", "r8", "r9", "memory", \
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
|
||||
)
|
||||
|
||||
|
||||
#define CHACHA_CHUNK_AVX() \
|
||||
__asm__ __volatile__ ( \
|
||||
CHACHA_CRYPT_AVX() \
|
||||
"movdqu 0(%[m]), %%xmm4\n\t" \
|
||||
"movdqu 16(%[m]), %%xmm5\n\t" \
|
||||
"movdqu 32(%[m]), %%xmm6\n\t" \
|
||||
"movdqu 48(%[m]), %%xmm7\n\t" \
|
||||
"pxor %%xmm4, %%xmm0\n\t" \
|
||||
"pxor %%xmm5, %%xmm1\n\t" \
|
||||
"pxor %%xmm6, %%xmm2\n\t" \
|
||||
"pxor %%xmm7, %%xmm3\n\t" \
|
||||
"movdqu %%xmm0, 0(%[c])\n\t" \
|
||||
"movdqu %%xmm1, 16(%[c])\n\t" \
|
||||
"movdqu %%xmm2, 32(%[c])\n\t" \
|
||||
"movdqu %%xmm3, 48(%[c])\n\t" \
|
||||
"addl $1, 48(%[input])\n\t" \
|
||||
: \
|
||||
: [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \
|
||||
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
|
||||
: "rax", "memory", \
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
|
||||
)
|
||||
|
||||
#endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */
|
||||
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
|
||||
word32 bytes)
|
||||
{
|
||||
ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
|
||||
ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
|
||||
byte* output;
|
||||
word32 i;
|
||||
word32 cnt = 0;
|
||||
static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL };
|
||||
static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL };
|
||||
static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
|
||||
static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
|
||||
|
||||
if (bytes == 0)
|
||||
return;
|
||||
|
@ -646,7 +1009,7 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
|
|||
"add 48(%[key]), %[cnt]\n\t"
|
||||
"movl %[cnt], 48(%[key])\n\t"
|
||||
"\n"
|
||||
"L_end128:"
|
||||
"L_end128:\n\t"
|
||||
: [bytes] "+r" (bytes), [cnt] "+r" (cnt),
|
||||
[in] "+r" (m), [out] "+r" (c)
|
||||
: [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
|
||||
|
@ -658,23 +1021,15 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
|
|||
"xmm12", "xmm13", "xmm14", "xmm15", "memory"
|
||||
);
|
||||
|
||||
output = (byte*)x;
|
||||
for (; bytes > 0;) {
|
||||
wc_Chacha_wordtobyte(x, ctx->X);
|
||||
ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
|
||||
if (bytes <= CHACHA_CHUNK_BYTES) {
|
||||
for (i = 0; i < bytes; ++i) {
|
||||
c[i] = m[i] ^ output[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) {
|
||||
c[i] = m[i] ^ output[i];
|
||||
}
|
||||
for (; bytes >= CHACHA_CHUNK_BYTES;) {
|
||||
CHACHA_CHUNK_AVX();
|
||||
bytes -= CHACHA_CHUNK_BYTES;
|
||||
c += CHACHA_CHUNK_BYTES;
|
||||
m += CHACHA_CHUNK_BYTES;
|
||||
}
|
||||
if (bytes > 0) {
|
||||
CHACHA_PARTIAL_CHUNK_AVX();
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_INTEL_AVX1 */
|
||||
|
||||
|
@ -684,16 +1039,16 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
|
|||
{
|
||||
ALIGN256 word32 X[8*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
|
||||
ALIGN256 word32 x[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
|
||||
byte* output;
|
||||
word32 i;
|
||||
word32 cnt = 0;
|
||||
static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL,
|
||||
0x0000000500000004UL,0x0000000700000006UL };
|
||||
static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL,
|
||||
0x0000000800000008UL,0x0000000800000008UL };
|
||||
static const __m256i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL,
|
||||
static const __m256i rotl8_256 =
|
||||
{ 0x0605040702010003UL,0x0e0d0c0f0a09080bUL,
|
||||
0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
|
||||
static const __m256i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL,
|
||||
static const __m256i rotl16_256 =
|
||||
{ 0x0504070601000302UL,0x0d0c0f0e09080b0aUL,
|
||||
0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
|
||||
|
||||
if (bytes == 0)
|
||||
|
@ -931,35 +1286,27 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
|
|||
"add 48(%[key]), %[cnt]\n\t"
|
||||
"movl %[cnt], 48(%[key])\n\t"
|
||||
"\n"
|
||||
"L_end256:"
|
||||
"L_end256:\n\t"
|
||||
: [bytes] "+r" (bytes), [cnt] "+r" (cnt),
|
||||
[in] "+r" (m), [out] "+r" (c)
|
||||
: [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
|
||||
[add] "m" (add), [eight] "m" (eight),
|
||||
[rotl8] "m" (rotl8), [rotl16] "m" (rotl16)
|
||||
[rotl8] "m" (rotl8_256), [rotl16] "m" (rotl16_256)
|
||||
: "ymm0", "ymm1", "ymm2", "ymm3",
|
||||
"ymm4", "ymm5", "ymm6", "ymm7",
|
||||
"ymm8", "ymm9", "ymm10", "ymm11",
|
||||
"ymm12", "ymm13", "ymm14", "ymm15", "memory"
|
||||
);
|
||||
|
||||
output = (byte*)x;
|
||||
for (; bytes > 0;) {
|
||||
wc_Chacha_wordtobyte(x, ctx->X);
|
||||
ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
|
||||
if (bytes <= CHACHA_CHUNK_BYTES) {
|
||||
for (i = 0; i < bytes; ++i) {
|
||||
c[i] = m[i] ^ output[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) {
|
||||
c[i] = m[i] ^ output[i];
|
||||
}
|
||||
for (; bytes >= CHACHA_CHUNK_BYTES;) {
|
||||
CHACHA_CHUNK_AVX();
|
||||
bytes -= CHACHA_CHUNK_BYTES;
|
||||
c += CHACHA_CHUNK_BYTES;
|
||||
m += CHACHA_CHUNK_BYTES;
|
||||
}
|
||||
if (bytes > 0) {
|
||||
CHACHA_PARTIAL_CHUNK_AVX();
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_INTEL_AVX2 */
|
||||
#endif /* USE_INTEL_CHACHA_SPEEDUP */
|
||||
|
@ -1004,16 +1351,25 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
|
|||
return BAD_FUNC_ARG;
|
||||
|
||||
#ifdef USE_INTEL_CHACHA_SPEEDUP
|
||||
if (!cpuidFlagsSet) {
|
||||
cpuidFlags = cpuid_get_flags();
|
||||
cpuidFlagsSet = 1;
|
||||
}
|
||||
|
||||
#ifdef HAVE_INTEL_AVX2
|
||||
if (IS_INTEL_AVX2(cpuid_get_flags())) {
|
||||
if (IS_INTEL_AVX2(cpuidFlags)) {
|
||||
chacha_encrypt_avx2(ctx, input, output, msglen);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
if (IS_INTEL_AVX1(cpuid_get_flags())) {
|
||||
if (IS_INTEL_AVX1(cpuidFlags)) {
|
||||
chacha_encrypt_avx(ctx, input, output, msglen);
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
chacha_encrypt_x64(ctx, input, output, msglen);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
wc_Chacha_encrypt_bytes(ctx, input, output, msglen);
|
||||
|
||||
|
|
|
@ -144,10 +144,10 @@ static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
|
|||
"addq %%rax, %%r12\n\t"
|
||||
"movq %%r15, %%rax\n\t"
|
||||
"adcq %%rdx, %%r13\n\t"
|
||||
"# r[0] * h[0] => rdx, rax +=> t1, t0\n\t"
|
||||
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
|
||||
"mulq %%r8\n\t"
|
||||
"movq %%rdx, %%r8\n\t"
|
||||
"movq %%rax, %%r11\n\t"
|
||||
"movq %%rdx, %%r8\n\t"
|
||||
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
|
||||
"movq 8(%[ctx]), %%rax\n\t"
|
||||
"mulq %%r9\n\t"
|
||||
|
@ -211,10 +211,10 @@ POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
|
|||
"addq %%rax, %%r12\n\t"
|
||||
"movq %%r15, %%rax\n\t"
|
||||
"adcq %%rdx, %%r13\n\t"
|
||||
"# r[0] * h[0] => rdx, rax +=> t1, t0\n\t"
|
||||
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
|
||||
"mulq %%r8\n\t"
|
||||
"movq %%rdx, %%r8\n\t"
|
||||
"movq %%rax, %%r11\n\t"
|
||||
"movq %%rdx, %%r8\n\t"
|
||||
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
|
||||
"movq 8(%[ctx]), %%rax\n\t"
|
||||
"mulq %%r9\n\t"
|
||||
|
|
Loading…
Reference in New Issue