From 31aa5e308df325cc2f396f0cc2f9d9c70bce0fe4 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 15 Dec 2017 13:33:25 +1000 Subject: [PATCH] Improve performance of small number of blocks for chacha20 --- wolfcrypt/benchmark/benchmark.c | 1100 ++++++++++++++++++------------- wolfcrypt/src/chacha.c | 438 ++++++++++-- wolfcrypt/src/poly1305.c | 8 +- 3 files changed, 1056 insertions(+), 490 deletions(-) diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index eb543a6ac..c7e5033a5 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -219,22 +219,22 @@ typedef struct bench_alg { static bench_alg bench_cipher_opt[] = { { "-cipher", -1 }, #ifdef HAVE_AES_CBC - { "-aes_cbc", BENCH_AES_CBC }, + { "-aes-cbc", BENCH_AES_CBC }, #endif #ifdef HAVE_AESGCM - { "-aes_gcm", BENCH_AES_GCM }, + { "-aes-gcm", BENCH_AES_GCM }, #endif #ifdef WOLFSSL_AES_DIRECT - { "-aes_ecb", BENCH_AES_ECB }, + { "-aes-ecb", BENCH_AES_ECB }, #endif #ifdef WOLFSSL_AES_XTS - { "-aes_xts", BENCH_AES_XTS }, + { "-aes-xts", BENCH_AES_XTS }, #endif #ifdef WOLFSSL_AES_COUNTER - { "-aes_ctr", BENCH_AES_CTR }, + { "-aes-ctr", BENCH_AES_CTR }, #endif #ifdef HAVE_AESCCM - { "-aes_ccm", BENCH_AES_CCM }, + { "-aes-ccm", BENCH_AES_CCM }, #endif #ifdef HAVE_CAMELLIA { "-camellia", BENCH_CAMELLIA }, @@ -252,7 +252,7 @@ static bench_alg bench_cipher_opt[] = { { "-chacha20", BENCH_CHACHA20 }, #endif #if defined(HAVE_CHACHA) && defined(HAVE_POLY1305) - { "-chacha20_poly1305", BENCH_CHACHA20_POLY1305 }, + { "-chacha20-poly1305", BENCH_CHACHA20_POLY1305 }, #endif #ifndef NO_DES3 { "-des", BENCH_DES }, @@ -294,16 +294,16 @@ static bench_alg bench_digest_opt[] = { #ifdef WOLFSSL_SHA3 { "-sha3", BENCH_SHA3 }, #ifndef WOLFSSL_NOSHA3_224 - { "-sha3_224", BENCH_SHA3_224 }, + { "-sha3-224", BENCH_SHA3_224 }, #endif #ifndef WOLFSSL_NOSHA3_256 - { "-sha3_256", BENCH_SHA3_256 }, + { "-sha3-256", BENCH_SHA3_256 }, #endif #ifndef WOLFSSL_NOSHA3_384 - { "-sha3_384", BENCH_SHA3_384 }, + { "-sha3-384", BENCH_SHA3_384 }, #endif #ifndef WOLFSSL_NOSHA3_512 - { "-sha3_512", BENCH_SHA3_512 }, + { "-sha3-512", BENCH_SHA3_512 }, #endif #endif #ifdef WOLFSSL_RIPEMD @@ -324,22 +324,22 @@ static bench_alg bench_mac_opt[] = { #ifndef NO_HMAC { "-hmac", BENCH_HMAC }, #ifndef NO_MD5 - { "-hmac_md5", BENCH_HMAC_MD5 }, + { "-hmac-md5", BENCH_HMAC_MD5 }, #endif #ifndef NO_SHA - { "-hmac_sha", BENCH_HMAC_SHA }, + { "-hmac-sha", BENCH_HMAC_SHA }, #endif #ifdef WOLFSSL_SHA224 - { "-hmac_sha224", BENCH_HMAC_SHA224 }, + { "-hmac-sha224", BENCH_HMAC_SHA224 }, #endif #ifndef NO_SHA256 - { "-hmac_sha256", BENCH_HMAC_SHA256 }, + { "-hmac-sha256", BENCH_HMAC_SHA256 }, #endif #ifdef WOLFSSL_SHA384 - { "-hmac_sha384", BENCH_HMAC_SHA384 }, + { "-hmac-sha384", BENCH_HMAC_SHA384 }, #endif #ifdef WOLFSSL_SHA512 - { "-hmac_sha512", BENCH_HMAC_SHA512 }, + { "-hmac-sha512", BENCH_HMAC_SHA512 }, #endif #endif { NULL, 0} @@ -350,7 +350,7 @@ static bench_alg bench_asym_opt[] = { { "-asym", -1 }, #ifndef NO_RSA #ifdef WOLFSSL_KEY_GEN - { "-rsa_kg", BENCH_RSA_KEYGEN }, + { "-rsa-kg", BENCH_RSA_KEYGEN }, #endif { "-rsa", BENCH_RSA }, #endif @@ -359,13 +359,13 @@ static bench_alg bench_asym_opt[] = { #endif #ifdef HAVE_NTRU { "-ntru", BENCH_NTRU }, - { "-ntru_kg", BENCH_NTRU_KEYGEN }, + { "-ntru-kg", BENCH_NTRU_KEYGEN }, #endif #ifdef HAVE_ECC - { "-ecc_kg", BENCH_ECC_MAKEKEY }, + { "-ecc-kg", BENCH_ECC_MAKEKEY }, { "-ecc", BENCH_ECC }, #ifdef HAVE_ECC_ENCRYPT - { "-ecc_enc", BENCH_ECC_ENCRYPT }, + { "-ecc-enc", BENCH_ECC_ENCRYPT }, #endif #endif #ifdef HAVE_CURVE25519 @@ -375,7 +375,7 @@ static bench_alg bench_asym_opt[] = { #endif #endif #ifdef HAVE_ED25519 - { "-ed25519_kg", BENCH_ED25519_KEYGEN }, + { "-ed25519-kg", BENCH_ED25519_KEYGEN }, { "-ed25519", BENCH_ED25519_SIGN }, #endif { NULL, 0} @@ -653,6 +653,7 @@ static THREAD_LS_T int devId = INVALID_DEVID; #define AES_AUTH_ADD_SZ 13 #define AES_AUTH_TAG_SZ 16 #define BENCH_CIPHER_ADD AES_AUTH_TAG_SZ + static word32 aesAuthAddSz = AES_AUTH_ADD_SZ; #endif #ifndef BENCH_CIPHER_ADD #define BENCH_CIPHER_ADD 0 @@ -662,24 +663,25 @@ static THREAD_LS_T int devId = INVALID_DEVID; /* use kB instead of mB for embedded benchmarking */ #ifdef BENCH_EMBEDDED enum BenchmarkBounds { - numBlocks = 25, /* how many kB to test (en/de)cryption */ scryptCnt = 1, ntimes = 2, genTimes = BENCH_MAX_PENDING, agreeTimes = 2 }; + static int numBlocks = 25; /* how many kB to test (en/de)cryption */ static word32 bench_size = (1024ul); #else enum BenchmarkBounds { - numBlocks = 5, /* how many megs to test (en/de)cryption */ scryptCnt = 10, ntimes = 100, genTimes = BENCH_MAX_PENDING, /* must be at least BENCH_MAX_PENDING */ agreeTimes = 100 }; + static int numBlocks = 5; /* how many megs to test (en/de)cryption */ static word32 bench_size = (1024*1024ul); #endif static int base2 = 1; +static int digest_stream = 1; /* for compatibility */ #define BENCH_SIZE bench_size @@ -1740,7 +1742,7 @@ static void bench_aesgcm_internal(int doAsync, const byte* key, word32 keySz, ret = wc_AesGcmEncrypt(&enc[i], bench_cipher, bench_plain, BENCH_SIZE, iv, ivSz, bench_tag, AES_AUTH_TAG_SZ, - bench_additional, AES_AUTH_ADD_SZ); + bench_additional, aesAuthAddSz); if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, ×, &pending)) { goto exit_aes_gcm; } @@ -1764,7 +1766,7 @@ exit_aes_gcm: ret = wc_AesGcmDecrypt(&enc[i], bench_plain, bench_cipher, BENCH_SIZE, iv, ivSz, bench_tag, AES_AUTH_TAG_SZ, - bench_additional, AES_AUTH_ADD_SZ); + bench_additional, aesAuthAddSz); if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, ×, &pending)) { goto exit_aes_gcm_dec; } @@ -1976,7 +1978,7 @@ static void bench_aesctr_internal(const byte* key, word32 keySz, const byte* iv, { Aes enc; double start; - int i, count, ret; + int i, count, ret = 0; wc_AesSetKeyDirect(&enc, key, keySz, iv, AES_ENCRYPTION); @@ -2022,7 +2024,7 @@ void bench_aesccm(void) for (i = 0; i < numBlocks; i++) { wc_AesCcmEncrypt(&enc, bench_cipher, bench_plain, BENCH_SIZE, bench_iv, 12, bench_tag, AES_AUTH_TAG_SZ, - bench_additional, AES_AUTH_ADD_SZ); + bench_additional, aesAuthAddSz); } count += i; } while (bench_stats_sym_check(start)); @@ -2332,51 +2334,66 @@ void bench_md5(int doAsync) /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitMd5_ex(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitMd5_ex failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitMd5_ex(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitMd5_ex failed, ret = %d\n", ret); + goto exit; + } + #ifdef WOLFSSL_PIC32MZ_HASH + wc_Md5SizeSet(&hash[i], numBlocks * BENCH_SIZE); + #endif } - #ifdef WOLFSSL_PIC32MZ_HASH - wc_Md5SizeSet(&hash[i], numBlocks * BENCH_SIZE); - #endif - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Md5Update(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_md5; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Md5Final(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_md5; + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Md5Update(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_md5; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Md5Final(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_md5; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitMd5_ex(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_Md5Update(hash, bench_plain, BENCH_SIZE); + ret |= wc_Md5Final(hash, digest[0]); + if (ret != 0) + goto exit_md5; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_md5: bench_stats_sym_finish("MD5", doAsync, count, bench_size, start, ret); @@ -2404,51 +2421,66 @@ void bench_sha(int doAsync) /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitSha_ex(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitSha failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitSha_ex(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitSha failed, ret = %d\n", ret); + goto exit; + } + #ifdef WOLFSSL_PIC32MZ_HASH + wc_ShaSizeSet(&hash[i], numBlocks * BENCH_SIZE); + #endif } - #ifdef WOLFSSL_PIC32MZ_HASH - wc_ShaSizeSet(&hash[i], numBlocks * BENCH_SIZE); - #endif - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_ShaUpdate(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_ShaFinal(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha; + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_ShaUpdate(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_ShaFinal(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitSha_ex(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_ShaUpdate(hash, bench_plain, BENCH_SIZE); + ret |= wc_ShaFinal(hash, digest[0]); + if (ret != 0) + goto exit_sha; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_sha: bench_stats_sym_finish("SHA", doAsync, count, bench_size, start, ret); @@ -2468,53 +2500,68 @@ void bench_sha224(int doAsync) { wc_Sha224 hash[BENCH_MAX_PENDING]; double start; - int ret, i, count = 0, times, pending = 0; + int ret = 0, i, count = 0, times, pending = 0; DECLARE_ARRAY(digest, byte, BENCH_MAX_PENDING, WC_SHA224_DIGEST_SIZE, HEAP_HINT); /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitSha224_ex(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitSha224_ex failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitSha224_ex(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitSha224_ex failed, ret = %d\n", ret); + goto exit; + } } - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha224Update(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha224; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha224Final(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha224; + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha224Update(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha224; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha224Final(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha224; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitSha224_ex(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_Sha224Update(hash, bench_plain, BENCH_SIZE); + ret |= wc_Sha224Final(hash, digest[0]); + if (ret != 0) + goto exit_sha224; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_sha224: bench_stats_sym_finish("SHA-224", doAsync, count, bench_size, start, ret); @@ -2539,50 +2586,65 @@ void bench_sha256(int doAsync) /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitSha256_ex(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitSha256_ex failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitSha256_ex(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitSha256_ex failed, ret = %d\n", ret); + goto exit; + } + #ifdef WOLFSSL_PIC32MZ_HASH + wc_Sha256SizeSet(&hash[i], numBlocks * BENCH_SIZE); + #endif } - #ifdef WOLFSSL_PIC32MZ_HASH - wc_Sha256SizeSet(&hash[i], numBlocks * BENCH_SIZE); - #endif - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha256Update(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha256; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha256Final(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha256; + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha256Update(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha256; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha256Final(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha256; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitSha256_ex(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_Sha256Update(hash, bench_plain, BENCH_SIZE); + ret |= wc_Sha256Final(hash, digest[0]); + if (ret != 0) + goto exit_sha256; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_sha256: bench_stats_sym_finish("SHA-256", doAsync, count, bench_size, start, ret); @@ -2607,47 +2669,62 @@ void bench_sha384(int doAsync) /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitSha384_ex(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitSha384_ex failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitSha384_ex(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitSha384_ex failed, ret = %d\n", ret); + goto exit; + } } - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha384Update(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha384; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha384Final(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha384; + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha384Update(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha384; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha384Final(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha384; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitSha384_ex(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_Sha384Update(hash, bench_plain, BENCH_SIZE); + ret |= wc_Sha384Final(hash, digest[0]); + if (ret != 0) + goto exit_sha384; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_sha384: bench_stats_sym_finish("SHA-384", doAsync, count, bench_size, start, ret); @@ -2672,47 +2749,62 @@ void bench_sha512(int doAsync) /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitSha512_ex(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitSha512_ex failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitSha512_ex(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitSha512_ex failed, ret = %d\n", ret); + goto exit; + } } - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha512Update(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha512; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha512Final(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha512; + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha512Update(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha512; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha512Final(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha512; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitSha512_ex(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_Sha512Update(hash, bench_plain, BENCH_SIZE); + ret |= wc_Sha512Final(hash, digest[0]); + if (ret != 0) + goto exit_sha512; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_sha512: bench_stats_sym_finish("SHA-512", doAsync, count, bench_size, start, ret); @@ -2739,47 +2831,62 @@ void bench_sha3_224(int doAsync) /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitSha3_224(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitSha3_224 failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitSha3_224(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitSha3_224 failed, ret = %d\n", ret); + goto exit; + } } - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha3_224_Update(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha3_224; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha3_224_Final(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha3_224; + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha3_224_Update(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha3_224; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha3_224_Final(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha3_224; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitSha3_224(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_Sha3_224_Update(hash, bench_plain, BENCH_SIZE); + ret |= wc_Sha3_224_Final(hash, digest[0]); + if (ret != 0) + goto exit_sha3_224; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_sha3_224: bench_stats_sym_finish("SHA3-224", doAsync, count, bench_size, start, ret); @@ -2804,47 +2911,62 @@ void bench_sha3_256(int doAsync) /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitSha3_256(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitSha3_256 failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitSha3_256(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitSha3_256 failed, ret = %d\n", ret); + goto exit; + } } - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha3_256_Update(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha3_256; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha3_256_Final(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha3_256; + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha3_256_Update(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha3_256; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha3_256_Final(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha3_256; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitSha3_256(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_Sha3_256_Update(hash, bench_plain, BENCH_SIZE); + ret |= wc_Sha3_256_Final(hash, digest[0]); + if (ret != 0) + goto exit_sha3_256; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_sha3_256: bench_stats_sym_finish("SHA3-256", doAsync, count, bench_size, start, ret); @@ -2869,47 +2991,62 @@ void bench_sha3_384(int doAsync) /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitSha3_384(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitSha3_384 failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitSha3_384(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitSha3_384 failed, ret = %d\n", ret); + goto exit; + } } - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha3_384_Update(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha3_384; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha3_384_Final(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha3_384; + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha3_384_Update(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha3_384; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha3_384_Final(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha3_384; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitSha3_384(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_Sha3_384_Update(hash, bench_plain, BENCH_SIZE); + ret |= wc_Sha3_384_Final(hash, digest[0]); + if (ret != 0) + goto exit_sha3_384; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_sha3_384: bench_stats_sym_finish("SHA3-384", doAsync, count, bench_size, start, ret); @@ -2934,47 +3071,62 @@ void bench_sha3_512(int doAsync) /* clear for done cleanup */ XMEMSET(hash, 0, sizeof(hash)); - /* init keys */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - ret = wc_InitSha3_512(&hash[i], HEAP_HINT, - doAsync ? devId : INVALID_DEVID); - if (ret != 0) { - printf("InitSha3_512 failed, ret = %d\n", ret); - goto exit; + if (digest_stream) { + /* init keys */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + ret = wc_InitSha3_512(&hash[i], HEAP_HINT, + doAsync ? devId : INVALID_DEVID); + if (ret != 0) { + printf("InitSha3_512 failed, ret = %d\n", ret); + goto exit; + } } - } - bench_stats_start(&count, &start); - do { - for (times = 0; times < numBlocks || pending > 0; ) { - bench_async_poll(&pending); - - /* while free pending slots in queue, submit ops */ - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha3_512_Update(&hash[i], bench_plain, - BENCH_SIZE); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha3_512; - } - } - } /* for i */ - } /* for times */ - count += times; - - times = 0; + bench_stats_start(&count, &start); do { - bench_async_poll(&pending); - for (i = 0; i < BENCH_MAX_PENDING; i++) { - if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { - ret = wc_Sha3_512_Final(&hash[i], digest[i]); - if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { - goto exit_sha3_512; + for (times = 0; times < numBlocks || pending > 0; ) { + bench_async_poll(&pending); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha3_512_Update(&hash[i], bench_plain, + BENCH_SIZE); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha3_512; + } } - } - } /* for i */ - } while (pending > 0); - } while (bench_stats_sym_check(start)); + } /* for i */ + } /* for times */ + count += times; + + times = 0; + do { + bench_async_poll(&pending); + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, numBlocks, &pending)) { + ret = wc_Sha3_512_Final(&hash[i], digest[i]); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&hash[i]), 0, ×, &pending)) { + goto exit_sha3_512; + } + } + } /* for i */ + } while (pending > 0); + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks; times++) { + ret = wc_InitSha3_512(hash, HEAP_HINT, INVALID_DEVID); + ret |= wc_Sha3_512_Update(hash, bench_plain, BENCH_SIZE); + ret |= wc_Sha3_512_Final(hash, digest[0]); + if (ret != 0) + goto exit_sha3_512; + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); + } exit_sha3_512: bench_stats_sym_finish("SHA3-512", doAsync, count, bench_size, start, ret); @@ -2996,28 +3148,50 @@ int bench_ripemd(void) RipeMd hash; byte digest[RIPEMD_DIGEST_SIZE]; double start; - int i, count, ret; + int i, count, ret = 0; - ret = wc_InitRipeMd(&hash); - if (ret != 0) { - return ret; - } - - bench_stats_start(&count, &start); - do { - for (i = 0; i < numBlocks; i++) { - ret = wc_RipeMdUpdate(&hash, bench_plain, BENCH_SIZE); - if (ret != 0) { - return ret; - } - } - ret = wc_RipeMdFinal(&hash, digest); + if (digest_stream) { + ret = wc_InitRipeMd(&hash); if (ret != 0) { return ret; } - count += i; - } while (bench_stats_sym_check(start)); + bench_stats_start(&count, &start); + do { + for (i = 0; i < numBlocks; i++) { + ret = wc_RipeMdUpdate(&hash, bench_plain, BENCH_SIZE); + if (ret != 0) { + return ret; + } + } + ret = wc_RipeMdFinal(&hash, digest); + if (ret != 0) { + return ret; + } + + count += i; + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (i = 0; i < numBlocks; i++) { + ret = wc_InitRipeMd(&hash); + if (ret != 0) { + return ret; + } + ret = wc_RipeMdUpdate(&hash, bench_plain, BENCH_SIZE); + if (ret != 0) { + return ret; + } + ret = wc_RipeMdFinal(&hash, digest); + if (ret != 0) { + return ret; + } + } + count += i; + } while (bench_stats_sym_check(start)); + } bench_stats_sym_finish("RIPEMD", 0, count, bench_size, start, ret); return 0; @@ -3033,28 +3207,53 @@ void bench_blake2(void) double start; int ret, i, count; - ret = wc_InitBlake2b(&b2b, 64); - if (ret != 0) { - printf("InitBlake2b failed, ret = %d\n", ret); - return; - } - - bench_stats_start(&count, &start); - do { - for (i = 0; i < numBlocks; i++) { - ret = wc_Blake2bUpdate(&b2b, bench_plain, BENCH_SIZE); - if (ret != 0) { - printf("Blake2bUpdate failed, ret = %d\n", ret); - return; - } - } - ret = wc_Blake2bFinal(&b2b, digest, 64); + if (digest_stream) { + ret = wc_InitBlake2b(&b2b, 64); if (ret != 0) { - printf("Blake2bFinal failed, ret = %d\n", ret); + printf("InitBlake2b failed, ret = %d\n", ret); return; } - count += i; - } while (bench_stats_sym_check(start)); + + bench_stats_start(&count, &start); + do { + for (i = 0; i < numBlocks; i++) { + ret = wc_Blake2bUpdate(&b2b, bench_plain, BENCH_SIZE); + if (ret != 0) { + printf("Blake2bUpdate failed, ret = %d\n", ret); + return; + } + } + ret = wc_Blake2bFinal(&b2b, digest, 64); + if (ret != 0) { + printf("Blake2bFinal failed, ret = %d\n", ret); + return; + } + count += i; + } while (bench_stats_sym_check(start)); + } + else { + bench_stats_start(&count, &start); + do { + for (i = 0; i < numBlocks; i++) { + ret = wc_InitBlake2b(&b2b, 64); + if (ret != 0) { + printf("InitBlake2b failed, ret = %d\n", ret); + return; + } + ret = wc_Blake2bUpdate(&b2b, bench_plain, BENCH_SIZE); + if (ret != 0) { + printf("Blake2bUpdate failed, ret = %d\n", ret); + return; + } + ret = wc_Blake2bFinal(&b2b, digest, 64); + if (ret != 0) { + printf("Blake2bFinal failed, ret = %d\n", ret); + return; + } + } + count += i; + } while (bench_stats_sym_check(start)); + } bench_stats_sym_finish("BLAKE2b", 0, count, bench_size, start, ret); } #endif @@ -3137,7 +3336,7 @@ static void bench_hmac(int doAsync, int type, int digestSz, #else DECLARE_ARRAY(digest, byte, BENCH_MAX_PENDING, digestSz, HEAP_HINT); #endif - + /* clear for done cleanup */ XMEMSET(hmac, 0, sizeof(hmac)); @@ -4500,6 +4699,7 @@ void benchmark_configure(int block_size) { /* must be greater than 0 */ if (block_size > 0) { + numBlocks = numBlocks * bench_size / block_size; bench_size = (word32)block_size; } } @@ -4537,6 +4737,10 @@ static void Usage(void) printf("benchmark\n"); printf("-? Help, print this usage\n"); printf("-base10 Display bytes as power of 10 (eg 1 kB = 1000 Bytes)\n"); +#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) + printf("-no_aad No additional authentication data passed.\n"); +#endif + printf("-dgst_full Full digest operation performed.\n"); #ifndef WOLFSSL_BENCHMARK_ALL printf("- Algorithm to benchmark. Available algorithms " "include:\n"); @@ -4592,6 +4796,12 @@ int main(int argc, char** argv) } else if (string_matches(argv[1], "-base10")) base2 = 0; +#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) + else if (string_matches(argv[1], "-no_aad")) + aesAuthAddSz = 0; +#endif + else if (string_matches(argv[1], "-dgst_full")) + digest_stream = 0; else if (argv[1][0] == '-') { optMatched = 0; #ifndef WOLFSSL_BENCHMARK_ALL diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index 7ca4702ac..4ea696b00 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -74,6 +74,9 @@ #ifndef NO_AVX2_SUPPORT #define HAVE_INTEL_AVX2 #endif + + static int cpuidFlagsSet = 0; + static int cpuidFlags = 0; #endif #ifdef BIG_ENDIAN_ORDER @@ -413,19 +416,379 @@ static INLINE void wc_Chacha_wordtobyte(word32 output[CHACHA_CHUNK_WORDS], #ifdef USE_INTEL_CHACHA_SPEEDUP +#define QUARTERROUND_2_X64(r11, r12, r13, r14, r21, r22, r23, r24) \ + "addl "#r12", "#r11"\n\t" \ + "addl "#r22", "#r21"\n\t" \ + "xorl "#r11", "#r14"\n\t" \ + "xorl "#r21", "#r24"\n\t" \ + "roll $16, "#r14"\n\t" \ + "roll $16, "#r24"\n\t" \ + "addl "#r14", "#r13"\n\t" \ + "addl "#r24", "#r23"\n\t" \ + "xorl "#r13", "#r12"\n\t" \ + "xorl "#r23", "#r22"\n\t" \ + "roll $12, "#r12"\n\t" \ + "roll $12, "#r22"\n\t" \ + "addl "#r12", "#r11"\n\t" \ + "addl "#r22", "#r21"\n\t" \ + "xorl "#r11", "#r14"\n\t" \ + "xorl "#r21", "#r24"\n\t" \ + "roll $8, "#r14"\n\t" \ + "roll $8, "#r24"\n\t" \ + "addl "#r14", "#r13"\n\t" \ + "addl "#r24", "#r23"\n\t" \ + "xorl "#r13", "#r12"\n\t" \ + "xorl "#r23", "#r22"\n\t" \ + "roll $7, "#r12"\n\t" \ + "roll $7, "#r22"\n\t" \ + +#define CHACHA_CRYPT_X64() \ + "subq $40, %%rsp\n\t" \ + "movq 32(%[input]), %%rax\n\t" \ + "movq 40(%[input]), %%rdx\n\t" \ + "movq %%rax, 8(%%rsp)\n\t" \ + "movq %%rdx, 16(%%rsp)\n\t" \ + "movl 0(%[input]), %%eax\n\t" \ + "movl 4(%[input]), %%ebx\n\t" \ + "movl 8(%[input]), %%ecx\n\t" \ + "movl 12(%[input]), %%edx\n\t" \ + "movl 16(%[input]), %%r8d\n\t" \ + "movl 20(%[input]), %%r9d\n\t" \ + "movl 24(%[input]), %%r10d\n\t" \ + "movl 28(%[input]), %%r11d\n\t" \ + "movl 48(%[input]), %%r12d\n\t" \ + "movl 52(%[input]), %%r13d\n\t" \ + "movl 56(%[input]), %%r14d\n\t" \ + "movl 60(%[input]), %%r15d\n\t" \ + "movb $10, (%%rsp)\n\t" \ + "movq %%rsi, 32(%%rsp)\n\t" \ + "movq %%rdi, 24(%%rsp)\n\t" \ + "movl 8(%%rsp), %%esi\n\t" \ + "movl 12(%%rsp), %%edi\n\t" \ + "\n" \ + "1:\n\t" \ + QUARTERROUND_2_X64(%%eax, %%r8d, %%esi, %%r12d, \ + %%ebx, %%r9d, %%edi, %%r13d) \ + "movl %%esi, 8(%%rsp)\n\t" \ + "movl %%edi, 12(%%rsp)\n\t" \ + "movl 16(%%rsp), %%esi\n\t" \ + "movl 20(%%rsp), %%edi\n\t" \ + QUARTERROUND_2_X64(%%ecx, %%r10d, %%esi, %%r14d, \ + %%edx, %%r11d, %%edi, %%r15d) \ + QUARTERROUND_2_X64(%%eax, %%r9d, %%esi, %%r15d, \ + %%ebx, %%r10d, %%edi, %%r12d) \ + "movl %%esi, 16(%%rsp)\n\t" \ + "movl %%edi, 20(%%rsp)\n\t" \ + "movl 8(%%rsp), %%esi\n\t" \ + "movl 12(%%rsp), %%edi\n\t" \ + QUARTERROUND_2_X64(%%ecx, %%r11d, %%esi, %%r13d, \ + %%edx, %%r8d, %%edi, %%r14d) \ + "decb (%%rsp)\n\t" \ + "jnz 1b\n\t" \ + "movl %%esi, 8(%%rsp)\n\t" \ + "movl %%edi, 12(%%rsp)\n\t" \ + "movq 32(%%rsp), %%rsi\n\t" \ + "movq 24(%%rsp), %%rdi\n\t" \ + "addl 0(%[input]), %%eax\n\t" \ + "addl 4(%[input]), %%ebx\n\t" \ + "addl 8(%[input]), %%ecx\n\t" \ + "addl 12(%[input]), %%edx\n\t" \ + "addl 16(%[input]), %%r8d\n\t" \ + "addl 20(%[input]), %%r9d\n\t" \ + "addl 24(%[input]), %%r10d\n\t" \ + "addl 28(%[input]), %%r11d\n\t" \ + "addl 48(%[input]), %%r12d\n\t" \ + "addl 52(%[input]), %%r13d\n\t" \ + "addl 56(%[input]), %%r14d\n\t" \ + "addl 60(%[input]), %%r15d\n\t" \ + +#define CHACHA_PARTIAL_CHUNK_X64() \ + __asm__ __volatile__ ( \ + CHACHA_CRYPT_X64() \ + "movl %%eax , 0(%[c])\n\t" \ + "movl %%ebx , 4(%[c])\n\t" \ + "movl %%ecx , 8(%[c])\n\t" \ + "movl %%edx , 12(%[c])\n\t" \ + "movl %%r8d , 16(%[c])\n\t" \ + "movl %%r9d , 20(%[c])\n\t" \ + "movl %%r10d, 24(%[c])\n\t" \ + "movl %%r11d, 28(%[c])\n\t" \ + "movl %%r12d, 48(%[c])\n\t" \ + "movl %%r13d, 52(%[c])\n\t" \ + "movl %%r14d, 56(%[c])\n\t" \ + "movl %%r15d, 60(%[c])\n\t" \ + "movl 8(%%rsp), %%eax\n\t" \ + "movl 12(%%rsp), %%ebx\n\t" \ + "movl 16(%%rsp), %%ecx\n\t" \ + "movl 20(%%rsp), %%edx\n\t" \ + "addl 32(%[input]), %%eax\n\t" \ + "addl 36(%[input]), %%ebx\n\t" \ + "addl 40(%[input]), %%ecx\n\t" \ + "addl 44(%[input]), %%edx\n\t" \ + "movl %%eax , 32(%[c])\n\t" \ + "movl %%ebx , 36(%[c])\n\t" \ + "movl %%ecx , 40(%[c])\n\t" \ + "movl %%edx , 44(%[c])\n\t" \ + "addl $1, 48(%[input])\n\t" \ + "addq $40, %%rsp\n\t" \ + "movq %[output], %%rax\n\t" \ + "movq %[m], %%rbx\n\t" \ + "movl %[bytes], %%r8d\n\t" \ + "xorq %%rdx, %%rdx\n\t" \ + "movl %%r8d, %%r9d\n\t" \ + "andl $7, %%r9d\n\t" \ + "jz 4f\n\t" \ + "\n" \ + "2:\n\t" \ + "movzbl (%[c],%%rdx,1), %%ecx\n\t" \ + "xorb (%%rbx,%%rdx,1), %%cl\n\t" \ + "movb %%cl, (%%rax,%%rdx,1)\n\t" \ + "incl %%edx\n\t" \ + "cmpl %%r9d, %%edx\n\t" \ + "jne 2b\n\t" \ + "je 3f\n\t" \ + "\n" \ + "4:\n\t" \ + "movq (%[c],%%rdx,1), %%rcx\n\t" \ + "xorq (%%rbx,%%rdx,1), %%rcx\n\t" \ + "movq %%rcx, (%%rax,%%rdx,1)\n\t" \ + "addl $8, %%edx\n\t" \ + "\n" \ + "3:\n\t" \ + "cmpl %%r8d, %%edx\n\t" \ + "jne 4b\n\t" \ + : \ + : [input] "r" (ctx->X), [c] "r" (x), \ + [output] "m" (c), [bytes] "m" (bytes), [m] "m" (m) \ + : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \ + "r14", "r15", "memory" \ + ) + + +#define CHACHA_CHUNK_X64() \ + __asm__ __volatile__ ( \ + CHACHA_CRYPT_X64() \ + "movq %%rsi, 32(%%rsp)\n\t" \ + "addq $40, %%rsp\n\t" \ + "movq %[m], %%rsi\n\t" \ + "subq $40, %%rsp\n\t" \ + "xorl 0(%%rsi), %%eax\n\t" \ + "xorl 4(%%rsi), %%ebx\n\t" \ + "xorl 8(%%rsi), %%ecx\n\t" \ + "xorl 12(%%rsi), %%edx\n\t" \ + "xorl 16(%%rsi), %%r8d\n\t" \ + "xorl 20(%%rsi), %%r9d\n\t" \ + "xorl 24(%%rsi), %%r10d\n\t" \ + "xorl 28(%%rsi), %%r11d\n\t" \ + "xorl 48(%%rsi), %%r12d\n\t" \ + "xorl 52(%%rsi), %%r13d\n\t" \ + "xorl 56(%%rsi), %%r14d\n\t" \ + "xorl 60(%%rsi), %%r15d\n\t" \ + "movq 32(%%rsp), %%rsi\n\t" \ + "movl %%eax , 0(%[c])\n\t" \ + "movl %%ebx , 4(%[c])\n\t" \ + "movl %%ecx , 8(%[c])\n\t" \ + "movl %%edx , 12(%[c])\n\t" \ + "movl %%r8d , 16(%[c])\n\t" \ + "movl %%r9d , 20(%[c])\n\t" \ + "movl %%r10d, 24(%[c])\n\t" \ + "movl %%r11d, 28(%[c])\n\t" \ + "movl %%r12d, 48(%[c])\n\t" \ + "movl %%r13d, 52(%[c])\n\t" \ + "movl %%r14d, 56(%[c])\n\t" \ + "movl %%r15d, 60(%[c])\n\t" \ + "addq $40, %%rsp\n\t" \ + "movq %[m], %%r8\n\t" \ + "subq $40, %%rsp\n\t" \ + "movl 8(%%rsp), %%eax\n\t" \ + "movl 12(%%rsp), %%ebx\n\t" \ + "movl 16(%%rsp), %%ecx\n\t" \ + "movl 20(%%rsp), %%edx\n\t" \ + "addl 32(%[input]), %%eax\n\t" \ + "addl 36(%[input]), %%ebx\n\t" \ + "addl 40(%[input]), %%ecx\n\t" \ + "addl 44(%[input]), %%edx\n\t" \ + "xorl 32(%%r8), %%eax\n\t" \ + "xorl 36(%%r8), %%ebx\n\t" \ + "xorl 40(%%r8), %%ecx\n\t" \ + "xorl 44(%%r8), %%edx\n\t" \ + "movl %%eax , 32(%[c])\n\t" \ + "movl %%ebx , 36(%[c])\n\t" \ + "movl %%ecx , 40(%[c])\n\t" \ + "movl %%edx , 44(%[c])\n\t" \ + "addl $1, 48(%[input])\n\t" \ + "addq $40, %%rsp\n\t" \ + : \ + : [input] "r" (ctx->X), [c] "r" (c), [m] "m" (m) \ + : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \ + "r14", "r15", "memory" \ + ) + + +static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c, + word32 bytes) +{ + word32 x[CHACHA_CHUNK_WORDS]; + + if (bytes == 0) + return; + + for (; bytes >= CHACHA_CHUNK_BYTES;) { + CHACHA_CHUNK_X64(); + bytes -= CHACHA_CHUNK_BYTES; + c += CHACHA_CHUNK_BYTES; + m += CHACHA_CHUNK_BYTES; + } + if (bytes > 0) { + CHACHA_PARTIAL_CHUNK_X64(); + } +} + +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) +static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; +static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; + +#define QUARTERROUND_2_AVX() \ + "paddd %%xmm1, %%xmm0\n\t" \ + "pxor %%xmm0, %%xmm3\n\t" \ + "pshufb %[rotl16], %%xmm3\n\t" \ + "paddd %%xmm3, %%xmm2\n\t" \ + "pxor %%xmm2, %%xmm1\n\t" \ + "movdqa %%xmm1, %%xmm4\n\t" \ + "pslld $12, %%xmm1\n\t" \ + "psrld $20, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "paddd %%xmm1, %%xmm0\n\t" \ + "pxor %%xmm0, %%xmm3\n\t" \ + "pshufb %[rotl8], %%xmm3\n\t" \ + "paddd %%xmm3, %%xmm2\n\t" \ + "pxor %%xmm2, %%xmm1\n\t" \ + "movdqa %%xmm1, %%xmm4\n\t" \ + "pslld $7, %%xmm1\n\t" \ + "psrld $25, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "# Swap words for next round\n\t" \ + "pshufd $0x39, %%xmm1, %%xmm1\n\t" \ + "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \ + "pshufd $0x93, %%xmm3, %%xmm3\n\t" \ + "paddd %%xmm1, %%xmm0\n\t" \ + "pxor %%xmm0, %%xmm3\n\t" \ + "pshufb %[rotl16], %%xmm3\n\t" \ + "paddd %%xmm3, %%xmm2\n\t" \ + "pxor %%xmm2, %%xmm1\n\t" \ + "movdqa %%xmm1, %%xmm4\n\t" \ + "pslld $12, %%xmm1\n\t" \ + "psrld $20, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "paddd %%xmm1, %%xmm0\n\t" \ + "pxor %%xmm0, %%xmm3\n\t" \ + "pshufb %[rotl8], %%xmm3\n\t" \ + "paddd %%xmm3, %%xmm2\n\t" \ + "pxor %%xmm2, %%xmm1\n\t" \ + "movdqa %%xmm1, %%xmm4\n\t" \ + "pslld $7, %%xmm1\n\t" \ + "psrld $25, %%xmm4\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" \ + "# Swap words back\n\t" \ + "pshufd $0x93, %%xmm1, %%xmm1\n\t" \ + "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \ + "pshufd $0x39, %%xmm3, %%xmm3\n\t" \ + +#define CHACHA_CRYPT_AVX() \ + "movdqu 0(%[input]), %%xmm0\n\t" \ + "movdqu 16(%[input]), %%xmm1\n\t" \ + "movdqu 32(%[input]), %%xmm2\n\t" \ + "movdqu 48(%[input]), %%xmm3\n\t" \ + "movb $10, %%al\n\t" \ + "\n" \ + "1:\n\t" \ + QUARTERROUND_2_AVX() \ + "decb %%al\n\t" \ + "jnz 1b\n\t" \ + "movdqu 0(%[input]), %%xmm4\n\t" \ + "movdqu 16(%[input]), %%xmm5\n\t" \ + "movdqu 32(%[input]), %%xmm6\n\t" \ + "movdqu 48(%[input]), %%xmm7\n\t" \ + "paddd %%xmm4, %%xmm0\n\t" \ + "paddd %%xmm5, %%xmm1\n\t" \ + "paddd %%xmm6, %%xmm2\n\t" \ + "paddd %%xmm7, %%xmm3\n\t" \ + +#define CHACHA_PARTIAL_CHUNK_AVX() \ + __asm__ __volatile__ ( \ + CHACHA_CRYPT_AVX() \ + "movdqu %%xmm0, 0(%[c])\n\t" \ + "movdqu %%xmm1, 16(%[c])\n\t" \ + "movdqu %%xmm2, 32(%[c])\n\t" \ + "movdqu %%xmm3, 48(%[c])\n\t" \ + "addl $1, 48(%[input])\n\t" \ + "movl %[bytes], %%r8d\n\t" \ + "xorq %%rdx, %%rdx\n\t" \ + "movl %%r8d, %%r9d\n\t" \ + "andl $7, %%r9d\n\t" \ + "jz 4f\n\t" \ + "\n" \ + "2:\n\t" \ + "movzbl (%[c],%%rdx,1), %%ecx\n\t" \ + "xorb (%[m],%%rdx,1), %%cl\n\t" \ + "movb %%cl, (%[output],%%rdx,1)\n\t" \ + "incl %%edx\n\t" \ + "cmpl %%r9d, %%edx\n\t" \ + "jne 2b\n\t" \ + "je 3f\n\t" \ + "\n" \ + "4:\n\t" \ + "movq (%[c],%%rdx,1), %%rcx\n\t" \ + "xorq (%[m],%%rdx,1), %%rcx\n\t" \ + "movq %%rcx, (%[output],%%rdx,1)\n\t" \ + "addl $8, %%edx\n\t" \ + "\n" \ + "3:\n\t" \ + "cmpl %%r8d, %%edx\n\t" \ + "jne 4b\n\t" \ + : \ + : [input] "r" (ctx->X), [c] "r" (x), \ + [output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \ + [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ + : "eax", "ecx", "edx", "r8", "r9", "memory", \ + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \ + ) + + +#define CHACHA_CHUNK_AVX() \ + __asm__ __volatile__ ( \ + CHACHA_CRYPT_AVX() \ + "movdqu 0(%[m]), %%xmm4\n\t" \ + "movdqu 16(%[m]), %%xmm5\n\t" \ + "movdqu 32(%[m]), %%xmm6\n\t" \ + "movdqu 48(%[m]), %%xmm7\n\t" \ + "pxor %%xmm4, %%xmm0\n\t" \ + "pxor %%xmm5, %%xmm1\n\t" \ + "pxor %%xmm6, %%xmm2\n\t" \ + "pxor %%xmm7, %%xmm3\n\t" \ + "movdqu %%xmm0, 0(%[c])\n\t" \ + "movdqu %%xmm1, 16(%[c])\n\t" \ + "movdqu %%xmm2, 32(%[c])\n\t" \ + "movdqu %%xmm3, 48(%[c])\n\t" \ + "addl $1, 48(%[input])\n\t" \ + : \ + : [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \ + [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ + : "rax", "memory", \ + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \ + ) + +#endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */ + #ifdef HAVE_INTEL_AVX1 static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c, word32 bytes) { ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ - byte* output; - word32 i; word32 cnt = 0; static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL }; static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL }; - static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; - static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; if (bytes == 0) return; @@ -433,7 +796,7 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c, __asm__ __volatile__ ( "movl %[bytes], %[cnt]\n\t" "shrl $8, %[cnt]\n\t" - "jz L_end128\n\t" + "jz L_end128\n\t" "vpshufd $0, (%[key]), %%xmm0\n\t" "vpshufd $0, 4(%[key]), %%xmm1\n\t" @@ -646,7 +1009,7 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c, "add 48(%[key]), %[cnt]\n\t" "movl %[cnt], 48(%[key])\n\t" "\n" - "L_end128:" + "L_end128:\n\t" : [bytes] "+r" (bytes), [cnt] "+r" (cnt), [in] "+r" (m), [out] "+r" (c) : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), @@ -658,23 +1021,15 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c, "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); - output = (byte*)x; - for (; bytes > 0;) { - wc_Chacha_wordtobyte(x, ctx->X); - ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); - if (bytes <= CHACHA_CHUNK_BYTES) { - for (i = 0; i < bytes; ++i) { - c[i] = m[i] ^ output[i]; - } - return; - } - for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) { - c[i] = m[i] ^ output[i]; - } + for (; bytes >= CHACHA_CHUNK_BYTES;) { + CHACHA_CHUNK_AVX(); bytes -= CHACHA_CHUNK_BYTES; c += CHACHA_CHUNK_BYTES; m += CHACHA_CHUNK_BYTES; } + if (bytes > 0) { + CHACHA_PARTIAL_CHUNK_AVX(); + } } #endif /* HAVE_INTEL_AVX1 */ @@ -684,16 +1039,16 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, { ALIGN256 word32 X[8*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ ALIGN256 word32 x[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ - byte* output; - word32 i; word32 cnt = 0; static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL, 0x0000000500000004UL,0x0000000700000006UL }; static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL, 0x0000000800000008UL,0x0000000800000008UL }; - static const __m256i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL, + static const __m256i rotl8_256 = + { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL, 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; - static const __m256i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL, + static const __m256i rotl16_256 = + { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL, 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; if (bytes == 0) @@ -702,7 +1057,7 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, __asm__ __volatile__ ( "movl %[bytes], %[cnt]\n\t" "shrl $9, %[cnt]\n\t" - "jz L_end256\n\t" + "jz L_end256\n\t" "vpbroadcastd (%[key]), %%ymm0\n\t" "vpbroadcastd 4(%[key]), %%ymm1\n\t" @@ -931,35 +1286,27 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, "add 48(%[key]), %[cnt]\n\t" "movl %[cnt], 48(%[key])\n\t" "\n" - "L_end256:" + "L_end256:\n\t" : [bytes] "+r" (bytes), [cnt] "+r" (cnt), [in] "+r" (m), [out] "+r" (c) : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), [add] "m" (add), [eight] "m" (eight), - [rotl8] "m" (rotl8), [rotl16] "m" (rotl16) + [rotl8] "m" (rotl8_256), [rotl16] "m" (rotl16_256) : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ); - output = (byte*)x; - for (; bytes > 0;) { - wc_Chacha_wordtobyte(x, ctx->X); - ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); - if (bytes <= CHACHA_CHUNK_BYTES) { - for (i = 0; i < bytes; ++i) { - c[i] = m[i] ^ output[i]; - } - return; - } - for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) { - c[i] = m[i] ^ output[i]; - } + for (; bytes >= CHACHA_CHUNK_BYTES;) { + CHACHA_CHUNK_AVX(); bytes -= CHACHA_CHUNK_BYTES; c += CHACHA_CHUNK_BYTES; m += CHACHA_CHUNK_BYTES; } + if (bytes > 0) { + CHACHA_PARTIAL_CHUNK_AVX(); + } } #endif /* HAVE_INTEL_AVX2 */ #endif /* USE_INTEL_CHACHA_SPEEDUP */ @@ -1004,16 +1351,25 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, return BAD_FUNC_ARG; #ifdef USE_INTEL_CHACHA_SPEEDUP + if (!cpuidFlagsSet) { + cpuidFlags = cpuid_get_flags(); + cpuidFlagsSet = 1; + } + #ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_AVX2(cpuid_get_flags())) { + if (IS_INTEL_AVX2(cpuidFlags)) { chacha_encrypt_avx2(ctx, input, output, msglen); return 0; } #endif - if (IS_INTEL_AVX1(cpuid_get_flags())) { + if (IS_INTEL_AVX1(cpuidFlags)) { chacha_encrypt_avx(ctx, input, output, msglen); return 0; } + else { + chacha_encrypt_x64(ctx, input, output, msglen); + return 0; + } #endif wc_Chacha_encrypt_bytes(ctx, input, output, msglen); diff --git a/wolfcrypt/src/poly1305.c b/wolfcrypt/src/poly1305.c index e02289e3c..488dda72f 100644 --- a/wolfcrypt/src/poly1305.c +++ b/wolfcrypt/src/poly1305.c @@ -144,10 +144,10 @@ static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m) "addq %%rax, %%r12\n\t" "movq %%r15, %%rax\n\t" "adcq %%rdx, %%r13\n\t" - "# r[0] * h[0] => rdx, rax +=> t1, t0\n\t" + "# r[0] * h[0] => rdx, rax ==> t4, t0\n\t" "mulq %%r8\n\t" - "movq %%rdx, %%r8\n\t" "movq %%rax, %%r11\n\t" + "movq %%rdx, %%r8\n\t" "# r[1] * h[1] => rdx, rax =+> t3, t2\n\t" "movq 8(%[ctx]), %%rax\n\t" "mulq %%r9\n\t" @@ -211,10 +211,10 @@ POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx, "addq %%rax, %%r12\n\t" "movq %%r15, %%rax\n\t" "adcq %%rdx, %%r13\n\t" - "# r[0] * h[0] => rdx, rax +=> t1, t0\n\t" + "# r[0] * h[0] => rdx, rax ==> t4, t0\n\t" "mulq %%r8\n\t" - "movq %%rdx, %%r8\n\t" "movq %%rax, %%r11\n\t" + "movq %%rdx, %%r8\n\t" "# r[1] * h[1] => rdx, rax =+> t3, t2\n\t" "movq 8(%[ctx]), %%rax\n\t" "mulq %%r9\n\t"