mirror of https://github.com/wolfSSL/wolfssl.git
initial ARMv8 instructions
parent
a0b02236b8
commit
41912b92c6
19
configure.ac
19
configure.ac
|
@ -542,6 +542,24 @@ fi
|
|||
AM_CONDITIONAL([BUILD_AESCCM], [test "x$ENABLED_AESCCM" = "xyes"])
|
||||
|
||||
|
||||
# AES-ARM
|
||||
AC_ARG_ENABLE([armasm],
|
||||
[AS_HELP_STRING([--enable-armasm],[Enable wolfSSL ARM ASM support (default: disabled)])],
|
||||
[ ENABLED_ARMASM=$enableval ],
|
||||
[ ENABLED_ARMASM=no ]
|
||||
)
|
||||
if test "$ENABLED_ARMASM" = "yes"
|
||||
then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM"
|
||||
if test "$GCC" = "yes"
|
||||
then
|
||||
# GCC needs this flag
|
||||
AM_CFLAGS="$AM_CFLAGS -mcpu=generic+crypto"
|
||||
fi
|
||||
fi
|
||||
|
||||
AM_CONDITIONAL([BUILD_ARMASM], [test "x$ENABLED_ARMASM" = "xyes"])
|
||||
|
||||
# AES-NI
|
||||
AC_ARG_ENABLE([aesni],
|
||||
[AS_HELP_STRING([--enable-aesni],[Enable wolfSSL AES-NI support (default: disabled)])],
|
||||
|
@ -3213,6 +3231,7 @@ echo " * User Crypto: $ENABLED_USER_CRYPTO"
|
|||
echo " * Fast RSA: $ENABLED_FAST_RSA"
|
||||
echo " * Async Crypto: $ENABLED_ASYNCCRYPT"
|
||||
echo " * Cavium: $ENABLED_CAVIUM"
|
||||
echo " * ARM ASM: $ENABLED_ARMASM"
|
||||
echo ""
|
||||
echo "---"
|
||||
|
||||
|
|
|
@ -62,9 +62,14 @@ endif
|
|||
src_libwolfssl_la_SOURCES += \
|
||||
wolfcrypt/src/hmac.c \
|
||||
wolfcrypt/src/random.c \
|
||||
wolfcrypt/src/sha256.c \
|
||||
wolfcrypt/src/hash.c
|
||||
|
||||
if BUILD_ARMASM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c
|
||||
else
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c
|
||||
endif
|
||||
|
||||
if BUILD_WOLFEVENT
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/wolfevent.c
|
||||
endif
|
||||
|
|
|
@ -110,11 +110,39 @@
|
|||
#define HAVE_GET_CYCLES
|
||||
static INLINE word64 get_intel_cycles(void);
|
||||
static word64 total_cycles;
|
||||
#define INIT_CYCLE_COUNTER
|
||||
#define BEGIN_INTEL_CYCLES total_cycles = get_intel_cycles();
|
||||
#define END_INTEL_CYCLES total_cycles = get_intel_cycles() - total_cycles;
|
||||
#define SHOW_INTEL_CYCLES printf(" Cycles per byte = %6.2f", \
|
||||
(float)total_cycles / (numBlocks*sizeof(plain)));
|
||||
#elif defined(LINUX_CYCLE_COUNT)
|
||||
#include <linux/perf_event.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#define HAVE_GET_CYCLES
|
||||
static word64 begin_cycles;
|
||||
static word64 total_cycles;
|
||||
static int cycles = -1;
|
||||
static struct perf_event_attr atr;
|
||||
|
||||
#define INIT_CYCLE_COUNTER do { \
|
||||
atr.type = PERF_TYPE_HARDWARE; \
|
||||
atr.config = PERF_COUNT_HW_CPU_CYCLES; \
|
||||
cycles = syscall(__NR_perf_event_open, &atr, 0, -1, -1, 0); \
|
||||
} while (0);
|
||||
|
||||
#define BEGIN_INTEL_CYCLES read(cycles, &begin_cycles, sizeof(begin_cycles));
|
||||
#define END_INTEL_CYCLES do { \
|
||||
read(cycles, &total_cycles, sizeof(total_cycles)); \
|
||||
total_cycles = total_cycles - begin_cycles; \
|
||||
} while (0);
|
||||
|
||||
#define SHOW_INTEL_CYCLES printf(" Cycles per byte = %6.2f", \
|
||||
(float)total_cycles / (numBlocks*sizeof(plain)));
|
||||
|
||||
#else
|
||||
#define INIT_CYCLE_COUNTER
|
||||
#define BEGIN_INTEL_CYCLES
|
||||
#define END_INTEL_CYCLES
|
||||
#define SHOW_INTEL_CYCLES
|
||||
|
@ -277,6 +305,7 @@ int benchmark_test(void *args)
|
|||
#endif
|
||||
|
||||
wolfCrypt_Init();
|
||||
INIT_CYCLE_COUNTER
|
||||
|
||||
#if defined(DEBUG_WOLFSSL) && !defined(HAVE_VALGRIND)
|
||||
wolfSSL_Debugging_ON();
|
||||
|
|
|
@ -344,7 +344,160 @@ void wc_AesAsyncFree(Aes* aes)
|
|||
#ifdef HAVE_AES_DECRYPT
|
||||
#error nRF51 AES Hardware does not support decrypt
|
||||
#endif /* HAVE_AES_DECRYPT */
|
||||
#elif defined(WOLFSSL_ARMASM)
|
||||
static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
|
||||
{
|
||||
byte* keyPt = (byte*)aes->key;
|
||||
word32 rounds = aes->rounds;
|
||||
byte out[AES_BLOCK_SIZE];
|
||||
byte* output = out;
|
||||
byte* input = (byte*)inBlock;
|
||||
|
||||
|
||||
/*
|
||||
AESE exor's input with round key
|
||||
shift rows of exor'ed result
|
||||
sub bytes for shifted rows
|
||||
*/
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"LD1 {v0.16b}, [%[CtrIn]], #16 \n"
|
||||
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
||||
|
||||
"AESE v0.16b, v1.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v2.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v3.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v4.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
|
||||
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
||||
"AESE v0.16b, v1.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v2.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v3.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v4.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
|
||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||
"AESE v0.16b, v1.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v2.16b \n"
|
||||
|
||||
"#subtract rounds done so far and see if should continue\n"
|
||||
"MOV w12, %w[R] \n"
|
||||
"SUB w12, w12, #10 \n"
|
||||
"CBZ w12, final \n"
|
||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v1.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v2.16b \n"
|
||||
|
||||
"SUB w12, w12, #2 \n"
|
||||
"CBZ w12, final \n"
|
||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v1.16b \n"
|
||||
"AESMC v0.16b, v0.16b \n"
|
||||
"AESE v0.16b, v2.16b \n"
|
||||
|
||||
"#Final AddRoundKey then store result \n"
|
||||
"final: \n"
|
||||
"LD1 {v1.16b}, [%[Key]], #16 \n"
|
||||
"EOR v0.16b, v0.16b, v1.16b \n"
|
||||
"ST1 {v0.16b}, [%[CtrOut]] \n"
|
||||
|
||||
:[CtrOut] "=r" (output), "=r" (keyPt), "=r" (rounds)
|
||||
:[Key] "1" (keyPt), [R] "2" (rounds), [CtrIn] "r" (input), "0" (output)
|
||||
: "cc", "memory", "w12"
|
||||
);
|
||||
|
||||
XMEMCPY(outBlock, out, AES_BLOCK_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#ifdef HAVE_AES_DECRYPT
|
||||
static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
|
||||
{
|
||||
byte* keyPt = (byte*)aes->key;
|
||||
word32 rounds = aes->rounds;
|
||||
byte out[AES_BLOCK_SIZE];
|
||||
byte* output = out;
|
||||
byte* input = (byte*)inBlock;
|
||||
|
||||
/*
|
||||
AESE exor's input with round key
|
||||
shift rows of exor'ed result
|
||||
sub bytes for shifted rows
|
||||
*/
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"LD1 {v0.16b}, [%[CtrIn]], #16 \n"
|
||||
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
||||
|
||||
"AESD v0.16b, v1.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v2.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v3.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v4.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
|
||||
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
||||
"AESD v0.16b, v1.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v2.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v3.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v4.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
|
||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||
"AESD v0.16b, v1.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v2.16b \n"
|
||||
|
||||
"#subtract rounds done so far and see if should continue\n"
|
||||
"MOV w12, %w[R] \n"
|
||||
"SUB w12, w12, #10 \n"
|
||||
"CBZ w12, finalDec \n"
|
||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v1.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v2.16b \n"
|
||||
|
||||
"SUB w12, w12, #2 \n"
|
||||
"CBZ w12, finalDec \n"
|
||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v1.16b \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"AESD v0.16b, v2.16b \n"
|
||||
|
||||
"#Final AddRoundKey then store result \n"
|
||||
"finalDec: \n"
|
||||
"LD1 {v1.16b}, [%[Key]], #16 \n"
|
||||
"EOR v0.16b, v0.16b, v1.16b \n"
|
||||
"ST1 {v0.4s}, [%[CtrOut]] \n"
|
||||
|
||||
:[CtrOut] "=r" (output), "=r" (keyPt), "=r" (rounds), "=r" (input)
|
||||
:[Key] "1" (keyPt), [R] "2" (rounds), [CtrIn] "3" (input), "0" (output)
|
||||
: "cc", "memory", "w12"
|
||||
);
|
||||
|
||||
XMEMCPY(outBlock, out, AES_BLOCK_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif /* HAVE_AES_DECRYPT */
|
||||
#else
|
||||
|
||||
/* using wolfCrypt software AES implementation */
|
||||
|
@ -1533,7 +1686,6 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
|
|||
}
|
||||
#endif /* HAVE_AES_DECRYPT */
|
||||
#endif /* HAVE_AES_CBC || WOLFSSL_AES_DIRECT */
|
||||
|
||||
#endif /* NEED_AES_TABLES */
|
||||
|
||||
|
||||
|
@ -1678,6 +1830,196 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
|
|||
{
|
||||
return wc_AesSetKey(aes, userKey, keylen, iv, dir);
|
||||
}
|
||||
#elif defined(WOLFSSL_ARMASM)
|
||||
static const byte rcon[] = {
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,0x1B, 0x36
|
||||
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
|
||||
};
|
||||
|
||||
|
||||
/* Similar to wolfSSL software implementation of expanding the AES key.
|
||||
* Changed out the locations of where table look ups where made to
|
||||
* use hardware instruction. Also altered decryption key to match. */
|
||||
int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
|
||||
const byte* iv, int dir)
|
||||
{
|
||||
word32 temp, *rk = aes->key;
|
||||
unsigned int i = 0;
|
||||
|
||||
#if defined(AES_MAX_KEY_SIZE)
|
||||
const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
|
||||
#endif
|
||||
|
||||
if (!((keylen == 16) || (keylen == 24) || (keylen == 32)))
|
||||
return BAD_FUNC_ARG;
|
||||
|
||||
#if defined(AES_MAX_KEY_SIZE)
|
||||
/* Check key length */
|
||||
if (keylen > max_key_len) {
|
||||
return BAD_FUNC_ARG;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WOLFSSL_AES_COUNTER
|
||||
aes->left = 0;
|
||||
#endif /* WOLFSSL_AES_COUNTER */
|
||||
|
||||
aes->rounds = keylen/4 + 6;
|
||||
XMEMCPY(rk, userKey, keylen);
|
||||
|
||||
switch(keylen)
|
||||
{
|
||||
#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128
|
||||
case 16:
|
||||
while (1)
|
||||
{
|
||||
temp = rk[3];
|
||||
|
||||
/* get table value from hardware */
|
||||
__asm__ volatile (
|
||||
"DUP v1.4s, %w[in] \n"
|
||||
"MOVI v0.16b, #0 \n"
|
||||
"AESE v0.16b, v1.16b \n"
|
||||
"UMOV %w[out], v0.4s[0] \n"
|
||||
: [out] "=r"(temp)
|
||||
: [in] "r" (temp)
|
||||
: "cc", "memory", "v0", "v1"
|
||||
);
|
||||
temp = rotrFixed(temp, 8);
|
||||
rk[4] = rk[0] ^ temp ^ rcon[i];
|
||||
rk[5] = rk[4] ^ rk[1];
|
||||
rk[6] = rk[5] ^ rk[2];
|
||||
rk[7] = rk[6] ^ rk[3];
|
||||
if (++i == 10)
|
||||
break;
|
||||
rk += 4;
|
||||
}
|
||||
break;
|
||||
#endif /* 128 */
|
||||
|
||||
#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 192
|
||||
case 24:
|
||||
/* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */
|
||||
while (1)
|
||||
{
|
||||
temp = rk[5];
|
||||
|
||||
/* get table value from hardware */
|
||||
__asm__ volatile (
|
||||
"DUP v1.4s, %w[in] \n"
|
||||
"MOVI v0.16b, #0 \n"
|
||||
"AESE v0.16b, v1.16b \n"
|
||||
"UMOV %w[out], v0.4s[0] \n"
|
||||
: [out] "=r"(temp)
|
||||
: [in] "r" (temp)
|
||||
: "cc", "memory", "v0", "v1"
|
||||
);
|
||||
temp = rotrFixed(temp, 8);
|
||||
rk[ 6] = rk[ 0] ^ temp ^ rcon[i];
|
||||
rk[ 7] = rk[ 1] ^ rk[ 6];
|
||||
rk[ 8] = rk[ 2] ^ rk[ 7];
|
||||
rk[ 9] = rk[ 3] ^ rk[ 8];
|
||||
if (++i == 8)
|
||||
break;
|
||||
rk[10] = rk[ 4] ^ rk[ 9];
|
||||
rk[11] = rk[ 5] ^ rk[10];
|
||||
rk += 6;
|
||||
}
|
||||
break;
|
||||
#endif /* 192 */
|
||||
|
||||
#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256
|
||||
case 32:
|
||||
while (1)
|
||||
{
|
||||
temp = rk[7];
|
||||
|
||||
/* get table value from hardware */
|
||||
__asm__ volatile (
|
||||
"DUP v1.4s, %w[in] \n"
|
||||
"MOVI v0.16b, #0 \n"
|
||||
"AESE v0.16b, v1.16b \n"
|
||||
"UMOV %w[out], v0.4s[0] \n"
|
||||
: [out] "=r"(temp)
|
||||
: [in] "r" (temp)
|
||||
: "cc", "memory", "v0", "v1"
|
||||
);
|
||||
temp = rotrFixed(temp, 8);
|
||||
rk[8] = rk[0] ^ temp ^ rcon[i];
|
||||
rk[ 9] = rk[ 1] ^ rk[ 8];
|
||||
rk[10] = rk[ 2] ^ rk[ 9];
|
||||
rk[11] = rk[ 3] ^ rk[10];
|
||||
if (++i == 7)
|
||||
break;
|
||||
temp = rk[11];
|
||||
|
||||
/* get table value from hardware */
|
||||
__asm__ volatile (
|
||||
"DUP v1.4s, %w[in] \n"
|
||||
"MOVI v0.16b, #0 \n"
|
||||
"AESE v0.16b, v1.16b \n"
|
||||
"UMOV %w[out], v0.4s[0] \n"
|
||||
: [out] "=r"(temp)
|
||||
: [in] "r" (temp)
|
||||
: "cc", "memory", "v0", "v1"
|
||||
);
|
||||
rk[12] = rk[ 4] ^ temp;
|
||||
rk[13] = rk[ 5] ^ rk[12];
|
||||
rk[14] = rk[ 6] ^ rk[13];
|
||||
rk[15] = rk[ 7] ^ rk[14];
|
||||
|
||||
rk += 8;
|
||||
}
|
||||
break;
|
||||
#endif /* 256 */
|
||||
|
||||
default:
|
||||
return BAD_FUNC_ARG;
|
||||
}
|
||||
|
||||
if (dir == AES_DECRYPTION)
|
||||
{
|
||||
#ifdef HAVE_AES_DECRYPT
|
||||
unsigned int j;
|
||||
rk = aes->key;
|
||||
|
||||
/* invert the order of the round keys: */
|
||||
for (i = 0, j = 4* aes->rounds; i < j; i += 4, j -= 4) {
|
||||
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
|
||||
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
|
||||
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
|
||||
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
|
||||
}
|
||||
/* apply the inverse MixColumn transform to all round keys but the
|
||||
first and the last: */
|
||||
for (i = 1; i < aes->rounds; i++) {
|
||||
rk += 4;
|
||||
__asm__ volatile (
|
||||
"LD1 {v0.16b}, [%[in]] \n"
|
||||
"AESIMC v0.16b, v0.16b \n"
|
||||
"ST1 {v0.16b}, [%[out]]\n"
|
||||
: [out] "=r" (rk)
|
||||
: [in] "0" (rk)
|
||||
: "cc", "memory", "v0"
|
||||
);
|
||||
}
|
||||
#else
|
||||
WOLFSSL_MSG("AES Decryption not compiled in");
|
||||
return BAD_FUNC_ARG;
|
||||
#endif /* HAVE_AES_DECRYPT */
|
||||
}
|
||||
|
||||
return wc_AesSetIV(aes, iv);
|
||||
}
|
||||
|
||||
#if defined(WOLFSSL_AES_DIRECT)
|
||||
int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
|
||||
const byte* iv, int dir)
|
||||
{
|
||||
return wc_AesSetKey(aes, userKey, keylen, iv, dir);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
static int wc_AesSetKeyLocal(Aes* aes, const byte* userKey, word32 keylen,
|
||||
const byte* iv, int dir)
|
||||
|
@ -2859,7 +3201,7 @@ static INLINE void IncrementGcmCounter(byte* inOutCtr)
|
|||
}
|
||||
|
||||
|
||||
#if defined(GCM_SMALL) || defined(GCM_TABLE)
|
||||
#if defined(GCM_SMALL) || defined(GCM_TABLE) || defined(WOLFSSL_ARMASM)
|
||||
|
||||
static INLINE void FlattenSzInBits(byte* buf, word32 sz)
|
||||
{
|
||||
|
@ -2943,6 +3285,20 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
|
|||
|
||||
if (ret == 0) {
|
||||
wc_AesEncrypt(aes, iv, aes->H);
|
||||
#if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
|
||||
{
|
||||
word32* pt = (word32*)aes->H;
|
||||
__asm__ volatile (
|
||||
"LD1 {v0.16b}, [%[h]] \n"
|
||||
"RBIT v0.16b, v0.16b \n"
|
||||
"ST1 {v0.16b}, [%[out]] \n"
|
||||
: [out] "=r" (pt)
|
||||
: [h] "0" (pt)
|
||||
: "cc", "memory"
|
||||
);
|
||||
return ret; /* no need to generate GCM_TABLE */
|
||||
}
|
||||
#endif
|
||||
#ifdef GCM_TABLE
|
||||
GenerateM0(aes);
|
||||
#endif /* GCM_TABLE */
|
||||
|
@ -3379,8 +3735,118 @@ static int AES_GCM_decrypt(const unsigned char *in,
|
|||
#endif /* WOLFSSL_AESNI */
|
||||
|
||||
|
||||
#if defined(GCM_SMALL)
|
||||
#if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
|
||||
/* PMULL and RBIT only with AArch64 */
|
||||
/* Use ARM hardware for polynomial multiply */
|
||||
static void GMULT(byte* X, byte* Y)
|
||||
{
|
||||
word32* Xpt = (word32*)X;
|
||||
word32* Ypt = (word32*)Y;
|
||||
|
||||
__asm__ volatile (
|
||||
"LD1 {v0.16b}, [%[inX]] \n"
|
||||
"LD1 {v1.16b}, [%[inY]] \n" /* v1 already reflected from set key */
|
||||
"RBIT v0.16b, v0.16b \n"
|
||||
|
||||
|
||||
/* Algorithm 1 from Intel GCM white paper.
|
||||
"Carry-Less Multiplication and Its Usage for Computing the GCM Mode"
|
||||
*/
|
||||
"PMULL v3.1q, v0.1d, v1.1d \n" /* a0 * b0 = C */
|
||||
"PMULL2 v4.1q, v0.2d, v1.2d \n" /* a1 * b1 = D */
|
||||
"EXT v5.16b, v1.16b, v1.16b, #8 \n" /* b0b1 -> b1b0 */
|
||||
"PMULL v6.1q, v0.1d, v5.1d \n" /* a0 * b1 = E */
|
||||
"PMULL2 v5.1q, v0.2d, v5.2d \n" /* a1 * b0 = F */
|
||||
|
||||
"#Set a register to all 0s using EOR \n"
|
||||
"EOR v7.16b, v7.16b, v7.16b \n"
|
||||
"EOR v5.16b, v5.16b, v6.16b \n" /* F ^ E */
|
||||
"EXT v6.16b, v7.16b, v5.16b, #8 \n" /* get (F^E)[0] */
|
||||
"EOR v3.16b, v3.16b, v6.16b \n" /* low 128 bits in v3 */
|
||||
"EXT v6.16b, v5.16b, v7.16b, #8 \n" /* get (F^E)[1] */
|
||||
"EOR v4.16b, v4.16b, v6.16b \n" /* high 128 bits in v4 */
|
||||
|
||||
|
||||
/* Based from White Paper "Implementing GCM on ARMv8"
|
||||
by Conrado P.L. Gouvea and Julio Lopez
|
||||
reduction on 256bit value using Algorithm 5 */
|
||||
"MOVI v8.16b, #0x87 \n"
|
||||
"USHR v8.2d, v8.2d, #56 \n"
|
||||
/* v8 is now 0x00000000000000870000000000000087 reflected 0xe1....*/
|
||||
"PMULL2 v5.1q, v4.2d, v8.2d \n"
|
||||
"EXT v6.16b, v5.16b, v7.16b, #8 \n" /* v7 is all 0's */
|
||||
"EOR v4.16b, v4.16b, v6.16b \n"
|
||||
"EXT v6.16b, v7.16b, v5.16b, #8 \n"
|
||||
"EOR v3.16b, v3.16b, v6.16b \n"
|
||||
"PMULL v5.1q, v4.1d, v8.1d \n"
|
||||
"EOR v4.16b, v3.16b, v5.16b \n"
|
||||
|
||||
"RBIT v4.16b, v4.16b \n"
|
||||
"STR q4, [%[out]] \n"
|
||||
: [out] "=r" (Xpt), "=r" (Ypt)
|
||||
: [inX] "0" (Xpt), [inY] "1" (Ypt)
|
||||
: "cc", "memory", "v3", "v4", "v5", "v6", "v7", "v8"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/* Currently is a copy from GCM_SMALL wolfSSL version. Duplicated and set
|
||||
* seperate for future optimizations. */
|
||||
static void GHASH(Aes* aes, const byte* a, word32 aSz,
|
||||
const byte* c, word32 cSz, byte* s, word32 sSz)
|
||||
{
|
||||
byte x[AES_BLOCK_SIZE];
|
||||
byte scratch[AES_BLOCK_SIZE];
|
||||
word32 blocks, partial;
|
||||
byte* h = aes->H;
|
||||
|
||||
XMEMSET(x, 0, AES_BLOCK_SIZE);
|
||||
|
||||
/* Hash in A, the Additional Authentication Data */
|
||||
if (aSz != 0 && a != NULL) {
|
||||
blocks = aSz / AES_BLOCK_SIZE;
|
||||
partial = aSz % AES_BLOCK_SIZE;
|
||||
while (blocks--) {
|
||||
xorbuf(x, a, AES_BLOCK_SIZE);
|
||||
GMULT(x, h);
|
||||
a += AES_BLOCK_SIZE;
|
||||
}
|
||||
if (partial != 0) {
|
||||
XMEMSET(scratch, 0, AES_BLOCK_SIZE);
|
||||
XMEMCPY(scratch, a, partial);
|
||||
xorbuf(x, scratch, AES_BLOCK_SIZE);
|
||||
GMULT(x, h);
|
||||
}
|
||||
}
|
||||
|
||||
/* Hash in C, the Ciphertext */
|
||||
if (cSz != 0 && c != NULL) {
|
||||
blocks = cSz / AES_BLOCK_SIZE;
|
||||
partial = cSz % AES_BLOCK_SIZE;
|
||||
while (blocks--) {
|
||||
xorbuf(x, c, AES_BLOCK_SIZE);
|
||||
GMULT(x, h);
|
||||
c += AES_BLOCK_SIZE;
|
||||
}
|
||||
if (partial != 0) {
|
||||
XMEMSET(scratch, 0, AES_BLOCK_SIZE);
|
||||
XMEMCPY(scratch, c, partial);
|
||||
xorbuf(x, scratch, AES_BLOCK_SIZE);
|
||||
GMULT(x, h);
|
||||
}
|
||||
}
|
||||
|
||||
/* Hash in the lengths of A and C in bits */
|
||||
FlattenSzInBits(&scratch[0], aSz);
|
||||
FlattenSzInBits(&scratch[8], cSz);
|
||||
xorbuf(x, scratch, AES_BLOCK_SIZE);
|
||||
GMULT(x, h);
|
||||
|
||||
/* Copy the result into s. */
|
||||
XMEMCPY(s, x, sSz);
|
||||
}
|
||||
/* not using ARMASM for multiplication */
|
||||
#elif defined(GCM_SMALL)
|
||||
static void GMULT(byte* X, byte* Y)
|
||||
{
|
||||
byte Z[AES_BLOCK_SIZE];
|
||||
|
|
|
@ -44,7 +44,8 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \
|
|||
wolfcrypt/src/port/ti/ti-hash.c \
|
||||
wolfcrypt/src/port/ti/ti-ccm.c \
|
||||
wolfcrypt/src/port/pic32/pic32mz-hash.c \
|
||||
wolfcrypt/src/port/nrf51.c
|
||||
wolfcrypt/src/port/nrf51.c \
|
||||
wolfcrypt/src/port/arm/armv8-sha256.c
|
||||
|
||||
if BUILD_CAVIUM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/cavium/cavium_nitrox.c
|
||||
|
|
|
@ -0,0 +1,608 @@
|
|||
/* armv8-sha256.c
|
||||
*
|
||||
* Copyright (C) 2006-2016 wolfSSL Inc.
|
||||
*
|
||||
* This file is part of wolfSSL.
|
||||
*
|
||||
* wolfSSL is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* wolfSSL is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
|
||||
*/
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif
|
||||
|
||||
#include <wolfssl/wolfcrypt/settings.h>
|
||||
|
||||
#if !defined(NO_SHA256) && defined(WOLFSSL_ARMASM)
|
||||
#include <wolfssl/wolfcrypt/sha256.h>
|
||||
#include <wolfssl/wolfcrypt/logging.h>
|
||||
#include <wolfssl/wolfcrypt/error-crypt.h>
|
||||
|
||||
#ifdef NO_INLINE
|
||||
#include <wolfssl/wolfcrypt/misc.h>
|
||||
#else
|
||||
#define WOLFSSL_MISC_INCLUDED
|
||||
#include <wolfcrypt/src/misc.c>
|
||||
#endif
|
||||
|
||||
#ifndef WOLFSSL_HAVE_MIN
|
||||
#define WOLFSSL_HAVE_MIN
|
||||
|
||||
static INLINE word32 min(word32 a, word32 b)
|
||||
{
|
||||
return a > b ? b : a;
|
||||
}
|
||||
|
||||
#endif /* WOLFSSL_HAVE_MIN */
|
||||
|
||||
#if !defined (ALIGN32)
|
||||
#if defined (__GNUC__)
|
||||
#define ALIGN32 __attribute__ ( (aligned (32)))
|
||||
#elif defined(_MSC_VER)
|
||||
/* disable align warning, we want alignment ! */
|
||||
#pragma warning(disable: 4324)
|
||||
#define ALIGN32 __declspec (align (32))
|
||||
#else
|
||||
#define ALIGN32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static const ALIGN32 word32 K[64] = {
|
||||
0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
|
||||
0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
|
||||
0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
|
||||
0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
|
||||
0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
|
||||
0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
|
||||
0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
|
||||
0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
|
||||
0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
|
||||
0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
|
||||
0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
|
||||
0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
|
||||
0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
|
||||
};
|
||||
|
||||
int wc_InitSha256(Sha256* sha256)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
sha256->digest[0] = 0x6A09E667L;
|
||||
sha256->digest[1] = 0xBB67AE85L;
|
||||
sha256->digest[2] = 0x3C6EF372L;
|
||||
sha256->digest[3] = 0xA54FF53AL;
|
||||
sha256->digest[4] = 0x510E527FL;
|
||||
sha256->digest[5] = 0x9B05688CL;
|
||||
sha256->digest[6] = 0x1F83D9ABL;
|
||||
sha256->digest[7] = 0x5BE0CD19L;
|
||||
|
||||
sha256->buffLen = 0;
|
||||
sha256->loLen = 0;
|
||||
sha256->hiLen = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static INLINE void AddLength(Sha256* sha256, word32 len)
|
||||
{
|
||||
word32 tmp = sha256->loLen;
|
||||
if ( (sha256->loLen += len) < tmp)
|
||||
sha256->hiLen++; /* carry low to high */
|
||||
}
|
||||
|
||||
|
||||
/* ARMv8 hardware accleration */
|
||||
int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
|
||||
{
|
||||
|
||||
/* do block size increments */
|
||||
byte* local = (byte*)sha256->buffer;
|
||||
|
||||
while (len) {
|
||||
word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen);
|
||||
XMEMCPY(&local[sha256->buffLen], data, add);
|
||||
|
||||
sha256->buffLen += add;
|
||||
data += add;
|
||||
len -= add;
|
||||
|
||||
if (sha256->buffLen == SHA256_BLOCK_SIZE) {
|
||||
word32* Kpt = (word32*)K;
|
||||
word32* bufferPt = sha256->buffer;
|
||||
word32* digestPt = sha256->digest;
|
||||
|
||||
__asm__ volatile (
|
||||
"#load in message and schedual updates \n"
|
||||
"LD1 {v4.16b-v7.16b}, [%[buffer]] \n"
|
||||
"MOV v0.16b, v4.16b \n"
|
||||
"MOV v1.16b, v5.16b \n"
|
||||
"REV32 v0.16b, v0.16b \n"
|
||||
"MOV v2.16b, v6.16b \n"
|
||||
"REV32 v1.16b, v1.16b \n"
|
||||
"MOV v3.16b, v7.16b \n"
|
||||
"REV32 v2.16b, v2.16b \n"
|
||||
"REV32 v3.16b, v3.16b \n"
|
||||
|
||||
"MOV v4.16b, v0.16b \n"
|
||||
"MOV v5.16b, v1.16b \n"
|
||||
"SHA256SU0 v4.4s, v1.4s \n"
|
||||
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
|
||||
"MOV v6.16b, v2.16b \n"
|
||||
"SHA256SU0 v5.4s, v2.4s \n"
|
||||
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
|
||||
"MOV v7.16b, v3.16b \n"
|
||||
"SHA256SU0 v6.4s, v3.4s \n"
|
||||
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
|
||||
"MOV v8.16b, v4.16b \n"
|
||||
"SHA256SU0 v7.4s, v4.4s \n"
|
||||
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
|
||||
"MOV v9.16b, v5.16b \n"
|
||||
"SHA256SU0 v8.4s, v5.4s \n"
|
||||
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
|
||||
"MOV v10.16b, v6.16b \n"
|
||||
"SHA256SU0 v9.4s, v6.4s \n"
|
||||
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
|
||||
"MOV v11.16b, v7.16b \n"
|
||||
"SHA256SU0 v10.4s, v7.4s \n"
|
||||
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
|
||||
"MOV v12.16b, v8.16b \n"
|
||||
"SHA256SU0 v11.4s, v8.4s \n"
|
||||
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
|
||||
"MOV v13.16b, v9.16b \n"
|
||||
"SHA256SU0 v12.4s, v9.4s \n"
|
||||
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
|
||||
"MOV v14.16b, v10.16b \n"
|
||||
"SHA256SU0 v13.4s, v10.4s \n"
|
||||
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
|
||||
"MOV v15.16b, v11.16b \n"
|
||||
"SHA256SU0 v14.4s, v11.4s \n"
|
||||
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
|
||||
"SHA256SU0 v15.4s, v12.4s \n"
|
||||
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
|
||||
|
||||
"#Add K values to message \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
|
||||
"ADD v0.4s, v0.4s, v16.4s \n"
|
||||
"ADD v1.4s, v1.4s, v17.4s \n"
|
||||
"ADD v2.4s, v2.4s, v18.4s \n"
|
||||
"ADD v3.4s, v3.4s, v19.4s \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
|
||||
"ADD v4.4s, v4.4s, v16.4s \n"
|
||||
"ADD v5.4s, v5.4s, v17.4s \n"
|
||||
"ADD v6.4s, v6.4s, v18.4s \n"
|
||||
"ADD v7.4s, v7.4s, v19.4s \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
|
||||
"ADD v8.4s, v8.4s, v16.4s \n"
|
||||
"ADD v9.4s, v9.4s, v17.4s \n"
|
||||
"ADD v10.4s, v10.4s, v18.4s \n"
|
||||
"ADD v11.4s, v11.4s, v19.4s \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]] \n"
|
||||
"ADD v12.4s, v12.4s, v16.4s \n"
|
||||
"ADD v13.4s, v13.4s, v17.4s \n"
|
||||
"LD1 {v20.4s-v21.4s}, [%[digest]] \n"
|
||||
"ADD v14.4s, v14.4s, v18.4s \n"
|
||||
"ADD v15.4s, v15.4s, v19.4s \n"
|
||||
|
||||
"#SHA256 operation on updated message \n"
|
||||
"MOV v16.16b, v20.16b \n"
|
||||
"MOV v17.16b, v21.16b \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v0.4s \n"
|
||||
"SHA256H2 q17, q18, v0.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v1.4s \n"
|
||||
"SHA256H2 q17, q18, v1.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v2.4s \n"
|
||||
"SHA256H2 q17, q18, v2.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v3.4s \n"
|
||||
"SHA256H2 q17, q18, v3.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v4.4s \n"
|
||||
"SHA256H2 q17, q18, v4.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v5.4s \n"
|
||||
"SHA256H2 q17, q18, v5.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v6.4s \n"
|
||||
"SHA256H2 q17, q18, v6.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v7.4s \n"
|
||||
"SHA256H2 q17, q18, v7.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v8.4s \n"
|
||||
"SHA256H2 q17, q18, v8.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v9.4s \n"
|
||||
"SHA256H2 q17, q18, v9.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v10.4s \n"
|
||||
"SHA256H2 q17, q18, v10.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v11.4s \n"
|
||||
"SHA256H2 q17, q18, v11.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v12.4s \n"
|
||||
"SHA256H2 q17, q18, v12.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v13.4s \n"
|
||||
"SHA256H2 q17, q18, v13.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v14.4s \n"
|
||||
"SHA256H2 q17, q18, v14.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v15.4s \n"
|
||||
"SHA256H2 q17, q18, v15.4s \n"
|
||||
|
||||
"#Add working vars back into digest state \n"
|
||||
"ADD v16.4s, v16.4s, v20.4s \n"
|
||||
"ADD v17.4s, v17.4s, v21.4s \n"
|
||||
"STP q16, q17, [%[out]] \n"
|
||||
: "=r" (Kpt), [out] "=r" (digestPt), "=r" (bufferPt)
|
||||
: [k] "0" (Kpt), [digest] "1" (digestPt), [buffer] "2" (bufferPt)
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12", "v13", "v14",
|
||||
"v15", "v16", "v17", "v18", "v19", "v20", "v21"
|
||||
);
|
||||
|
||||
AddLength(sha256, SHA256_BLOCK_SIZE);
|
||||
sha256->buffLen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int wc_Sha256Final(Sha256* sha256, byte* hash)
|
||||
{
|
||||
byte* local = (byte*)sha256->buffer;
|
||||
word32* Kpt = (word32*)K;
|
||||
word32* bufferPt = sha256->buffer;
|
||||
word32* digestPt = sha256->digest;
|
||||
word32* hashPt = (word32*)hash;
|
||||
|
||||
AddLength(sha256, sha256->buffLen); /* before adding pads */
|
||||
|
||||
local[sha256->buffLen++] = 0x80; /* add 1 */
|
||||
|
||||
/* pad with zeros */
|
||||
if (sha256->buffLen > SHA256_PAD_SIZE) {
|
||||
|
||||
XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen);
|
||||
|
||||
sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
|
||||
bufferPt = sha256->buffer;
|
||||
digestPt = sha256->digest;
|
||||
Kpt = (word32*)K;
|
||||
__asm__ volatile (
|
||||
"#load in message and schedual updates \n"
|
||||
"LD1 {v4.16b-v7.16b}, [%[buffer]] \n"
|
||||
"MOV v0.16b, v4.16b \n"
|
||||
"MOV v1.16b, v5.16b \n"
|
||||
"REV32 v0.16b, v0.16b \n"
|
||||
"MOV v2.16b, v6.16b \n"
|
||||
"REV32 v1.16b, v1.16b \n"
|
||||
"MOV v3.16b, v7.16b \n"
|
||||
"REV32 v2.16b, v2.16b \n"
|
||||
"REV32 v3.16b, v3.16b \n"
|
||||
"MOV v4.16b, v0.16b \n"
|
||||
"MOV v5.16b, v1.16b \n"
|
||||
"SHA256SU0 v4.4s, v1.4s \n"
|
||||
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
|
||||
"MOV v6.16b, v2.16b \n"
|
||||
"SHA256SU0 v5.4s, v2.4s \n"
|
||||
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
|
||||
"MOV v7.16b, v3.16b \n"
|
||||
"SHA256SU0 v6.4s, v3.4s \n"
|
||||
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
|
||||
"MOV v8.16b, v4.16b \n"
|
||||
"SHA256SU0 v7.4s, v4.4s \n"
|
||||
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
|
||||
"MOV v9.16b, v5.16b \n"
|
||||
"SHA256SU0 v8.4s, v5.4s \n"
|
||||
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
|
||||
"MOV v10.16b, v6.16b \n"
|
||||
"SHA256SU0 v9.4s, v6.4s \n"
|
||||
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
|
||||
"MOV v11.16b, v7.16b \n"
|
||||
"SHA256SU0 v10.4s, v7.4s \n"
|
||||
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
|
||||
"MOV v12.16b, v8.16b \n"
|
||||
"SHA256SU0 v11.4s, v8.4s \n"
|
||||
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
|
||||
"MOV v13.16b, v9.16b \n"
|
||||
"SHA256SU0 v12.4s, v9.4s \n"
|
||||
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
|
||||
"MOV v14.16b, v10.16b \n"
|
||||
"SHA256SU0 v13.4s, v10.4s \n"
|
||||
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
|
||||
"MOV v15.16b, v11.16b \n"
|
||||
"SHA256SU0 v14.4s, v11.4s \n"
|
||||
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
|
||||
"SHA256SU0 v15.4s, v12.4s \n"
|
||||
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
|
||||
|
||||
"#Add K values to message \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
|
||||
"ADD v0.4s, v0.4s, v16.4s \n"
|
||||
"ADD v1.4s, v1.4s, v17.4s \n"
|
||||
"ADD v2.4s, v2.4s, v18.4s \n"
|
||||
"ADD v3.4s, v3.4s, v19.4s \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
|
||||
"ADD v4.4s, v4.4s, v16.4s \n"
|
||||
"ADD v5.4s, v5.4s, v17.4s \n"
|
||||
"ADD v6.4s, v6.4s, v18.4s \n"
|
||||
"ADD v7.4s, v7.4s, v19.4s \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
|
||||
"ADD v8.4s, v8.4s, v16.4s \n"
|
||||
"ADD v9.4s, v9.4s, v17.4s \n"
|
||||
"ADD v10.4s, v10.4s, v18.4s \n"
|
||||
"ADD v11.4s, v11.4s, v19.4s \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]] \n"
|
||||
"ADD v12.4s, v12.4s, v16.4s \n"
|
||||
"ADD v13.4s, v13.4s, v17.4s \n"
|
||||
"LD1 {v20.4s-v21.4s}, [%[digest]] \n"
|
||||
"ADD v14.4s, v14.4s, v18.4s \n"
|
||||
"ADD v15.4s, v15.4s, v19.4s \n"
|
||||
|
||||
"#SHA256 operation on updated message \n"
|
||||
"MOV v16.16b, v20.16b \n"
|
||||
"MOV v17.16b, v21.16b \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v0.4s \n"
|
||||
"SHA256H2 q17, q18, v0.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v1.4s \n"
|
||||
"SHA256H2 q17, q18, v1.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v2.4s \n"
|
||||
"SHA256H2 q17, q18, v2.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v3.4s \n"
|
||||
"SHA256H2 q17, q18, v3.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v4.4s \n"
|
||||
"SHA256H2 q17, q18, v4.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v5.4s \n"
|
||||
"SHA256H2 q17, q18, v5.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v6.4s \n"
|
||||
"SHA256H2 q17, q18, v6.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v7.4s \n"
|
||||
"SHA256H2 q17, q18, v7.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v8.4s \n"
|
||||
"SHA256H2 q17, q18, v8.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v9.4s \n"
|
||||
"SHA256H2 q17, q18, v9.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v10.4s \n"
|
||||
"SHA256H2 q17, q18, v10.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v11.4s \n"
|
||||
"SHA256H2 q17, q18, v11.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v12.4s \n"
|
||||
"SHA256H2 q17, q18, v12.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v13.4s \n"
|
||||
"SHA256H2 q17, q18, v13.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v14.4s \n"
|
||||
"SHA256H2 q17, q18, v14.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v15.4s \n"
|
||||
"SHA256H2 q17, q18, v15.4s \n"
|
||||
|
||||
"#Add working vars back into digest state \n"
|
||||
"ADD v16.4s, v16.4s, v20.4s \n"
|
||||
"ADD v17.4s, v17.4s, v21.4s \n"
|
||||
"STP q16, q17, [%[out]] \n"
|
||||
: "=r" (Kpt), [out] "=r" (digestPt), "=r" (bufferPt)
|
||||
: [k] "0" (Kpt), [digest] "1" (digestPt), [buffer] "2" (bufferPt)
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12", "v13", "v14",
|
||||
"v15", "v16", "v17", "v18", "v19", "v20", "v21"
|
||||
);
|
||||
|
||||
sha256->buffLen = 0;
|
||||
}
|
||||
XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen);
|
||||
|
||||
/* put lengths in bits */
|
||||
sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) +
|
||||
(sha256->hiLen << 3);
|
||||
sha256->loLen = sha256->loLen << 3;
|
||||
|
||||
/* store lengths */
|
||||
#if defined(LITTLE_ENDIAN_ORDER)
|
||||
bufferPt = sha256->buffer;
|
||||
__asm__ volatile (
|
||||
"LD1 {v0.16b}, [%[in]] \n"
|
||||
"REV32 v0.16b, v0.16b \n"
|
||||
"ST1 {v0.16b}, [%[out]], #16 \n"
|
||||
"LD1 {v0.16b}, [%[in]] \n"
|
||||
"REV32 v0.16b, v0.16b \n"
|
||||
"ST1 {v0.16b}, [%[out]], #16 \n"
|
||||
"LD1 {v0.16b}, [%[in]] \n"
|
||||
"REV32 v0.16b, v0.16b \n"
|
||||
"ST1 {v0.16b}, [%[out]], #16 \n"
|
||||
"LD1 {v0.16b}, [%[in]] \n"
|
||||
"REV32 v0.16b, v0.16b \n"
|
||||
"ST1 {v0.16b}, [%[out]] \n"
|
||||
: [out] "=r" (bufferPt)
|
||||
: [in] "0" (bufferPt)
|
||||
: "cc"
|
||||
);
|
||||
#endif
|
||||
/* ! length ordering dependent on digest endian type ! */
|
||||
XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
|
||||
XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
|
||||
sizeof(word32));
|
||||
|
||||
bufferPt = sha256->buffer;
|
||||
digestPt = sha256->digest;
|
||||
Kpt = (word32*)K;
|
||||
__asm__ volatile (
|
||||
"#load in message and schedual updates \n"
|
||||
"LD1 {v4.16b-v7.16b}, [%[buffer]] \n"
|
||||
"MOV v0.16b, v4.16b \n"
|
||||
"MOV v1.16b, v5.16b \n"
|
||||
"MOV v2.16b, v6.16b \n"
|
||||
"MOV v3.16b, v7.16b \n"
|
||||
"SHA256SU0 v4.4s, v1.4s \n"
|
||||
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
|
||||
"MOV v6.16b, v2.16b \n"
|
||||
"SHA256SU0 v5.4s, v2.4s \n"
|
||||
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
|
||||
"MOV v7.16b, v3.16b \n"
|
||||
"SHA256SU0 v6.4s, v3.4s \n"
|
||||
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
|
||||
"MOV v8.16b, v4.16b \n"
|
||||
"SHA256SU0 v7.4s, v4.4s \n"
|
||||
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
|
||||
"MOV v9.16b, v5.16b \n"
|
||||
"SHA256SU0 v8.4s, v5.4s \n"
|
||||
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
|
||||
"MOV v10.16b, v6.16b \n"
|
||||
"SHA256SU0 v9.4s, v6.4s \n"
|
||||
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
|
||||
"MOV v11.16b, v7.16b \n"
|
||||
"SHA256SU0 v10.4s, v7.4s \n"
|
||||
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
|
||||
"MOV v12.16b, v8.16b \n"
|
||||
"SHA256SU0 v11.4s, v8.4s \n"
|
||||
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
|
||||
"MOV v13.16b, v9.16b \n"
|
||||
"SHA256SU0 v12.4s, v9.4s \n"
|
||||
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
|
||||
"MOV v14.16b, v10.16b \n"
|
||||
"SHA256SU0 v13.4s, v10.4s \n"
|
||||
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
|
||||
"MOV v15.16b, v11.16b \n"
|
||||
"SHA256SU0 v14.4s, v11.4s \n"
|
||||
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
|
||||
"SHA256SU0 v15.4s, v12.4s \n"
|
||||
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
|
||||
|
||||
"#Add K values to message \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
|
||||
"ADD v0.4s, v0.4s, v16.4s \n"
|
||||
"ADD v1.4s, v1.4s, v17.4s \n"
|
||||
"ADD v2.4s, v2.4s, v18.4s \n"
|
||||
"ADD v3.4s, v3.4s, v19.4s \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
|
||||
"ADD v4.4s, v4.4s, v16.4s \n"
|
||||
"ADD v5.4s, v5.4s, v17.4s \n"
|
||||
"ADD v6.4s, v6.4s, v18.4s \n"
|
||||
"ADD v7.4s, v7.4s, v19.4s \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
|
||||
"ADD v8.4s, v8.4s, v16.4s \n"
|
||||
"ADD v9.4s, v9.4s, v17.4s \n"
|
||||
"ADD v10.4s, v10.4s, v18.4s \n"
|
||||
"ADD v11.4s, v11.4s, v19.4s \n"
|
||||
"LD1 {v16.16b-v19.16b}, [%[k]] \n"
|
||||
"ADD v12.4s, v12.4s, v16.4s \n"
|
||||
"ADD v13.4s, v13.4s, v17.4s \n"
|
||||
"LD1 {v20.4s-v21.4s}, [%[digest]] \n"
|
||||
"ADD v14.4s, v14.4s, v18.4s \n"
|
||||
"ADD v15.4s, v15.4s, v19.4s \n"
|
||||
|
||||
"#SHA256 operation on updated message \n"
|
||||
"MOV v16.16b, v20.16b \n"
|
||||
"MOV v17.16b, v21.16b \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v0.4s \n"
|
||||
"SHA256H2 q17, q18, v0.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v1.4s \n"
|
||||
"SHA256H2 q17, q18, v1.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v2.4s \n"
|
||||
"SHA256H2 q17, q18, v2.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v3.4s \n"
|
||||
"SHA256H2 q17, q18, v3.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v4.4s \n"
|
||||
"SHA256H2 q17, q18, v4.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v5.4s \n"
|
||||
"SHA256H2 q17, q18, v5.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v6.4s \n"
|
||||
"SHA256H2 q17, q18, v6.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v7.4s \n"
|
||||
"SHA256H2 q17, q18, v7.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v8.4s \n"
|
||||
"SHA256H2 q17, q18, v8.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v9.4s \n"
|
||||
"SHA256H2 q17, q18, v9.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v10.4s \n"
|
||||
"SHA256H2 q17, q18, v10.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v11.4s \n"
|
||||
"SHA256H2 q17, q18, v11.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v12.4s \n"
|
||||
"SHA256H2 q17, q18, v12.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v13.4s \n"
|
||||
"SHA256H2 q17, q18, v13.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v14.4s \n"
|
||||
"SHA256H2 q17, q18, v14.4s \n"
|
||||
"MOV v18.16b, v16.16b \n"
|
||||
"SHA256H q16, q17, v15.4s \n"
|
||||
"SHA256H2 q17, q18, v15.4s \n"
|
||||
|
||||
"#Add working vars back into digest state \n"
|
||||
"ADD v16.4s, v16.4s, v20.4s \n"
|
||||
"ADD v17.4s, v17.4s, v21.4s \n"
|
||||
"STP q16, q17, [%[out]] \n"
|
||||
|
||||
"#Store value as hash output \n"
|
||||
#if defined(LITTLE_ENDIAN_ORDER)
|
||||
"REV32 v16.16b, v16.16b \n"
|
||||
#endif
|
||||
"ST1 {v16.16b}, [%[hashOut]], #16 \n"
|
||||
#if defined(LITTLE_ENDIAN_ORDER)
|
||||
"REV32 v17.16b, v17.16b \n"
|
||||
#endif
|
||||
"ST1 {v17.16b}, [%[hashOut]] \n"
|
||||
: "=r" (Kpt), [out] "=r" (digestPt), "=r" (bufferPt),
|
||||
[hashOut] "=r" (hashPt)
|
||||
: [k] "0" (Kpt), [digest] "1" (digestPt), [buffer] "2" (bufferPt), "3" (hashPt)
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12", "v13", "v14",
|
||||
"v15", "v16", "v17", "v18", "v19", "v20", "v21"
|
||||
);
|
||||
|
||||
return wc_InitSha256(sha256); /* reset state */
|
||||
}
|
||||
|
||||
#endif /* NO_SHA256 and WOLFSSL_ARMASM */
|
||||
|
|
@ -54,13 +54,29 @@ enum {
|
|||
|
||||
#ifndef WOLFSSL_TI_HASH
|
||||
|
||||
#ifdef WOLFSSL_ARMASM /* slight performance increase with aligned memory */
|
||||
#if !defined (ALIGN16)
|
||||
#if defined (__GNUC__)
|
||||
#define ALIGN16 __attribute__ ( (aligned (16)))
|
||||
#elif defined(_MSC_VER)
|
||||
/* disable align warning, we want alignment ! */
|
||||
#pragma warning(disable: 4324)
|
||||
#define ALIGN16 __declspec (align (16))
|
||||
#else
|
||||
#define ALIGN16
|
||||
#endif
|
||||
#endif
|
||||
#else /* not using ARM ASM*/
|
||||
#define ALIGN16
|
||||
#endif /* WOLFSSL_ARMASM */
|
||||
|
||||
/* Sha256 digest */
|
||||
typedef struct Sha256 {
|
||||
word32 buffLen; /* in bytes */
|
||||
word32 loLen; /* length in bytes */
|
||||
word32 hiLen; /* length in bytes */
|
||||
word32 digest[SHA256_DIGEST_SIZE / sizeof(word32)];
|
||||
word32 buffer[SHA256_BLOCK_SIZE / sizeof(word32)];
|
||||
ALIGN16 word32 digest[SHA256_DIGEST_SIZE / sizeof(word32)];
|
||||
ALIGN16 word32 buffer[SHA256_BLOCK_SIZE / sizeof(word32)];
|
||||
#ifdef WOLFSSL_PIC32MZ_HASH
|
||||
pic32mz_desc desc ; /* Crypt Engine descriptor */
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue