initial ARMv8 instructions

pull/538/head
Jacob Barthelmeh 2016-07-22 15:49:15 +00:00
parent a0b02236b8
commit 41912b92c6
7 changed files with 1151 additions and 7 deletions

View File

@ -542,6 +542,24 @@ fi
AM_CONDITIONAL([BUILD_AESCCM], [test "x$ENABLED_AESCCM" = "xyes"])
# AES-ARM
AC_ARG_ENABLE([armasm],
[AS_HELP_STRING([--enable-armasm],[Enable wolfSSL ARM ASM support (default: disabled)])],
[ ENABLED_ARMASM=$enableval ],
[ ENABLED_ARMASM=no ]
)
if test "$ENABLED_ARMASM" = "yes"
then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM"
if test "$GCC" = "yes"
then
# GCC needs this flag
AM_CFLAGS="$AM_CFLAGS -mcpu=generic+crypto"
fi
fi
AM_CONDITIONAL([BUILD_ARMASM], [test "x$ENABLED_ARMASM" = "xyes"])
# AES-NI
AC_ARG_ENABLE([aesni],
[AS_HELP_STRING([--enable-aesni],[Enable wolfSSL AES-NI support (default: disabled)])],
@ -3213,6 +3231,7 @@ echo " * User Crypto: $ENABLED_USER_CRYPTO"
echo " * Fast RSA: $ENABLED_FAST_RSA"
echo " * Async Crypto: $ENABLED_ASYNCCRYPT"
echo " * Cavium: $ENABLED_CAVIUM"
echo " * ARM ASM: $ENABLED_ARMASM"
echo ""
echo "---"

View File

@ -62,9 +62,14 @@ endif
src_libwolfssl_la_SOURCES += \
wolfcrypt/src/hmac.c \
wolfcrypt/src/random.c \
wolfcrypt/src/sha256.c \
wolfcrypt/src/hash.c
if BUILD_ARMASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c
endif
if BUILD_WOLFEVENT
src_libwolfssl_la_SOURCES += wolfcrypt/src/wolfevent.c
endif

View File

@ -110,11 +110,39 @@
#define HAVE_GET_CYCLES
static INLINE word64 get_intel_cycles(void);
static word64 total_cycles;
#define INIT_CYCLE_COUNTER
#define BEGIN_INTEL_CYCLES total_cycles = get_intel_cycles();
#define END_INTEL_CYCLES total_cycles = get_intel_cycles() - total_cycles;
#define SHOW_INTEL_CYCLES printf(" Cycles per byte = %6.2f", \
(float)total_cycles / (numBlocks*sizeof(plain)));
#elif defined(LINUX_CYCLE_COUNT)
#include <linux/perf_event.h>
#include <sys/syscall.h>
#include <unistd.h>
#define HAVE_GET_CYCLES
static word64 begin_cycles;
static word64 total_cycles;
static int cycles = -1;
static struct perf_event_attr atr;
#define INIT_CYCLE_COUNTER do { \
atr.type = PERF_TYPE_HARDWARE; \
atr.config = PERF_COUNT_HW_CPU_CYCLES; \
cycles = syscall(__NR_perf_event_open, &atr, 0, -1, -1, 0); \
} while (0);
#define BEGIN_INTEL_CYCLES read(cycles, &begin_cycles, sizeof(begin_cycles));
#define END_INTEL_CYCLES do { \
read(cycles, &total_cycles, sizeof(total_cycles)); \
total_cycles = total_cycles - begin_cycles; \
} while (0);
#define SHOW_INTEL_CYCLES printf(" Cycles per byte = %6.2f", \
(float)total_cycles / (numBlocks*sizeof(plain)));
#else
#define INIT_CYCLE_COUNTER
#define BEGIN_INTEL_CYCLES
#define END_INTEL_CYCLES
#define SHOW_INTEL_CYCLES
@ -277,6 +305,7 @@ int benchmark_test(void *args)
#endif
wolfCrypt_Init();
INIT_CYCLE_COUNTER
#if defined(DEBUG_WOLFSSL) && !defined(HAVE_VALGRIND)
wolfSSL_Debugging_ON();

View File

@ -344,7 +344,160 @@ void wc_AesAsyncFree(Aes* aes)
#ifdef HAVE_AES_DECRYPT
#error nRF51 AES Hardware does not support decrypt
#endif /* HAVE_AES_DECRYPT */
#elif defined(WOLFSSL_ARMASM)
static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
{
byte* keyPt = (byte*)aes->key;
word32 rounds = aes->rounds;
byte out[AES_BLOCK_SIZE];
byte* output = out;
byte* input = (byte*)inBlock;
/*
AESE exor's input with round key
shift rows of exor'ed result
sub bytes for shifted rows
*/
__asm__ __volatile__ (
"LD1 {v0.16b}, [%[CtrIn]], #16 \n"
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
"AESE v0.16b, v1.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v2.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v3.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v4.16b \n"
"AESMC v0.16b, v0.16b \n"
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
"AESE v0.16b, v1.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v2.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v3.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v4.16b \n"
"AESMC v0.16b, v0.16b \n"
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
"AESE v0.16b, v1.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v2.16b \n"
"#subtract rounds done so far and see if should continue\n"
"MOV w12, %w[R] \n"
"SUB w12, w12, #10 \n"
"CBZ w12, final \n"
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v1.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v2.16b \n"
"SUB w12, w12, #2 \n"
"CBZ w12, final \n"
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v1.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v2.16b \n"
"#Final AddRoundKey then store result \n"
"final: \n"
"LD1 {v1.16b}, [%[Key]], #16 \n"
"EOR v0.16b, v0.16b, v1.16b \n"
"ST1 {v0.16b}, [%[CtrOut]] \n"
:[CtrOut] "=r" (output), "=r" (keyPt), "=r" (rounds)
:[Key] "1" (keyPt), [R] "2" (rounds), [CtrIn] "r" (input), "0" (output)
: "cc", "memory", "w12"
);
XMEMCPY(outBlock, out, AES_BLOCK_SIZE);
return 0;
}
#ifdef HAVE_AES_DECRYPT
static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
{
byte* keyPt = (byte*)aes->key;
word32 rounds = aes->rounds;
byte out[AES_BLOCK_SIZE];
byte* output = out;
byte* input = (byte*)inBlock;
/*
AESE exor's input with round key
shift rows of exor'ed result
sub bytes for shifted rows
*/
__asm__ __volatile__ (
"LD1 {v0.16b}, [%[CtrIn]], #16 \n"
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
"AESD v0.16b, v1.16b \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v2.16b \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v3.16b \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v4.16b \n"
"AESIMC v0.16b, v0.16b \n"
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
"AESD v0.16b, v1.16b \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v2.16b \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v3.16b \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v4.16b \n"
"AESIMC v0.16b, v0.16b \n"
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
"AESD v0.16b, v1.16b \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v2.16b \n"
"#subtract rounds done so far and see if should continue\n"
"MOV w12, %w[R] \n"
"SUB w12, w12, #10 \n"
"CBZ w12, finalDec \n"
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v1.16b \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v2.16b \n"
"SUB w12, w12, #2 \n"
"CBZ w12, finalDec \n"
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v1.16b \n"
"AESIMC v0.16b, v0.16b \n"
"AESD v0.16b, v2.16b \n"
"#Final AddRoundKey then store result \n"
"finalDec: \n"
"LD1 {v1.16b}, [%[Key]], #16 \n"
"EOR v0.16b, v0.16b, v1.16b \n"
"ST1 {v0.4s}, [%[CtrOut]] \n"
:[CtrOut] "=r" (output), "=r" (keyPt), "=r" (rounds), "=r" (input)
:[Key] "1" (keyPt), [R] "2" (rounds), [CtrIn] "3" (input), "0" (output)
: "cc", "memory", "w12"
);
XMEMCPY(outBlock, out, AES_BLOCK_SIZE);
return 0;
}
#endif /* HAVE_AES_DECRYPT */
#else
/* using wolfCrypt software AES implementation */
@ -1533,7 +1686,6 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
}
#endif /* HAVE_AES_DECRYPT */
#endif /* HAVE_AES_CBC || WOLFSSL_AES_DIRECT */
#endif /* NEED_AES_TABLES */
@ -1678,6 +1830,196 @@ static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
{
return wc_AesSetKey(aes, userKey, keylen, iv, dir);
}
#elif defined(WOLFSSL_ARMASM)
static const byte rcon[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,0x1B, 0x36
/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
};
/* Similar to wolfSSL software implementation of expanding the AES key.
* Changed out the locations of where table look ups where made to
* use hardware instruction. Also altered decryption key to match. */
int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
const byte* iv, int dir)
{
word32 temp, *rk = aes->key;
unsigned int i = 0;
#if defined(AES_MAX_KEY_SIZE)
const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
#endif
if (!((keylen == 16) || (keylen == 24) || (keylen == 32)))
return BAD_FUNC_ARG;
#if defined(AES_MAX_KEY_SIZE)
/* Check key length */
if (keylen > max_key_len) {
return BAD_FUNC_ARG;
}
#endif
#ifdef WOLFSSL_AES_COUNTER
aes->left = 0;
#endif /* WOLFSSL_AES_COUNTER */
aes->rounds = keylen/4 + 6;
XMEMCPY(rk, userKey, keylen);
switch(keylen)
{
#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128
case 16:
while (1)
{
temp = rk[3];
/* get table value from hardware */
__asm__ volatile (
"DUP v1.4s, %w[in] \n"
"MOVI v0.16b, #0 \n"
"AESE v0.16b, v1.16b \n"
"UMOV %w[out], v0.4s[0] \n"
: [out] "=r"(temp)
: [in] "r" (temp)
: "cc", "memory", "v0", "v1"
);
temp = rotrFixed(temp, 8);
rk[4] = rk[0] ^ temp ^ rcon[i];
rk[5] = rk[4] ^ rk[1];
rk[6] = rk[5] ^ rk[2];
rk[7] = rk[6] ^ rk[3];
if (++i == 10)
break;
rk += 4;
}
break;
#endif /* 128 */
#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 192
case 24:
/* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */
while (1)
{
temp = rk[5];
/* get table value from hardware */
__asm__ volatile (
"DUP v1.4s, %w[in] \n"
"MOVI v0.16b, #0 \n"
"AESE v0.16b, v1.16b \n"
"UMOV %w[out], v0.4s[0] \n"
: [out] "=r"(temp)
: [in] "r" (temp)
: "cc", "memory", "v0", "v1"
);
temp = rotrFixed(temp, 8);
rk[ 6] = rk[ 0] ^ temp ^ rcon[i];
rk[ 7] = rk[ 1] ^ rk[ 6];
rk[ 8] = rk[ 2] ^ rk[ 7];
rk[ 9] = rk[ 3] ^ rk[ 8];
if (++i == 8)
break;
rk[10] = rk[ 4] ^ rk[ 9];
rk[11] = rk[ 5] ^ rk[10];
rk += 6;
}
break;
#endif /* 192 */
#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256
case 32:
while (1)
{
temp = rk[7];
/* get table value from hardware */
__asm__ volatile (
"DUP v1.4s, %w[in] \n"
"MOVI v0.16b, #0 \n"
"AESE v0.16b, v1.16b \n"
"UMOV %w[out], v0.4s[0] \n"
: [out] "=r"(temp)
: [in] "r" (temp)
: "cc", "memory", "v0", "v1"
);
temp = rotrFixed(temp, 8);
rk[8] = rk[0] ^ temp ^ rcon[i];
rk[ 9] = rk[ 1] ^ rk[ 8];
rk[10] = rk[ 2] ^ rk[ 9];
rk[11] = rk[ 3] ^ rk[10];
if (++i == 7)
break;
temp = rk[11];
/* get table value from hardware */
__asm__ volatile (
"DUP v1.4s, %w[in] \n"
"MOVI v0.16b, #0 \n"
"AESE v0.16b, v1.16b \n"
"UMOV %w[out], v0.4s[0] \n"
: [out] "=r"(temp)
: [in] "r" (temp)
: "cc", "memory", "v0", "v1"
);
rk[12] = rk[ 4] ^ temp;
rk[13] = rk[ 5] ^ rk[12];
rk[14] = rk[ 6] ^ rk[13];
rk[15] = rk[ 7] ^ rk[14];
rk += 8;
}
break;
#endif /* 256 */
default:
return BAD_FUNC_ARG;
}
if (dir == AES_DECRYPTION)
{
#ifdef HAVE_AES_DECRYPT
unsigned int j;
rk = aes->key;
/* invert the order of the round keys: */
for (i = 0, j = 4* aes->rounds; i < j; i += 4, j -= 4) {
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
}
/* apply the inverse MixColumn transform to all round keys but the
first and the last: */
for (i = 1; i < aes->rounds; i++) {
rk += 4;
__asm__ volatile (
"LD1 {v0.16b}, [%[in]] \n"
"AESIMC v0.16b, v0.16b \n"
"ST1 {v0.16b}, [%[out]]\n"
: [out] "=r" (rk)
: [in] "0" (rk)
: "cc", "memory", "v0"
);
}
#else
WOLFSSL_MSG("AES Decryption not compiled in");
return BAD_FUNC_ARG;
#endif /* HAVE_AES_DECRYPT */
}
return wc_AesSetIV(aes, iv);
}
#if defined(WOLFSSL_AES_DIRECT)
int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
const byte* iv, int dir)
{
return wc_AesSetKey(aes, userKey, keylen, iv, dir);
}
#endif
#else
static int wc_AesSetKeyLocal(Aes* aes, const byte* userKey, word32 keylen,
const byte* iv, int dir)
@ -2859,7 +3201,7 @@ static INLINE void IncrementGcmCounter(byte* inOutCtr)
}
#if defined(GCM_SMALL) || defined(GCM_TABLE)
#if defined(GCM_SMALL) || defined(GCM_TABLE) || defined(WOLFSSL_ARMASM)
static INLINE void FlattenSzInBits(byte* buf, word32 sz)
{
@ -2943,6 +3285,20 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
if (ret == 0) {
wc_AesEncrypt(aes, iv, aes->H);
#if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
{
word32* pt = (word32*)aes->H;
__asm__ volatile (
"LD1 {v0.16b}, [%[h]] \n"
"RBIT v0.16b, v0.16b \n"
"ST1 {v0.16b}, [%[out]] \n"
: [out] "=r" (pt)
: [h] "0" (pt)
: "cc", "memory"
);
return ret; /* no need to generate GCM_TABLE */
}
#endif
#ifdef GCM_TABLE
GenerateM0(aes);
#endif /* GCM_TABLE */
@ -3379,8 +3735,118 @@ static int AES_GCM_decrypt(const unsigned char *in,
#endif /* WOLFSSL_AESNI */
#if defined(GCM_SMALL)
#if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
/* PMULL and RBIT only with AArch64 */
/* Use ARM hardware for polynomial multiply */
static void GMULT(byte* X, byte* Y)
{
word32* Xpt = (word32*)X;
word32* Ypt = (word32*)Y;
__asm__ volatile (
"LD1 {v0.16b}, [%[inX]] \n"
"LD1 {v1.16b}, [%[inY]] \n" /* v1 already reflected from set key */
"RBIT v0.16b, v0.16b \n"
/* Algorithm 1 from Intel GCM white paper.
"Carry-Less Multiplication and Its Usage for Computing the GCM Mode"
*/
"PMULL v3.1q, v0.1d, v1.1d \n" /* a0 * b0 = C */
"PMULL2 v4.1q, v0.2d, v1.2d \n" /* a1 * b1 = D */
"EXT v5.16b, v1.16b, v1.16b, #8 \n" /* b0b1 -> b1b0 */
"PMULL v6.1q, v0.1d, v5.1d \n" /* a0 * b1 = E */
"PMULL2 v5.1q, v0.2d, v5.2d \n" /* a1 * b0 = F */
"#Set a register to all 0s using EOR \n"
"EOR v7.16b, v7.16b, v7.16b \n"
"EOR v5.16b, v5.16b, v6.16b \n" /* F ^ E */
"EXT v6.16b, v7.16b, v5.16b, #8 \n" /* get (F^E)[0] */
"EOR v3.16b, v3.16b, v6.16b \n" /* low 128 bits in v3 */
"EXT v6.16b, v5.16b, v7.16b, #8 \n" /* get (F^E)[1] */
"EOR v4.16b, v4.16b, v6.16b \n" /* high 128 bits in v4 */
/* Based from White Paper "Implementing GCM on ARMv8"
by Conrado P.L. Gouvea and Julio Lopez
reduction on 256bit value using Algorithm 5 */
"MOVI v8.16b, #0x87 \n"
"USHR v8.2d, v8.2d, #56 \n"
/* v8 is now 0x00000000000000870000000000000087 reflected 0xe1....*/
"PMULL2 v5.1q, v4.2d, v8.2d \n"
"EXT v6.16b, v5.16b, v7.16b, #8 \n" /* v7 is all 0's */
"EOR v4.16b, v4.16b, v6.16b \n"
"EXT v6.16b, v7.16b, v5.16b, #8 \n"
"EOR v3.16b, v3.16b, v6.16b \n"
"PMULL v5.1q, v4.1d, v8.1d \n"
"EOR v4.16b, v3.16b, v5.16b \n"
"RBIT v4.16b, v4.16b \n"
"STR q4, [%[out]] \n"
: [out] "=r" (Xpt), "=r" (Ypt)
: [inX] "0" (Xpt), [inY] "1" (Ypt)
: "cc", "memory", "v3", "v4", "v5", "v6", "v7", "v8"
);
}
/* Currently is a copy from GCM_SMALL wolfSSL version. Duplicated and set
* seperate for future optimizations. */
static void GHASH(Aes* aes, const byte* a, word32 aSz,
const byte* c, word32 cSz, byte* s, word32 sSz)
{
byte x[AES_BLOCK_SIZE];
byte scratch[AES_BLOCK_SIZE];
word32 blocks, partial;
byte* h = aes->H;
XMEMSET(x, 0, AES_BLOCK_SIZE);
/* Hash in A, the Additional Authentication Data */
if (aSz != 0 && a != NULL) {
blocks = aSz / AES_BLOCK_SIZE;
partial = aSz % AES_BLOCK_SIZE;
while (blocks--) {
xorbuf(x, a, AES_BLOCK_SIZE);
GMULT(x, h);
a += AES_BLOCK_SIZE;
}
if (partial != 0) {
XMEMSET(scratch, 0, AES_BLOCK_SIZE);
XMEMCPY(scratch, a, partial);
xorbuf(x, scratch, AES_BLOCK_SIZE);
GMULT(x, h);
}
}
/* Hash in C, the Ciphertext */
if (cSz != 0 && c != NULL) {
blocks = cSz / AES_BLOCK_SIZE;
partial = cSz % AES_BLOCK_SIZE;
while (blocks--) {
xorbuf(x, c, AES_BLOCK_SIZE);
GMULT(x, h);
c += AES_BLOCK_SIZE;
}
if (partial != 0) {
XMEMSET(scratch, 0, AES_BLOCK_SIZE);
XMEMCPY(scratch, c, partial);
xorbuf(x, scratch, AES_BLOCK_SIZE);
GMULT(x, h);
}
}
/* Hash in the lengths of A and C in bits */
FlattenSzInBits(&scratch[0], aSz);
FlattenSzInBits(&scratch[8], cSz);
xorbuf(x, scratch, AES_BLOCK_SIZE);
GMULT(x, h);
/* Copy the result into s. */
XMEMCPY(s, x, sSz);
}
/* not using ARMASM for multiplication */
#elif defined(GCM_SMALL)
static void GMULT(byte* X, byte* Y)
{
byte Z[AES_BLOCK_SIZE];

View File

@ -44,7 +44,8 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \
wolfcrypt/src/port/ti/ti-hash.c \
wolfcrypt/src/port/ti/ti-ccm.c \
wolfcrypt/src/port/pic32/pic32mz-hash.c \
wolfcrypt/src/port/nrf51.c
wolfcrypt/src/port/nrf51.c \
wolfcrypt/src/port/arm/armv8-sha256.c
if BUILD_CAVIUM
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/cavium/cavium_nitrox.c

View File

@ -0,0 +1,608 @@
/* armv8-sha256.c
*
* Copyright (C) 2006-2016 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <wolfssl/wolfcrypt/settings.h>
#if !defined(NO_SHA256) && defined(WOLFSSL_ARMASM)
#include <wolfssl/wolfcrypt/sha256.h>
#include <wolfssl/wolfcrypt/logging.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#ifdef NO_INLINE
#include <wolfssl/wolfcrypt/misc.h>
#else
#define WOLFSSL_MISC_INCLUDED
#include <wolfcrypt/src/misc.c>
#endif
#ifndef WOLFSSL_HAVE_MIN
#define WOLFSSL_HAVE_MIN
static INLINE word32 min(word32 a, word32 b)
{
return a > b ? b : a;
}
#endif /* WOLFSSL_HAVE_MIN */
#if !defined (ALIGN32)
#if defined (__GNUC__)
#define ALIGN32 __attribute__ ( (aligned (32)))
#elif defined(_MSC_VER)
/* disable align warning, we want alignment ! */
#pragma warning(disable: 4324)
#define ALIGN32 __declspec (align (32))
#else
#define ALIGN32
#endif
#endif
static const ALIGN32 word32 K[64] = {
0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
};
int wc_InitSha256(Sha256* sha256)
{
int ret = 0;
sha256->digest[0] = 0x6A09E667L;
sha256->digest[1] = 0xBB67AE85L;
sha256->digest[2] = 0x3C6EF372L;
sha256->digest[3] = 0xA54FF53AL;
sha256->digest[4] = 0x510E527FL;
sha256->digest[5] = 0x9B05688CL;
sha256->digest[6] = 0x1F83D9ABL;
sha256->digest[7] = 0x5BE0CD19L;
sha256->buffLen = 0;
sha256->loLen = 0;
sha256->hiLen = 0;
return ret;
}
static INLINE void AddLength(Sha256* sha256, word32 len)
{
word32 tmp = sha256->loLen;
if ( (sha256->loLen += len) < tmp)
sha256->hiLen++; /* carry low to high */
}
/* ARMv8 hardware accleration */
int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
{
/* do block size increments */
byte* local = (byte*)sha256->buffer;
while (len) {
word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen);
XMEMCPY(&local[sha256->buffLen], data, add);
sha256->buffLen += add;
data += add;
len -= add;
if (sha256->buffLen == SHA256_BLOCK_SIZE) {
word32* Kpt = (word32*)K;
word32* bufferPt = sha256->buffer;
word32* digestPt = sha256->digest;
__asm__ volatile (
"#load in message and schedual updates \n"
"LD1 {v4.16b-v7.16b}, [%[buffer]] \n"
"MOV v0.16b, v4.16b \n"
"MOV v1.16b, v5.16b \n"
"REV32 v0.16b, v0.16b \n"
"MOV v2.16b, v6.16b \n"
"REV32 v1.16b, v1.16b \n"
"MOV v3.16b, v7.16b \n"
"REV32 v2.16b, v2.16b \n"
"REV32 v3.16b, v3.16b \n"
"MOV v4.16b, v0.16b \n"
"MOV v5.16b, v1.16b \n"
"SHA256SU0 v4.4s, v1.4s \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"MOV v6.16b, v2.16b \n"
"SHA256SU0 v5.4s, v2.4s \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"MOV v7.16b, v3.16b \n"
"SHA256SU0 v6.4s, v3.4s \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"MOV v8.16b, v4.16b \n"
"SHA256SU0 v7.4s, v4.4s \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"MOV v9.16b, v5.16b \n"
"SHA256SU0 v8.4s, v5.4s \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"MOV v10.16b, v6.16b \n"
"SHA256SU0 v9.4s, v6.4s \n"
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"MOV v11.16b, v7.16b \n"
"SHA256SU0 v10.4s, v7.4s \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"MOV v12.16b, v8.16b \n"
"SHA256SU0 v11.4s, v8.4s \n"
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
"MOV v13.16b, v9.16b \n"
"SHA256SU0 v12.4s, v9.4s \n"
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
"MOV v14.16b, v10.16b \n"
"SHA256SU0 v13.4s, v10.4s \n"
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
"MOV v15.16b, v11.16b \n"
"SHA256SU0 v14.4s, v11.4s \n"
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
"SHA256SU0 v15.4s, v12.4s \n"
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
"#Add K values to message \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v0.4s, v0.4s, v16.4s \n"
"ADD v1.4s, v1.4s, v17.4s \n"
"ADD v2.4s, v2.4s, v18.4s \n"
"ADD v3.4s, v3.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v4.4s, v4.4s, v16.4s \n"
"ADD v5.4s, v5.4s, v17.4s \n"
"ADD v6.4s, v6.4s, v18.4s \n"
"ADD v7.4s, v7.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v8.4s, v8.4s, v16.4s \n"
"ADD v9.4s, v9.4s, v17.4s \n"
"ADD v10.4s, v10.4s, v18.4s \n"
"ADD v11.4s, v11.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]] \n"
"ADD v12.4s, v12.4s, v16.4s \n"
"ADD v13.4s, v13.4s, v17.4s \n"
"LD1 {v20.4s-v21.4s}, [%[digest]] \n"
"ADD v14.4s, v14.4s, v18.4s \n"
"ADD v15.4s, v15.4s, v19.4s \n"
"#SHA256 operation on updated message \n"
"MOV v16.16b, v20.16b \n"
"MOV v17.16b, v21.16b \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v0.4s \n"
"SHA256H2 q17, q18, v0.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v1.4s \n"
"SHA256H2 q17, q18, v1.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v2.4s \n"
"SHA256H2 q17, q18, v2.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v3.4s \n"
"SHA256H2 q17, q18, v3.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v4.4s \n"
"SHA256H2 q17, q18, v4.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v5.4s \n"
"SHA256H2 q17, q18, v5.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v6.4s \n"
"SHA256H2 q17, q18, v6.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v7.4s \n"
"SHA256H2 q17, q18, v7.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v8.4s \n"
"SHA256H2 q17, q18, v8.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v9.4s \n"
"SHA256H2 q17, q18, v9.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v10.4s \n"
"SHA256H2 q17, q18, v10.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v11.4s \n"
"SHA256H2 q17, q18, v11.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v12.4s \n"
"SHA256H2 q17, q18, v12.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v13.4s \n"
"SHA256H2 q17, q18, v13.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v14.4s \n"
"SHA256H2 q17, q18, v14.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v15.4s \n"
"SHA256H2 q17, q18, v15.4s \n"
"#Add working vars back into digest state \n"
"ADD v16.4s, v16.4s, v20.4s \n"
"ADD v17.4s, v17.4s, v21.4s \n"
"STP q16, q17, [%[out]] \n"
: "=r" (Kpt), [out] "=r" (digestPt), "=r" (bufferPt)
: [k] "0" (Kpt), [digest] "1" (digestPt), [buffer] "2" (bufferPt)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21"
);
AddLength(sha256, SHA256_BLOCK_SIZE);
sha256->buffLen = 0;
}
}
return 0;
}
int wc_Sha256Final(Sha256* sha256, byte* hash)
{
byte* local = (byte*)sha256->buffer;
word32* Kpt = (word32*)K;
word32* bufferPt = sha256->buffer;
word32* digestPt = sha256->digest;
word32* hashPt = (word32*)hash;
AddLength(sha256, sha256->buffLen); /* before adding pads */
local[sha256->buffLen++] = 0x80; /* add 1 */
/* pad with zeros */
if (sha256->buffLen > SHA256_PAD_SIZE) {
XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen);
sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
bufferPt = sha256->buffer;
digestPt = sha256->digest;
Kpt = (word32*)K;
__asm__ volatile (
"#load in message and schedual updates \n"
"LD1 {v4.16b-v7.16b}, [%[buffer]] \n"
"MOV v0.16b, v4.16b \n"
"MOV v1.16b, v5.16b \n"
"REV32 v0.16b, v0.16b \n"
"MOV v2.16b, v6.16b \n"
"REV32 v1.16b, v1.16b \n"
"MOV v3.16b, v7.16b \n"
"REV32 v2.16b, v2.16b \n"
"REV32 v3.16b, v3.16b \n"
"MOV v4.16b, v0.16b \n"
"MOV v5.16b, v1.16b \n"
"SHA256SU0 v4.4s, v1.4s \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"MOV v6.16b, v2.16b \n"
"SHA256SU0 v5.4s, v2.4s \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"MOV v7.16b, v3.16b \n"
"SHA256SU0 v6.4s, v3.4s \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"MOV v8.16b, v4.16b \n"
"SHA256SU0 v7.4s, v4.4s \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"MOV v9.16b, v5.16b \n"
"SHA256SU0 v8.4s, v5.4s \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"MOV v10.16b, v6.16b \n"
"SHA256SU0 v9.4s, v6.4s \n"
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"MOV v11.16b, v7.16b \n"
"SHA256SU0 v10.4s, v7.4s \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"MOV v12.16b, v8.16b \n"
"SHA256SU0 v11.4s, v8.4s \n"
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
"MOV v13.16b, v9.16b \n"
"SHA256SU0 v12.4s, v9.4s \n"
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
"MOV v14.16b, v10.16b \n"
"SHA256SU0 v13.4s, v10.4s \n"
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
"MOV v15.16b, v11.16b \n"
"SHA256SU0 v14.4s, v11.4s \n"
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
"SHA256SU0 v15.4s, v12.4s \n"
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
"#Add K values to message \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v0.4s, v0.4s, v16.4s \n"
"ADD v1.4s, v1.4s, v17.4s \n"
"ADD v2.4s, v2.4s, v18.4s \n"
"ADD v3.4s, v3.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v4.4s, v4.4s, v16.4s \n"
"ADD v5.4s, v5.4s, v17.4s \n"
"ADD v6.4s, v6.4s, v18.4s \n"
"ADD v7.4s, v7.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v8.4s, v8.4s, v16.4s \n"
"ADD v9.4s, v9.4s, v17.4s \n"
"ADD v10.4s, v10.4s, v18.4s \n"
"ADD v11.4s, v11.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]] \n"
"ADD v12.4s, v12.4s, v16.4s \n"
"ADD v13.4s, v13.4s, v17.4s \n"
"LD1 {v20.4s-v21.4s}, [%[digest]] \n"
"ADD v14.4s, v14.4s, v18.4s \n"
"ADD v15.4s, v15.4s, v19.4s \n"
"#SHA256 operation on updated message \n"
"MOV v16.16b, v20.16b \n"
"MOV v17.16b, v21.16b \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v0.4s \n"
"SHA256H2 q17, q18, v0.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v1.4s \n"
"SHA256H2 q17, q18, v1.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v2.4s \n"
"SHA256H2 q17, q18, v2.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v3.4s \n"
"SHA256H2 q17, q18, v3.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v4.4s \n"
"SHA256H2 q17, q18, v4.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v5.4s \n"
"SHA256H2 q17, q18, v5.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v6.4s \n"
"SHA256H2 q17, q18, v6.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v7.4s \n"
"SHA256H2 q17, q18, v7.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v8.4s \n"
"SHA256H2 q17, q18, v8.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v9.4s \n"
"SHA256H2 q17, q18, v9.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v10.4s \n"
"SHA256H2 q17, q18, v10.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v11.4s \n"
"SHA256H2 q17, q18, v11.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v12.4s \n"
"SHA256H2 q17, q18, v12.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v13.4s \n"
"SHA256H2 q17, q18, v13.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v14.4s \n"
"SHA256H2 q17, q18, v14.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v15.4s \n"
"SHA256H2 q17, q18, v15.4s \n"
"#Add working vars back into digest state \n"
"ADD v16.4s, v16.4s, v20.4s \n"
"ADD v17.4s, v17.4s, v21.4s \n"
"STP q16, q17, [%[out]] \n"
: "=r" (Kpt), [out] "=r" (digestPt), "=r" (bufferPt)
: [k] "0" (Kpt), [digest] "1" (digestPt), [buffer] "2" (bufferPt)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21"
);
sha256->buffLen = 0;
}
XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen);
/* put lengths in bits */
sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) +
(sha256->hiLen << 3);
sha256->loLen = sha256->loLen << 3;
/* store lengths */
#if defined(LITTLE_ENDIAN_ORDER)
bufferPt = sha256->buffer;
__asm__ volatile (
"LD1 {v0.16b}, [%[in]] \n"
"REV32 v0.16b, v0.16b \n"
"ST1 {v0.16b}, [%[out]], #16 \n"
"LD1 {v0.16b}, [%[in]] \n"
"REV32 v0.16b, v0.16b \n"
"ST1 {v0.16b}, [%[out]], #16 \n"
"LD1 {v0.16b}, [%[in]] \n"
"REV32 v0.16b, v0.16b \n"
"ST1 {v0.16b}, [%[out]], #16 \n"
"LD1 {v0.16b}, [%[in]] \n"
"REV32 v0.16b, v0.16b \n"
"ST1 {v0.16b}, [%[out]] \n"
: [out] "=r" (bufferPt)
: [in] "0" (bufferPt)
: "cc"
);
#endif
/* ! length ordering dependent on digest endian type ! */
XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
sizeof(word32));
bufferPt = sha256->buffer;
digestPt = sha256->digest;
Kpt = (word32*)K;
__asm__ volatile (
"#load in message and schedual updates \n"
"LD1 {v4.16b-v7.16b}, [%[buffer]] \n"
"MOV v0.16b, v4.16b \n"
"MOV v1.16b, v5.16b \n"
"MOV v2.16b, v6.16b \n"
"MOV v3.16b, v7.16b \n"
"SHA256SU0 v4.4s, v1.4s \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"MOV v6.16b, v2.16b \n"
"SHA256SU0 v5.4s, v2.4s \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"MOV v7.16b, v3.16b \n"
"SHA256SU0 v6.4s, v3.4s \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"MOV v8.16b, v4.16b \n"
"SHA256SU0 v7.4s, v4.4s \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"MOV v9.16b, v5.16b \n"
"SHA256SU0 v8.4s, v5.4s \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"MOV v10.16b, v6.16b \n"
"SHA256SU0 v9.4s, v6.4s \n"
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"MOV v11.16b, v7.16b \n"
"SHA256SU0 v10.4s, v7.4s \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"MOV v12.16b, v8.16b \n"
"SHA256SU0 v11.4s, v8.4s \n"
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
"MOV v13.16b, v9.16b \n"
"SHA256SU0 v12.4s, v9.4s \n"
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
"MOV v14.16b, v10.16b \n"
"SHA256SU0 v13.4s, v10.4s \n"
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
"MOV v15.16b, v11.16b \n"
"SHA256SU0 v14.4s, v11.4s \n"
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
"SHA256SU0 v15.4s, v12.4s \n"
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
"#Add K values to message \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v0.4s, v0.4s, v16.4s \n"
"ADD v1.4s, v1.4s, v17.4s \n"
"ADD v2.4s, v2.4s, v18.4s \n"
"ADD v3.4s, v3.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v4.4s, v4.4s, v16.4s \n"
"ADD v5.4s, v5.4s, v17.4s \n"
"ADD v6.4s, v6.4s, v18.4s \n"
"ADD v7.4s, v7.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v8.4s, v8.4s, v16.4s \n"
"ADD v9.4s, v9.4s, v17.4s \n"
"ADD v10.4s, v10.4s, v18.4s \n"
"ADD v11.4s, v11.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]] \n"
"ADD v12.4s, v12.4s, v16.4s \n"
"ADD v13.4s, v13.4s, v17.4s \n"
"LD1 {v20.4s-v21.4s}, [%[digest]] \n"
"ADD v14.4s, v14.4s, v18.4s \n"
"ADD v15.4s, v15.4s, v19.4s \n"
"#SHA256 operation on updated message \n"
"MOV v16.16b, v20.16b \n"
"MOV v17.16b, v21.16b \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v0.4s \n"
"SHA256H2 q17, q18, v0.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v1.4s \n"
"SHA256H2 q17, q18, v1.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v2.4s \n"
"SHA256H2 q17, q18, v2.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v3.4s \n"
"SHA256H2 q17, q18, v3.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v4.4s \n"
"SHA256H2 q17, q18, v4.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v5.4s \n"
"SHA256H2 q17, q18, v5.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v6.4s \n"
"SHA256H2 q17, q18, v6.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v7.4s \n"
"SHA256H2 q17, q18, v7.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v8.4s \n"
"SHA256H2 q17, q18, v8.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v9.4s \n"
"SHA256H2 q17, q18, v9.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v10.4s \n"
"SHA256H2 q17, q18, v10.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v11.4s \n"
"SHA256H2 q17, q18, v11.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v12.4s \n"
"SHA256H2 q17, q18, v12.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v13.4s \n"
"SHA256H2 q17, q18, v13.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v14.4s \n"
"SHA256H2 q17, q18, v14.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v15.4s \n"
"SHA256H2 q17, q18, v15.4s \n"
"#Add working vars back into digest state \n"
"ADD v16.4s, v16.4s, v20.4s \n"
"ADD v17.4s, v17.4s, v21.4s \n"
"STP q16, q17, [%[out]] \n"
"#Store value as hash output \n"
#if defined(LITTLE_ENDIAN_ORDER)
"REV32 v16.16b, v16.16b \n"
#endif
"ST1 {v16.16b}, [%[hashOut]], #16 \n"
#if defined(LITTLE_ENDIAN_ORDER)
"REV32 v17.16b, v17.16b \n"
#endif
"ST1 {v17.16b}, [%[hashOut]] \n"
: "=r" (Kpt), [out] "=r" (digestPt), "=r" (bufferPt),
[hashOut] "=r" (hashPt)
: [k] "0" (Kpt), [digest] "1" (digestPt), [buffer] "2" (bufferPt), "3" (hashPt)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21"
);
return wc_InitSha256(sha256); /* reset state */
}
#endif /* NO_SHA256 and WOLFSSL_ARMASM */

View File

@ -54,13 +54,29 @@ enum {
#ifndef WOLFSSL_TI_HASH
#ifdef WOLFSSL_ARMASM /* slight performance increase with aligned memory */
#if !defined (ALIGN16)
#if defined (__GNUC__)
#define ALIGN16 __attribute__ ( (aligned (16)))
#elif defined(_MSC_VER)
/* disable align warning, we want alignment ! */
#pragma warning(disable: 4324)
#define ALIGN16 __declspec (align (16))
#else
#define ALIGN16
#endif
#endif
#else /* not using ARM ASM*/
#define ALIGN16
#endif /* WOLFSSL_ARMASM */
/* Sha256 digest */
typedef struct Sha256 {
word32 buffLen; /* in bytes */
word32 loLen; /* length in bytes */
word32 hiLen; /* length in bytes */
word32 digest[SHA256_DIGEST_SIZE / sizeof(word32)];
word32 buffer[SHA256_BLOCK_SIZE / sizeof(word32)];
ALIGN16 word32 digest[SHA256_DIGEST_SIZE / sizeof(word32)];
ALIGN16 word32 buffer[SHA256_BLOCK_SIZE / sizeof(word32)];
#ifdef WOLFSSL_PIC32MZ_HASH
pic32mz_desc desc ; /* Crypt Engine descriptor */
#endif