mirror of https://github.com/wolfSSL/wolfssl.git
Aarch64 Poly1305 ASM: Improve performance
Do as many multiplications in base 64 rather than 26 with normal integer registers.pull/7859/head
parent
2a08d3001c
commit
3725594020
|
@ -53,137 +53,112 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
|
||||||
const unsigned char *m, size_t bytes)
|
const unsigned char *m, size_t bytes)
|
||||||
{
|
{
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
|
/* Check for zero bytes to do. */
|
||||||
"CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
|
"CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
|
||||||
"BLO L_poly1305_aarch64_16_64_done_%= \n\t"
|
"BLO L_poly1305_aarch64_16_done_%= \n\t"
|
||||||
/* Load r and h */
|
|
||||||
"LDP x21, x23, %[ctx_r] \n\t"
|
"MOV x12, #1 \n\t"
|
||||||
"LDR w25, %[ctx_r_4] \n\t"
|
/* Load h */
|
||||||
"LDP x2, x4, %[ctx_h] \n\t"
|
"LDP w4, w5, [%[ctx_h], #0] \n\t"
|
||||||
"LDR w6, %[ctx_h_4] \n\t"
|
"LDP w6, w7, [%[ctx_h], #8] \n\t"
|
||||||
"LSR x22, x21, #32 \n\t"
|
"LDR w8, [%[ctx_h], #16] \n\t"
|
||||||
"LSR x24, x23, #32 \n\t"
|
/* Base 26 -> Base 64 */
|
||||||
"LSR x3, x2, #32 \n\t"
|
"ORR x4, x4, x5, LSL #26\n\t"
|
||||||
"LSR x5, x4, #32 \n\t"
|
"ORR x4, x4, x6, LSL #52\n\t"
|
||||||
"AND x21, x21, #0x3ffffff \n\t"
|
"LSR x5, x6, #12\n\t"
|
||||||
"AND x23, x23, #0x3ffffff \n\t"
|
"ORR x5, x5, x7, LSL #14\n\t"
|
||||||
"AND x2, x2, #0x3ffffff \n\t"
|
"ORR x5, x5, x8, LSL #40\n\t"
|
||||||
"AND x4, x4, #0x3ffffff \n\t"
|
"LSR x6, x8, #24\n\t"
|
||||||
/* s1 = r1 * 5; */
|
/* Load r */
|
||||||
/* s2 = r2 * 5; */
|
"LDP x8, x9, %[ctx_r64] \n\t"
|
||||||
/* s3 = r3 * 5; */
|
"SUB %[finished], x12, %[finished]\n\t"
|
||||||
/* s4 = r4 * 5; */
|
|
||||||
"MOV x15, #5 \n\t"
|
|
||||||
"CMP %[finished], #0 \n\t"
|
|
||||||
"MUL w7, w22, w15 \n\t"
|
|
||||||
"CSET %[finished], EQ \n\t"
|
|
||||||
"MUL w8, w23, w15 \n\t"
|
|
||||||
"LSL %[finished], %[finished], #24 \n\t"
|
|
||||||
"MUL w9, w24, w15 \n\t"
|
|
||||||
"MOV x14, #0x3ffffff \n\t"
|
|
||||||
"MUL w10, w25, w15 \n\t"
|
|
||||||
"\n"
|
"\n"
|
||||||
".align 2 \n\t"
|
".align 2 \n\t"
|
||||||
"L_poly1305_aarch64_16_64_loop_%=: \n\t"
|
"L_poly1305_aarch64_16_loop_%=: \n\t"
|
||||||
/* t0 = U8TO64(&m[0]); */
|
/* Load m */
|
||||||
/* t1 = U8TO64(&m[8]); */
|
"LDR x10, [%[m]] \n\t"
|
||||||
"LDP x16, x17, [%[m]], #16 \n\t"
|
"LDR x11, [%[m], 8] \n\t"
|
||||||
/* h0 += (U8TO32(m + 0)) & 0x3ffffff; */
|
/* Add m and !finished at bit 128. */
|
||||||
"AND x26, x16, #0x3ffffff \n\t"
|
"ADDS x4, x4, x10 \n\t"
|
||||||
"ADD x2, x2, x26 \n\t"
|
"ADCS x5, x5, x11 \n\t"
|
||||||
/* h1 += (U8TO32(m + 3) >> 2) & 0x3ffffff; */
|
"ADC x6, x6, %[finished] \n\t"
|
||||||
"AND x26, x14, x16, LSR #26 \n\t"
|
|
||||||
"ADD x3, x3, x26 \n\t"
|
/* r * h */
|
||||||
/* h2 += (U8TO32(m + 6) >> 4) & 0x3ffffff; */
|
/* r0 * h0 */
|
||||||
"EXTR x26, x17, x16, #52 \n\t"
|
"MUL x12, x8, x4\n\t"
|
||||||
"AND x26, x26, #0x3ffffff \n\t"
|
"UMULH x13, x8, x4\n\t"
|
||||||
"ADD x4, x4, x26 \n\t"
|
/* r0 * h1 */
|
||||||
/* h3 += (U8TO32(m + 9) >> 6) & 0x3ffffff; */
|
"MUL x16, x8, x5\n\t"
|
||||||
"AND x26, x14, x17, LSR #14 \n\t"
|
"UMULH x14, x8, x5\n\t"
|
||||||
"ADD x5, x5, x26 \n\t"
|
/* r1 * h0 */
|
||||||
/* h4 += (U8TO32(m + 12) >> 8) | hibit; */
|
"MUL x15, x9, x4\n\t"
|
||||||
"ORR x17, %[finished], x17, LSR #40 \n\t"
|
"ADDS x13, x13, x16\n\t"
|
||||||
"ADD x6, x6, x17 \n\t"
|
"UMULH x17, x9, x4\n\t"
|
||||||
/* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
|
"ADC x14, x14, xzr\n\t"
|
||||||
/* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
|
"ADDS x13, x13, x15\n\t"
|
||||||
/* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */
|
/* r0 * h2 */
|
||||||
/* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */
|
"MUL x16, x8, x6\n\t"
|
||||||
/* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
|
"ADCS x14, x14, x17\n\t"
|
||||||
"MUL x16, x2, x21 \n\t"
|
"UMULH x17, x8, x6\n\t"
|
||||||
"MUL x17, x2, x22 \n\t"
|
"ADC x15, xzr, xzr\n\t"
|
||||||
"MUL x26, x2, x23 \n\t"
|
"ADDS x14, x14, x16\n\t"
|
||||||
"MUL x19, x2, x24 \n\t"
|
/* r1 * h1 */
|
||||||
"MUL x20, x2, x25 \n\t"
|
"MUL x16, x9, x5\n\t"
|
||||||
"MADD x16, x3, x10, x16 \n\t"
|
"ADC x15, x15, x17\n\t"
|
||||||
"MADD x17, x3, x21, x17 \n\t"
|
"UMULH x19, x9, x5\n\t"
|
||||||
"MADD x26, x3, x22, x26 \n\t"
|
"ADDS x14, x14, x16\n\t"
|
||||||
"MADD x19, x3, x23, x19 \n\t"
|
/* r1 * h2 */
|
||||||
"MADD x20, x3, x24, x20 \n\t"
|
"MUL x17, x9, x6\n\t"
|
||||||
"MADD x16, x4, x9, x16 \n\t"
|
"ADCS x15, x15, x19\n\t"
|
||||||
"MADD x17, x4, x10, x17 \n\t"
|
"UMULH x19, x9, x6\n\t"
|
||||||
"MADD x26, x4, x21, x26 \n\t"
|
"ADC x16, xzr, xzr\n\t"
|
||||||
"MADD x19, x4, x22, x19 \n\t"
|
"ADDS x15, x15, x17\n\t"
|
||||||
"MADD x20, x4, x23, x20 \n\t"
|
"ADC x16, x16, x19\n\t"
|
||||||
"MADD x16, x5, x8, x16 \n\t"
|
/* h' = x12, x13, x14, x15, x16 */
|
||||||
"MADD x17, x5, x9, x17 \n\t"
|
|
||||||
"MADD x26, x5, x10, x26 \n\t"
|
/* h' mod 2^130 - 5 */
|
||||||
"MADD x19, x5, x21, x19 \n\t"
|
/* Get top two bits from h[2]. */
|
||||||
"MADD x20, x5, x22, x20 \n\t"
|
"AND x6, x14, 3\n\t"
|
||||||
"MADD x16, x6, x7, x16 \n\t"
|
/* Get high bits from h[2]. */
|
||||||
"MADD x17, x6, x8, x17 \n\t"
|
"AND x14, x14, -4\n\t"
|
||||||
"MADD x26, x6, x9, x26 \n\t"
|
/* Add top bits * 4. */
|
||||||
"MADD x19, x6, x10, x19 \n\t"
|
"ADDS x4, x12, x14\n\t"
|
||||||
"MADD x20, x6, x21, x20 \n\t"
|
"ADCS x5, x13, x15\n\t"
|
||||||
/* d1 = d1 + d0 >> 26 */
|
"ADC x6, x6, x16\n\t"
|
||||||
/* d2 = d2 + d1 >> 26 */
|
/* Move down 2 bits. */
|
||||||
/* d3 = d3 + d2 >> 26 */
|
"EXTR x14, x15, x14, 2\n\t"
|
||||||
/* d4 = d4 + d3 >> 26 */
|
"EXTR x15, x16, x15, 2\n\t"
|
||||||
/* h0 = d0 & 0x3ffffff */
|
/* Add top bits. */
|
||||||
/* h1 = d1 & 0x3ffffff */
|
"ADDS x4, x4, x14\n\t"
|
||||||
/* h2 = d2 & 0x3ffffff */
|
"ADCS x5, x5, x15\n\t"
|
||||||
/* h0 = h0 + (d4 >> 26) * 5 */
|
"ADC x6, x6, xzr\n\t"
|
||||||
/* h1 = h1 + h0 >> 26 */
|
|
||||||
/* h3 = d3 & 0x3ffffff */
|
"SUBS %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]\n\t"
|
||||||
/* h4 = d4 & 0x3ffffff */
|
"ADD %[m], %[m], %[POLY1305_BLOCK_SIZE]\n\t"
|
||||||
/* h0 = h0 & 0x3ffffff */
|
"BGT L_poly1305_aarch64_16_loop_%=\n\t"
|
||||||
"ADD x17, x17, x16, LSR #26 \n\t"
|
|
||||||
"ADD x20, x20, x19, LSR #26 \n\t"
|
/* Base 64 -> Base 26 */
|
||||||
"AND x16, x16, #0x3ffffff \n\t"
|
"MOV x10, #0x3ffffff\n\t"
|
||||||
"LSR x2, x20, #26 \n\t"
|
"EXTR x8, x6, x5, #40\n\t"
|
||||||
"AND x19, x19, #0x3ffffff \n\t"
|
"AND x7, x10, x5, LSR #14\n\t"
|
||||||
"MADD x16, x2, x15, x16 \n\t"
|
"EXTR x6, x5, x4, #52\n\t"
|
||||||
"ADD x26, x26, x17, LSR #26 \n\t"
|
"AND x5, x10, x4, LSR #26\n\t"
|
||||||
"AND x17, x17, #0x3ffffff \n\t"
|
"AND x4, x4, x10\n\t"
|
||||||
"AND x20, x20, #0x3ffffff \n\t"
|
"AND x6, x6, x10\n\t"
|
||||||
"ADD x19, x19, x26, LSR #26 \n\t"
|
"AND x8, x8, x10\n\t"
|
||||||
"AND x4, x26, #0x3ffffff \n\t"
|
"STP w4, w5, [%[ctx_h], #0] \n\t"
|
||||||
"ADD x3, x17, x16, LSR #26 \n\t"
|
"STP w6, w7, [%[ctx_h], #8] \n\t"
|
||||||
"AND x2, x16, #0x3ffffff \n\t"
|
"STR w8, [%[ctx_h], #16] \n\t"
|
||||||
"ADD x6, x20, x19, LSR #26 \n\t"
|
|
||||||
"AND x5, x19, #0x3ffffff \n\t"
|
|
||||||
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
|
|
||||||
"CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
|
|
||||||
"BHS L_poly1305_aarch64_16_64_loop_%= \n\t"
|
|
||||||
/* Store h */
|
|
||||||
"ORR x2, x2, x3, LSL #32 \n\t"
|
|
||||||
"ORR x4, x4, x5, LSL #32 \n\t"
|
|
||||||
"STP x2, x4, %[ctx_h] \n\t"
|
|
||||||
"STR w6, %[ctx_h_4] \n\t"
|
|
||||||
"\n"
|
"\n"
|
||||||
".align 2 \n\t"
|
".align 2 \n\t"
|
||||||
"L_poly1305_aarch64_16_64_done_%=: \n\t"
|
"L_poly1305_aarch64_16_done_%=: \n\t"
|
||||||
: [ctx_h] "+m" (ctx->h[0]),
|
: [bytes] "+r" (bytes), [m] "+r" (m)
|
||||||
[ctx_h_4] "+m" (ctx->h[4]),
|
|
||||||
[bytes] "+r" (bytes),
|
|
||||||
[m] "+r" (m)
|
|
||||||
: [POLY1305_BLOCK_SIZE] "I" (POLY1305_BLOCK_SIZE),
|
: [POLY1305_BLOCK_SIZE] "I" (POLY1305_BLOCK_SIZE),
|
||||||
[ctx_r] "m" (ctx->r[0]),
|
[ctx_r64] "m" (ctx->r64[0]), [ctx_h] "r" (ctx->h),
|
||||||
[ctx_r_4] "m" (ctx->r[4]),
|
|
||||||
[finished] "r" ((word64)ctx->finished)
|
[finished] "r" ((word64)ctx->finished)
|
||||||
: "memory", "cc",
|
: "memory", "cc",
|
||||||
"w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w15",
|
"x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14",
|
||||||
"w21", "w22", "w23", "w24", "w25", "x2", "x3", "x4", "x5", "x6",
|
"x15", "x16", "x17", "x19"
|
||||||
"x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17", "x19", "x20",
|
|
||||||
"x21", "x22", "x23", "x24", "x25", "x26"
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -910,151 +885,147 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||||
"LDP x10, x11, [%[key], #16] \n\t"
|
"LDP x10, x11, [%[key], #16] \n\t"
|
||||||
/* Load clamp */
|
/* Load clamp */
|
||||||
"LDP x12, x13, [%[clamp]] \n\t"
|
"LDP x12, x13, [%[clamp]] \n\t"
|
||||||
|
/* Save pad for later */
|
||||||
|
"STP x10, x11, [%[ctx_pad]] \n\t"
|
||||||
/* Apply clamp */
|
/* Apply clamp */
|
||||||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
|
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
|
||||||
"AND x8, x8, x12 \n\t"
|
"AND x8, x8, x12 \n\t"
|
||||||
"AND x9, x9, x13 \n\t"
|
"AND x9, x9, x13 \n\t"
|
||||||
"MOV x19, xzr \n\t"
|
"STP x8, x9, [%[ctx_r64]] \n\t"
|
||||||
"MOV x20, xzr \n\t"
|
/* 128-bits: Base 64 -> Base 26 */
|
||||||
"MOV x21, xzr \n\t"
|
"MOV x20, #0x3ffffff\n\t"
|
||||||
"MOV x22, xzr \n\t"
|
"LSR x15, x9, #40\n\t"
|
||||||
"MOV x23, xzr \n\t"
|
"AND x14, x20, x9, LSR #14\n\t"
|
||||||
"BFI x19, x8, #0, #26 \n\t"
|
"EXTR x13, x9, x8, #52\n\t"
|
||||||
"LSR x8, x8, #26 \n\t"
|
"AND x12, x20, x8, LSR #26\n\t"
|
||||||
"BFI x20, x8, #0, #26 \n\t"
|
"AND x11, x8, x20\n\t"
|
||||||
"LSR x8, x8, #26 \n\t"
|
"AND x13, x13, x20\n\t"
|
||||||
"BFI x21, x8, #0, #12 \n\t"
|
"AND x15, x15, x20\n\t"
|
||||||
"BFI x21, x9, #12, #14 \n\t"
|
"STP w11, w12, [%[ctx_r], #0] \n\t"
|
||||||
"LSR x9, x9, #14 \n\t"
|
"STP w13, w14, [%[ctx_r], #8] \n\t"
|
||||||
"BFI x22, x9, #0, #26 \n\t"
|
"STR w15, [%[ctx_r], #16] \n\t"
|
||||||
"LSR x9, x9, #26 \n\t"
|
|
||||||
"BFI x23, x9, #0, #24 \n\t"
|
|
||||||
/* Compute r^2 */
|
/* Compute r^2 */
|
||||||
/* r*5 */
|
/* r0 * r0 */
|
||||||
"MOV x8, #5 \n\t"
|
"MUL x12, x8, x8\n\t"
|
||||||
"MUL x24, x20, x8 \n\t"
|
"UMULH x13, x8, x8\n\t"
|
||||||
"MUL x25, x21, x8 \n\t"
|
/* 2 * r0 * r1 */
|
||||||
"MUL x26, x22, x8 \n\t"
|
"MUL x15, x8, x9\n\t"
|
||||||
"MUL x27, x23, x8 \n\t"
|
"UMULH x16, x8, x9\n\t"
|
||||||
/* d = r*r */
|
"ADDS x13, x13, x15\n\t"
|
||||||
/* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
|
"ADC x14, xzr, x16\n\t"
|
||||||
/* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
|
"ADDS x13, x13, x15\n\t"
|
||||||
/* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */
|
"ADCS x14, x14, x16\n\t"
|
||||||
/* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */
|
"ADC x15, xzr, xzr\n\t"
|
||||||
/* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
|
/* r1 * r1 */
|
||||||
"MUL x14, x19, x19 \n\t"
|
"MUL x16, x9, x9\n\t"
|
||||||
"MUL x15, x19, x20 \n\t"
|
"UMULH x17, x9, x9\n\t"
|
||||||
"MUL x16, x19, x21 \n\t"
|
"ADDS x14, x14, x16\n\t"
|
||||||
"MUL x17, x19, x22 \n\t"
|
"ADC x15, x15, x17\n\t"
|
||||||
"MUL x7, x19, x23 \n\t"
|
|
||||||
"MADD x14, x20, x27, x14 \n\t"
|
|
||||||
"MADD x15, x20, x19, x15 \n\t"
|
|
||||||
"MADD x16, x20, x20, x16 \n\t"
|
|
||||||
"MADD x17, x20, x21, x17 \n\t"
|
|
||||||
"MADD x7, x20, x22, x7 \n\t"
|
|
||||||
"MADD x14, x21, x26, x14 \n\t"
|
|
||||||
"MADD x15, x21, x27, x15 \n\t"
|
|
||||||
"MADD x16, x21, x19, x16 \n\t"
|
|
||||||
"MADD x17, x21, x20, x17 \n\t"
|
|
||||||
"MADD x7, x21, x21, x7 \n\t"
|
|
||||||
"MADD x14, x22, x25, x14 \n\t"
|
|
||||||
"MADD x15, x22, x26, x15 \n\t"
|
|
||||||
"MADD x16, x22, x27, x16 \n\t"
|
|
||||||
"MADD x17, x22, x19, x17 \n\t"
|
|
||||||
"MADD x7, x22, x20, x7 \n\t"
|
|
||||||
"MADD x14, x23, x24, x14 \n\t"
|
|
||||||
"MADD x15, x23, x25, x15 \n\t"
|
|
||||||
"MADD x16, x23, x26, x16 \n\t"
|
|
||||||
"MADD x17, x23, x27, x17 \n\t"
|
|
||||||
"MADD x7, x23, x19, x7 \n\t"
|
|
||||||
/* r_2 = r^2 % P */
|
/* r_2 = r^2 % P */
|
||||||
"ADD x15, x15, x14, LSR #26 \n\t"
|
/* Get top two bits from r^2[2]. */
|
||||||
"ADD x7, x7, x17, LSR #26 \n\t"
|
"AND x10, x14, 3\n\t"
|
||||||
"AND x14, x14, #0x3ffffff \n\t"
|
/* Get high bits from r^2[2]. */
|
||||||
"LSR x9, x7, #26 \n\t"
|
"AND x14, x14, -4\n\t"
|
||||||
"AND x17, x17, #0x3ffffff \n\t"
|
/* Add top bits * 4. */
|
||||||
"MADD x14, x9, x8, x14 \n\t"
|
"ADDS x8, x12, x14\n\t"
|
||||||
"ADD x16, x16, x15, LSR #26 \n\t"
|
"ADCS x9, x13, x15\n\t"
|
||||||
"AND x15, x15, #0x3ffffff \n\t"
|
"ADC x10, x10, xzr\n\t"
|
||||||
"AND x7, x7, #0x3ffffff \n\t"
|
/* Move down 2 bits. */
|
||||||
"ADD x17, x17, x16, LSR #26 \n\t"
|
"EXTR x14, x15, x14, 2\n\t"
|
||||||
"AND x16, x16, #0x3ffffff \n\t"
|
"LSR x15, x15, 2\n\t"
|
||||||
"ADD x15, x15, x14, LSR #26 \n\t"
|
/* Add top bits. */
|
||||||
"AND x14, x14, #0x3ffffff \n\t"
|
"ADDS x8, x8, x14\n\t"
|
||||||
"ADD x7, x7, x17, LSR #26 \n\t"
|
"ADCS x9, x9, x15\n\t"
|
||||||
"AND x17, x17, #0x3ffffff \n\t"
|
"ADC x10, x10, xzr\n\t"
|
||||||
/* Store r */
|
/* 130-bits: Base 64 -> Base 26 */
|
||||||
"ORR x19, x19, x20, LSL #32 \n\t"
|
"EXTR x15, x10, x9, #40\n\t"
|
||||||
"ORR x21, x21, x22, LSL #32 \n\t"
|
"AND x14, x20, x9, LSR #14\n\t"
|
||||||
"STP x19, x21, [%[ctx_r]] \n\t"
|
"EXTR x13, x9, x8, #52\n\t"
|
||||||
"STR w23, [%[ctx_r], #16] \n\t"
|
"AND x12, x20, x8, LSR #26\n\t"
|
||||||
"MOV x8, #5 \n\t"
|
"AND x11, x8, x20\n\t"
|
||||||
"MUL x24, x15, x8 \n\t"
|
"AND x13, x13, x20\n\t"
|
||||||
"MUL x25, x16, x8 \n\t"
|
"AND x15, x15, x20\n\t"
|
||||||
"MUL x26, x17, x8 \n\t"
|
|
||||||
"MUL x27, x7, x8 \n\t"
|
|
||||||
/* Compute r^4 */
|
|
||||||
/* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
|
|
||||||
/* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
|
|
||||||
/* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */
|
|
||||||
/* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */
|
|
||||||
/* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
|
|
||||||
"MUL x19, x14, x14 \n\t"
|
|
||||||
"MUL x20, x14, x15 \n\t"
|
|
||||||
"MUL x21, x14, x16 \n\t"
|
|
||||||
"MUL x22, x14, x17 \n\t"
|
|
||||||
"MUL x23, x14, x7 \n\t"
|
|
||||||
"MADD x19, x15, x27, x19 \n\t"
|
|
||||||
"MADD x20, x15, x14, x20 \n\t"
|
|
||||||
"MADD x21, x15, x15, x21 \n\t"
|
|
||||||
"MADD x22, x15, x16, x22 \n\t"
|
|
||||||
"MADD x23, x15, x17, x23 \n\t"
|
|
||||||
"MADD x19, x16, x26, x19 \n\t"
|
|
||||||
"MADD x20, x16, x27, x20 \n\t"
|
|
||||||
"MADD x21, x16, x14, x21 \n\t"
|
|
||||||
"MADD x22, x16, x15, x22 \n\t"
|
|
||||||
"MADD x23, x16, x16, x23 \n\t"
|
|
||||||
"MADD x19, x17, x25, x19 \n\t"
|
|
||||||
"MADD x20, x17, x26, x20 \n\t"
|
|
||||||
"MADD x21, x17, x27, x21 \n\t"
|
|
||||||
"MADD x22, x17, x14, x22 \n\t"
|
|
||||||
"MADD x23, x17, x15, x23 \n\t"
|
|
||||||
"MADD x19, x7, x24, x19 \n\t"
|
|
||||||
"MADD x20, x7, x25, x20 \n\t"
|
|
||||||
"MADD x21, x7, x26, x21 \n\t"
|
|
||||||
"MADD x22, x7, x27, x22 \n\t"
|
|
||||||
"MADD x23, x7, x14, x23 \n\t"
|
|
||||||
/* r^4 % P */
|
|
||||||
"ADD x20, x20, x19, LSR #26 \n\t"
|
|
||||||
"ADD x23, x23, x22, LSR #26 \n\t"
|
|
||||||
"AND x19, x19, #0x3ffffff \n\t"
|
|
||||||
"LSR x9, x23, #26 \n\t"
|
|
||||||
"AND x22, x22, #0x3ffffff \n\t"
|
|
||||||
"MADD x19, x9, x8, x19 \n\t"
|
|
||||||
"ADD x21, x21, x20, LSR #26 \n\t"
|
|
||||||
"AND x20, x20, #0x3ffffff \n\t"
|
|
||||||
"AND x23, x23, #0x3ffffff \n\t"
|
|
||||||
"ADD x22, x22, x21, LSR #26 \n\t"
|
|
||||||
"AND x21, x21, #0x3ffffff \n\t"
|
|
||||||
"ADD x20, x20, x19, LSR #26 \n\t"
|
|
||||||
"AND x19, x19, #0x3ffffff \n\t"
|
|
||||||
"ADD x23, x23, x22, LSR #26 \n\t"
|
|
||||||
"AND x22, x22, #0x3ffffff \n\t"
|
|
||||||
/* Store r^2 */
|
/* Store r^2 */
|
||||||
"ORR x14, x14, x15, LSL #32 \n\t"
|
"STP w11, w12, [%[ctx_r_2], #0] \n\t"
|
||||||
"ORR x16, x16, x17, LSL #32 \n\t"
|
"STP w13, w14, [%[ctx_r_2], #8] \n\t"
|
||||||
"STP x14, x16, [%[ctx_r_2]] \n\t"
|
"STR w15, [%[ctx_r_2], #16] \n\t"
|
||||||
"STR w7, [%[ctx_r_2], #16] \n\t"
|
|
||||||
|
/* Compute r^4 */
|
||||||
|
/* r0 * r0 */
|
||||||
|
"MUL x12, x8, x8\n\t"
|
||||||
|
"UMULH x13, x8, x8\n\t"
|
||||||
|
/* 2 * r0 * r1 */
|
||||||
|
"MUL x15, x8, x9\n\t"
|
||||||
|
"UMULH x16, x8, x9\n\t"
|
||||||
|
"ADDS x13, x13, x15\n\t"
|
||||||
|
"ADC x14, xzr, x16\n\t"
|
||||||
|
"ADDS x13, x13, x15\n\t"
|
||||||
|
"ADCS x14, x14, x16\n\t"
|
||||||
|
"ADC x15, xzr, xzr\n\t"
|
||||||
|
/* 2 * r0 * r2 */
|
||||||
|
"MUL x16, x8, x10\n\t"
|
||||||
|
"UMULH x17, x8, x10\n\t"
|
||||||
|
"ADDS x14, x14, x16\n\t"
|
||||||
|
"ADC x15, x15, x17\n\t"
|
||||||
|
"ADDS x14, x14, x16\n\t"
|
||||||
|
"ADC x15, x15, x17\n\t"
|
||||||
|
/* r1 * r1 */
|
||||||
|
"MUL x16, x9, x9\n\t"
|
||||||
|
"UMULH x17, x9, x9\n\t"
|
||||||
|
"ADDS x14, x14, x16\n\t"
|
||||||
|
"ADCS x15, x15, x17\n\t"
|
||||||
|
"ADC x16, xzr, xzr\n\t"
|
||||||
|
/* 2 * r1 * r2 */
|
||||||
|
"MUL x17, x9, x10\n\t"
|
||||||
|
"UMULH x19, x9, x10\n\t"
|
||||||
|
"ADDS x15, x15, x17\n\t"
|
||||||
|
"ADC x16, x16, x19\n\t"
|
||||||
|
"ADDS x15, x15, x17\n\t"
|
||||||
|
"ADC x16, x16, x19\n\t"
|
||||||
|
/* r2 * r2 */
|
||||||
|
"MUL x17, x10, x10\n\t"
|
||||||
|
"ADD x16, x16, x17\n\t"
|
||||||
|
/* r_4 = r^4 % P */
|
||||||
|
/* Get top two bits from r^4[2]. */
|
||||||
|
"AND x10, x14, 3\n\t"
|
||||||
|
/* Get high bits from r^4[2]. */
|
||||||
|
"AND x14, x14, -4\n\t"
|
||||||
|
/* Add top bits * 4. */
|
||||||
|
"ADDS x8, x12, x14\n\t"
|
||||||
|
"ADCS x9, x13, x15\n\t"
|
||||||
|
"ADC x10, x10, x16\n\t"
|
||||||
|
/* Move down 2 bits. */
|
||||||
|
"EXTR x14, x15, x14, 2\n\t"
|
||||||
|
"EXTR x15, x16, x15, 2\n\t"
|
||||||
|
"LSR x16, x16, 2\n\t"
|
||||||
|
/* Add top bits. */
|
||||||
|
"ADDS x8, x8, x14\n\t"
|
||||||
|
"ADCS x9, x9, x15\n\t"
|
||||||
|
"ADC x10, x10, x16\n\t"
|
||||||
|
/* Top again as it was 260 bits mod less than 130 bits. */
|
||||||
|
"AND x11, x10, -4\n\t"
|
||||||
|
"AND x10, x10, 3\n\t"
|
||||||
|
"ADD x11, x11, x11, LSR #2\n\t"
|
||||||
|
"ADDS x8, x8, x11\n\t"
|
||||||
|
"ADCS x9, x9, xzr\n\t"
|
||||||
|
"ADC x10, x10, xzr\n\t"
|
||||||
|
/* 130-bits: Base 64 -> Base 26 */
|
||||||
|
"EXTR x15, x10, x9, #40\n\t"
|
||||||
|
"AND x14, x20, x9, LSR #14\n\t"
|
||||||
|
"EXTR x13, x9, x8, #52\n\t"
|
||||||
|
"AND x12, x20, x8, LSR #26\n\t"
|
||||||
|
"AND x11, x8, x20\n\t"
|
||||||
|
"AND x13, x13, x20\n\t"
|
||||||
|
"AND x15, x15, x20\n\t"
|
||||||
/* Store r^4 */
|
/* Store r^4 */
|
||||||
"ORR x19, x19, x20, LSL #32 \n\t"
|
"STP w11, w12, [%[ctx_r_4], #0] \n\t"
|
||||||
"ORR x21, x21, x22, LSL #32 \n\t"
|
"STP w13, w14, [%[ctx_r_4], #8] \n\t"
|
||||||
"STP x19, x21, [%[ctx_r_4]] \n\t"
|
"STR w15, [%[ctx_r_4], #16] \n\t"
|
||||||
"STR w23, [%[ctx_r_4], #16] \n\t"
|
|
||||||
/* h (accumulator) = 0 */
|
/* h (accumulator) = 0 */
|
||||||
"STP xzr, xzr, [%[ctx_h_0]] \n\t"
|
"STP xzr, xzr, [%[ctx_h_0]] \n\t"
|
||||||
"STR wzr, [%[ctx_h_0], #16] \n\t"
|
"STR wzr, [%[ctx_h_0], #16] \n\t"
|
||||||
/* Save pad for later */
|
|
||||||
"STP x10, x11, [%[ctx_pad]] \n\t"
|
|
||||||
/* Zero leftover */
|
/* Zero leftover */
|
||||||
"STR xzr, [%[ctx_leftover]] \n\t"
|
"STR xzr, [%[ctx_leftover]] \n\t"
|
||||||
/* Zero finished */
|
/* Zero finished */
|
||||||
|
@ -1062,6 +1033,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||||
:
|
:
|
||||||
: [clamp] "r" (clamp),
|
: [clamp] "r" (clamp),
|
||||||
[key] "r" (key),
|
[key] "r" (key),
|
||||||
|
[ctx_r64] "r" (ctx->r64),
|
||||||
[ctx_r] "r" (ctx->r),
|
[ctx_r] "r" (ctx->r),
|
||||||
[ctx_r_2] "r" (ctx->r_2),
|
[ctx_r_2] "r" (ctx->r_2),
|
||||||
[ctx_r_4] "r" (ctx->r_4),
|
[ctx_r_4] "r" (ctx->r_4),
|
||||||
|
@ -1070,9 +1042,8 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||||
[ctx_leftover] "r" (&ctx->leftover),
|
[ctx_leftover] "r" (&ctx->leftover),
|
||||||
[ctx_finished] "r" (&ctx->finished)
|
[ctx_finished] "r" (&ctx->finished)
|
||||||
: "memory", "cc",
|
: "memory", "cc",
|
||||||
"w7", "w14", "w15", "w16", "w17", "w19", "w20", "w21", "w22", "w23",
|
"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
|
||||||
"x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
|
"x19", "x20"
|
||||||
"x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
|
|
||||||
);
|
);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1081,7 +1052,6 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||||
|
|
||||||
int wc_Poly1305Final(Poly1305* ctx, byte* mac)
|
int wc_Poly1305Final(Poly1305* ctx, byte* mac)
|
||||||
{
|
{
|
||||||
|
|
||||||
if (ctx == NULL)
|
if (ctx == NULL)
|
||||||
return BAD_FUNC_ARG;
|
return BAD_FUNC_ARG;
|
||||||
|
|
||||||
|
@ -1096,67 +1066,54 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
|
||||||
}
|
}
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
/* Load raw h and zero h registers */
|
"LDP x9, x10, %[ctx_pad] \n\t"
|
||||||
"LDP x2, x3, %[h_addr] \n\t"
|
/* Load h */
|
||||||
"MOV x5, xzr \n\t"
|
"LDP w4, w5, [%[ctx_h], #0] \n\t"
|
||||||
"LDR w4, %[h_4_addr] \n\t"
|
"LDP w6, w7, [%[ctx_h], #8] \n\t"
|
||||||
"MOV x6, xzr \n\t"
|
"LDR w8, [%[ctx_h], #16] \n\t"
|
||||||
"LDP x16, x17, %[pad_addr] \n\t"
|
|
||||||
/* Base 26 -> Base 64 */
|
/* Base 26 -> Base 64 */
|
||||||
"MOV w5, w2 \n\t"
|
"ORR x4, x4, x5, LSL #26\n\t"
|
||||||
"LSR x2, x2, #32 \n\t"
|
"ORR x4, x4, x6, LSL #52\n\t"
|
||||||
"ORR x5, x5, x2, LSL #26 \n\t"
|
"LSR x5, x6, #12\n\t"
|
||||||
"ORR x5, x5, x3, LSL #52 \n\t"
|
"ORR x5, x5, x7, LSL #14\n\t"
|
||||||
"LSR w6, w3, #12 \n\t"
|
"ORR x5, x5, x8, LSL #40\n\t"
|
||||||
"LSR x3, x3, #32 \n\t"
|
"LSR x6, x8, #24\n\t"
|
||||||
"ORR x6, x6, x3, LSL #14 \n\t"
|
|
||||||
"ORR x6, x6, x4, LSL #40 \n\t"
|
|
||||||
"LSR x7, x4, #24 \n\t"
|
|
||||||
/* Check if h is larger than p */
|
/* Check if h is larger than p */
|
||||||
"ADDS x2, x5, #5 \n\t"
|
"ADDS x1, x4, #5 \n\t"
|
||||||
"ADCS x3, x6, xzr \n\t"
|
"ADCS x2, x5, xzr \n\t"
|
||||||
"ADC x4, x7, xzr \n\t"
|
"ADC x3, x6, xzr \n\t"
|
||||||
/* Check if h+5 is larger than 2^130 */
|
/* Check if h+5 is larger than 2^130 */
|
||||||
"CMP x4, #3 \n\t"
|
"CMP x3, #3 \n\t"
|
||||||
|
"CSEL x4, x1, x4, HI \n\t"
|
||||||
"CSEL x5, x2, x5, HI \n\t"
|
"CSEL x5, x2, x5, HI \n\t"
|
||||||
"CSEL x6, x3, x6, HI \n\t"
|
"ADDS x4, x4, x9 \n\t"
|
||||||
"ADDS x5, x5, x16 \n\t"
|
"ADC x5, x5, x10 \n\t"
|
||||||
"ADC x6, x6, x17 \n\t"
|
"STP x4, x5, [%[mac]] \n\t"
|
||||||
"STP x5, x6, [%[mac]] \n\t"
|
|
||||||
: [mac] "+r" (mac)
|
|
||||||
: [pad_addr] "m" (ctx->pad),
|
|
||||||
[h_addr] "m" (ctx->h),
|
|
||||||
[h_4_addr] "m" (ctx->h[4])
|
|
||||||
: "memory", "cc",
|
|
||||||
"w2", "w3", "w4", "w5", "w6", "w7", "x2", "x3", "x4", "x5",
|
|
||||||
"x6", "x7", "x16", "x17"
|
|
||||||
);
|
|
||||||
|
|
||||||
/* zero out the state */
|
/* Zero out h */
|
||||||
ctx->h[0] = 0;
|
"STP xzr, xzr, [%[ctx_h]] \n\t"
|
||||||
ctx->h[1] = 0;
|
"STR wzr, [%[ctx_h], #16] \n\t"
|
||||||
ctx->h[2] = 0;
|
/* Zero out r64 */
|
||||||
ctx->h[3] = 0;
|
"STP xzr, xzr, [%[ctx_r64]] \n\t"
|
||||||
ctx->h[4] = 0;
|
/* Zero out r */
|
||||||
ctx->r[0] = 0;
|
"STP xzr, xzr, [%[ctx_r]] \n\t"
|
||||||
ctx->r[1] = 0;
|
"STR wzr, [%[ctx_r], #16] \n\t"
|
||||||
ctx->r[2] = 0;
|
/* Zero out r_2 */
|
||||||
ctx->r[3] = 0;
|
"STP xzr, xzr, [%[ctx_r_2]] \n\t"
|
||||||
ctx->r[4] = 0;
|
"STR wzr, [%[ctx_r_2], #16] \n\t"
|
||||||
ctx->r_2[0] = 0;
|
/* Zero out r_4 */
|
||||||
ctx->r_2[1] = 0;
|
"STP xzr, xzr, [%[ctx_r_4]] \n\t"
|
||||||
ctx->r_2[2] = 0;
|
"STR wzr, [%[ctx_r_4], #16] \n\t"
|
||||||
ctx->r_2[3] = 0;
|
/* Zero out pad */
|
||||||
ctx->r_2[4] = 0;
|
"STP xzr, xzr, %[ctx_pad] \n\t"
|
||||||
ctx->r_4[0] = 0;
|
:
|
||||||
ctx->r_4[1] = 0;
|
: [ctx_pad] "m" (ctx->pad), [ctx_h] "r" (ctx->h), [mac] "r" (mac),
|
||||||
ctx->r_4[2] = 0;
|
[ctx_r64] "r" (ctx->r64), [ctx_r] "r" (ctx->r),
|
||||||
ctx->r_4[3] = 0;
|
[ctx_r_2] "r" (ctx->r_2), [ctx_r_4] "r" (ctx->r_4)
|
||||||
ctx->r_4[4] = 0;
|
: "memory", "cc",
|
||||||
ctx->pad[0] = 0;
|
"w4", "w5", "w6", "w7", "w8",
|
||||||
ctx->pad[1] = 0;
|
"x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10"
|
||||||
ctx->pad[2] = 0;
|
);
|
||||||
ctx->pad[3] = 0;
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -90,6 +90,7 @@ typedef struct Poly1305 {
|
||||||
unsigned char started;
|
unsigned char started;
|
||||||
#else
|
#else
|
||||||
#if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
|
#if defined(WOLFSSL_ARMASM) && defined(__aarch64__)
|
||||||
|
ALIGN128 word64 r64[2];
|
||||||
ALIGN128 word32 r[5];
|
ALIGN128 word32 r[5];
|
||||||
ALIGN128 word32 r_2[5]; /* r^2 */
|
ALIGN128 word32 r_2[5]; /* r^2 */
|
||||||
ALIGN128 word32 r_4[5]; /* r^4 */
|
ALIGN128 word32 r_4[5]; /* r^4 */
|
||||||
|
|
Loading…
Reference in New Issue