mirror of https://github.com/wolfSSL/wolfssl.git
1131 lines
31 KiB
ArmAsm
1131 lines
31 KiB
ArmAsm
/* poly1305_asm.S */
|
|
/*
|
|
* Copyright (C) 2006-2023 wolfSSL Inc.
|
|
*
|
|
* This file is part of wolfSSL.
|
|
*
|
|
* wolfSSL is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* wolfSSL is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
|
|
*/
|
|
|
|
#ifdef WOLFSSL_USER_SETTINGS
|
|
#ifdef WOLFSSL_USER_SETTINGS_ASM
|
|
/*
|
|
* user_settings_asm.h is a file generated by the script user_settings_asm.sh.
|
|
* The script takes in a user_settings.h and produces user_settings_asm.h, which
|
|
* is a stripped down version of user_settings.h containing only preprocessor
|
|
* directives. This makes the header safe to include in assembly (.S) files.
|
|
*/
|
|
#include "user_settings_asm.h"
|
|
#else
|
|
/*
|
|
* Note: if user_settings.h contains any C code (e.g. a typedef or function
|
|
* prototype), including it here in an assembly (.S) file will cause an
|
|
* assembler failure. See user_settings_asm.h above.
|
|
*/
|
|
#include "user_settings.h"
|
|
#endif /* WOLFSSL_USER_SETTINGS_ASM */
|
|
#endif /* WOLFSSL_USER_SETTINGS */
|
|
|
|
#ifndef HAVE_INTEL_AVX1
|
|
#define HAVE_INTEL_AVX1
|
|
#endif /* HAVE_INTEL_AVX1 */
|
|
#ifndef NO_AVX2_SUPPORT
|
|
#define HAVE_INTEL_AVX2
|
|
#endif /* NO_AVX2_SUPPORT */
|
|
|
|
#ifdef WOLFSSL_X86_64_BUILD
|
|
#ifdef HAVE_INTEL_AVX1
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl poly1305_setkey_avx
|
|
.type poly1305_setkey_avx,@function
|
|
.align 16
|
|
poly1305_setkey_avx:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _poly1305_setkey_avx
|
|
.p2align 4
|
|
_poly1305_setkey_avx:
|
|
#endif /* __APPLE__ */
|
|
movabsq $0xffffffc0fffffff, %r10
|
|
movabsq $0xffffffc0ffffffc, %r11
|
|
movq (%rsi), %rdx
|
|
movq 8(%rsi), %rax
|
|
movq 16(%rsi), %rcx
|
|
movq 24(%rsi), %r8
|
|
andq %r10, %rdx
|
|
andq %r11, %rax
|
|
movq %rdx, %r10
|
|
movq %rax, %r11
|
|
xorq %r9, %r9
|
|
movq %rdx, (%rdi)
|
|
movq %rax, 8(%rdi)
|
|
movq %r9, 24(%rdi)
|
|
movq %r9, 32(%rdi)
|
|
movq %r9, 40(%rdi)
|
|
movq %rcx, 48(%rdi)
|
|
movq %r8, 56(%rdi)
|
|
movq %r9, 352(%rdi)
|
|
movq %r9, 408(%rdi)
|
|
movq %rdx, 360(%rdi)
|
|
movq %rax, 416(%rdi)
|
|
addq %rdx, %r10
|
|
addq %rax, %r11
|
|
movq %r10, 368(%rdi)
|
|
movq %r11, 424(%rdi)
|
|
addq %rdx, %r10
|
|
addq %rax, %r11
|
|
movq %r10, 376(%rdi)
|
|
movq %r11, 432(%rdi)
|
|
addq %rdx, %r10
|
|
addq %rax, %r11
|
|
movq %r10, 384(%rdi)
|
|
movq %r11, 440(%rdi)
|
|
addq %rdx, %r10
|
|
addq %rax, %r11
|
|
movq %r10, 392(%rdi)
|
|
movq %r11, 448(%rdi)
|
|
addq %rdx, %r10
|
|
addq %rax, %r11
|
|
movq %r10, 400(%rdi)
|
|
movq %r11, 456(%rdi)
|
|
movq %r9, 608(%rdi)
|
|
movb $0x01, 616(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size poly1305_setkey_avx,.-poly1305_setkey_avx
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl poly1305_block_avx
|
|
.type poly1305_block_avx,@function
|
|
.align 16
|
|
poly1305_block_avx:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _poly1305_block_avx
|
|
.p2align 4
|
|
_poly1305_block_avx:
|
|
#endif /* __APPLE__ */
|
|
pushq %r15
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
movq (%rdi), %r15
|
|
movq 8(%rdi), %rbx
|
|
movq 24(%rdi), %r8
|
|
movq 32(%rdi), %r9
|
|
movq 40(%rdi), %r10
|
|
xorq %r14, %r14
|
|
movb 616(%rdi), %r14b
|
|
# h += m
|
|
movq (%rsi), %r11
|
|
movq 8(%rsi), %r12
|
|
addq %r11, %r8
|
|
adcq %r12, %r9
|
|
movq %rbx, %rax
|
|
adcq %r14, %r10
|
|
# r[1] * h[0] => rdx, rax ==> t2, t1
|
|
mulq %r8
|
|
movq %rax, %r12
|
|
movq %rdx, %r13
|
|
# r[0] * h[1] => rdx, rax ++> t2, t1
|
|
movq %r15, %rax
|
|
mulq %r9
|
|
addq %rax, %r12
|
|
movq %r15, %rax
|
|
adcq %rdx, %r13
|
|
# r[0] * h[0] => rdx, rax ==> t4, t0
|
|
mulq %r8
|
|
movq %rax, %r11
|
|
movq %rdx, %r8
|
|
# r[1] * h[1] => rdx, rax =+> t3, t2
|
|
movq %rbx, %rax
|
|
mulq %r9
|
|
# r[0] * h[2] +> t2
|
|
addq 352(%rdi,%r10,8), %r13
|
|
movq %rdx, %r14
|
|
addq %r8, %r12
|
|
adcq %rax, %r13
|
|
# r[1] * h[2] +> t3
|
|
adcq 408(%rdi,%r10,8), %r14
|
|
# r * h in r14, r13, r12, r11
|
|
# h = (r * h) mod 2^130 - 5
|
|
movq %r13, %r10
|
|
andq $-4, %r13
|
|
andq $3, %r10
|
|
addq %r13, %r11
|
|
movq %r13, %r8
|
|
adcq %r14, %r12
|
|
adcq $0x00, %r10
|
|
shrdq $2, %r14, %r8
|
|
shrq $2, %r14
|
|
addq %r11, %r8
|
|
adcq %r14, %r12
|
|
movq %r12, %r9
|
|
adcq $0x00, %r10
|
|
# h in r10, r9, r8
|
|
# Store h to ctx
|
|
movq %r8, 24(%rdi)
|
|
movq %r9, 32(%rdi)
|
|
movq %r10, 40(%rdi)
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
popq %r15
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size poly1305_block_avx,.-poly1305_block_avx
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl poly1305_blocks_avx
|
|
.type poly1305_blocks_avx,@function
|
|
.align 16
|
|
poly1305_blocks_avx:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _poly1305_blocks_avx
|
|
.p2align 4
|
|
_poly1305_blocks_avx:
|
|
#endif /* __APPLE__ */
|
|
pushq %r15
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
movq %rdx, %rcx
|
|
movq (%rdi), %r15
|
|
movq 8(%rdi), %rbx
|
|
movq 24(%rdi), %r8
|
|
movq 32(%rdi), %r9
|
|
movq 40(%rdi), %r10
|
|
L_poly1305_avx_blocks_start:
|
|
# h += m
|
|
movq (%rsi), %r11
|
|
movq 8(%rsi), %r12
|
|
addq %r11, %r8
|
|
adcq %r12, %r9
|
|
movq %rbx, %rax
|
|
adcq $0x00, %r10
|
|
# r[1] * h[0] => rdx, rax ==> t2, t1
|
|
mulq %r8
|
|
movq %rax, %r12
|
|
movq %rdx, %r13
|
|
# r[0] * h[1] => rdx, rax ++> t2, t1
|
|
movq %r15, %rax
|
|
mulq %r9
|
|
addq %rax, %r12
|
|
movq %r15, %rax
|
|
adcq %rdx, %r13
|
|
# r[0] * h[0] => rdx, rax ==> t4, t0
|
|
mulq %r8
|
|
movq %rax, %r11
|
|
movq %rdx, %r8
|
|
# r[1] * h[1] => rdx, rax =+> t3, t2
|
|
movq %rbx, %rax
|
|
mulq %r9
|
|
# r[0] * h[2] +> t2
|
|
addq 360(%rdi,%r10,8), %r13
|
|
movq %rdx, %r14
|
|
addq %r8, %r12
|
|
adcq %rax, %r13
|
|
# r[1] * h[2] +> t3
|
|
adcq 416(%rdi,%r10,8), %r14
|
|
# r * h in r14, r13, r12, r11
|
|
# h = (r * h) mod 2^130 - 5
|
|
movq %r13, %r10
|
|
andq $-4, %r13
|
|
andq $3, %r10
|
|
addq %r13, %r11
|
|
movq %r13, %r8
|
|
adcq %r14, %r12
|
|
adcq $0x00, %r10
|
|
shrdq $2, %r14, %r8
|
|
shrq $2, %r14
|
|
addq %r11, %r8
|
|
adcq %r14, %r12
|
|
movq %r12, %r9
|
|
adcq $0x00, %r10
|
|
# h in r10, r9, r8
|
|
# Next block from message
|
|
addq $16, %rsi
|
|
subq $16, %rcx
|
|
jg L_poly1305_avx_blocks_start
|
|
# Store h to ctx
|
|
movq %r8, 24(%rdi)
|
|
movq %r9, 32(%rdi)
|
|
movq %r10, 40(%rdi)
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
popq %r15
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size poly1305_blocks_avx,.-poly1305_blocks_avx
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl poly1305_final_avx
|
|
.type poly1305_final_avx,@function
|
|
.align 16
|
|
poly1305_final_avx:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _poly1305_final_avx
|
|
.p2align 4
|
|
_poly1305_final_avx:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
movq %rsi, %rbx
|
|
movq 608(%rdi), %rax
|
|
testq %rax, %rax
|
|
je L_poly1305_avx_final_no_more
|
|
movb $0x01, 480(%rdi,%rax,1)
|
|
jmp L_poly1305_avx_final_cmp_rem
|
|
L_poly1305_avx_final_zero_rem:
|
|
movb $0x00, 480(%rdi,%rax,1)
|
|
L_poly1305_avx_final_cmp_rem:
|
|
incb %al
|
|
cmpq $16, %rax
|
|
jl L_poly1305_avx_final_zero_rem
|
|
movb $0x00, 616(%rdi)
|
|
leaq 480(%rdi), %rsi
|
|
#ifndef __APPLE__
|
|
callq poly1305_block_avx@plt
|
|
#else
|
|
callq _poly1305_block_avx
|
|
#endif /* __APPLE__ */
|
|
L_poly1305_avx_final_no_more:
|
|
movq 24(%rdi), %rax
|
|
movq 32(%rdi), %rdx
|
|
movq 40(%rdi), %rcx
|
|
movq 48(%rdi), %r11
|
|
movq 56(%rdi), %r12
|
|
# h %= p
|
|
# h = (h + pad)
|
|
# mod 2^130 - 5
|
|
movq %rcx, %r8
|
|
andq $3, %rcx
|
|
shrq $2, %r8
|
|
# Multiply by 5
|
|
leaq 0(%r8,%r8,4), %r8
|
|
addq %r8, %rax
|
|
adcq $0x00, %rdx
|
|
adcq $0x00, %rcx
|
|
# Fixup when between (1 << 130) - 1 and (1 << 130) - 5
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
movq %rcx, %r10
|
|
addq $5, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
cmpq $4, %r10
|
|
cmoveq %r8, %rax
|
|
cmoveq %r9, %rdx
|
|
# h += pad
|
|
addq %r11, %rax
|
|
adcq %r12, %rdx
|
|
movq %rax, (%rbx)
|
|
movq %rdx, 8(%rbx)
|
|
# Zero out r
|
|
movq $0x00, (%rdi)
|
|
movq $0x00, 8(%rdi)
|
|
# Zero out h
|
|
movq $0x00, 24(%rdi)
|
|
movq $0x00, 32(%rdi)
|
|
movq $0x00, 40(%rdi)
|
|
# Zero out pad
|
|
movq $0x00, 48(%rdi)
|
|
movq $0x00, 56(%rdi)
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size poly1305_final_avx,.-poly1305_final_avx
|
|
#endif /* __APPLE__ */
|
|
#endif /* HAVE_INTEL_AVX1 */
|
|
#ifdef HAVE_INTEL_AVX2
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl poly1305_calc_powers_avx2
|
|
.type poly1305_calc_powers_avx2,@function
|
|
.align 16
|
|
poly1305_calc_powers_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _poly1305_calc_powers_avx2
|
|
.p2align 4
|
|
_poly1305_calc_powers_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
pushq %rbx
|
|
pushq %rbp
|
|
movq (%rdi), %rcx
|
|
movq 8(%rdi), %r8
|
|
xorq %r9, %r9
|
|
# Convert to 26 bits in 32
|
|
movq %rcx, %rax
|
|
movq %rcx, %rdx
|
|
movq %rcx, %rsi
|
|
movq %r8, %rbx
|
|
movq %r8, %rbp
|
|
shrq $26, %rdx
|
|
shrdq $52, %r8, %rsi
|
|
shrq $14, %rbx
|
|
shrdq $40, %r9, %rbp
|
|
andq $0x3ffffff, %rax
|
|
andq $0x3ffffff, %rdx
|
|
andq $0x3ffffff, %rsi
|
|
andq $0x3ffffff, %rbx
|
|
andq $0x3ffffff, %rbp
|
|
movl %eax, 224(%rdi)
|
|
movl %edx, 228(%rdi)
|
|
movl %esi, 232(%rdi)
|
|
movl %ebx, 236(%rdi)
|
|
movl %ebp, 240(%rdi)
|
|
movl $0x00, 244(%rdi)
|
|
# Square 128-bit
|
|
movq %r8, %rax
|
|
mulq %rcx
|
|
xorq %r13, %r13
|
|
movq %rax, %r11
|
|
movq %rdx, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
movq %rcx, %rax
|
|
mulq %rax
|
|
movq %rax, %r10
|
|
movq %rdx, %r15
|
|
movq %r8, %rax
|
|
mulq %rax
|
|
addq %r15, %r11
|
|
adcq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# Reduce 256-bit to 130-bit
|
|
movq %r12, %rax
|
|
movq %r13, %rdx
|
|
andq $-4, %rax
|
|
andq $3, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
shrdq $2, %rdx, %rax
|
|
shrq $2, %rdx
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
movq %r12, %rax
|
|
shrq $2, %rax
|
|
leaq 0(%rax,%rax,4), %rax
|
|
andq $3, %r12
|
|
addq %rax, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Convert to 26 bits in 32
|
|
movq %r10, %rax
|
|
movq %r10, %rdx
|
|
movq %r10, %rsi
|
|
movq %r11, %rbx
|
|
movq %r11, %rbp
|
|
shrq $26, %rdx
|
|
shrdq $52, %r11, %rsi
|
|
shrq $14, %rbx
|
|
shrdq $40, %r12, %rbp
|
|
andq $0x3ffffff, %rax
|
|
andq $0x3ffffff, %rdx
|
|
andq $0x3ffffff, %rsi
|
|
andq $0x3ffffff, %rbx
|
|
andq $0x3ffffff, %rbp
|
|
movl %eax, 256(%rdi)
|
|
movl %edx, 260(%rdi)
|
|
movl %esi, 264(%rdi)
|
|
movl %ebx, 268(%rdi)
|
|
movl %ebp, 272(%rdi)
|
|
movl $0x00, 276(%rdi)
|
|
# Multiply 128-bit by 130-bit
|
|
# r1[0] * r2[0]
|
|
movq %rcx, %rax
|
|
mulq %r10
|
|
movq %rax, %r13
|
|
movq %rdx, %r14
|
|
# r1[0] * r2[1]
|
|
movq %rcx, %rax
|
|
mulq %r11
|
|
movq $0x00, %r15
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# r1[1] * r2[0]
|
|
movq %r8, %rax
|
|
mulq %r10
|
|
movq $0x00, %rsi
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
adcq $0x00, %rsi
|
|
# r1[0] * r2[2]
|
|
movq %rcx, %rax
|
|
mulq %r12
|
|
addq %rax, %r15
|
|
adcq %rdx, %rsi
|
|
# r1[1] * r2[1]
|
|
movq %r8, %rax
|
|
mulq %r11
|
|
movq $0x00, %rbx
|
|
addq %rax, %r15
|
|
adcq %rdx, %rsi
|
|
adcq $0x00, %rbx
|
|
# r1[1] * r2[2]
|
|
movq %r8, %rax
|
|
mulq %r12
|
|
addq %rax, %rsi
|
|
adcq %rdx, %rbx
|
|
# Reduce 260-bit to 130-bit
|
|
movq %r15, %rax
|
|
movq %rsi, %rdx
|
|
movq %rbx, %rbx
|
|
andq $-4, %rax
|
|
andq $3, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rbx, %r15
|
|
shrdq $2, %rdx, %rax
|
|
shrdq $2, %rbx, %rdx
|
|
shrq $2, %rbx
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rbx, %r15
|
|
movq %r15, %rax
|
|
andq $3, %r15
|
|
shrq $2, %rax
|
|
leaq 0(%rax,%rax,4), %rax
|
|
addq %rax, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Convert to 26 bits in 32
|
|
movq %r13, %rax
|
|
movq %r13, %rdx
|
|
movq %r13, %rsi
|
|
movq %r14, %rbx
|
|
movq %r14, %rbp
|
|
shrq $26, %rdx
|
|
shrdq $52, %r14, %rsi
|
|
shrq $14, %rbx
|
|
shrdq $40, %r15, %rbp
|
|
andq $0x3ffffff, %rax
|
|
andq $0x3ffffff, %rdx
|
|
andq $0x3ffffff, %rsi
|
|
andq $0x3ffffff, %rbx
|
|
andq $0x3ffffff, %rbp
|
|
movl %eax, 288(%rdi)
|
|
movl %edx, 292(%rdi)
|
|
movl %esi, 296(%rdi)
|
|
movl %ebx, 300(%rdi)
|
|
movl %ebp, 304(%rdi)
|
|
movl $0x00, 308(%rdi)
|
|
# Square 130-bit
|
|
movq %r11, %rax
|
|
mulq %r10
|
|
xorq %r13, %r13
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
addq %rax, %r8
|
|
adcq %rdx, %r9
|
|
adcq $0x00, %r13
|
|
movq %r10, %rax
|
|
mulq %rax
|
|
movq %rax, %rcx
|
|
movq %rdx, %r15
|
|
movq %r11, %rax
|
|
mulq %rax
|
|
addq %r15, %r8
|
|
adcq %rax, %r9
|
|
adcq %rdx, %r13
|
|
movq %r12, %rax
|
|
mulq %rax
|
|
movq %rax, %r14
|
|
movq %r12, %rax
|
|
mulq %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
addq %rax, %r9
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
movq %r12, %rax
|
|
mulq %r11
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Reduce 260-bit to 130-bit
|
|
movq %r9, %rax
|
|
movq %r13, %rdx
|
|
movq %r14, %r15
|
|
andq $-4, %rax
|
|
andq $3, %r9
|
|
addq %rax, %rcx
|
|
adcq %rdx, %r8
|
|
adcq %r15, %r9
|
|
shrdq $2, %rdx, %rax
|
|
shrdq $2, %r15, %rdx
|
|
shrq $2, %r15
|
|
addq %rax, %rcx
|
|
adcq %rdx, %r8
|
|
adcq %r15, %r9
|
|
movq %r9, %rax
|
|
andq $3, %r9
|
|
shrq $2, %rax
|
|
leaq 0(%rax,%rax,4), %rax
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r8
|
|
adcq $0x00, %r9
|
|
# Convert to 26 bits in 32
|
|
movq %rcx, %rax
|
|
movq %rcx, %rdx
|
|
movq %rcx, %rsi
|
|
movq %r8, %rbx
|
|
movq %r8, %rbp
|
|
shrq $26, %rdx
|
|
shrdq $52, %r8, %rsi
|
|
shrq $14, %rbx
|
|
shrdq $40, %r9, %rbp
|
|
andq $0x3ffffff, %rax
|
|
andq $0x3ffffff, %rdx
|
|
andq $0x3ffffff, %rsi
|
|
andq $0x3ffffff, %rbx
|
|
andq $0x3ffffff, %rbp
|
|
movl %eax, 320(%rdi)
|
|
movl %edx, 324(%rdi)
|
|
movl %esi, 328(%rdi)
|
|
movl %ebx, 332(%rdi)
|
|
movl %ebp, 336(%rdi)
|
|
movl $0x00, 340(%rdi)
|
|
popq %rbp
|
|
popq %rbx
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl poly1305_setkey_avx2
|
|
.type poly1305_setkey_avx2,@function
|
|
.align 16
|
|
poly1305_setkey_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _poly1305_setkey_avx2
|
|
.p2align 4
|
|
_poly1305_setkey_avx2:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
callq poly1305_setkey_avx@plt
|
|
#else
|
|
callq _poly1305_setkey_avx
|
|
#endif /* __APPLE__ */
|
|
vpxor %ymm0, %ymm0, %ymm0
|
|
vmovdqu %ymm0, 64(%rdi)
|
|
vmovdqu %ymm0, 96(%rdi)
|
|
vmovdqu %ymm0, 128(%rdi)
|
|
vmovdqu %ymm0, 160(%rdi)
|
|
vmovdqu %ymm0, 192(%rdi)
|
|
movq $0x00, 608(%rdi)
|
|
movw $0x00, 616(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size poly1305_setkey_avx2,.-poly1305_setkey_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
#else
|
|
.section __DATA,__data
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.align 32
|
|
#else
|
|
.p2align 5
|
|
#endif /* __APPLE__ */
|
|
L_poly1305_avx2_blocks_mask:
|
|
.quad 0x3ffffff, 0x3ffffff
|
|
.quad 0x3ffffff, 0x3ffffff
|
|
#ifndef __APPLE__
|
|
.data
|
|
#else
|
|
.section __DATA,__data
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.align 32
|
|
#else
|
|
.p2align 5
|
|
#endif /* __APPLE__ */
|
|
L_poly1305_avx2_blocks_hibit:
|
|
.quad 0x1000000, 0x1000000
|
|
.quad 0x1000000, 0x1000000
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl poly1305_blocks_avx2
|
|
.type poly1305_blocks_avx2,@function
|
|
.align 16
|
|
poly1305_blocks_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _poly1305_blocks_avx2
|
|
.p2align 4
|
|
_poly1305_blocks_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %rbx
|
|
subq $0x140, %rsp
|
|
movq %rsp, %rcx
|
|
andq $-32, %rcx
|
|
addq $32, %rcx
|
|
vpxor %ymm15, %ymm15, %ymm15
|
|
movq %rcx, %rbx
|
|
leaq 64(%rdi), %rax
|
|
addq $0xa0, %rbx
|
|
cmpw $0x00, 616(%rdi)
|
|
jne L_poly1305_avx2_blocks_begin_h
|
|
# Load the message data
|
|
vmovdqu (%rsi), %ymm0
|
|
vmovdqu 32(%rsi), %ymm1
|
|
vperm2i128 $32, %ymm1, %ymm0, %ymm2
|
|
vperm2i128 $49, %ymm1, %ymm0, %ymm0
|
|
vpunpckldq %ymm0, %ymm2, %ymm1
|
|
vpunpckhdq %ymm0, %ymm2, %ymm3
|
|
vpunpckldq %ymm15, %ymm1, %ymm0
|
|
vpunpckhdq %ymm15, %ymm1, %ymm1
|
|
vpunpckldq %ymm15, %ymm3, %ymm2
|
|
vpunpckhdq %ymm15, %ymm3, %ymm3
|
|
vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4
|
|
vpsllq $6, %ymm1, %ymm1
|
|
vpsllq $12, %ymm2, %ymm2
|
|
vpsllq $18, %ymm3, %ymm3
|
|
vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
|
|
# Reduce, in place, the message data
|
|
vpsrlq $26, %ymm0, %ymm10
|
|
vpsrlq $26, %ymm3, %ymm11
|
|
vpand %ymm14, %ymm0, %ymm0
|
|
vpand %ymm14, %ymm3, %ymm3
|
|
vpaddq %ymm1, %ymm10, %ymm1
|
|
vpaddq %ymm4, %ymm11, %ymm4
|
|
vpsrlq $26, %ymm1, %ymm10
|
|
vpsrlq $26, %ymm4, %ymm11
|
|
vpand %ymm14, %ymm1, %ymm1
|
|
vpand %ymm14, %ymm4, %ymm4
|
|
vpaddq %ymm2, %ymm10, %ymm2
|
|
vpslld $2, %ymm11, %ymm12
|
|
vpaddd %ymm12, %ymm11, %ymm12
|
|
vpsrlq $26, %ymm2, %ymm10
|
|
vpaddq %ymm0, %ymm12, %ymm0
|
|
vpsrlq $26, %ymm0, %ymm11
|
|
vpand %ymm14, %ymm2, %ymm2
|
|
vpand %ymm14, %ymm0, %ymm0
|
|
vpaddq %ymm3, %ymm10, %ymm3
|
|
vpaddq %ymm1, %ymm11, %ymm1
|
|
vpsrlq $26, %ymm3, %ymm10
|
|
vpand %ymm14, %ymm3, %ymm3
|
|
vpaddq %ymm4, %ymm10, %ymm4
|
|
addq $0x40, %rsi
|
|
subq $0x40, %rdx
|
|
jz L_poly1305_avx2_blocks_store
|
|
jmp L_poly1305_avx2_blocks_load_r4
|
|
L_poly1305_avx2_blocks_begin_h:
|
|
# Load the H values.
|
|
vmovdqu (%rax), %ymm0
|
|
vmovdqu 32(%rax), %ymm1
|
|
vmovdqu 64(%rax), %ymm2
|
|
vmovdqu 96(%rax), %ymm3
|
|
vmovdqu 128(%rax), %ymm4
|
|
# Check if there is a power of r to load - otherwise use r^4.
|
|
cmpb $0x00, 616(%rdi)
|
|
je L_poly1305_avx2_blocks_load_r4
|
|
# Load the 4 powers of r - r^4, r^3, r^2, r^1.
|
|
vmovdqu 224(%rdi), %ymm8
|
|
vmovdqu 256(%rdi), %ymm7
|
|
vmovdqu 288(%rdi), %ymm6
|
|
vmovdqu 320(%rdi), %ymm5
|
|
vpermq $0xd8, %ymm5, %ymm5
|
|
vpermq $0xd8, %ymm6, %ymm6
|
|
vpermq $0xd8, %ymm7, %ymm7
|
|
vpermq $0xd8, %ymm8, %ymm8
|
|
vpunpcklqdq %ymm6, %ymm5, %ymm10
|
|
vpunpckhqdq %ymm6, %ymm5, %ymm11
|
|
vpunpcklqdq %ymm8, %ymm7, %ymm12
|
|
vpunpckhqdq %ymm8, %ymm7, %ymm13
|
|
vperm2i128 $32, %ymm12, %ymm10, %ymm5
|
|
vperm2i128 $49, %ymm12, %ymm10, %ymm7
|
|
vperm2i128 $32, %ymm13, %ymm11, %ymm9
|
|
vpsrlq $32, %ymm5, %ymm6
|
|
vpsrlq $32, %ymm7, %ymm8
|
|
jmp L_poly1305_avx2_blocks_mul_5
|
|
L_poly1305_avx2_blocks_load_r4:
|
|
# Load r^4 into all four positions.
|
|
vmovdqu 320(%rdi), %ymm13
|
|
vpermq $0x00, %ymm13, %ymm5
|
|
vpsrlq $32, %ymm13, %ymm14
|
|
vpermq $0x55, %ymm13, %ymm7
|
|
vpermq $0xaa, %ymm13, %ymm9
|
|
vpermq $0x00, %ymm14, %ymm6
|
|
vpermq $0x55, %ymm14, %ymm8
|
|
L_poly1305_avx2_blocks_mul_5:
|
|
# Multiply top 4 26-bit values of all four H by 5
|
|
vpslld $2, %ymm6, %ymm10
|
|
vpslld $2, %ymm7, %ymm11
|
|
vpslld $2, %ymm8, %ymm12
|
|
vpslld $2, %ymm9, %ymm13
|
|
vpaddq %ymm10, %ymm6, %ymm10
|
|
vpaddq %ymm11, %ymm7, %ymm11
|
|
vpaddq %ymm12, %ymm8, %ymm12
|
|
vpaddq %ymm13, %ymm9, %ymm13
|
|
# Store powers of r and multiple of 5 for use in multiply.
|
|
vmovdqa %ymm10, (%rbx)
|
|
vmovdqa %ymm11, 32(%rbx)
|
|
vmovdqa %ymm12, 64(%rbx)
|
|
vmovdqa %ymm13, 96(%rbx)
|
|
vmovdqa %ymm5, (%rcx)
|
|
vmovdqa %ymm6, 32(%rcx)
|
|
vmovdqa %ymm7, 64(%rcx)
|
|
vmovdqa %ymm8, 96(%rcx)
|
|
vmovdqa %ymm9, 128(%rcx)
|
|
vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
|
|
# If not finished then loop over data
|
|
cmpb $0x01, 616(%rdi)
|
|
jne L_poly1305_avx2_blocks_start
|
|
# Do last multiply, reduce, add the four H together and move to
|
|
# 32-bit registers
|
|
vpmuludq (%rbx), %ymm4, %ymm5
|
|
vpmuludq 32(%rbx), %ymm3, %ymm10
|
|
vpmuludq 32(%rbx), %ymm4, %ymm6
|
|
vpmuludq 64(%rbx), %ymm3, %ymm11
|
|
vpmuludq 64(%rbx), %ymm4, %ymm7
|
|
vpaddq %ymm5, %ymm10, %ymm5
|
|
vpmuludq 64(%rbx), %ymm2, %ymm12
|
|
vpmuludq 96(%rbx), %ymm4, %ymm8
|
|
vpaddq %ymm6, %ymm11, %ymm6
|
|
vpmuludq 96(%rbx), %ymm1, %ymm13
|
|
vpmuludq 96(%rbx), %ymm2, %ymm10
|
|
vpaddq %ymm5, %ymm12, %ymm5
|
|
vpmuludq 96(%rbx), %ymm3, %ymm11
|
|
vpmuludq (%rcx), %ymm3, %ymm12
|
|
vpaddq %ymm5, %ymm13, %ymm5
|
|
vpmuludq (%rcx), %ymm4, %ymm9
|
|
vpaddq %ymm6, %ymm10, %ymm6
|
|
vpmuludq (%rcx), %ymm0, %ymm13
|
|
vpaddq %ymm7, %ymm11, %ymm7
|
|
vpmuludq (%rcx), %ymm1, %ymm10
|
|
vpaddq %ymm8, %ymm12, %ymm8
|
|
vpmuludq (%rcx), %ymm2, %ymm11
|
|
vpmuludq 32(%rcx), %ymm2, %ymm12
|
|
vpaddq %ymm5, %ymm13, %ymm5
|
|
vpmuludq 32(%rcx), %ymm3, %ymm13
|
|
vpaddq %ymm6, %ymm10, %ymm6
|
|
vpmuludq 32(%rcx), %ymm0, %ymm10
|
|
vpaddq %ymm7, %ymm11, %ymm7
|
|
vpmuludq 32(%rcx), %ymm1, %ymm11
|
|
vpaddq %ymm8, %ymm12, %ymm8
|
|
vpmuludq 64(%rcx), %ymm1, %ymm12
|
|
vpaddq %ymm9, %ymm13, %ymm9
|
|
vpmuludq 64(%rcx), %ymm2, %ymm13
|
|
vpaddq %ymm6, %ymm10, %ymm6
|
|
vpmuludq 64(%rcx), %ymm0, %ymm10
|
|
vpaddq %ymm7, %ymm11, %ymm7
|
|
vpmuludq 96(%rcx), %ymm0, %ymm11
|
|
vpaddq %ymm8, %ymm12, %ymm8
|
|
vpmuludq 96(%rcx), %ymm1, %ymm12
|
|
vpaddq %ymm9, %ymm13, %ymm9
|
|
vpaddq %ymm7, %ymm10, %ymm7
|
|
vpmuludq 128(%rcx), %ymm0, %ymm13
|
|
vpaddq %ymm8, %ymm11, %ymm8
|
|
vpaddq %ymm9, %ymm12, %ymm9
|
|
vpaddq %ymm9, %ymm13, %ymm9
|
|
vpsrlq $26, %ymm5, %ymm10
|
|
vpsrlq $26, %ymm8, %ymm11
|
|
vpand %ymm14, %ymm5, %ymm5
|
|
vpand %ymm14, %ymm8, %ymm8
|
|
vpaddq %ymm6, %ymm10, %ymm6
|
|
vpaddq %ymm9, %ymm11, %ymm9
|
|
vpsrlq $26, %ymm6, %ymm10
|
|
vpsrlq $26, %ymm9, %ymm11
|
|
vpand %ymm14, %ymm6, %ymm1
|
|
vpand %ymm14, %ymm9, %ymm4
|
|
vpaddq %ymm7, %ymm10, %ymm7
|
|
vpslld $2, %ymm11, %ymm12
|
|
vpaddd %ymm12, %ymm11, %ymm12
|
|
vpsrlq $26, %ymm7, %ymm10
|
|
vpaddq %ymm5, %ymm12, %ymm5
|
|
vpsrlq $26, %ymm5, %ymm11
|
|
vpand %ymm14, %ymm7, %ymm2
|
|
vpand %ymm14, %ymm5, %ymm0
|
|
vpaddq %ymm8, %ymm10, %ymm8
|
|
vpaddq %ymm1, %ymm11, %ymm1
|
|
vpsrlq $26, %ymm8, %ymm10
|
|
vpand %ymm14, %ymm8, %ymm3
|
|
vpaddq %ymm4, %ymm10, %ymm4
|
|
vpsrldq $8, %ymm0, %ymm5
|
|
vpsrldq $8, %ymm1, %ymm6
|
|
vpsrldq $8, %ymm2, %ymm7
|
|
vpsrldq $8, %ymm3, %ymm8
|
|
vpsrldq $8, %ymm4, %ymm9
|
|
vpaddq %ymm0, %ymm5, %ymm0
|
|
vpaddq %ymm1, %ymm6, %ymm1
|
|
vpaddq %ymm2, %ymm7, %ymm2
|
|
vpaddq %ymm3, %ymm8, %ymm3
|
|
vpaddq %ymm4, %ymm9, %ymm4
|
|
vpermq $2, %ymm0, %ymm5
|
|
vpermq $2, %ymm1, %ymm6
|
|
vpermq $2, %ymm2, %ymm7
|
|
vpermq $2, %ymm3, %ymm8
|
|
vpermq $2, %ymm4, %ymm9
|
|
vpaddq %ymm0, %ymm5, %ymm0
|
|
vpaddq %ymm1, %ymm6, %ymm1
|
|
vpaddq %ymm2, %ymm7, %ymm2
|
|
vpaddq %ymm3, %ymm8, %ymm3
|
|
vpaddq %ymm4, %ymm9, %ymm4
|
|
vmovd %xmm0, %r8d
|
|
vmovd %xmm1, %r9d
|
|
vmovd %xmm2, %r10d
|
|
vmovd %xmm3, %r11d
|
|
vmovd %xmm4, %r12d
|
|
jmp L_poly1305_avx2_blocks_end_calc
|
|
L_poly1305_avx2_blocks_start:
|
|
vmovdqu (%rsi), %ymm5
|
|
vmovdqu 32(%rsi), %ymm6
|
|
vperm2i128 $32, %ymm6, %ymm5, %ymm7
|
|
vperm2i128 $49, %ymm6, %ymm5, %ymm5
|
|
vpunpckldq %ymm5, %ymm7, %ymm6
|
|
vpunpckhdq %ymm5, %ymm7, %ymm8
|
|
vpunpckldq %ymm15, %ymm6, %ymm5
|
|
vpunpckhdq %ymm15, %ymm6, %ymm6
|
|
vpunpckldq %ymm15, %ymm8, %ymm7
|
|
vpunpckhdq %ymm15, %ymm8, %ymm8
|
|
vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9
|
|
vpsllq $6, %ymm6, %ymm6
|
|
vpsllq $12, %ymm7, %ymm7
|
|
vpsllq $18, %ymm8, %ymm8
|
|
vpmuludq (%rbx), %ymm4, %ymm10
|
|
vpaddq %ymm5, %ymm10, %ymm5
|
|
vpmuludq 32(%rbx), %ymm3, %ymm10
|
|
vpmuludq 32(%rbx), %ymm4, %ymm11
|
|
vpaddq %ymm6, %ymm11, %ymm6
|
|
vpmuludq 64(%rbx), %ymm3, %ymm11
|
|
vpmuludq 64(%rbx), %ymm4, %ymm12
|
|
vpaddq %ymm7, %ymm12, %ymm7
|
|
vpaddq %ymm5, %ymm10, %ymm5
|
|
vpmuludq 64(%rbx), %ymm2, %ymm12
|
|
vpmuludq 96(%rbx), %ymm4, %ymm13
|
|
vpaddq %ymm8, %ymm13, %ymm8
|
|
vpaddq %ymm6, %ymm11, %ymm6
|
|
vpmuludq 96(%rbx), %ymm1, %ymm13
|
|
vpmuludq 96(%rbx), %ymm2, %ymm10
|
|
vpaddq %ymm5, %ymm12, %ymm5
|
|
vpmuludq 96(%rbx), %ymm3, %ymm11
|
|
vpmuludq (%rcx), %ymm3, %ymm12
|
|
vpaddq %ymm5, %ymm13, %ymm5
|
|
vpmuludq (%rcx), %ymm4, %ymm13
|
|
vpaddq %ymm9, %ymm13, %ymm9
|
|
vpaddq %ymm6, %ymm10, %ymm6
|
|
vpmuludq (%rcx), %ymm0, %ymm13
|
|
vpaddq %ymm7, %ymm11, %ymm7
|
|
vpmuludq (%rcx), %ymm1, %ymm10
|
|
vpaddq %ymm8, %ymm12, %ymm8
|
|
vpmuludq (%rcx), %ymm2, %ymm11
|
|
vpmuludq 32(%rcx), %ymm2, %ymm12
|
|
vpaddq %ymm5, %ymm13, %ymm5
|
|
vpmuludq 32(%rcx), %ymm3, %ymm13
|
|
vpaddq %ymm6, %ymm10, %ymm6
|
|
vpmuludq 32(%rcx), %ymm0, %ymm10
|
|
vpaddq %ymm7, %ymm11, %ymm7
|
|
vpmuludq 32(%rcx), %ymm1, %ymm11
|
|
vpaddq %ymm8, %ymm12, %ymm8
|
|
vpmuludq 64(%rcx), %ymm1, %ymm12
|
|
vpaddq %ymm9, %ymm13, %ymm9
|
|
vpmuludq 64(%rcx), %ymm2, %ymm13
|
|
vpaddq %ymm6, %ymm10, %ymm6
|
|
vpmuludq 64(%rcx), %ymm0, %ymm10
|
|
vpaddq %ymm7, %ymm11, %ymm7
|
|
vpmuludq 96(%rcx), %ymm0, %ymm11
|
|
vpaddq %ymm8, %ymm12, %ymm8
|
|
vpmuludq 96(%rcx), %ymm1, %ymm12
|
|
vpaddq %ymm9, %ymm13, %ymm9
|
|
vpaddq %ymm7, %ymm10, %ymm7
|
|
vpmuludq 128(%rcx), %ymm0, %ymm13
|
|
vpaddq %ymm8, %ymm11, %ymm8
|
|
vpaddq %ymm9, %ymm12, %ymm9
|
|
vpaddq %ymm9, %ymm13, %ymm9
|
|
vpsrlq $26, %ymm5, %ymm10
|
|
vpsrlq $26, %ymm8, %ymm11
|
|
vpand %ymm14, %ymm5, %ymm5
|
|
vpand %ymm14, %ymm8, %ymm8
|
|
vpaddq %ymm6, %ymm10, %ymm6
|
|
vpaddq %ymm9, %ymm11, %ymm9
|
|
vpsrlq $26, %ymm6, %ymm10
|
|
vpsrlq $26, %ymm9, %ymm11
|
|
vpand %ymm14, %ymm6, %ymm1
|
|
vpand %ymm14, %ymm9, %ymm4
|
|
vpaddq %ymm7, %ymm10, %ymm7
|
|
vpslld $2, %ymm11, %ymm12
|
|
vpaddd %ymm12, %ymm11, %ymm12
|
|
vpsrlq $26, %ymm7, %ymm10
|
|
vpaddq %ymm5, %ymm12, %ymm5
|
|
vpsrlq $26, %ymm5, %ymm11
|
|
vpand %ymm14, %ymm7, %ymm2
|
|
vpand %ymm14, %ymm5, %ymm0
|
|
vpaddq %ymm8, %ymm10, %ymm8
|
|
vpaddq %ymm1, %ymm11, %ymm1
|
|
vpsrlq $26, %ymm8, %ymm10
|
|
vpand %ymm14, %ymm8, %ymm3
|
|
vpaddq %ymm4, %ymm10, %ymm4
|
|
addq $0x40, %rsi
|
|
subq $0x40, %rdx
|
|
jnz L_poly1305_avx2_blocks_start
|
|
L_poly1305_avx2_blocks_store:
|
|
# Store four H values - state
|
|
vmovdqu %ymm0, (%rax)
|
|
vmovdqu %ymm1, 32(%rax)
|
|
vmovdqu %ymm2, 64(%rax)
|
|
vmovdqu %ymm3, 96(%rax)
|
|
vmovdqu %ymm4, 128(%rax)
|
|
L_poly1305_avx2_blocks_end_calc:
|
|
cmpb $0x00, 616(%rdi)
|
|
je L_poly1305_avx2_blocks_complete
|
|
movq %r8, %rax
|
|
movq %r10, %rdx
|
|
movq %r12, %rcx
|
|
shrq $12, %rdx
|
|
shrq $24, %rcx
|
|
shlq $26, %r9
|
|
shlq $52, %r10
|
|
shlq $14, %r11
|
|
shlq $40, %r12
|
|
addq %r9, %rax
|
|
adcq %r10, %rax
|
|
adcq %r11, %rdx
|
|
adcq %r12, %rdx
|
|
adcq $0x00, %rcx
|
|
movq %rcx, %r8
|
|
andq $3, %rcx
|
|
shrq $2, %r8
|
|
leaq 0(%r8,%r8,4), %r8
|
|
addq %r8, %rax
|
|
adcq $0x00, %rdx
|
|
adcq $0x00, %rcx
|
|
movq %rax, 24(%rdi)
|
|
movq %rdx, 32(%rdi)
|
|
movq %rcx, 40(%rdi)
|
|
L_poly1305_avx2_blocks_complete:
|
|
movb $0x01, 617(%rdi)
|
|
addq $0x140, %rsp
|
|
popq %rbx
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl poly1305_final_avx2
|
|
.type poly1305_final_avx2,@function
|
|
.align 16
|
|
poly1305_final_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _poly1305_final_avx2
|
|
.p2align 4
|
|
_poly1305_final_avx2:
|
|
#endif /* __APPLE__ */
|
|
movb $0x01, 616(%rdi)
|
|
movb 617(%rdi), %cl
|
|
cmpb $0x00, %cl
|
|
je L_poly1305_avx2_final_done_blocks_X4
|
|
pushq %rsi
|
|
movq $0x40, %rdx
|
|
xorq %rsi, %rsi
|
|
#ifndef __APPLE__
|
|
callq poly1305_blocks_avx2@plt
|
|
#else
|
|
callq _poly1305_blocks_avx2
|
|
#endif /* __APPLE__ */
|
|
popq %rsi
|
|
L_poly1305_avx2_final_done_blocks_X4:
|
|
movq 608(%rdi), %rax
|
|
movq %rax, %rcx
|
|
andq $-16, %rcx
|
|
cmpb $0x00, %cl
|
|
je L_poly1305_avx2_final_done_blocks
|
|
pushq %rcx
|
|
pushq %rax
|
|
pushq %rsi
|
|
movq %rcx, %rdx
|
|
leaq 480(%rdi), %rsi
|
|
#ifndef __APPLE__
|
|
callq poly1305_blocks_avx@plt
|
|
#else
|
|
callq _poly1305_blocks_avx
|
|
#endif /* __APPLE__ */
|
|
popq %rsi
|
|
popq %rax
|
|
popq %rcx
|
|
L_poly1305_avx2_final_done_blocks:
|
|
subq %rcx, 608(%rdi)
|
|
xorq %rdx, %rdx
|
|
jmp L_poly1305_avx2_final_cmp_copy
|
|
L_poly1305_avx2_final_start_copy:
|
|
movb 480(%rdi,%rcx,1), %r8b
|
|
movb %r8b, 480(%rdi,%rdx,1)
|
|
incb %cl
|
|
incb %dl
|
|
L_poly1305_avx2_final_cmp_copy:
|
|
cmp %rcx, %rax
|
|
jne L_poly1305_avx2_final_start_copy
|
|
#ifndef __APPLE__
|
|
callq poly1305_final_avx@plt
|
|
#else
|
|
callq _poly1305_final_avx
|
|
#endif /* __APPLE__ */
|
|
vpxor %ymm0, %ymm0, %ymm0
|
|
vmovdqu %ymm0, 64(%rdi)
|
|
vmovdqu %ymm0, 96(%rdi)
|
|
vmovdqu %ymm0, 128(%rdi)
|
|
vmovdqu %ymm0, 160(%rdi)
|
|
vmovdqu %ymm0, 192(%rdi)
|
|
vmovdqu %ymm0, 224(%rdi)
|
|
vmovdqu %ymm0, 256(%rdi)
|
|
vmovdqu %ymm0, 288(%rdi)
|
|
vmovdqu %ymm0, 320(%rdi)
|
|
movq $0x00, 608(%rdi)
|
|
movw $0x00, 616(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size poly1305_final_avx2,.-poly1305_final_avx2
|
|
#endif /* __APPLE__ */
|
|
#endif /* HAVE_INTEL_AVX2 */
|
|
#endif /* WOLFSSL_X86_64_BUILD */
|
|
|
|
#if defined(__linux__) && defined(__ELF__)
|
|
.section .note.GNU-stack,"",%progbits
|
|
#endif
|