mirror of https://github.com/wolfSSL/wolfssl.git
Intel AVX1/SSE2 ASM: no ymm/zmm regs no vzeroupper
vzeroupper instruction not needed to be invoked unless ymm or zmm registers are used.pull/8479/head
parent
93000e5f14
commit
e90e3aa7c6
|
@ -9910,7 +9910,6 @@ L_AES_GCM_init_avx1_iv_done:
|
|||
vpaddd L_avx1_aes_gcm_one(%rip), %xmm4, %xmm4
|
||||
vmovdqa %xmm5, (%r8)
|
||||
vmovdqa %xmm4, (%r9)
|
||||
vzeroupper
|
||||
addq $16, %rsp
|
||||
popq %r13
|
||||
popq %r12
|
||||
|
@ -9985,7 +9984,6 @@ L_AES_GCM_aad_update_avx1_16_loop:
|
|||
cmpl %esi, %ecx
|
||||
jl L_AES_GCM_aad_update_avx1_16_loop
|
||||
vmovdqa %xmm5, (%rdx)
|
||||
vzeroupper
|
||||
repz retq
|
||||
#ifndef __APPLE__
|
||||
.size AES_GCM_aad_update_avx1,.-AES_GCM_aad_update_avx1
|
||||
|
|
|
@ -9832,7 +9832,6 @@ L_AES_GCM_init_avx1_iv_done:
|
|||
vpaddd xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
|
||||
vmovdqa OWORD PTR [rax], xmm5
|
||||
vmovdqa OWORD PTR [r8], xmm4
|
||||
vzeroupper
|
||||
vmovdqu xmm6, OWORD PTR [rsp+16]
|
||||
vmovdqu xmm7, OWORD PTR [rsp+32]
|
||||
vmovdqu xmm8, OWORD PTR [rsp+48]
|
||||
|
@ -9905,7 +9904,6 @@ L_AES_GCM_aad_update_avx1_16_loop:
|
|||
cmp ecx, edx
|
||||
jl L_AES_GCM_aad_update_avx1_16_loop
|
||||
vmovdqa OWORD PTR [r8], xmm5
|
||||
vzeroupper
|
||||
vmovdqu xmm6, OWORD PTR [rsp]
|
||||
vmovdqu xmm7, OWORD PTR [rsp+16]
|
||||
add rsp, 32
|
||||
|
|
|
@ -1033,7 +1033,6 @@ L_chacha20_avx1_partial_end64:
|
|||
subl %r11d, %r8d
|
||||
movl %r8d, 76(%rdi)
|
||||
L_chacha20_avx1_partial_done:
|
||||
vzeroupper
|
||||
addq $0x190, %rsp
|
||||
popq %r15
|
||||
popq %r14
|
||||
|
|
|
@ -990,7 +990,6 @@ L_chacha20_avx1_partial_end64:
|
|||
sub r10d, r13d
|
||||
mov DWORD PTR [rcx+76], r10d
|
||||
L_chacha20_avx1_partial_done:
|
||||
vzeroupper
|
||||
vmovdqu xmm6, OWORD PTR [rsp+400]
|
||||
vmovdqu xmm7, OWORD PTR [rsp+416]
|
||||
vmovdqu xmm8, OWORD PTR [rsp+432]
|
||||
|
|
|
@ -273,7 +273,6 @@ _Transform_Sha256_SSE2_Sha:
|
|||
movhpd %xmm1, 16(%rdi)
|
||||
movhpd %xmm2, 24(%rdi)
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
repz retq
|
||||
#ifndef __APPLE__
|
||||
.size Transform_Sha256_SSE2_Sha,.-Transform_Sha256_SSE2_Sha
|
||||
|
@ -476,7 +475,6 @@ L_sha256_sha_len_sse2_start:
|
|||
movhpd %xmm1, 16(%rdi)
|
||||
movhpd %xmm2, 24(%rdi)
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
repz retq
|
||||
#ifndef __APPLE__
|
||||
.size Transform_Sha256_SSE2_Sha_Len,.-Transform_Sha256_SSE2_Sha_Len
|
||||
|
@ -2920,7 +2918,6 @@ _Transform_Sha256_AVX1:
|
|||
addl %r14d, 24(%rdi)
|
||||
addl %r15d, 28(%rdi)
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
addq $0x40, %rsp
|
||||
popq %rbp
|
||||
popq %r15
|
||||
|
@ -5327,7 +5324,6 @@ L_sha256_len_avx1_start:
|
|||
movl %r15d, 28(%rdi)
|
||||
jnz L_sha256_len_avx1_start
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
addq $0x44, %rsp
|
||||
popq %rbp
|
||||
popq %r15
|
||||
|
@ -7735,7 +7731,6 @@ _Transform_Sha256_AVX1_RORX:
|
|||
addl %r14d, 24(%rdi)
|
||||
addl %r15d, 28(%rdi)
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
addq $0x40, %rsp
|
||||
popq %rbp
|
||||
popq %r15
|
||||
|
@ -10101,7 +10096,6 @@ L_sha256_len_avx1_len_rorx_start:
|
|||
movl %r15d, 28(%rdi)
|
||||
jnz L_sha256_len_avx1_len_rorx_start
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
addq $0x44, %rsp
|
||||
popq %rbp
|
||||
popq %r15
|
||||
|
@ -10312,7 +10306,6 @@ _Transform_Sha256_AVX1_Sha:
|
|||
vmovhpd %xmm1, 16(%rdi)
|
||||
vmovhpd %xmm2, 24(%rdi)
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
repz retq
|
||||
#ifndef __APPLE__
|
||||
.size Transform_Sha256_AVX1_Sha,.-Transform_Sha256_AVX1_Sha
|
||||
|
@ -10487,7 +10480,6 @@ L_sha256_sha_len_avx1_start:
|
|||
vmovhpd %xmm1, 16(%rdi)
|
||||
vmovhpd %xmm2, 24(%rdi)
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
repz retq
|
||||
#ifndef __APPLE__
|
||||
.size Transform_Sha256_AVX1_Sha_Len,.-Transform_Sha256_AVX1_Sha_Len
|
||||
|
|
|
@ -159,7 +159,7 @@ _Transform_Sha512_AVX1:
|
|||
movq %r12, %rax
|
||||
xorq %r10, %rbx
|
||||
# Start of 16 rounds
|
||||
L_sha256_len_avx1_start:
|
||||
L_transform_sha512_avx1_start:
|
||||
vpaddq (%rsi), %xmm0, %xmm8
|
||||
vpaddq 16(%rsi), %xmm1, %xmm9
|
||||
vmovdqu %xmm8, (%rsp)
|
||||
|
@ -906,7 +906,7 @@ L_sha256_len_avx1_start:
|
|||
vpaddq %xmm7, %xmm8, %xmm7
|
||||
# msg_sched done: 14-17
|
||||
subl $0x01, 128(%rsp)
|
||||
jne L_sha256_len_avx1_start
|
||||
jne L_transform_sha512_avx1_start
|
||||
vpaddq (%rsi), %xmm0, %xmm8
|
||||
vpaddq 16(%rsi), %xmm1, %xmm9
|
||||
vmovdqu %xmm8, (%rsp)
|
||||
|
@ -1372,7 +1372,6 @@ L_sha256_len_avx1_start:
|
|||
addq %r14, 48(%rdi)
|
||||
addq %r15, 56(%rdi)
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
addq $0x88, %rsp
|
||||
popq %r15
|
||||
popq %r14
|
||||
|
@ -2664,7 +2663,6 @@ L_sha512_len_avx1_start:
|
|||
movq %r15, 56(%rdi)
|
||||
jnz L_sha512_len_avx1_begin
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
addq $0x90, %rsp
|
||||
popq %rbp
|
||||
popq %r15
|
||||
|
@ -2805,7 +2803,7 @@ _Transform_Sha512_AVX1_RORX:
|
|||
vmovdqu %xmm8, 96(%rsp)
|
||||
vmovdqu %xmm9, 112(%rsp)
|
||||
# Start of 16 rounds
|
||||
L_sha256_len_avx1_rorx_start:
|
||||
L_transform_sha512_avx1_rorx_start:
|
||||
addq $0x80, %rsi
|
||||
# msg_sched: 0-1
|
||||
# rnd_0: 0 - 0
|
||||
|
@ -3512,7 +3510,7 @@ L_sha256_len_avx1_rorx_start:
|
|||
vmovdqu %xmm8, 96(%rsp)
|
||||
vmovdqu %xmm9, 112(%rsp)
|
||||
subl $0x01, 128(%rsp)
|
||||
jne L_sha256_len_avx1_rorx_start
|
||||
jne L_transform_sha512_avx1_rorx_start
|
||||
# rnd_all_2: 0-1
|
||||
# rnd_0: 0 - 7
|
||||
rorxq $14, %r12, %rax
|
||||
|
@ -3931,7 +3929,6 @@ L_sha256_len_avx1_rorx_start:
|
|||
addq %r14, 48(%rdi)
|
||||
addq %r15, 56(%rdi)
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
addq $0x88, %rsp
|
||||
popq %r15
|
||||
popq %r14
|
||||
|
@ -5168,7 +5165,6 @@ L_sha512_len_avx1_rorx_start:
|
|||
movq %r15, 56(%rdi)
|
||||
jnz L_sha512_len_avx1_rorx_begin
|
||||
xorq %rax, %rax
|
||||
vzeroupper
|
||||
addq $0x90, %rsp
|
||||
popq %rbp
|
||||
popq %r15
|
||||
|
|
Loading…
Reference in New Issue