Intel AVX1/SSE2 ASM: no ymm/zmm regs no vzeroupper

vzeroupper instruction not needed to be invoked unless ymm or zmm
registers are used.
pull/8479/head
Sean Parkinson 2025-02-20 22:28:40 +10:00
parent 93000e5f14
commit e90e3aa7c6
6 changed files with 4 additions and 22 deletions

View File

@ -9910,7 +9910,6 @@ L_AES_GCM_init_avx1_iv_done:
vpaddd L_avx1_aes_gcm_one(%rip), %xmm4, %xmm4
vmovdqa %xmm5, (%r8)
vmovdqa %xmm4, (%r9)
vzeroupper
addq $16, %rsp
popq %r13
popq %r12
@ -9985,7 +9984,6 @@ L_AES_GCM_aad_update_avx1_16_loop:
cmpl %esi, %ecx
jl L_AES_GCM_aad_update_avx1_16_loop
vmovdqa %xmm5, (%rdx)
vzeroupper
repz retq
#ifndef __APPLE__
.size AES_GCM_aad_update_avx1,.-AES_GCM_aad_update_avx1

View File

@ -9832,7 +9832,6 @@ L_AES_GCM_init_avx1_iv_done:
vpaddd xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
vmovdqa OWORD PTR [rax], xmm5
vmovdqa OWORD PTR [r8], xmm4
vzeroupper
vmovdqu xmm6, OWORD PTR [rsp+16]
vmovdqu xmm7, OWORD PTR [rsp+32]
vmovdqu xmm8, OWORD PTR [rsp+48]
@ -9905,7 +9904,6 @@ L_AES_GCM_aad_update_avx1_16_loop:
cmp ecx, edx
jl L_AES_GCM_aad_update_avx1_16_loop
vmovdqa OWORD PTR [r8], xmm5
vzeroupper
vmovdqu xmm6, OWORD PTR [rsp]
vmovdqu xmm7, OWORD PTR [rsp+16]
add rsp, 32

View File

@ -1033,7 +1033,6 @@ L_chacha20_avx1_partial_end64:
subl %r11d, %r8d
movl %r8d, 76(%rdi)
L_chacha20_avx1_partial_done:
vzeroupper
addq $0x190, %rsp
popq %r15
popq %r14

View File

@ -990,7 +990,6 @@ L_chacha20_avx1_partial_end64:
sub r10d, r13d
mov DWORD PTR [rcx+76], r10d
L_chacha20_avx1_partial_done:
vzeroupper
vmovdqu xmm6, OWORD PTR [rsp+400]
vmovdqu xmm7, OWORD PTR [rsp+416]
vmovdqu xmm8, OWORD PTR [rsp+432]

View File

@ -273,7 +273,6 @@ _Transform_Sha256_SSE2_Sha:
movhpd %xmm1, 16(%rdi)
movhpd %xmm2, 24(%rdi)
xorq %rax, %rax
vzeroupper
repz retq
#ifndef __APPLE__
.size Transform_Sha256_SSE2_Sha,.-Transform_Sha256_SSE2_Sha
@ -476,7 +475,6 @@ L_sha256_sha_len_sse2_start:
movhpd %xmm1, 16(%rdi)
movhpd %xmm2, 24(%rdi)
xorq %rax, %rax
vzeroupper
repz retq
#ifndef __APPLE__
.size Transform_Sha256_SSE2_Sha_Len,.-Transform_Sha256_SSE2_Sha_Len
@ -2920,7 +2918,6 @@ _Transform_Sha256_AVX1:
addl %r14d, 24(%rdi)
addl %r15d, 28(%rdi)
xorq %rax, %rax
vzeroupper
addq $0x40, %rsp
popq %rbp
popq %r15
@ -5327,7 +5324,6 @@ L_sha256_len_avx1_start:
movl %r15d, 28(%rdi)
jnz L_sha256_len_avx1_start
xorq %rax, %rax
vzeroupper
addq $0x44, %rsp
popq %rbp
popq %r15
@ -7735,7 +7731,6 @@ _Transform_Sha256_AVX1_RORX:
addl %r14d, 24(%rdi)
addl %r15d, 28(%rdi)
xorq %rax, %rax
vzeroupper
addq $0x40, %rsp
popq %rbp
popq %r15
@ -10101,7 +10096,6 @@ L_sha256_len_avx1_len_rorx_start:
movl %r15d, 28(%rdi)
jnz L_sha256_len_avx1_len_rorx_start
xorq %rax, %rax
vzeroupper
addq $0x44, %rsp
popq %rbp
popq %r15
@ -10312,7 +10306,6 @@ _Transform_Sha256_AVX1_Sha:
vmovhpd %xmm1, 16(%rdi)
vmovhpd %xmm2, 24(%rdi)
xorq %rax, %rax
vzeroupper
repz retq
#ifndef __APPLE__
.size Transform_Sha256_AVX1_Sha,.-Transform_Sha256_AVX1_Sha
@ -10487,7 +10480,6 @@ L_sha256_sha_len_avx1_start:
vmovhpd %xmm1, 16(%rdi)
vmovhpd %xmm2, 24(%rdi)
xorq %rax, %rax
vzeroupper
repz retq
#ifndef __APPLE__
.size Transform_Sha256_AVX1_Sha_Len,.-Transform_Sha256_AVX1_Sha_Len

View File

@ -159,7 +159,7 @@ _Transform_Sha512_AVX1:
movq %r12, %rax
xorq %r10, %rbx
# Start of 16 rounds
L_sha256_len_avx1_start:
L_transform_sha512_avx1_start:
vpaddq (%rsi), %xmm0, %xmm8
vpaddq 16(%rsi), %xmm1, %xmm9
vmovdqu %xmm8, (%rsp)
@ -906,7 +906,7 @@ L_sha256_len_avx1_start:
vpaddq %xmm7, %xmm8, %xmm7
# msg_sched done: 14-17
subl $0x01, 128(%rsp)
jne L_sha256_len_avx1_start
jne L_transform_sha512_avx1_start
vpaddq (%rsi), %xmm0, %xmm8
vpaddq 16(%rsi), %xmm1, %xmm9
vmovdqu %xmm8, (%rsp)
@ -1372,7 +1372,6 @@ L_sha256_len_avx1_start:
addq %r14, 48(%rdi)
addq %r15, 56(%rdi)
xorq %rax, %rax
vzeroupper
addq $0x88, %rsp
popq %r15
popq %r14
@ -2664,7 +2663,6 @@ L_sha512_len_avx1_start:
movq %r15, 56(%rdi)
jnz L_sha512_len_avx1_begin
xorq %rax, %rax
vzeroupper
addq $0x90, %rsp
popq %rbp
popq %r15
@ -2805,7 +2803,7 @@ _Transform_Sha512_AVX1_RORX:
vmovdqu %xmm8, 96(%rsp)
vmovdqu %xmm9, 112(%rsp)
# Start of 16 rounds
L_sha256_len_avx1_rorx_start:
L_transform_sha512_avx1_rorx_start:
addq $0x80, %rsi
# msg_sched: 0-1
# rnd_0: 0 - 0
@ -3512,7 +3510,7 @@ L_sha256_len_avx1_rorx_start:
vmovdqu %xmm8, 96(%rsp)
vmovdqu %xmm9, 112(%rsp)
subl $0x01, 128(%rsp)
jne L_sha256_len_avx1_rorx_start
jne L_transform_sha512_avx1_rorx_start
# rnd_all_2: 0-1
# rnd_0: 0 - 7
rorxq $14, %r12, %rax
@ -3931,7 +3929,6 @@ L_sha256_len_avx1_rorx_start:
addq %r14, 48(%rdi)
addq %r15, 56(%rdi)
xorq %rax, %rax
vzeroupper
addq $0x88, %rsp
popq %r15
popq %r14
@ -5168,7 +5165,6 @@ L_sha512_len_avx1_rorx_start:
movq %r15, 56(%rdi)
jnz L_sha512_len_avx1_rorx_begin
xorq %rax, %rax
vzeroupper
addq $0x90, %rsp
popq %rbp
popq %r15