diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S index b14620be0..95ac60ae2 100644 --- a/wolfcrypt/src/aes_gcm_asm.S +++ b/wolfcrypt/src/aes_gcm_asm.S @@ -9910,7 +9910,6 @@ L_AES_GCM_init_avx1_iv_done: vpaddd L_avx1_aes_gcm_one(%rip), %xmm4, %xmm4 vmovdqa %xmm5, (%r8) vmovdqa %xmm4, (%r9) - vzeroupper addq $16, %rsp popq %r13 popq %r12 @@ -9985,7 +9984,6 @@ L_AES_GCM_aad_update_avx1_16_loop: cmpl %esi, %ecx jl L_AES_GCM_aad_update_avx1_16_loop vmovdqa %xmm5, (%rdx) - vzeroupper repz retq #ifndef __APPLE__ .size AES_GCM_aad_update_avx1,.-AES_GCM_aad_update_avx1 diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm index 2e4683cdd..a818e8658 100644 --- a/wolfcrypt/src/aes_gcm_asm.asm +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -9832,7 +9832,6 @@ L_AES_GCM_init_avx1_iv_done: vpaddd xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one vmovdqa OWORD PTR [rax], xmm5 vmovdqa OWORD PTR [r8], xmm4 - vzeroupper vmovdqu xmm6, OWORD PTR [rsp+16] vmovdqu xmm7, OWORD PTR [rsp+32] vmovdqu xmm8, OWORD PTR [rsp+48] @@ -9905,7 +9904,6 @@ L_AES_GCM_aad_update_avx1_16_loop: cmp ecx, edx jl L_AES_GCM_aad_update_avx1_16_loop vmovdqa OWORD PTR [r8], xmm5 - vzeroupper vmovdqu xmm6, OWORD PTR [rsp] vmovdqu xmm7, OWORD PTR [rsp+16] add rsp, 32 diff --git a/wolfcrypt/src/chacha_asm.S b/wolfcrypt/src/chacha_asm.S index 6616e5b3d..37e2a5930 100644 --- a/wolfcrypt/src/chacha_asm.S +++ b/wolfcrypt/src/chacha_asm.S @@ -1033,7 +1033,6 @@ L_chacha20_avx1_partial_end64: subl %r11d, %r8d movl %r8d, 76(%rdi) L_chacha20_avx1_partial_done: - vzeroupper addq $0x190, %rsp popq %r15 popq %r14 diff --git a/wolfcrypt/src/chacha_asm.asm b/wolfcrypt/src/chacha_asm.asm index 334b0555f..e9988945b 100644 --- a/wolfcrypt/src/chacha_asm.asm +++ b/wolfcrypt/src/chacha_asm.asm @@ -990,7 +990,6 @@ L_chacha20_avx1_partial_end64: sub r10d, r13d mov DWORD PTR [rcx+76], r10d L_chacha20_avx1_partial_done: - vzeroupper vmovdqu xmm6, OWORD PTR [rsp+400] vmovdqu xmm7, OWORD PTR [rsp+416] vmovdqu xmm8, OWORD PTR [rsp+432] diff --git a/wolfcrypt/src/sha256_asm.S b/wolfcrypt/src/sha256_asm.S index e180a5fc3..5d2d60049 100644 --- a/wolfcrypt/src/sha256_asm.S +++ b/wolfcrypt/src/sha256_asm.S @@ -273,7 +273,6 @@ _Transform_Sha256_SSE2_Sha: movhpd %xmm1, 16(%rdi) movhpd %xmm2, 24(%rdi) xorq %rax, %rax - vzeroupper repz retq #ifndef __APPLE__ .size Transform_Sha256_SSE2_Sha,.-Transform_Sha256_SSE2_Sha @@ -476,7 +475,6 @@ L_sha256_sha_len_sse2_start: movhpd %xmm1, 16(%rdi) movhpd %xmm2, 24(%rdi) xorq %rax, %rax - vzeroupper repz retq #ifndef __APPLE__ .size Transform_Sha256_SSE2_Sha_Len,.-Transform_Sha256_SSE2_Sha_Len @@ -2920,7 +2918,6 @@ _Transform_Sha256_AVX1: addl %r14d, 24(%rdi) addl %r15d, 28(%rdi) xorq %rax, %rax - vzeroupper addq $0x40, %rsp popq %rbp popq %r15 @@ -5327,7 +5324,6 @@ L_sha256_len_avx1_start: movl %r15d, 28(%rdi) jnz L_sha256_len_avx1_start xorq %rax, %rax - vzeroupper addq $0x44, %rsp popq %rbp popq %r15 @@ -7735,7 +7731,6 @@ _Transform_Sha256_AVX1_RORX: addl %r14d, 24(%rdi) addl %r15d, 28(%rdi) xorq %rax, %rax - vzeroupper addq $0x40, %rsp popq %rbp popq %r15 @@ -10101,7 +10096,6 @@ L_sha256_len_avx1_len_rorx_start: movl %r15d, 28(%rdi) jnz L_sha256_len_avx1_len_rorx_start xorq %rax, %rax - vzeroupper addq $0x44, %rsp popq %rbp popq %r15 @@ -10312,7 +10306,6 @@ _Transform_Sha256_AVX1_Sha: vmovhpd %xmm1, 16(%rdi) vmovhpd %xmm2, 24(%rdi) xorq %rax, %rax - vzeroupper repz retq #ifndef __APPLE__ .size Transform_Sha256_AVX1_Sha,.-Transform_Sha256_AVX1_Sha @@ -10487,7 +10480,6 @@ L_sha256_sha_len_avx1_start: vmovhpd %xmm1, 16(%rdi) vmovhpd %xmm2, 24(%rdi) xorq %rax, %rax - vzeroupper repz retq #ifndef __APPLE__ .size Transform_Sha256_AVX1_Sha_Len,.-Transform_Sha256_AVX1_Sha_Len diff --git a/wolfcrypt/src/sha512_asm.S b/wolfcrypt/src/sha512_asm.S index 84cb7c826..fe7278541 100644 --- a/wolfcrypt/src/sha512_asm.S +++ b/wolfcrypt/src/sha512_asm.S @@ -159,7 +159,7 @@ _Transform_Sha512_AVX1: movq %r12, %rax xorq %r10, %rbx # Start of 16 rounds -L_sha256_len_avx1_start: +L_transform_sha512_avx1_start: vpaddq (%rsi), %xmm0, %xmm8 vpaddq 16(%rsi), %xmm1, %xmm9 vmovdqu %xmm8, (%rsp) @@ -906,7 +906,7 @@ L_sha256_len_avx1_start: vpaddq %xmm7, %xmm8, %xmm7 # msg_sched done: 14-17 subl $0x01, 128(%rsp) - jne L_sha256_len_avx1_start + jne L_transform_sha512_avx1_start vpaddq (%rsi), %xmm0, %xmm8 vpaddq 16(%rsi), %xmm1, %xmm9 vmovdqu %xmm8, (%rsp) @@ -1372,7 +1372,6 @@ L_sha256_len_avx1_start: addq %r14, 48(%rdi) addq %r15, 56(%rdi) xorq %rax, %rax - vzeroupper addq $0x88, %rsp popq %r15 popq %r14 @@ -2664,7 +2663,6 @@ L_sha512_len_avx1_start: movq %r15, 56(%rdi) jnz L_sha512_len_avx1_begin xorq %rax, %rax - vzeroupper addq $0x90, %rsp popq %rbp popq %r15 @@ -2805,7 +2803,7 @@ _Transform_Sha512_AVX1_RORX: vmovdqu %xmm8, 96(%rsp) vmovdqu %xmm9, 112(%rsp) # Start of 16 rounds -L_sha256_len_avx1_rorx_start: +L_transform_sha512_avx1_rorx_start: addq $0x80, %rsi # msg_sched: 0-1 # rnd_0: 0 - 0 @@ -3512,7 +3510,7 @@ L_sha256_len_avx1_rorx_start: vmovdqu %xmm8, 96(%rsp) vmovdqu %xmm9, 112(%rsp) subl $0x01, 128(%rsp) - jne L_sha256_len_avx1_rorx_start + jne L_transform_sha512_avx1_rorx_start # rnd_all_2: 0-1 # rnd_0: 0 - 7 rorxq $14, %r12, %rax @@ -3931,7 +3929,6 @@ L_sha256_len_avx1_rorx_start: addq %r14, 48(%rdi) addq %r15, 56(%rdi) xorq %rax, %rax - vzeroupper addq $0x88, %rsp popq %r15 popq %r14 @@ -5168,7 +5165,6 @@ L_sha512_len_avx1_rorx_start: movq %r15, 56(%rdi) jnz L_sha512_len_avx1_rorx_begin xorq %rax, %rax - vzeroupper addq $0x90, %rsp popq %rbp popq %r15