From 75c14e4c8eb6f6d3a2ffdb2bea29968443b872ab Mon Sep 17 00:00:00 2001
From: Sean Parkinson <sean@wolfssl.com>
Date: Mon, 20 Apr 2020 09:09:45 +1000
Subject: [PATCH] Only use Intel instruction movbe when available

---
 wolfcrypt/src/cpuid.c         |    1 +
 wolfcrypt/src/sp_x86_64.c     |  215 +++++-
 wolfcrypt/src/sp_x86_64_asm.S | 1317 +++++++++++++++++++++++++++++----
 wolfssl/wolfcrypt/cpuid.h     |    2 +
 4 files changed, 1385 insertions(+), 150 deletions(-)

diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c
index 85c4bf2d6..cc360a3c1 100644
--- a/wolfcrypt/src/cpuid.c
+++ b/wolfcrypt/src/cpuid.c
@@ -97,6 +97,7 @@
             if (cpuid_flag(7, 0, EBX, 18)) { cpuid_flags |= CPUID_RDSEED; }
             if (cpuid_flag(1, 0, ECX, 25)) { cpuid_flags |= CPUID_AESNI ; }
             if (cpuid_flag(7, 0, EBX, 19)) { cpuid_flags |= CPUID_ADX   ; }
+            if (cpuid_flag(1, 0, ECX, 22)) { cpuid_flags |= CPUID_MOVBE ; }
             cpuid_check = 1;
         }
     }
diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c
index 3e49d2022..f94612136 100644
--- a/wolfcrypt/src/sp_x86_64.c
+++ b/wolfcrypt/src/sp_x86_64.c
@@ -49,7 +49,27 @@
 #ifdef WOLFSSL_SP_X86_64_ASM
 #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 #ifndef WOLFSSL_SP_NO_2048
-extern void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n);
+extern void sp_2048_from_bin_bswap(sp_digit* r, int size, const byte* a, int n);
+extern void sp_2048_from_bin_movbe(sp_digit* r, int size, const byte* a, int n);
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_2048_from_bin_movbe(r, size, a, n);
+    }
+    else {
+        sp_2048_from_bin_bswap(r, size, a, n);
+    }
+}
+
 /* Convert an mp_int to an array of sp_digit.
  *
  * r  A single precision integer.
@@ -132,7 +152,26 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
 #endif
 }
 
-extern void sp_2048_to_bin(sp_digit* r, byte* a);
+extern void sp_2048_to_bin_bswap(sp_digit* r, byte* a);
+extern void sp_2048_to_bin_movbe(sp_digit* r, byte* a);
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 256
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_2048_to_bin(sp_digit* r, byte* a)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_2048_to_bin_movbe(r, a);
+    }
+    else {
+        sp_2048_to_bin_bswap(r, a);
+    }
+}
+
 extern void sp_2048_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b);
 extern void sp_2048_sqr_16(sp_digit* r, const sp_digit* a);
 extern void sp_2048_mul_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* b);
@@ -2184,7 +2223,27 @@ int sp_ModExp_1024(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
 #endif /* !WOLFSSL_SP_NO_2048 */
 
 #ifndef WOLFSSL_SP_NO_3072
-extern void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n);
+extern void sp_3072_from_bin_bswap(sp_digit* r, int size, const byte* a, int n);
+extern void sp_3072_from_bin_movbe(sp_digit* r, int size, const byte* a, int n);
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_3072_from_bin_movbe(r, size, a, n);
+    }
+    else {
+        sp_3072_from_bin_bswap(r, size, a, n);
+    }
+}
+
 /* Convert an mp_int to an array of sp_digit.
  *
  * r  A single precision integer.
@@ -2267,7 +2326,26 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
 #endif
 }
 
-extern void sp_3072_to_bin(sp_digit* r, byte* a);
+extern void sp_3072_to_bin_bswap(sp_digit* r, byte* a);
+extern void sp_3072_to_bin_movbe(sp_digit* r, byte* a);
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 384
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_3072_to_bin(sp_digit* r, byte* a)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_3072_to_bin_movbe(r, a);
+    }
+    else {
+        sp_3072_to_bin_bswap(r, a);
+    }
+}
+
 extern void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b);
 extern void sp_3072_sqr_12(sp_digit* r, const sp_digit* a);
 extern void sp_3072_mul_avx2_12(sp_digit* r, const sp_digit* a, const sp_digit* b);
@@ -4333,7 +4411,27 @@ int sp_ModExp_1536(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
 #endif /* !WOLFSSL_SP_NO_3072 */
 
 #ifdef WOLFSSL_SP_4096
-extern void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n);
+extern void sp_4096_from_bin_bswap(sp_digit* r, int size, const byte* a, int n);
+extern void sp_4096_from_bin_movbe(sp_digit* r, int size, const byte* a, int n);
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_4096_from_bin_movbe(r, size, a, n);
+    }
+    else {
+        sp_4096_from_bin_bswap(r, size, a, n);
+    }
+}
+
 /* Convert an mp_int to an array of sp_digit.
  *
  * r  A single precision integer.
@@ -4416,7 +4514,26 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
 #endif
 }
 
-extern void sp_4096_to_bin(sp_digit* r, byte* a);
+extern void sp_4096_to_bin_bswap(sp_digit* r, byte* a);
+extern void sp_4096_to_bin_movbe(sp_digit* r, byte* a);
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 512
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_4096_to_bin(sp_digit* r, byte* a)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_4096_to_bin_movbe(r, a);
+    }
+    else {
+        sp_4096_to_bin_bswap(r, a);
+    }
+}
+
 extern sp_digit sp_4096_sub_in_place_64(sp_digit* a, const sp_digit* b);
 extern sp_digit sp_4096_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b);
 extern void sp_4096_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b);
@@ -22065,7 +22182,27 @@ static int sp_256_iszero_4(const sp_digit* a)
 
 #endif /* WOLFSSL_VALIDATE_ECC_KEYGEN || HAVE_ECC_SIGN || HAVE_ECC_VERIFY */
 extern void sp_256_add_one_4(sp_digit* a);
-extern void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n);
+extern void sp_256_from_bin_bswap(sp_digit* r, int size, const byte* a, int n);
+extern void sp_256_from_bin_movbe(sp_digit* r, int size, const byte* a, int n);
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_256_from_bin_movbe(r, size, a, n);
+    }
+    else {
+        sp_256_from_bin_bswap(r, size, a, n);
+    }
+}
+
 /* Generates a scalar that is in the range 1..order-1.
  *
  * rng  Random number generator.
@@ -22192,7 +22329,26 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
 }
 
 #ifdef HAVE_ECC_DHE
-extern void sp_256_to_bin(sp_digit* r, byte* a);
+extern void sp_256_to_bin_bswap(sp_digit* r, byte* a);
+extern void sp_256_to_bin_movbe(sp_digit* r, byte* a);
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 32
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_256_to_bin(sp_digit* r, byte* a)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_256_to_bin_movbe(r, a);
+    }
+    else {
+        sp_256_to_bin_bswap(r, a);
+    }
+}
+
 /* Multiply the point by the scalar and serialize the X ordinate.
  * The number is 0 padded to maximum size on output.
  *
@@ -27886,7 +28042,27 @@ static int sp_384_iszero_6(const sp_digit* a)
 
 #endif /* WOLFSSL_VALIDATE_ECC_KEYGEN || HAVE_ECC_SIGN || HAVE_ECC_VERIFY */
 extern void sp_384_add_one_6(sp_digit* a);
-extern void sp_384_from_bin(sp_digit* r, int size, const byte* a, int n);
+extern void sp_384_from_bin_bswap(sp_digit* r, int size, const byte* a, int n);
+extern void sp_384_from_bin_movbe(sp_digit* r, int size, const byte* a, int n);
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_384_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_384_from_bin_movbe(r, size, a, n);
+    }
+    else {
+        sp_384_from_bin_bswap(r, size, a, n);
+    }
+}
+
 /* Generates a scalar that is in the range 1..order-1.
  *
  * rng  Random number generator.
@@ -28013,7 +28189,26 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
 }
 
 #ifdef HAVE_ECC_DHE
-extern void sp_384_to_bin(sp_digit* r, byte* a);
+extern void sp_384_to_bin_bswap(sp_digit* r, byte* a);
+extern void sp_384_to_bin_movbe(sp_digit* r, byte* a);
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 48
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_384_to_bin(sp_digit* r, byte* a)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    if (IS_INTEL_MOVBE(cpuid_flags)) {
+        sp_384_to_bin_movbe(r, a);
+    }
+    else {
+        sp_384_to_bin_bswap(r, a);
+    }
+}
+
 /* Multiply the point by the scalar and serialize the X ordinate.
  * The number is 0 padded to maximum size on output.
  *
diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S
index c6941f1f0..58ae2271a 100644
--- a/wolfcrypt/src/sp_x86_64_asm.S
+++ b/wolfcrypt/src/sp_x86_64_asm.S
@@ -23,6 +23,7 @@
 #ifndef WOLFSSL_SP_NO_2048
 #ifndef WOLFSSL_SP_NO_2048
 /* Read big endian unsigned byte array into r.
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * size  Maximum number of bytes to convert
@@ -30,22 +31,114 @@
  * n  Number of bytes in array to read.
  */
 #ifndef __APPLE__
-.globl	sp_2048_from_bin
-.type	sp_2048_from_bin,@function
+.globl	sp_2048_from_bin_bswap
+.type	sp_2048_from_bin_bswap,@function
 .align	16
-sp_2048_from_bin:
+sp_2048_from_bin_bswap:
 #else
-.globl	_sp_2048_from_bin
+.globl	_sp_2048_from_bin_bswap
 .p2align	4
-_sp_2048_from_bin:
+_sp_2048_from_bin_bswap:
 #endif /* __APPLE__ */
         movq	%rdx, %r9
         movq	%rdi, %r10
         addq	%rcx, %r9
         addq	$256, %r10
         xorq	%r11, %r11
-        jmp	L_2048_from_bin_64_end
-L_2048_from_bin_64_start:
+        jmp	L_2048_from_bin_bswap_64_end
+L_2048_from_bin_bswap_64_start:
+        subq	$64, %r9
+        movq	56(%r9), %rax
+        movq	48(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, (%rdi)
+        movq	%r8, 8(%rdi)
+        movq	40(%r9), %rax
+        movq	32(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 16(%rdi)
+        movq	%r8, 24(%rdi)
+        movq	24(%r9), %rax
+        movq	16(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 32(%rdi)
+        movq	%r8, 40(%rdi)
+        movq	8(%r9), %rax
+        movq	(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 48(%rdi)
+        movq	%r8, 56(%rdi)
+        addq	$64, %rdi
+        subq	$64, %rcx
+L_2048_from_bin_bswap_64_end:
+        cmpq	$63, %rcx
+        jg	L_2048_from_bin_bswap_64_start
+        jmp	L_2048_from_bin_bswap_8_end
+L_2048_from_bin_bswap_8_start:
+        subq	$8, %r9
+        movq	(%r9), %rax
+        bswapq	%rax
+        movq	%rax, (%rdi)
+        addq	$8, %rdi
+        subq	$8, %rcx
+L_2048_from_bin_bswap_8_end:
+        cmpq	$7, %rcx
+        jg	L_2048_from_bin_bswap_8_start
+        cmpq	%r11, %rcx
+        je	L_2048_from_bin_bswap_hi_end
+        movq	%r11, %r8
+        movq	%r11, %rax
+L_2048_from_bin_bswap_hi_start:
+        movb	(%rdx), %al
+        shlq	$8, %r8
+        incq	%rdx
+        addq	%rax, %r8
+        decq	%rcx
+        jg	L_2048_from_bin_bswap_hi_start
+        movq	%r8, (%rdi)
+        addq	$8, %rdi
+L_2048_from_bin_bswap_hi_end:
+        cmpq	%r10, %rdi
+        je	L_2048_from_bin_bswap_zero_end
+L_2048_from_bin_bswap_zero_start:
+        movq	%r11, (%rdi)
+        addq	$8, %rdi
+        cmpq	%r10, %rdi
+        jl	L_2048_from_bin_bswap_zero_start
+L_2048_from_bin_bswap_zero_end:
+        repz retq
+#ifndef __APPLE__
+.size	sp_2048_from_bin_bswap,.-sp_2048_from_bin_bswap
+#endif /* __APPLE__ */
+/* Read big endian unsigned byte array into r.
+ * Uses the movbe instruction which is an optional instruction.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+#ifndef __APPLE__
+.globl	sp_2048_from_bin_movbe
+.type	sp_2048_from_bin_movbe,@function
+.align	16
+sp_2048_from_bin_movbe:
+#else
+.globl	_sp_2048_from_bin_movbe
+.p2align	4
+_sp_2048_from_bin_movbe:
+#endif /* __APPLE__ */
+        movq	%rdx, %r9
+        movq	%rdi, %r10
+        addq	%rcx, %r9
+        addq	$256, %r10
+        xorq	%r11, %r11
+        jmp	L_2048_from_bin_movbe_64_end
+L_2048_from_bin_movbe_64_start:
         subq	$64, %r9
         movbeq	56(%r9), %rax
         movbeq	48(%r9), %r8
@@ -65,60 +158,178 @@ L_2048_from_bin_64_start:
         movq	%r8, 56(%rdi)
         addq	$64, %rdi
         subq	$64, %rcx
-L_2048_from_bin_64_end:
+L_2048_from_bin_movbe_64_end:
         cmpq	$63, %rcx
-        jg	L_2048_from_bin_64_start
-        jmp	L_2048_from_bin_8_end
-L_2048_from_bin_8_start:
+        jg	L_2048_from_bin_movbe_64_start
+        jmp	L_2048_from_bin_movbe_8_end
+L_2048_from_bin_movbe_8_start:
         subq	$8, %r9
         movbeq	(%r9), %rax
         movq	%rax, (%rdi)
         addq	$8, %rdi
         subq	$8, %rcx
-L_2048_from_bin_8_end:
+L_2048_from_bin_movbe_8_end:
         cmpq	$7, %rcx
-        jg	L_2048_from_bin_8_start
+        jg	L_2048_from_bin_movbe_8_start
         cmpq	%r11, %rcx
-        je	L_2048_from_bin_hi_end
+        je	L_2048_from_bin_movbe_hi_end
         movq	%r11, %r8
         movq	%r11, %rax
-L_2048_from_bin_hi_start:
+L_2048_from_bin_movbe_hi_start:
         movb	(%rdx), %al
         shlq	$8, %r8
         incq	%rdx
         addq	%rax, %r8
         decq	%rcx
-        jg	L_2048_from_bin_hi_start
+        jg	L_2048_from_bin_movbe_hi_start
         movq	%r8, (%rdi)
         addq	$8, %rdi
-L_2048_from_bin_hi_end:
+L_2048_from_bin_movbe_hi_end:
         cmpq	%r10, %rdi
-        je	L_2048_from_bin_zero_end
-L_2048_from_bin_zero_start:
+        je	L_2048_from_bin_movbe_zero_end
+L_2048_from_bin_movbe_zero_start:
         movq	%r11, (%rdi)
         addq	$8, %rdi
         cmpq	%r10, %rdi
-        jl	L_2048_from_bin_zero_start
-L_2048_from_bin_zero_end:
+        jl	L_2048_from_bin_movbe_zero_start
+L_2048_from_bin_movbe_zero_end:
         repz retq
 #ifndef __APPLE__
-.size	sp_2048_from_bin,.-sp_2048_from_bin
+.size	sp_2048_from_bin_movbe,.-sp_2048_from_bin_movbe
 #endif /* __APPLE__ */
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 256
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * a  Byte array.
  */
 #ifndef __APPLE__
-.globl	sp_2048_to_bin
-.type	sp_2048_to_bin,@function
+.globl	sp_2048_to_bin_bswap
+.type	sp_2048_to_bin_bswap,@function
 .align	16
-sp_2048_to_bin:
+sp_2048_to_bin_bswap:
 #else
-.globl	_sp_2048_to_bin
+.globl	_sp_2048_to_bin_bswap
 .p2align	4
-_sp_2048_to_bin:
+_sp_2048_to_bin_bswap:
+#endif /* __APPLE__ */
+        movq	248(%rdi), %rdx
+        movq	240(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, (%rsi)
+        movq	%rax, 8(%rsi)
+        movq	232(%rdi), %rdx
+        movq	224(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 16(%rsi)
+        movq	%rax, 24(%rsi)
+        movq	216(%rdi), %rdx
+        movq	208(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 32(%rsi)
+        movq	%rax, 40(%rsi)
+        movq	200(%rdi), %rdx
+        movq	192(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 48(%rsi)
+        movq	%rax, 56(%rsi)
+        movq	184(%rdi), %rdx
+        movq	176(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 64(%rsi)
+        movq	%rax, 72(%rsi)
+        movq	168(%rdi), %rdx
+        movq	160(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 80(%rsi)
+        movq	%rax, 88(%rsi)
+        movq	152(%rdi), %rdx
+        movq	144(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 96(%rsi)
+        movq	%rax, 104(%rsi)
+        movq	136(%rdi), %rdx
+        movq	128(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 112(%rsi)
+        movq	%rax, 120(%rsi)
+        movq	120(%rdi), %rdx
+        movq	112(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 128(%rsi)
+        movq	%rax, 136(%rsi)
+        movq	104(%rdi), %rdx
+        movq	96(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 144(%rsi)
+        movq	%rax, 152(%rsi)
+        movq	88(%rdi), %rdx
+        movq	80(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 160(%rsi)
+        movq	%rax, 168(%rsi)
+        movq	72(%rdi), %rdx
+        movq	64(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 176(%rsi)
+        movq	%rax, 184(%rsi)
+        movq	56(%rdi), %rdx
+        movq	48(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 192(%rsi)
+        movq	%rax, 200(%rsi)
+        movq	40(%rdi), %rdx
+        movq	32(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 208(%rsi)
+        movq	%rax, 216(%rsi)
+        movq	24(%rdi), %rdx
+        movq	16(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 224(%rsi)
+        movq	%rax, 232(%rsi)
+        movq	8(%rdi), %rdx
+        movq	(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 240(%rsi)
+        movq	%rax, 248(%rsi)
+        repz retq
+#ifndef __APPLE__
+.size	sp_2048_to_bin_bswap,.-sp_2048_to_bin_bswap
+#endif /* __APPLE__ */
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 256
+ * Uses the movbe instruction which is optional.
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+#ifndef __APPLE__
+.globl	sp_2048_to_bin_movbe
+.type	sp_2048_to_bin_movbe,@function
+.align	16
+sp_2048_to_bin_movbe:
+#else
+.globl	_sp_2048_to_bin_movbe
+.p2align	4
+_sp_2048_to_bin_movbe:
 #endif /* __APPLE__ */
         movbeq	248(%rdi), %rdx
         movbeq	240(%rdi), %rax
@@ -186,7 +397,7 @@ _sp_2048_to_bin:
         movq	%rax, 248(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_2048_to_bin,.-sp_2048_to_bin
+.size	sp_2048_to_bin_movbe,.-sp_2048_to_bin_movbe
 #endif /* __APPLE__ */
 /* Multiply a and b into r. (r = a * b)
  *
@@ -12288,6 +12499,7 @@ _sp_2048_lshift_32:
 #ifndef WOLFSSL_SP_NO_3072
 #ifndef WOLFSSL_SP_NO_3072
 /* Read big endian unsigned byte array into r.
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * size  Maximum number of bytes to convert
@@ -12295,22 +12507,114 @@ _sp_2048_lshift_32:
  * n  Number of bytes in array to read.
  */
 #ifndef __APPLE__
-.globl	sp_3072_from_bin
-.type	sp_3072_from_bin,@function
+.globl	sp_3072_from_bin_bswap
+.type	sp_3072_from_bin_bswap,@function
 .align	16
-sp_3072_from_bin:
+sp_3072_from_bin_bswap:
 #else
-.globl	_sp_3072_from_bin
+.globl	_sp_3072_from_bin_bswap
 .p2align	4
-_sp_3072_from_bin:
+_sp_3072_from_bin_bswap:
 #endif /* __APPLE__ */
         movq	%rdx, %r9
         movq	%rdi, %r10
         addq	%rcx, %r9
         addq	$384, %r10
         xorq	%r11, %r11
-        jmp	L_3072_from_bin_64_end
-L_3072_from_bin_64_start:
+        jmp	L_3072_from_bin_bswap_64_end
+L_3072_from_bin_bswap_64_start:
+        subq	$64, %r9
+        movq	56(%r9), %rax
+        movq	48(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, (%rdi)
+        movq	%r8, 8(%rdi)
+        movq	40(%r9), %rax
+        movq	32(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 16(%rdi)
+        movq	%r8, 24(%rdi)
+        movq	24(%r9), %rax
+        movq	16(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 32(%rdi)
+        movq	%r8, 40(%rdi)
+        movq	8(%r9), %rax
+        movq	(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 48(%rdi)
+        movq	%r8, 56(%rdi)
+        addq	$64, %rdi
+        subq	$64, %rcx
+L_3072_from_bin_bswap_64_end:
+        cmpq	$63, %rcx
+        jg	L_3072_from_bin_bswap_64_start
+        jmp	L_3072_from_bin_bswap_8_end
+L_3072_from_bin_bswap_8_start:
+        subq	$8, %r9
+        movq	(%r9), %rax
+        bswapq	%rax
+        movq	%rax, (%rdi)
+        addq	$8, %rdi
+        subq	$8, %rcx
+L_3072_from_bin_bswap_8_end:
+        cmpq	$7, %rcx
+        jg	L_3072_from_bin_bswap_8_start
+        cmpq	%r11, %rcx
+        je	L_3072_from_bin_bswap_hi_end
+        movq	%r11, %r8
+        movq	%r11, %rax
+L_3072_from_bin_bswap_hi_start:
+        movb	(%rdx), %al
+        shlq	$8, %r8
+        incq	%rdx
+        addq	%rax, %r8
+        decq	%rcx
+        jg	L_3072_from_bin_bswap_hi_start
+        movq	%r8, (%rdi)
+        addq	$8, %rdi
+L_3072_from_bin_bswap_hi_end:
+        cmpq	%r10, %rdi
+        je	L_3072_from_bin_bswap_zero_end
+L_3072_from_bin_bswap_zero_start:
+        movq	%r11, (%rdi)
+        addq	$8, %rdi
+        cmpq	%r10, %rdi
+        jl	L_3072_from_bin_bswap_zero_start
+L_3072_from_bin_bswap_zero_end:
+        repz retq
+#ifndef __APPLE__
+.size	sp_3072_from_bin_bswap,.-sp_3072_from_bin_bswap
+#endif /* __APPLE__ */
+/* Read big endian unsigned byte array into r.
+ * Uses the movbe instruction which is an optional instruction.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+#ifndef __APPLE__
+.globl	sp_3072_from_bin_movbe
+.type	sp_3072_from_bin_movbe,@function
+.align	16
+sp_3072_from_bin_movbe:
+#else
+.globl	_sp_3072_from_bin_movbe
+.p2align	4
+_sp_3072_from_bin_movbe:
+#endif /* __APPLE__ */
+        movq	%rdx, %r9
+        movq	%rdi, %r10
+        addq	%rcx, %r9
+        addq	$384, %r10
+        xorq	%r11, %r11
+        jmp	L_3072_from_bin_movbe_64_end
+L_3072_from_bin_movbe_64_start:
         subq	$64, %r9
         movbeq	56(%r9), %rax
         movbeq	48(%r9), %r8
@@ -12330,60 +12634,226 @@ L_3072_from_bin_64_start:
         movq	%r8, 56(%rdi)
         addq	$64, %rdi
         subq	$64, %rcx
-L_3072_from_bin_64_end:
+L_3072_from_bin_movbe_64_end:
         cmpq	$63, %rcx
-        jg	L_3072_from_bin_64_start
-        jmp	L_3072_from_bin_8_end
-L_3072_from_bin_8_start:
+        jg	L_3072_from_bin_movbe_64_start
+        jmp	L_3072_from_bin_movbe_8_end
+L_3072_from_bin_movbe_8_start:
         subq	$8, %r9
         movbeq	(%r9), %rax
         movq	%rax, (%rdi)
         addq	$8, %rdi
         subq	$8, %rcx
-L_3072_from_bin_8_end:
+L_3072_from_bin_movbe_8_end:
         cmpq	$7, %rcx
-        jg	L_3072_from_bin_8_start
+        jg	L_3072_from_bin_movbe_8_start
         cmpq	%r11, %rcx
-        je	L_3072_from_bin_hi_end
+        je	L_3072_from_bin_movbe_hi_end
         movq	%r11, %r8
         movq	%r11, %rax
-L_3072_from_bin_hi_start:
+L_3072_from_bin_movbe_hi_start:
         movb	(%rdx), %al
         shlq	$8, %r8
         incq	%rdx
         addq	%rax, %r8
         decq	%rcx
-        jg	L_3072_from_bin_hi_start
+        jg	L_3072_from_bin_movbe_hi_start
         movq	%r8, (%rdi)
         addq	$8, %rdi
-L_3072_from_bin_hi_end:
+L_3072_from_bin_movbe_hi_end:
         cmpq	%r10, %rdi
-        je	L_3072_from_bin_zero_end
-L_3072_from_bin_zero_start:
+        je	L_3072_from_bin_movbe_zero_end
+L_3072_from_bin_movbe_zero_start:
         movq	%r11, (%rdi)
         addq	$8, %rdi
         cmpq	%r10, %rdi
-        jl	L_3072_from_bin_zero_start
-L_3072_from_bin_zero_end:
+        jl	L_3072_from_bin_movbe_zero_start
+L_3072_from_bin_movbe_zero_end:
         repz retq
 #ifndef __APPLE__
-.size	sp_3072_from_bin,.-sp_3072_from_bin
+.size	sp_3072_from_bin_movbe,.-sp_3072_from_bin_movbe
 #endif /* __APPLE__ */
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 384
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * a  Byte array.
  */
 #ifndef __APPLE__
-.globl	sp_3072_to_bin
-.type	sp_3072_to_bin,@function
+.globl	sp_3072_to_bin_bswap
+.type	sp_3072_to_bin_bswap,@function
 .align	16
-sp_3072_to_bin:
+sp_3072_to_bin_bswap:
 #else
-.globl	_sp_3072_to_bin
+.globl	_sp_3072_to_bin_bswap
 .p2align	4
-_sp_3072_to_bin:
+_sp_3072_to_bin_bswap:
+#endif /* __APPLE__ */
+        movq	376(%rdi), %rdx
+        movq	368(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, (%rsi)
+        movq	%rax, 8(%rsi)
+        movq	360(%rdi), %rdx
+        movq	352(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 16(%rsi)
+        movq	%rax, 24(%rsi)
+        movq	344(%rdi), %rdx
+        movq	336(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 32(%rsi)
+        movq	%rax, 40(%rsi)
+        movq	328(%rdi), %rdx
+        movq	320(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 48(%rsi)
+        movq	%rax, 56(%rsi)
+        movq	312(%rdi), %rdx
+        movq	304(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 64(%rsi)
+        movq	%rax, 72(%rsi)
+        movq	296(%rdi), %rdx
+        movq	288(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 80(%rsi)
+        movq	%rax, 88(%rsi)
+        movq	280(%rdi), %rdx
+        movq	272(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 96(%rsi)
+        movq	%rax, 104(%rsi)
+        movq	264(%rdi), %rdx
+        movq	256(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 112(%rsi)
+        movq	%rax, 120(%rsi)
+        movq	248(%rdi), %rdx
+        movq	240(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 128(%rsi)
+        movq	%rax, 136(%rsi)
+        movq	232(%rdi), %rdx
+        movq	224(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 144(%rsi)
+        movq	%rax, 152(%rsi)
+        movq	216(%rdi), %rdx
+        movq	208(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 160(%rsi)
+        movq	%rax, 168(%rsi)
+        movq	200(%rdi), %rdx
+        movq	192(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 176(%rsi)
+        movq	%rax, 184(%rsi)
+        movq	184(%rdi), %rdx
+        movq	176(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 192(%rsi)
+        movq	%rax, 200(%rsi)
+        movq	168(%rdi), %rdx
+        movq	160(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 208(%rsi)
+        movq	%rax, 216(%rsi)
+        movq	152(%rdi), %rdx
+        movq	144(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 224(%rsi)
+        movq	%rax, 232(%rsi)
+        movq	136(%rdi), %rdx
+        movq	128(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 240(%rsi)
+        movq	%rax, 248(%rsi)
+        movq	120(%rdi), %rdx
+        movq	112(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 256(%rsi)
+        movq	%rax, 264(%rsi)
+        movq	104(%rdi), %rdx
+        movq	96(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 272(%rsi)
+        movq	%rax, 280(%rsi)
+        movq	88(%rdi), %rdx
+        movq	80(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 288(%rsi)
+        movq	%rax, 296(%rsi)
+        movq	72(%rdi), %rdx
+        movq	64(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 304(%rsi)
+        movq	%rax, 312(%rsi)
+        movq	56(%rdi), %rdx
+        movq	48(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 320(%rsi)
+        movq	%rax, 328(%rsi)
+        movq	40(%rdi), %rdx
+        movq	32(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 336(%rsi)
+        movq	%rax, 344(%rsi)
+        movq	24(%rdi), %rdx
+        movq	16(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 352(%rsi)
+        movq	%rax, 360(%rsi)
+        movq	8(%rdi), %rdx
+        movq	(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 368(%rsi)
+        movq	%rax, 376(%rsi)
+        repz retq
+#ifndef __APPLE__
+.size	sp_3072_to_bin_bswap,.-sp_3072_to_bin_bswap
+#endif /* __APPLE__ */
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 384
+ * Uses the movbe instruction which is optional.
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+#ifndef __APPLE__
+.globl	sp_3072_to_bin_movbe
+.type	sp_3072_to_bin_movbe,@function
+.align	16
+sp_3072_to_bin_movbe:
+#else
+.globl	_sp_3072_to_bin_movbe
+.p2align	4
+_sp_3072_to_bin_movbe:
 #endif /* __APPLE__ */
         movbeq	376(%rdi), %rdx
         movbeq	368(%rdi), %rax
@@ -12483,7 +12953,7 @@ _sp_3072_to_bin:
         movq	%rax, 376(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_3072_to_bin,.-sp_3072_to_bin
+.size	sp_3072_to_bin_movbe,.-sp_3072_to_bin_movbe
 #endif /* __APPLE__ */
 /* Multiply a and b into r. (r = a * b)
  *
@@ -27167,6 +27637,7 @@ _sp_3072_lshift_48:
 #ifdef WOLFSSL_SP_4096
 #ifdef WOLFSSL_SP_4096
 /* Read big endian unsigned byte array into r.
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * size  Maximum number of bytes to convert
@@ -27174,22 +27645,114 @@ _sp_3072_lshift_48:
  * n  Number of bytes in array to read.
  */
 #ifndef __APPLE__
-.globl	sp_4096_from_bin
-.type	sp_4096_from_bin,@function
+.globl	sp_4096_from_bin_bswap
+.type	sp_4096_from_bin_bswap,@function
 .align	16
-sp_4096_from_bin:
+sp_4096_from_bin_bswap:
 #else
-.globl	_sp_4096_from_bin
+.globl	_sp_4096_from_bin_bswap
 .p2align	4
-_sp_4096_from_bin:
+_sp_4096_from_bin_bswap:
 #endif /* __APPLE__ */
         movq	%rdx, %r9
         movq	%rdi, %r10
         addq	%rcx, %r9
         addq	$512, %r10
         xorq	%r11, %r11
-        jmp	L_4096_from_bin_64_end
-L_4096_from_bin_64_start:
+        jmp	L_4096_from_bin_bswap_64_end
+L_4096_from_bin_bswap_64_start:
+        subq	$64, %r9
+        movq	56(%r9), %rax
+        movq	48(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, (%rdi)
+        movq	%r8, 8(%rdi)
+        movq	40(%r9), %rax
+        movq	32(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 16(%rdi)
+        movq	%r8, 24(%rdi)
+        movq	24(%r9), %rax
+        movq	16(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 32(%rdi)
+        movq	%r8, 40(%rdi)
+        movq	8(%r9), %rax
+        movq	(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 48(%rdi)
+        movq	%r8, 56(%rdi)
+        addq	$64, %rdi
+        subq	$64, %rcx
+L_4096_from_bin_bswap_64_end:
+        cmpq	$63, %rcx
+        jg	L_4096_from_bin_bswap_64_start
+        jmp	L_4096_from_bin_bswap_8_end
+L_4096_from_bin_bswap_8_start:
+        subq	$8, %r9
+        movq	(%r9), %rax
+        bswapq	%rax
+        movq	%rax, (%rdi)
+        addq	$8, %rdi
+        subq	$8, %rcx
+L_4096_from_bin_bswap_8_end:
+        cmpq	$7, %rcx
+        jg	L_4096_from_bin_bswap_8_start
+        cmpq	%r11, %rcx
+        je	L_4096_from_bin_bswap_hi_end
+        movq	%r11, %r8
+        movq	%r11, %rax
+L_4096_from_bin_bswap_hi_start:
+        movb	(%rdx), %al
+        shlq	$8, %r8
+        incq	%rdx
+        addq	%rax, %r8
+        decq	%rcx
+        jg	L_4096_from_bin_bswap_hi_start
+        movq	%r8, (%rdi)
+        addq	$8, %rdi
+L_4096_from_bin_bswap_hi_end:
+        cmpq	%r10, %rdi
+        je	L_4096_from_bin_bswap_zero_end
+L_4096_from_bin_bswap_zero_start:
+        movq	%r11, (%rdi)
+        addq	$8, %rdi
+        cmpq	%r10, %rdi
+        jl	L_4096_from_bin_bswap_zero_start
+L_4096_from_bin_bswap_zero_end:
+        repz retq
+#ifndef __APPLE__
+.size	sp_4096_from_bin_bswap,.-sp_4096_from_bin_bswap
+#endif /* __APPLE__ */
+/* Read big endian unsigned byte array into r.
+ * Uses the movbe instruction which is an optional instruction.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+#ifndef __APPLE__
+.globl	sp_4096_from_bin_movbe
+.type	sp_4096_from_bin_movbe,@function
+.align	16
+sp_4096_from_bin_movbe:
+#else
+.globl	_sp_4096_from_bin_movbe
+.p2align	4
+_sp_4096_from_bin_movbe:
+#endif /* __APPLE__ */
+        movq	%rdx, %r9
+        movq	%rdi, %r10
+        addq	%rcx, %r9
+        addq	$512, %r10
+        xorq	%r11, %r11
+        jmp	L_4096_from_bin_movbe_64_end
+L_4096_from_bin_movbe_64_start:
         subq	$64, %r9
         movbeq	56(%r9), %rax
         movbeq	48(%r9), %r8
@@ -27209,60 +27772,274 @@ L_4096_from_bin_64_start:
         movq	%r8, 56(%rdi)
         addq	$64, %rdi
         subq	$64, %rcx
-L_4096_from_bin_64_end:
+L_4096_from_bin_movbe_64_end:
         cmpq	$63, %rcx
-        jg	L_4096_from_bin_64_start
-        jmp	L_4096_from_bin_8_end
-L_4096_from_bin_8_start:
+        jg	L_4096_from_bin_movbe_64_start
+        jmp	L_4096_from_bin_movbe_8_end
+L_4096_from_bin_movbe_8_start:
         subq	$8, %r9
         movbeq	(%r9), %rax
         movq	%rax, (%rdi)
         addq	$8, %rdi
         subq	$8, %rcx
-L_4096_from_bin_8_end:
+L_4096_from_bin_movbe_8_end:
         cmpq	$7, %rcx
-        jg	L_4096_from_bin_8_start
+        jg	L_4096_from_bin_movbe_8_start
         cmpq	%r11, %rcx
-        je	L_4096_from_bin_hi_end
+        je	L_4096_from_bin_movbe_hi_end
         movq	%r11, %r8
         movq	%r11, %rax
-L_4096_from_bin_hi_start:
+L_4096_from_bin_movbe_hi_start:
         movb	(%rdx), %al
         shlq	$8, %r8
         incq	%rdx
         addq	%rax, %r8
         decq	%rcx
-        jg	L_4096_from_bin_hi_start
+        jg	L_4096_from_bin_movbe_hi_start
         movq	%r8, (%rdi)
         addq	$8, %rdi
-L_4096_from_bin_hi_end:
+L_4096_from_bin_movbe_hi_end:
         cmpq	%r10, %rdi
-        je	L_4096_from_bin_zero_end
-L_4096_from_bin_zero_start:
+        je	L_4096_from_bin_movbe_zero_end
+L_4096_from_bin_movbe_zero_start:
         movq	%r11, (%rdi)
         addq	$8, %rdi
         cmpq	%r10, %rdi
-        jl	L_4096_from_bin_zero_start
-L_4096_from_bin_zero_end:
+        jl	L_4096_from_bin_movbe_zero_start
+L_4096_from_bin_movbe_zero_end:
         repz retq
 #ifndef __APPLE__
-.size	sp_4096_from_bin,.-sp_4096_from_bin
+.size	sp_4096_from_bin_movbe,.-sp_4096_from_bin_movbe
 #endif /* __APPLE__ */
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 512
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * a  Byte array.
  */
 #ifndef __APPLE__
-.globl	sp_4096_to_bin
-.type	sp_4096_to_bin,@function
+.globl	sp_4096_to_bin_bswap
+.type	sp_4096_to_bin_bswap,@function
 .align	16
-sp_4096_to_bin:
+sp_4096_to_bin_bswap:
 #else
-.globl	_sp_4096_to_bin
+.globl	_sp_4096_to_bin_bswap
 .p2align	4
-_sp_4096_to_bin:
+_sp_4096_to_bin_bswap:
+#endif /* __APPLE__ */
+        movq	504(%rdi), %rdx
+        movq	496(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, (%rsi)
+        movq	%rax, 8(%rsi)
+        movq	488(%rdi), %rdx
+        movq	480(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 16(%rsi)
+        movq	%rax, 24(%rsi)
+        movq	472(%rdi), %rdx
+        movq	464(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 32(%rsi)
+        movq	%rax, 40(%rsi)
+        movq	456(%rdi), %rdx
+        movq	448(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 48(%rsi)
+        movq	%rax, 56(%rsi)
+        movq	440(%rdi), %rdx
+        movq	432(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 64(%rsi)
+        movq	%rax, 72(%rsi)
+        movq	424(%rdi), %rdx
+        movq	416(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 80(%rsi)
+        movq	%rax, 88(%rsi)
+        movq	408(%rdi), %rdx
+        movq	400(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 96(%rsi)
+        movq	%rax, 104(%rsi)
+        movq	392(%rdi), %rdx
+        movq	384(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 112(%rsi)
+        movq	%rax, 120(%rsi)
+        movq	376(%rdi), %rdx
+        movq	368(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 128(%rsi)
+        movq	%rax, 136(%rsi)
+        movq	360(%rdi), %rdx
+        movq	352(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 144(%rsi)
+        movq	%rax, 152(%rsi)
+        movq	344(%rdi), %rdx
+        movq	336(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 160(%rsi)
+        movq	%rax, 168(%rsi)
+        movq	328(%rdi), %rdx
+        movq	320(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 176(%rsi)
+        movq	%rax, 184(%rsi)
+        movq	312(%rdi), %rdx
+        movq	304(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 192(%rsi)
+        movq	%rax, 200(%rsi)
+        movq	296(%rdi), %rdx
+        movq	288(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 208(%rsi)
+        movq	%rax, 216(%rsi)
+        movq	280(%rdi), %rdx
+        movq	272(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 224(%rsi)
+        movq	%rax, 232(%rsi)
+        movq	264(%rdi), %rdx
+        movq	256(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 240(%rsi)
+        movq	%rax, 248(%rsi)
+        movq	248(%rdi), %rdx
+        movq	240(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 256(%rsi)
+        movq	%rax, 264(%rsi)
+        movq	232(%rdi), %rdx
+        movq	224(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 272(%rsi)
+        movq	%rax, 280(%rsi)
+        movq	216(%rdi), %rdx
+        movq	208(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 288(%rsi)
+        movq	%rax, 296(%rsi)
+        movq	200(%rdi), %rdx
+        movq	192(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 304(%rsi)
+        movq	%rax, 312(%rsi)
+        movq	184(%rdi), %rdx
+        movq	176(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 320(%rsi)
+        movq	%rax, 328(%rsi)
+        movq	168(%rdi), %rdx
+        movq	160(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 336(%rsi)
+        movq	%rax, 344(%rsi)
+        movq	152(%rdi), %rdx
+        movq	144(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 352(%rsi)
+        movq	%rax, 360(%rsi)
+        movq	136(%rdi), %rdx
+        movq	128(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 368(%rsi)
+        movq	%rax, 376(%rsi)
+        movq	120(%rdi), %rdx
+        movq	112(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 384(%rsi)
+        movq	%rax, 392(%rsi)
+        movq	104(%rdi), %rdx
+        movq	96(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 400(%rsi)
+        movq	%rax, 408(%rsi)
+        movq	88(%rdi), %rdx
+        movq	80(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 416(%rsi)
+        movq	%rax, 424(%rsi)
+        movq	72(%rdi), %rdx
+        movq	64(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 432(%rsi)
+        movq	%rax, 440(%rsi)
+        movq	56(%rdi), %rdx
+        movq	48(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 448(%rsi)
+        movq	%rax, 456(%rsi)
+        movq	40(%rdi), %rdx
+        movq	32(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 464(%rsi)
+        movq	%rax, 472(%rsi)
+        movq	24(%rdi), %rdx
+        movq	16(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 480(%rsi)
+        movq	%rax, 488(%rsi)
+        movq	8(%rdi), %rdx
+        movq	(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 496(%rsi)
+        movq	%rax, 504(%rsi)
+        repz retq
+#ifndef __APPLE__
+.size	sp_4096_to_bin_bswap,.-sp_4096_to_bin_bswap
+#endif /* __APPLE__ */
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 512
+ * Uses the movbe instruction which is optional.
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+#ifndef __APPLE__
+.globl	sp_4096_to_bin_movbe
+.type	sp_4096_to_bin_movbe,@function
+.align	16
+sp_4096_to_bin_movbe:
+#else
+.globl	_sp_4096_to_bin_movbe
+.p2align	4
+_sp_4096_to_bin_movbe:
 #endif /* __APPLE__ */
         movbeq	504(%rdi), %rdx
         movbeq	496(%rdi), %rax
@@ -27394,7 +28171,7 @@ _sp_4096_to_bin:
         movq	%rax, 504(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_4096_to_bin,.-sp_4096_to_bin
+.size	sp_4096_to_bin_movbe,.-sp_4096_to_bin_movbe
 #endif /* __APPLE__ */
 /* Sub b from a into a. (a -= b)
  *
@@ -38579,6 +39356,7 @@ _sp_256_add_one_4:
 .size	sp_256_add_one_4,.-sp_256_add_one_4
 #endif /* __APPLE__ */
 /* Read big endian unsigned byte array into r.
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * size  Maximum number of bytes to convert
@@ -38586,22 +39364,114 @@ _sp_256_add_one_4:
  * n  Number of bytes in array to read.
  */
 #ifndef __APPLE__
-.globl	sp_256_from_bin
-.type	sp_256_from_bin,@function
+.globl	sp_256_from_bin_bswap
+.type	sp_256_from_bin_bswap,@function
 .align	16
-sp_256_from_bin:
+sp_256_from_bin_bswap:
 #else
-.globl	_sp_256_from_bin
+.globl	_sp_256_from_bin_bswap
 .p2align	4
-_sp_256_from_bin:
+_sp_256_from_bin_bswap:
 #endif /* __APPLE__ */
         movq	%rdx, %r9
         movq	%rdi, %r10
         addq	%rcx, %r9
         addq	$32, %r10
         xorq	%r11, %r11
-        jmp	L_256_from_bin_64_end
-L_256_from_bin_64_start:
+        jmp	L_256_from_bin_bswap_64_end
+L_256_from_bin_bswap_64_start:
+        subq	$64, %r9
+        movq	56(%r9), %rax
+        movq	48(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, (%rdi)
+        movq	%r8, 8(%rdi)
+        movq	40(%r9), %rax
+        movq	32(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 16(%rdi)
+        movq	%r8, 24(%rdi)
+        movq	24(%r9), %rax
+        movq	16(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 32(%rdi)
+        movq	%r8, 40(%rdi)
+        movq	8(%r9), %rax
+        movq	(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 48(%rdi)
+        movq	%r8, 56(%rdi)
+        addq	$64, %rdi
+        subq	$64, %rcx
+L_256_from_bin_bswap_64_end:
+        cmpq	$63, %rcx
+        jg	L_256_from_bin_bswap_64_start
+        jmp	L_256_from_bin_bswap_8_end
+L_256_from_bin_bswap_8_start:
+        subq	$8, %r9
+        movq	(%r9), %rax
+        bswapq	%rax
+        movq	%rax, (%rdi)
+        addq	$8, %rdi
+        subq	$8, %rcx
+L_256_from_bin_bswap_8_end:
+        cmpq	$7, %rcx
+        jg	L_256_from_bin_bswap_8_start
+        cmpq	%r11, %rcx
+        je	L_256_from_bin_bswap_hi_end
+        movq	%r11, %r8
+        movq	%r11, %rax
+L_256_from_bin_bswap_hi_start:
+        movb	(%rdx), %al
+        shlq	$8, %r8
+        incq	%rdx
+        addq	%rax, %r8
+        decq	%rcx
+        jg	L_256_from_bin_bswap_hi_start
+        movq	%r8, (%rdi)
+        addq	$8, %rdi
+L_256_from_bin_bswap_hi_end:
+        cmpq	%r10, %rdi
+        je	L_256_from_bin_bswap_zero_end
+L_256_from_bin_bswap_zero_start:
+        movq	%r11, (%rdi)
+        addq	$8, %rdi
+        cmpq	%r10, %rdi
+        jl	L_256_from_bin_bswap_zero_start
+L_256_from_bin_bswap_zero_end:
+        repz retq
+#ifndef __APPLE__
+.size	sp_256_from_bin_bswap,.-sp_256_from_bin_bswap
+#endif /* __APPLE__ */
+/* Read big endian unsigned byte array into r.
+ * Uses the movbe instruction which is an optional instruction.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+#ifndef __APPLE__
+.globl	sp_256_from_bin_movbe
+.type	sp_256_from_bin_movbe,@function
+.align	16
+sp_256_from_bin_movbe:
+#else
+.globl	_sp_256_from_bin_movbe
+.p2align	4
+_sp_256_from_bin_movbe:
+#endif /* __APPLE__ */
+        movq	%rdx, %r9
+        movq	%rdi, %r10
+        addq	%rcx, %r9
+        addq	$32, %r10
+        xorq	%r11, %r11
+        jmp	L_256_from_bin_movbe_64_end
+L_256_from_bin_movbe_64_start:
         subq	$64, %r9
         movbeq	56(%r9), %rax
         movbeq	48(%r9), %r8
@@ -38621,60 +39491,94 @@ L_256_from_bin_64_start:
         movq	%r8, 56(%rdi)
         addq	$64, %rdi
         subq	$64, %rcx
-L_256_from_bin_64_end:
+L_256_from_bin_movbe_64_end:
         cmpq	$63, %rcx
-        jg	L_256_from_bin_64_start
-        jmp	L_256_from_bin_8_end
-L_256_from_bin_8_start:
+        jg	L_256_from_bin_movbe_64_start
+        jmp	L_256_from_bin_movbe_8_end
+L_256_from_bin_movbe_8_start:
         subq	$8, %r9
         movbeq	(%r9), %rax
         movq	%rax, (%rdi)
         addq	$8, %rdi
         subq	$8, %rcx
-L_256_from_bin_8_end:
+L_256_from_bin_movbe_8_end:
         cmpq	$7, %rcx
-        jg	L_256_from_bin_8_start
+        jg	L_256_from_bin_movbe_8_start
         cmpq	%r11, %rcx
-        je	L_256_from_bin_hi_end
+        je	L_256_from_bin_movbe_hi_end
         movq	%r11, %r8
         movq	%r11, %rax
-L_256_from_bin_hi_start:
+L_256_from_bin_movbe_hi_start:
         movb	(%rdx), %al
         shlq	$8, %r8
         incq	%rdx
         addq	%rax, %r8
         decq	%rcx
-        jg	L_256_from_bin_hi_start
+        jg	L_256_from_bin_movbe_hi_start
         movq	%r8, (%rdi)
         addq	$8, %rdi
-L_256_from_bin_hi_end:
+L_256_from_bin_movbe_hi_end:
         cmpq	%r10, %rdi
-        je	L_256_from_bin_zero_end
-L_256_from_bin_zero_start:
+        je	L_256_from_bin_movbe_zero_end
+L_256_from_bin_movbe_zero_start:
         movq	%r11, (%rdi)
         addq	$8, %rdi
         cmpq	%r10, %rdi
-        jl	L_256_from_bin_zero_start
-L_256_from_bin_zero_end:
+        jl	L_256_from_bin_movbe_zero_start
+L_256_from_bin_movbe_zero_end:
         repz retq
 #ifndef __APPLE__
-.size	sp_256_from_bin,.-sp_256_from_bin
+.size	sp_256_from_bin_movbe,.-sp_256_from_bin_movbe
 #endif /* __APPLE__ */
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 32
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * a  Byte array.
  */
 #ifndef __APPLE__
-.globl	sp_256_to_bin
-.type	sp_256_to_bin,@function
+.globl	sp_256_to_bin_bswap
+.type	sp_256_to_bin_bswap,@function
 .align	16
-sp_256_to_bin:
+sp_256_to_bin_bswap:
 #else
-.globl	_sp_256_to_bin
+.globl	_sp_256_to_bin_bswap
 .p2align	4
-_sp_256_to_bin:
+_sp_256_to_bin_bswap:
+#endif /* __APPLE__ */
+        movq	24(%rdi), %rdx
+        movq	16(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, (%rsi)
+        movq	%rax, 8(%rsi)
+        movq	8(%rdi), %rdx
+        movq	(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 16(%rsi)
+        movq	%rax, 24(%rsi)
+        repz retq
+#ifndef __APPLE__
+.size	sp_256_to_bin_bswap,.-sp_256_to_bin_bswap
+#endif /* __APPLE__ */
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 32
+ * Uses the movbe instruction which is optional.
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+#ifndef __APPLE__
+.globl	sp_256_to_bin_movbe
+.type	sp_256_to_bin_movbe,@function
+.align	16
+sp_256_to_bin_movbe:
+#else
+.globl	_sp_256_to_bin_movbe
+.p2align	4
+_sp_256_to_bin_movbe:
 #endif /* __APPLE__ */
         movbeq	24(%rdi), %rdx
         movbeq	16(%rdi), %rax
@@ -38686,7 +39590,7 @@ _sp_256_to_bin:
         movq	%rax, 24(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_256_to_bin,.-sp_256_to_bin
+.size	sp_256_to_bin_movbe,.-sp_256_to_bin_movbe
 #endif /* __APPLE__ */
 /* Add b to a into r. (r = a + b)
  *
@@ -41500,6 +42404,7 @@ _sp_384_add_one_6:
 .size	sp_384_add_one_6,.-sp_384_add_one_6
 #endif /* __APPLE__ */
 /* Read big endian unsigned byte array into r.
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * size  Maximum number of bytes to convert
@@ -41507,22 +42412,114 @@ _sp_384_add_one_6:
  * n  Number of bytes in array to read.
  */
 #ifndef __APPLE__
-.globl	sp_384_from_bin
-.type	sp_384_from_bin,@function
+.globl	sp_384_from_bin_bswap
+.type	sp_384_from_bin_bswap,@function
 .align	16
-sp_384_from_bin:
+sp_384_from_bin_bswap:
 #else
-.globl	_sp_384_from_bin
+.globl	_sp_384_from_bin_bswap
 .p2align	4
-_sp_384_from_bin:
+_sp_384_from_bin_bswap:
 #endif /* __APPLE__ */
         movq	%rdx, %r9
         movq	%rdi, %r10
         addq	%rcx, %r9
         addq	$48, %r10
         xorq	%r11, %r11
-        jmp	L_384_from_bin_64_end
-L_384_from_bin_64_start:
+        jmp	L_384_from_bin_bswap_64_end
+L_384_from_bin_bswap_64_start:
+        subq	$64, %r9
+        movq	56(%r9), %rax
+        movq	48(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, (%rdi)
+        movq	%r8, 8(%rdi)
+        movq	40(%r9), %rax
+        movq	32(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 16(%rdi)
+        movq	%r8, 24(%rdi)
+        movq	24(%r9), %rax
+        movq	16(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 32(%rdi)
+        movq	%r8, 40(%rdi)
+        movq	8(%r9), %rax
+        movq	(%r9), %r8
+        bswapq	%rax
+        bswapq	%r8
+        movq	%rax, 48(%rdi)
+        movq	%r8, 56(%rdi)
+        addq	$64, %rdi
+        subq	$64, %rcx
+L_384_from_bin_bswap_64_end:
+        cmpq	$63, %rcx
+        jg	L_384_from_bin_bswap_64_start
+        jmp	L_384_from_bin_bswap_8_end
+L_384_from_bin_bswap_8_start:
+        subq	$8, %r9
+        movq	(%r9), %rax
+        bswapq	%rax
+        movq	%rax, (%rdi)
+        addq	$8, %rdi
+        subq	$8, %rcx
+L_384_from_bin_bswap_8_end:
+        cmpq	$7, %rcx
+        jg	L_384_from_bin_bswap_8_start
+        cmpq	%r11, %rcx
+        je	L_384_from_bin_bswap_hi_end
+        movq	%r11, %r8
+        movq	%r11, %rax
+L_384_from_bin_bswap_hi_start:
+        movb	(%rdx), %al
+        shlq	$8, %r8
+        incq	%rdx
+        addq	%rax, %r8
+        decq	%rcx
+        jg	L_384_from_bin_bswap_hi_start
+        movq	%r8, (%rdi)
+        addq	$8, %rdi
+L_384_from_bin_bswap_hi_end:
+        cmpq	%r10, %rdi
+        je	L_384_from_bin_bswap_zero_end
+L_384_from_bin_bswap_zero_start:
+        movq	%r11, (%rdi)
+        addq	$8, %rdi
+        cmpq	%r10, %rdi
+        jl	L_384_from_bin_bswap_zero_start
+L_384_from_bin_bswap_zero_end:
+        repz retq
+#ifndef __APPLE__
+.size	sp_384_from_bin_bswap,.-sp_384_from_bin_bswap
+#endif /* __APPLE__ */
+/* Read big endian unsigned byte array into r.
+ * Uses the movbe instruction which is an optional instruction.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+#ifndef __APPLE__
+.globl	sp_384_from_bin_movbe
+.type	sp_384_from_bin_movbe,@function
+.align	16
+sp_384_from_bin_movbe:
+#else
+.globl	_sp_384_from_bin_movbe
+.p2align	4
+_sp_384_from_bin_movbe:
+#endif /* __APPLE__ */
+        movq	%rdx, %r9
+        movq	%rdi, %r10
+        addq	%rcx, %r9
+        addq	$48, %r10
+        xorq	%r11, %r11
+        jmp	L_384_from_bin_movbe_64_end
+L_384_from_bin_movbe_64_start:
         subq	$64, %r9
         movbeq	56(%r9), %rax
         movbeq	48(%r9), %r8
@@ -41542,60 +42539,100 @@ L_384_from_bin_64_start:
         movq	%r8, 56(%rdi)
         addq	$64, %rdi
         subq	$64, %rcx
-L_384_from_bin_64_end:
+L_384_from_bin_movbe_64_end:
         cmpq	$63, %rcx
-        jg	L_384_from_bin_64_start
-        jmp	L_384_from_bin_8_end
-L_384_from_bin_8_start:
+        jg	L_384_from_bin_movbe_64_start
+        jmp	L_384_from_bin_movbe_8_end
+L_384_from_bin_movbe_8_start:
         subq	$8, %r9
         movbeq	(%r9), %rax
         movq	%rax, (%rdi)
         addq	$8, %rdi
         subq	$8, %rcx
-L_384_from_bin_8_end:
+L_384_from_bin_movbe_8_end:
         cmpq	$7, %rcx
-        jg	L_384_from_bin_8_start
+        jg	L_384_from_bin_movbe_8_start
         cmpq	%r11, %rcx
-        je	L_384_from_bin_hi_end
+        je	L_384_from_bin_movbe_hi_end
         movq	%r11, %r8
         movq	%r11, %rax
-L_384_from_bin_hi_start:
+L_384_from_bin_movbe_hi_start:
         movb	(%rdx), %al
         shlq	$8, %r8
         incq	%rdx
         addq	%rax, %r8
         decq	%rcx
-        jg	L_384_from_bin_hi_start
+        jg	L_384_from_bin_movbe_hi_start
         movq	%r8, (%rdi)
         addq	$8, %rdi
-L_384_from_bin_hi_end:
+L_384_from_bin_movbe_hi_end:
         cmpq	%r10, %rdi
-        je	L_384_from_bin_zero_end
-L_384_from_bin_zero_start:
+        je	L_384_from_bin_movbe_zero_end
+L_384_from_bin_movbe_zero_start:
         movq	%r11, (%rdi)
         addq	$8, %rdi
         cmpq	%r10, %rdi
-        jl	L_384_from_bin_zero_start
-L_384_from_bin_zero_end:
+        jl	L_384_from_bin_movbe_zero_start
+L_384_from_bin_movbe_zero_end:
         repz retq
 #ifndef __APPLE__
-.size	sp_384_from_bin,.-sp_384_from_bin
+.size	sp_384_from_bin_movbe,.-sp_384_from_bin_movbe
 #endif /* __APPLE__ */
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 48
+ * Uses the bswap instruction.
  *
  * r  A single precision integer.
  * a  Byte array.
  */
 #ifndef __APPLE__
-.globl	sp_384_to_bin
-.type	sp_384_to_bin,@function
+.globl	sp_384_to_bin_bswap
+.type	sp_384_to_bin_bswap,@function
 .align	16
-sp_384_to_bin:
+sp_384_to_bin_bswap:
 #else
-.globl	_sp_384_to_bin
+.globl	_sp_384_to_bin_bswap
 .p2align	4
-_sp_384_to_bin:
+_sp_384_to_bin_bswap:
+#endif /* __APPLE__ */
+        movq	40(%rdi), %rdx
+        movq	32(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, (%rsi)
+        movq	%rax, 8(%rsi)
+        movq	24(%rdi), %rdx
+        movq	16(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 16(%rsi)
+        movq	%rax, 24(%rsi)
+        movq	8(%rdi), %rdx
+        movq	(%rdi), %rax
+        bswapq	%rdx
+        bswapq	%rax
+        movq	%rdx, 32(%rsi)
+        movq	%rax, 40(%rsi)
+        repz retq
+#ifndef __APPLE__
+.size	sp_384_to_bin_bswap,.-sp_384_to_bin_bswap
+#endif /* __APPLE__ */
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 48
+ * Uses the movbe instruction which is optional.
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+#ifndef __APPLE__
+.globl	sp_384_to_bin_movbe
+.type	sp_384_to_bin_movbe,@function
+.align	16
+sp_384_to_bin_movbe:
+#else
+.globl	_sp_384_to_bin_movbe
+.p2align	4
+_sp_384_to_bin_movbe:
 #endif /* __APPLE__ */
         movbeq	40(%rdi), %rdx
         movbeq	32(%rdi), %rax
@@ -41611,7 +42648,7 @@ _sp_384_to_bin:
         movq	%rax, 40(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_384_to_bin,.-sp_384_to_bin
+.size	sp_384_to_bin_movbe,.-sp_384_to_bin_movbe
 #endif /* __APPLE__ */
 /* Sub b from a into a. (a -= b)
  *
diff --git a/wolfssl/wolfcrypt/cpuid.h b/wolfssl/wolfcrypt/cpuid.h
index 3c3d1c294..912a01085 100644
--- a/wolfssl/wolfcrypt/cpuid.h
+++ b/wolfssl/wolfcrypt/cpuid.h
@@ -41,6 +41,7 @@
     #define CPUID_BMI2   0x0010   /* MULX, RORX */
     #define CPUID_AESNI  0x0020
     #define CPUID_ADX    0x0040   /* ADCX, ADOX */
+    #define CPUID_MOVBE  0x0080   /* Move and byte swap */
 
     #define IS_INTEL_AVX1(f)    ((f) & CPUID_AVX1)
     #define IS_INTEL_AVX2(f)    ((f) & CPUID_AVX2)
@@ -49,6 +50,7 @@
     #define IS_INTEL_BMI2(f)    ((f) & CPUID_BMI2)
     #define IS_INTEL_AESNI(f)   ((f) & CPUID_AESNI)
     #define IS_INTEL_ADX(f)     ((f) & CPUID_ADX)
+    #define IS_INTEL_MOVBE(f)   ((f) & CPUID_MOVBE)
 
     void cpuid_set_flags(void);
     word32 cpuid_get_flags(void);