From 0cc21a42f38f9e03e98ce3bef84bbaea3d4c50bd Mon Sep 17 00:00:00 2001
From: Sean Parkinson <sean@wolfssl.com>
Date: Tue, 26 Sep 2023 09:34:25 +1000
Subject: [PATCH] SP updates for SM2

Allow wolfSSL to build with SP implementations of SM2.
Updates to SP implementation of other code.
---
 configure.ac                      |   18 +-
 examples/server/server.c          |   12 +
 src/include.am                    |   29 +
 tests/api.c                       |    4 +-
 wolfcrypt/src/ecc.c               |  206 ++-
 wolfcrypt/src/eccsi.c             |   17 +-
 wolfcrypt/src/sm2.c               |   20 +
 wolfcrypt/src/sm3.c               |   20 +
 wolfcrypt/src/sm3_asm.S           |   20 +
 wolfcrypt/src/sm4.c               |   20 +
 wolfcrypt/src/sp_arm32.c          | 1086 ++++++++--------
 wolfcrypt/src/sp_arm64.c          |  298 ++---
 wolfcrypt/src/sp_armthumb.c       | 1980 ++++++++++++-----------------
 wolfcrypt/src/sp_c32.c            |  127 +-
 wolfcrypt/src/sp_c64.c            |  127 +-
 wolfcrypt/src/sp_cortexm.c        |  925 ++++++--------
 wolfcrypt/src/sp_int.c            |    4 +-
 wolfcrypt/src/sp_sm2_arm32.c      |   33 +
 wolfcrypt/src/sp_sm2_arm64.c      |   33 +
 wolfcrypt/src/sp_sm2_armthumb.c   |   33 +
 wolfcrypt/src/sp_sm2_c32.c        |   33 +
 wolfcrypt/src/sp_sm2_c64.c        |   33 +
 wolfcrypt/src/sp_sm2_cortexm.c    |   33 +
 wolfcrypt/src/sp_sm2_x86_64.c     |   33 +
 wolfcrypt/src/sp_sm2_x86_64_asm.S |   33 +
 wolfcrypt/src/sp_x86_64.c         |  189 +--
 wolfcrypt/src/sp_x86_64_asm.S     |  185 +--
 wolfcrypt/src/sp_x86_64_asm.asm   |  102 +-
 wolfcrypt/test/test.c             |    2 +-
 wolfssl/wolfcrypt/ecc.h           |    2 +-
 wolfssl/wolfcrypt/settings.h      |    3 +
 wolfssl/wolfcrypt/sm2.h           |   20 +
 wolfssl/wolfcrypt/sm3.h           |   20 +
 wolfssl/wolfcrypt/sm4.h           |   20 +
 wolfssl/wolfcrypt/sp.h            |   30 +
 35 files changed, 2768 insertions(+), 2982 deletions(-)
 create mode 100644 wolfcrypt/src/sp_sm2_arm32.c
 create mode 100644 wolfcrypt/src/sp_sm2_arm64.c
 create mode 100644 wolfcrypt/src/sp_sm2_armthumb.c
 create mode 100644 wolfcrypt/src/sp_sm2_c32.c
 create mode 100644 wolfcrypt/src/sp_sm2_c64.c
 create mode 100644 wolfcrypt/src/sp_sm2_cortexm.c
 create mode 100644 wolfcrypt/src/sp_sm2_x86_64.c
 create mode 100644 wolfcrypt/src/sp_sm2_x86_64_asm.S

diff --git a/configure.ac b/configure.ac
index 4aba828b3..ddd63d3e8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3515,10 +3515,6 @@ AC_ARG_ENABLE([sm2],
     [ ENABLED_SM2=no ]
     )
 
-if test "$ENABLED_SP_MATH" = "yes"
-then
-    ENABLED_SM2="no"
-fi
 if test "$ENABLED_SM2" = "yes"
 then
     if test "$ENABLED_ECC" = "no"
@@ -7115,6 +7111,7 @@ ENABLED_SP_ECC=no
 ENABLED_SP_EC_256=no
 ENABLED_SP_EC_384=no
 ENABLED_SP_EC_521=no
+ENABLED_SP_SM2=$ENABLED_SM2
 ENABLED_SP_SAKKE_1024=$ENABLED_SAKKE
 ENABLED_SP_NO_MALLOC=no
 ENABLED_SP_NONBLOCK=no
@@ -7206,6 +7203,15 @@ do
     ENABLED_SP_ECC=yes
     ENABLED_SP_SAKKE_1024=yes
     ;;
+  smallsm2)
+    ENABLED_SP_SMALL=yes
+    ENABLED_SP_ECC=yes
+    ENABLED_SP_SM2=yes
+    ;;
+  sm2)
+    ENABLED_SP_ECC=yes
+    ENABLED_SP_SM2=yes
+    ;;
 
   small2048)
     ENABLED_SP_SMALL=yes
@@ -7353,6 +7359,10 @@ if test "$ENABLED_ECC" != "no" && test "$ENABLED_SP_ECC" = "yes"; then
         AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_1024"
         AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_1024"
     fi
+    if test "$ENABLED_SP_SM2" = "yes"; then
+        AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SM2"
+        AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SM2"
+    fi
 fi
 if test "$ENABLED_SP_SMALL" = "yes"; then
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
diff --git a/examples/server/server.c b/examples/server/server.c
index e01cf3151..a06088dd8 100644
--- a/examples/server/server.c
+++ b/examples/server/server.c
@@ -752,6 +752,18 @@ static void SetKeyShare(WOLFSSL* ssl, int onlyKeyShare, int useX25519,
                 else
                     err_sys("unable to use curve secp256r1");
             } while (ret == WC_PENDING_E);
+        #elif defined(WOLFSSL_SM2)
+            do {
+                ret = wolfSSL_UseKeyShare(ssl, WOLFSSL_ECC_SM2P256V1);
+                if (ret == WOLFSSL_SUCCESS)
+                    groups[count++] = WOLFSSL_ECC_SM2P256V1;
+            #ifdef WOLFSSL_ASYNC_CRYPT
+                else if (ret == WC_PENDING_E)
+                    wolfSSL_AsyncPoll(ssl, WOLF_POLL_FLAG_CHECK_HW);
+            #endif
+                else
+                    err_sys("unable to use curve sm2p256r1");
+            } while (ret == WC_PENDING_E);
         #endif
     #endif
         }
diff --git a/src/include.am b/src/include.am
index c222d5c67..56e92a850 100644
--- a/src/include.am
+++ b/src/include.am
@@ -517,6 +517,35 @@ endif !BUILD_FIPS_CURRENT
 if !BUILD_FIPS_CURRENT
 if BUILD_SM2
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sm2.c
+if BUILD_SP
+if BUILD_SP_C32
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sp_sm2_c32.c
+endif
+if BUILD_SP_C64
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sp_sm2_c64.c
+endif
+
+if BUILD_SP_X86_64
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sp_sm2_x86_64.c
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sp_sm2_x86_64_asm.S
+endif
+if !BUILD_FIPS_V2
+if BUILD_SP_ARM32
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sp_sm2_arm32.c
+endif
+endif
+if BUILD_SP_ARM_THUMB
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sp_sm2_armthumb.c
+endif
+if !BUILD_FIPS_V2
+if BUILD_SP_ARM64
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sp_sm2_arm64.c
+endif
+endif
+if BUILD_SP_ARM_CORTEX
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sp_sm2_cortexm.c
+endif
+endif BUILD_SP
 endif BUILD_SM2
 endif !BUILD_FIPS_CURRENT
 
diff --git a/tests/api.c b/tests/api.c
index c6d520313..704d4bc31 100644
--- a/tests/api.c
+++ b/tests/api.c
@@ -23770,8 +23770,8 @@ static int test_wc_ecc_sm2_sign_hash_ex(void)
         mp_int smallR[1];
         sp_init_size(smallR, 1);
         /* Force failure in _ecc_sm2_calc_r_s by r being too small. */
-        ExpectIntEQ(wc_ecc_sm2_sign_hash_ex(hash, sizeof(hash), rng, key,
-            smallR, s), MP_VAL);
+        ExpectIntLT(wc_ecc_sm2_sign_hash_ex(hash, sizeof(hash), rng, key,
+            smallR, s), 0);
     }
 #endif
 
diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c
index eaff8ae8c..42685dd50 100644
--- a/wolfcrypt/src/ecc.c
+++ b/wolfcrypt/src/ecc.c
@@ -666,7 +666,8 @@ enum {
         #endif
         #define ecc_oid_brainpoolp256r1_sz CODED_BRAINPOOLP256R1_SZ
     #endif /* HAVE_ECC_BRAINPOOL */
-    #if defined(WOLFSSL_SM2) && !defined(WOLFSSL_SP_MATH)
+#endif /* ECC256 */
+    #if defined(WOLFSSL_SM2)
         #ifdef HAVE_OID_ENCODING
             #define CODED_SM2P256V1    {1,2,156,10197,1,301}
             #define CODED_SM2P256V1_SZ 6
@@ -680,8 +681,7 @@ enum {
             #define ecc_oid_sm2p256v1 CODED_SM2P256V1
         #endif
         #define ecc_oid_sm2p256v1_sz CODED_SM2P256V1_SZ
-    #endif /* WOLFSSL_SM2 && !WOLFSSL_SP_MATH */
-#endif /* ECC256 */
+    #endif /* WOLFSSL_SM2 */
 #ifdef ECC320
     #ifdef HAVE_ECC_BRAINPOOL
         #ifdef HAVE_OID_ENCODING
@@ -1161,7 +1161,8 @@ const ecc_set_type ecc_sets[] = {
         1,                                                                  /* cofactor   */
     },
     #endif /* HAVE_ECC_BRAINPOOL */
-    #if defined(WOLFSSL_SM2) && !defined(WOLFSSL_SP_MATH)
+#endif /* ECC256 */
+    #if defined(WOLFSSL_SM2)
     {
         32,                                                     /* size/bytes */
         ECC_SM2P256V1,                                          /* ID         */
@@ -1179,8 +1180,7 @@ const ecc_set_type ecc_sets[] = {
         ECC_SM2P256V1_OID,                                      /* oid sum    */
         1,                                                      /* cofactor   */
     },
-    #endif /* WOLFSSL_SM2 && !WOLFSSL_SP_MATH */
-#endif /* ECC256 */
+    #endif /* WOLFSSL_SM2 */
 #ifdef ECC320
     #ifdef HAVE_ECC_BRAINPOOL
     {
@@ -2157,16 +2157,14 @@ done:
     (void)a;
     (void)mp;
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
+    if ((modBits == 256) && (!mp_is_bit_set(modulus, 224))) {
+       return sp_ecc_proj_add_point_sm2_256(P->x, P->y, P->z, Q->x, Q->y, Q->z,
+                                            R->x, R->y, R->z);
+    }
+#endif
 #ifndef WOLFSSL_SP_NO_256
     if (modBits == 256) {
-#ifdef SM2_SP_IMPL_AVAILABLE
-    #ifdef WOLFSSL_SM2
-        if (!mp_is_bit_set(modulus, 224)) {
-           return sp_ecc_proj_add_point_sm2_256(P->x, P->y, P->z, Q->x, Q->y,
-                                                Q->z, R->x, R->y, R->z);
-        }
-    #endif
-#endif
         return sp_ecc_proj_add_point_256(P->x, P->y, P->z, Q->x, Q->y, Q->z,
                                          R->x, R->y, R->z);
     }
@@ -2529,16 +2527,13 @@ static int _ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* a,
     (void)a;
     (void)mp;
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
+    if ((modBits == 256) && (!mp_is_bit_set(modulus, 224))) {
+       return sp_ecc_proj_dbl_point_sm2_256(P->x, P->y, P->z, R->x, R->y, R->z);
+    }
+#endif
 #ifndef WOLFSSL_SP_NO_256
     if (modBits == 256) {
-#ifdef SM2_SP_IMPL_AVAILABLE
-    #ifdef WOLFSSL_SM2
-        if (!mp_is_bit_set(modulus, 224)) {
-           return sp_ecc_proj_dbl_point_sm2_256(P->x, P->y, P->z, R->x, R->y,
-                                                R->z);
-        }
-    #endif
-#endif
         return sp_ecc_proj_dbl_point_256(P->x, P->y, P->z, R->x, R->y, R->z);
     }
 #endif
@@ -2789,15 +2784,13 @@ done:
    (void)mp;
    (void)ct;
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
+   if ((mp_count_bits(modulus) == 256) && (!mp_is_bit_set(modulus, 224))) {
+       return sp_ecc_map_sm2_256(P->x, P->y, P->z);
+   }
+#endif
 #ifndef WOLFSSL_SP_NO_256
    if (mp_count_bits(modulus) == 256) {
-#ifdef SM2_SP_IMPL_AVAILABLE
-    #ifdef WOLFSSL_SM2
-        if (!mp_is_bit_set(modulus, 224)) {
-           return sp_ecc_map_sm2_256(P->x, P->y, P->z);
-        }
-    #endif
-#endif
        return sp_ecc_map_256(P->x, P->y, P->z);
    }
 #endif
@@ -3698,15 +3691,13 @@ exit:
    }
 
 #ifdef WOLFSSL_HAVE_SP_ECC
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
+   if ((mp_count_bits(modulus) == 256) && (!mp_is_bit_set(modulus, 224))) {
+       return sp_ecc_mulmod_sm2_256(k, G, R, map, heap);
+   }
+#endif
 #ifndef WOLFSSL_SP_NO_256
    if (mp_count_bits(modulus) == 256) {
-#ifdef SM2_SP_IMPL_AVAILABLE
-   #ifdef WOLFSSL_SM2
-       if (!mp_is_bit_set(modulus, 224)) {
-           return sp_ecc_mulmod_sm2_256(k, G, R, map, heap);
-       }
-   #endif
-#endif
        return sp_ecc_mulmod_256(k, G, R, map, heap);
    }
 #endif
@@ -3905,6 +3896,11 @@ exit:
    (void)rng;
 
 #ifdef WOLFSSL_HAVE_SP_ECC
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
+   if ((mp_count_bits(modulus) == 256) && (!mp_is_bit_set(modulus, 224))) {
+       return sp_ecc_mulmod_sm2_256(k, G, R, map, heap);
+   }
+#endif
 #ifndef WOLFSSL_SP_NO_256
    if (mp_count_bits(modulus) == 256) {
        return sp_ecc_mulmod_256(k, G, R, map, heap);
@@ -4695,8 +4691,8 @@ int wc_ecc_shared_secret_gen_sync(ecc_key* private_key, ecc_point* point,
     #endif /* !WC_ECC_NONBLOCK */
     }
     else
-#ifdef SM2_SP_IMPL_AVAILABLE
-#ifdef WOLFSSL_SM2
+#endif /* ! WOLFSSL_SP_NO_256 */
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
     if (private_key->idx != ECC_CUSTOM_IDX &&
                                ecc_sets[private_key->idx].id == ECC_SM2P256V1) {
         err = sp_ecc_secret_gen_sm2_256(k, point, out, outlen,
@@ -4704,8 +4700,6 @@ int wc_ecc_shared_secret_gen_sync(ecc_key* private_key, ecc_point* point,
     }
     else
 #endif
-#endif
-#endif /* ! WOLFSSL_SP_NO_256 */
 #ifdef WOLFSSL_SP_384
     if (private_key->idx != ECC_CUSTOM_IDX &&
         ecc_sets[private_key->idx].id == ECC_SECP384R1) {
@@ -5289,15 +5283,13 @@ static int ecc_make_pub_ex(ecc_key* key, ecc_curve_spec* curve,
         err = sp_ecc_mulmod_base_256(key->k, pub, 1, key->heap);
     }
     else
-#ifdef SM2_SP_IMPL_AVAILABLE
-#ifdef WOLFSSL_SM2
+#endif /* WOLFSSL_SP_NO_256 */
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
     if (key->idx != ECC_CUSTOM_IDX && ecc_sets[key->idx].id == ECC_SM2P256V1) {
-        err = sp_ecc_mulmod_base_sm2_256(&key->k, pub, 1, key->heap);
+        err = sp_ecc_mulmod_base_sm2_256(key->k, pub, 1, key->heap);
     }
     else
 #endif
-#endif
-#endif /* WOLFSSL_SP_NO_256 */
 #ifdef WOLFSSL_SP_384
     if (key->idx != ECC_CUSTOM_IDX && ecc_sets[key->idx].id == ECC_SECP384R1) {
         err = sp_ecc_mulmod_base_384(key->k, pub, 1, key->heap);
@@ -5673,18 +5665,16 @@ static int _ecc_make_key_ex(WC_RNG* rng, int keysize, ecc_key* key,
         }
     }
     else
-#ifdef SM2_SP_IMPL_AVAILABLE
-#ifdef WOLFSSL_SM2
+#endif /* !WOLFSSL_SP_NO_256 */
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
     if (key->idx != ECC_CUSTOM_IDX && ecc_sets[key->idx].id == ECC_SM2P256V1) {
-        err = sp_ecc_make_key_sm2_256(rng, &key->k, &key->pubkey, key->heap);
+        err = sp_ecc_make_key_sm2_256(rng, key->k, &key->pubkey, key->heap);
         if (err == MP_OKAY) {
             key->type = ECC_PRIVATEKEY;
         }
     }
     else
 #endif
-#endif
-#endif /* !WOLFSSL_SP_NO_256 */
 #ifdef WOLFSSL_SP_384
     if (key->idx != ECC_CUSTOM_IDX && ecc_sets[key->idx].id == ECC_SECP384R1) {
     #ifndef WC_ECC_NONBLOCK
@@ -6892,14 +6882,16 @@ static int ecc_sign_hash_sp(const byte* in, word32 inlen, WC_RNG* rng,
             }
         #endif
         }
-#ifdef SM2_SP_IMPL_AVAILABLE
-        #ifdef WOLFSSL_SM2
+    #endif
+    #if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
         if (ecc_sets[key->idx].id == ECC_SM2P256V1) {
-            return sp_ecc_sign_sm2_256(in, inlen, rng, &key->k, r, s, sign_k,
+            int ret;
+            SAVE_VECTOR_REGISTERS(return _svr_ret;);
+            ret = sp_ecc_sign_sm2_256(in, inlen, rng, key->k, r, s, sign_k,
                 key->heap);
+            RESTORE_VECTOR_REGISTERS();
+            return ret;
         }
-        #endif
-#endif
     #endif
     #ifdef WOLFSSL_SP_384
         if (ecc_sets[key->idx].id == ECC_SECP384R1) {
@@ -7018,6 +7010,9 @@ int wc_ecc_sign_hash_ex(const byte* in, word32 inlen, WC_RNG* rng,
     #ifndef WOLFSSL_SP_NO_256
          && ecc_sets[key->idx].id != ECC_SECP256R1
     #endif
+    #ifdef WOLFSSL_SP_SM2
+         && ecc_sets[key->idx].id != ECC_SM2P256V1
+    #endif
     #ifdef WOLFSSL_SP_384
          && ecc_sets[key->idx].id != ECC_SECP384R1
     #endif
@@ -8422,6 +8417,9 @@ static int ecc_verify_hash_sp(mp_int *r, mp_int *s, const byte* hash,
     #ifndef WOLFSSL_SP_NO_256
          && ecc_sets[key->idx].id != ECC_SECP256R1
     #endif
+    #ifdef WOLFSSL_SP_SM2
+         && ecc_sets[key->idx].id != ECC_SM2P256V1
+    #endif
     #ifdef WOLFSSL_SP_384
          && ecc_sets[key->idx].id != ECC_SECP384R1
     #endif
@@ -8470,8 +8468,8 @@ static int ecc_verify_hash_sp(mp_int *r, mp_int *s, const byte* hash,
             }
         #endif
         }
-#ifdef SM2_SP_IMPL_AVAILABLE
-        #ifdef WOLFSSL_SM2
+    #endif
+    #if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
         if (ecc_sets[key->idx].id == ECC_SM2P256V1) {
             #if defined(FP_ECC_CONTROL) && !defined(WOLFSSL_DSP_BUILD)
             return sp_ecc_cache_verify_sm2_256(hash, hashlen, key->pubkey.x,
@@ -8485,9 +8483,7 @@ static int ecc_verify_hash_sp(mp_int *r, mp_int *s, const byte* hash,
                 key->pubkey.y, key->pubkey.z, r, s, res, key->heap);
             #endif
         }
-        #endif
     #endif
-#endif
     #ifdef WOLFSSL_SP_384
         if (ecc_sets[key->idx].id == ECC_SECP384R1) {
         #ifdef WC_ECC_NONBLOCK
@@ -9108,16 +9104,14 @@ int wc_ecc_import_point_der_ex(const byte* in, word32 inLen,
             err = sp_ecc_uncompress_256(point->x, pointType, point->y);
         }
         else
-#ifdef SM2_SP_IMPL_AVAILABLE
-        #ifdef WOLFSSL_SM2
+        #endif
+        #if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
         if (curve_idx != ECC_CUSTOM_IDX &&
-                                 ecc_sets[curve_idx->idx].id == ECC_SM2P256V1) {
+                                      ecc_sets[curve_idx].id == ECC_SM2P256V1) {
             sp_ecc_uncompress_sm2_256(point->x, pointType, point->y);
         }
         else
         #endif
-#endif
-        #endif
         #ifdef WOLFSSL_SP_384
         if (curve_idx != ECC_CUSTOM_IDX &&
                                       ecc_sets[curve_idx].id == ECC_SECP384R1) {
@@ -9663,15 +9657,13 @@ static int _ecc_is_point(ecc_point* ecp, mp_int* a, mp_int* b, mp_int* prime)
    (void)b;
 
 #ifdef WOLFSSL_HAVE_SP_ECC
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
+   if ((mp_count_bits(prime) == 256) && (!mp_is_bit_set(prime, 224))) {
+       return sp_ecc_is_point_sm2_256(ecp->x, ecp->y);
+   }
+#endif
 #ifndef WOLFSSL_SP_NO_256
    if (mp_count_bits(prime) == 256) {
-#ifdef SM2_SP_IMPL_AVAILABLE
-   #ifdef WOLFSSL_SM2
-       if (!mp_is_bit_set(prime, 224)) {
-           return sp_ecc_is_point_sm2_256(ecp->x, ecp->y);
-       }
-   #endif
-#endif
        return sp_ecc_is_point_256(ecp->x, ecp->y);
    }
 #endif
@@ -9764,16 +9756,14 @@ static int ecc_check_privkey_gen(ecc_key* key, mp_int* a, mp_int* prime)
         }
     }
     else
-#ifdef SM2_SP_IMPL_AVAILABLE
-    #ifdef WOLFSSL_SM2
+#endif
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
     if (key->idx != ECC_CUSTOM_IDX && ecc_sets[key->idx].id == ECC_SM2P256V1) {
         if (err == MP_OKAY) {
-            err = sp_ecc_mulmod_base_sm2_256(&key->k, res, 1, key->heap);
+            err = sp_ecc_mulmod_base_sm2_256(key->k, res, 1, key->heap);
         }
     }
     else
-    #endif
-#endif
 #endif
 #ifdef WOLFSSL_SP_384
     if (key->idx != ECC_CUSTOM_IDX && ecc_sets[key->idx].id == ECC_SECP384R1) {
@@ -10007,15 +9997,13 @@ static int ecc_check_pubkey_order(ecc_key* key, ecc_point* pubkey, mp_int* a,
             err = sp_ecc_mulmod_256(order, pubkey, inf, 1, key->heap);
         }
         else
-#ifdef SM2_SP_IMPL_AVAILABLE
-    #ifdef WOLFSSL_SM2
+#endif
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
         if (key->idx != ECC_CUSTOM_IDX &&
                                        ecc_sets[key->idx].id == ECC_SM2P256V1) {
             err = sp_ecc_mulmod_sm2_256(order, pubkey, inf, 1, key->heap);
         }
         else
-    #endif
-#endif
 #endif
 #ifdef WOLFSSL_SP_384
         if (key->idx != ECC_CUSTOM_IDX &&
@@ -10121,15 +10109,13 @@ static int _ecc_validate_public_key(ecc_key* key, int partial, int priv)
         return sp_ecc_check_key_256(key->pubkey.x, key->pubkey.y,
             key->type == ECC_PRIVATEKEY ? key->k : NULL, key->heap);
     }
-#ifdef SM2_SP_IMPL_AVAILABLE
-#ifdef WOLFSSL_SM2
+#endif
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
     if (key->idx != ECC_CUSTOM_IDX && ecc_sets[key->idx].id == ECC_SM2P256V1) {
-        return sp_ecc_check_key_sm2_256(key->pubkey.x, key->pubkey.y
-            key->type == ECC_PRIVATEKEY ? &key->k : NULL, key->heap);
+        return sp_ecc_check_key_sm2_256(key->pubkey.x, key->pubkey.y,
+            key->type == ECC_PRIVATEKEY ? key->k : NULL, key->heap);
     }
 #endif
-#endif
-#endif
 #ifdef WOLFSSL_SP_384
     if (key->idx != ECC_CUSTOM_IDX && ecc_sets[key->idx].id == ECC_SECP384R1) {
         return sp_ecc_check_key_384(key->pubkey.x, key->pubkey.y,
@@ -10506,14 +10492,12 @@ int wc_ecc_import_x963_ex(const byte* in, word32 inLen, ecc_key* key,
                 key->pubkey.y);
         }
         else
-#ifdef SM2_SP_IMPL_AVAILABLE
-        #ifdef WOLFSSL_SM2
+    #endif
+    #if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
         if (key->dp->id == ECC_SM2P256V1) {
             sp_ecc_uncompress_sm2_256(key->pubkey.x, pointType, key->pubkey.y);
         }
         else
-        #endif
-#endif
     #endif
     #ifdef WOLFSSL_SP_384
         if (key->dp->id == ECC_SECP384R1) {
@@ -13059,21 +13043,20 @@ int wc_ecc_mulmod_ex(const mp_int* k, ecc_point *G, ecc_point *R, mp_int* a,
         return IS_POINT_E;
     }
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
+    if ((mp_count_bits(modulus) == 256) && (!mp_is_bit_set(modulus, 224))) {
+        int ret;
+        SAVE_VECTOR_REGISTERS(return _svr_ret);
+        ret = sp_ecc_mulmod_sm2_256(k, G, R, map, heap);
+        RESTORE_VECTOR_REGISTERS();
+        return ret;
+    }
+#endif
 #ifndef WOLFSSL_SP_NO_256
     if (mp_count_bits(modulus) == 256) {
         int ret;
         SAVE_VECTOR_REGISTERS(return _svr_ret);
-#ifdef SM2_SP_IMPL_AVAILABLE
-     #ifdef WOLFSSL_SM2
-        if (!mp_is_bit_set(modulus, 224)) {
-            ret = sp_ecc_mulmod_sm2_256(k, G, R, map, heap);
-        }
-        else
-     #endif
-#endif
-        {
-            ret = sp_ecc_mulmod_256(k, G, R, map, heap);
-        }
+        ret = sp_ecc_mulmod_256(k, G, R, map, heap);
         RESTORE_VECTOR_REGISTERS();
         return ret;
     }
@@ -13238,21 +13221,20 @@ int wc_ecc_mulmod_ex2(const mp_int* k, ecc_point *G, ecc_point *R, mp_int* a,
         return IS_POINT_E;
     }
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SP_SM2)
+    if ((mp_count_bits(modulus) == 256) && (!mp_is_bit_set(modulus, 224))) {
+        int ret;
+        SAVE_VECTOR_REGISTERS(return _svr_ret);
+        ret = sp_ecc_mulmod_sm2_256(k, G, R, map, heap);
+        RESTORE_VECTOR_REGISTERS();
+        return ret;
+    }
+#endif
 #ifndef WOLFSSL_SP_NO_256
     if (mp_count_bits(modulus) == 256) {
         int ret;
         SAVE_VECTOR_REGISTERS(return _svr_ret);
-#ifdef SM2_SP_IMPL_AVAILABLE
-    #ifdef WOLFSSL_SM2
-        if (!mp_is_bit_set(modulus, 224)) {
-            ret = sp_ecc_mulmod_sm2_256(k, G, R, map, heap);
-        }
-        else
-    #endif
-#endif
-        {
-            ret = sp_ecc_mulmod_256(k, G, R, map, heap);
-        }
+        ret = sp_ecc_mulmod_256(k, G, R, map, heap);
         RESTORE_VECTOR_REGISTERS();
         return ret;
     }
diff --git a/wolfcrypt/src/eccsi.c b/wolfcrypt/src/eccsi.c
index ed2e2b8bd..1725aba4b 100644
--- a/wolfcrypt/src/eccsi.c
+++ b/wolfcrypt/src/eccsi.c
@@ -1350,15 +1350,13 @@ static int eccsi_mulmod_base_add(EccsiKey* key, const mp_int* n,
 {
     int err = 0;
 
-#ifdef WOLFSSL_HAVE_SP_ECC
-#ifndef WOLFSSL_SP_NO_256
+#if defined(WOLFSSL_HAVE_SP_ECC) && !defined(WOLFSSL_SP_NO_256)
     if ((key->ecc.idx != ECC_CUSTOM_IDX) &&
             (ecc_sets[key->ecc.idx].id == ECC_SECP256R1)) {
         err = sp_ecc_mulmod_base_add_256(n, a, 1, res, map, key->heap);
     }
     else
 #endif
-#endif
 #ifndef WOLFSSL_SP_MATH
     {
         EccsiKeyParams* params = &key->params;
@@ -1377,7 +1375,12 @@ static int eccsi_mulmod_base_add(EccsiKey* key, const mp_int* n,
     {
         err = NOT_COMPILED_IN;
     }
+    (void)key;
+    (void)h;
+    (void)a;
+    (void)res;
     (void)mp;
+    (void)map;
 #endif
 
     return err;
@@ -1401,14 +1404,12 @@ static int eccsi_mulmod_point(EccsiKey* key, const mp_int* n, ecc_point* point,
 {
     int err;
 
-#ifdef WOLFSSL_HAVE_SP_ECC
-#ifndef WOLFSSL_SP_NO_256
+#if defined(WOLFSSL_HAVE_SP_ECC) && !defined(WOLFSSL_SP_NO_256)
     if ((key->ecc.idx != ECC_CUSTOM_IDX) &&
             (ecc_sets[key->ecc.idx].id == ECC_SECP256R1)) {
         err = sp_ecc_mulmod_256(n, point, res, map, key->heap);
     }
     else
-#endif
 #endif
     {
         EccsiKeyParams* params = &key->params;
@@ -1437,8 +1438,7 @@ static int eccsi_mulmod_point(EccsiKey* key, const mp_int* n, ecc_point* point,
 static int eccsi_mulmod_point_add(EccsiKey* key, const mp_int* n,
         ecc_point* point, ecc_point* a, ecc_point* res, mp_digit mp, int map)
 {
-#ifdef WOLFSSL_HAVE_SP_ECC
-#ifndef WOLFSSL_SP_NO_256
+#if defined(WOLFSSL_HAVE_SP_ECC) && !defined(WOLFSSL_SP_NO_256)
     int err = NOT_COMPILED_IN;
 
     if ((key->ecc.idx != ECC_CUSTOM_IDX) &&
@@ -1449,7 +1449,6 @@ static int eccsi_mulmod_point_add(EccsiKey* key, const mp_int* n,
     (void)mp;
 
     return err;
-#endif
 #else
     int err;
     EccsiKeyParams* params = &key->params;
diff --git a/wolfcrypt/src/sm2.c b/wolfcrypt/src/sm2.c
index ffdb8de7e..829d5e5b2 100644
--- a/wolfcrypt/src/sm2.c
+++ b/wolfcrypt/src/sm2.c
@@ -1,3 +1,23 @@
+/* sm2.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
 
 #ifdef HAVE_CONFIG_H
     #include <config.h>
diff --git a/wolfcrypt/src/sm3.c b/wolfcrypt/src/sm3.c
index ab6324225..1339037b7 100644
--- a/wolfcrypt/src/sm3.c
+++ b/wolfcrypt/src/sm3.c
@@ -1,3 +1,23 @@
+/* sm3.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
 
 #ifdef HAVE_CONFIG_H
     #include <config.h>
diff --git a/wolfcrypt/src/sm3_asm.S b/wolfcrypt/src/sm3_asm.S
index ab6324225..2c368f1ff 100644
--- a/wolfcrypt/src/sm3_asm.S
+++ b/wolfcrypt/src/sm3_asm.S
@@ -1,3 +1,23 @@
+/* sm3_asm.S
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
 
 #ifdef HAVE_CONFIG_H
     #include <config.h>
diff --git a/wolfcrypt/src/sm4.c b/wolfcrypt/src/sm4.c
index d3da2b9df..1e4f31760 100644
--- a/wolfcrypt/src/sm4.c
+++ b/wolfcrypt/src/sm4.c
@@ -1,3 +1,23 @@
+/* sm4.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
 
 #ifdef HAVE_CONFIG_H
     #include <config.h>
diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c
index 876fa887a..a1ae275de 100644
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -11581,8 +11581,8 @@ static sp_int32 sp_2048_cmp_32(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[64], t2[33];
     sp_digit div, r1;
@@ -11622,7 +11622,8 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_mod_32(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_2048_mod_32(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_2048_div_32(a, m, NULL, r);
 }
@@ -15493,8 +15494,8 @@ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[128], t2[65];
     sp_digit div, r1;
@@ -15550,7 +15551,8 @@ static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, s
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_2048_div_64_cond(a, m, NULL, r);
 }
@@ -16346,8 +16348,8 @@ static sp_int32 sp_2048_cmp_64(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[128], t2[65];
     sp_digit div, r1;
@@ -16387,7 +16389,8 @@ static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_dig
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_mod_64(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_2048_mod_64(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_2048_div_64(a, m, NULL, r);
 }
@@ -36703,8 +36706,8 @@ static sp_int32 sp_3072_cmp_48(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[96], t2[49];
     sp_digit div, r1;
@@ -36744,7 +36747,8 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_mod_48(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_3072_mod_48(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_3072_div_48(a, m, NULL, r);
 }
@@ -42127,8 +42131,8 @@ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[192], t2[97];
     sp_digit div, r1;
@@ -42184,7 +42188,8 @@ static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, s
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_3072_div_96_cond(a, m, NULL, r);
 }
@@ -43338,8 +43343,8 @@ static sp_int32 sp_3072_cmp_96(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[192], t2[97];
     sp_digit div, r1;
@@ -43379,7 +43384,8 @@ static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_dig
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_mod_96(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_3072_mod_96(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_3072_div_96(a, m, NULL, r);
 }
@@ -57066,8 +57072,8 @@ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[256], t2[129];
     sp_digit div, r1;
@@ -57123,7 +57129,8 @@ static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d,
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_4096_div_128_cond(a, m, NULL, r);
 }
@@ -58629,8 +58636,8 @@ static sp_int32 sp_4096_cmp_128(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[256], t2[129];
     sp_digit div, r1;
@@ -58670,7 +58677,8 @@ static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_di
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_mod_128(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_4096_mod_128(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_4096_div_128(a, m, NULL, r);
 }
@@ -65169,80 +65177,6 @@ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
     return (uint32_t)(size_t)r;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "mov	r12, #0\n\t"
-        "add	lr, %[a], #32\n\t"
-        "\n"
-    "L_sp_256_sub_8_word_%=: \n\t"
-        "rsbs	r12, r12, #0\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "sbc	r12, r3, r3\n\t"
-        "cmp	%[a], lr\n\t"
-        "bne	L_sp_256_sub_8_word_%=\n\t"
-        "mov	%[r], r12\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "subs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "sbc	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 #endif /* WOLFSSL_SP_SMALL */
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -67665,7 +67599,6 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "str	%[r], [sp, #64]\n\t"
         /* Start Reduction */
         "ldm	sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
-        "str	%[r], [sp]\n\t"
         "mov	r3, r11\n\t"
         "mov	r4, r12\n\t"
         /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */
@@ -67707,7 +67640,7 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "adcs	r4, r4, r6\n\t"
         "adc	lr, lr, #0\n\t"
         "str	r4, [sp, #28]\n\t"
-        /* a[8]  +=  t[0] + t[2] + t[5] */
+        /* a[8]  +=  t[0] + t[2] + t[5] + carry */
         /* a[9]  +=  t[1] + t[3] + t[6] */
         /* a[10] +=  t[2] + t[4] + t[7] */
         "add	r0, sp, #32\n\t"
@@ -67783,7 +67716,7 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "sbcs	r4, r4, #0\n\t"
         "sbcs	r5, r5, #0\n\t"
         "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, lr, LSR #31\n\t"
+        "sbcs	r7, r7, lr, lsr #31\n\t"
         "sbc	r8, r8, lr\n\t"
         "ldr	%[r], [sp, #64]\n\t"
         "stm	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
@@ -68143,7 +68076,6 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "stm	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         /* Start Reduction */
         "ldm	sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
-        "str	%[r], [sp]\n\t"
         "mov	r3, r11\n\t"
         "mov	r4, r12\n\t"
         /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */
@@ -68185,7 +68117,7 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "adcs	r4, r4, r6\n\t"
         "adc	lr, lr, #0\n\t"
         "str	r4, [sp, #28]\n\t"
-        /* a[8]  +=  t[0] + t[2] + t[5] */
+        /* a[8]  +=  t[0] + t[2] + t[5] + carry */
         /* a[9]  +=  t[1] + t[3] + t[6] */
         /* a[10] +=  t[2] + t[4] + t[7] */
         "add	r0, sp, #32\n\t"
@@ -68261,7 +68193,7 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "sbcs	r4, r4, #0\n\t"
         "sbcs	r5, r5, #0\n\t"
         "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, lr, LSR #31\n\t"
+        "sbcs	r7, r7, lr, lsr #31\n\t"
         "sbc	r8, r8, lr\n\t"
         "ldr	%[r], [sp, #64]\n\t"
         "stm	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
@@ -68399,7 +68331,6 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "stm	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         /* Start Reduction */
         "ldm	sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
-        "str	%[r], [sp]\n\t"
         "mov	r3, r11\n\t"
         "mov	r4, r12\n\t"
         /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */
@@ -68441,7 +68372,7 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "adcs	r4, r4, r6\n\t"
         "adc	lr, lr, #0\n\t"
         "str	r4, [sp, #28]\n\t"
-        /* a[8]  +=  t[0] + t[2] + t[5] */
+        /* a[8]  +=  t[0] + t[2] + t[5] + carry */
         /* a[9]  +=  t[1] + t[3] + t[6] */
         /* a[10] +=  t[2] + t[4] + t[7] */
         "add	r0, sp, #32\n\t"
@@ -68517,7 +68448,7 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "sbcs	r4, r4, #0\n\t"
         "sbcs	r5, r5, #0\n\t"
         "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, lr, LSR #31\n\t"
+        "sbcs	r7, r7, lr, lsr #31\n\t"
         "sbc	r8, r8, lr\n\t"
         "ldr	%[r], [sp, #68]\n\t"
         "stm	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
@@ -69598,7 +69529,6 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "str	%[r], [sp, #64]\n\t"
         /* Start Reduction */
         "ldm	sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
-        "str	%[r], [sp]\n\t"
         "mov	r3, r11\n\t"
         "mov	r4, r12\n\t"
         /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */
@@ -69640,7 +69570,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "adcs	r4, r4, r6\n\t"
         "adc	lr, lr, #0\n\t"
         "str	r4, [sp, #28]\n\t"
-        /* a[8]  +=  t[0] + t[2] + t[5] */
+        /* a[8]  +=  t[0] + t[2] + t[5] + carry */
         /* a[9]  +=  t[1] + t[3] + t[6] */
         /* a[10] +=  t[2] + t[4] + t[7] */
         "add	r0, sp, #32\n\t"
@@ -69716,7 +69646,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "sbcs	r4, r4, #0\n\t"
         "sbcs	r5, r5, #0\n\t"
         "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, lr, LSR #31\n\t"
+        "sbcs	r7, r7, lr, lsr #31\n\t"
         "sbc	r8, r8, lr\n\t"
         "ldr	%[r], [sp, #64]\n\t"
         "stm	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
@@ -69965,7 +69895,6 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "stm	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
         /* Start Reduction */
         "ldm	sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
-        "str	%[r], [sp]\n\t"
         "mov	r3, r11\n\t"
         "mov	r4, r12\n\t"
         /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */
@@ -70007,7 +69936,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "adcs	r4, r4, r6\n\t"
         "adc	lr, lr, #0\n\t"
         "str	r4, [sp, #28]\n\t"
-        /* a[8]  +=  t[0] + t[2] + t[5] */
+        /* a[8]  +=  t[0] + t[2] + t[5] + carry */
         /* a[9]  +=  t[1] + t[3] + t[6] */
         /* a[10] +=  t[2] + t[4] + t[7] */
         "add	r0, sp, #32\n\t"
@@ -70083,7 +70012,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "sbcs	r4, r4, #0\n\t"
         "sbcs	r5, r5, #0\n\t"
         "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, lr, LSR #31\n\t"
+        "sbcs	r7, r7, lr, lsr #31\n\t"
         "sbc	r8, r8, lr\n\t"
         "ldr	%[r], [sp, #64]\n\t"
         "stm	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
@@ -70206,7 +70135,6 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "stm	lr!, {r7}\n\t"
         /* Start Reduction */
         "ldm	sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
-        "str	%[r], [sp]\n\t"
         "mov	r3, r11\n\t"
         "mov	r4, r12\n\t"
         /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */
@@ -70248,7 +70176,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "adcs	r4, r4, r6\n\t"
         "adc	lr, lr, #0\n\t"
         "str	r4, [sp, #28]\n\t"
-        /* a[8]  +=  t[0] + t[2] + t[5] */
+        /* a[8]  +=  t[0] + t[2] + t[5] + carry */
         /* a[9]  +=  t[1] + t[3] + t[6] */
         /* a[10] +=  t[2] + t[4] + t[7] */
         "add	r0, sp, #32\n\t"
@@ -70324,7 +70252,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "sbcs	r4, r4, #0\n\t"
         "sbcs	r5, r5, #0\n\t"
         "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, lr, LSR #31\n\t"
+        "sbcs	r7, r7, lr, lsr #31\n\t"
         "sbc	r8, r8, lr\n\t"
         "ldr	%[r], [sp, #64]\n\t"
         "stm	%[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
@@ -71157,95 +71085,140 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p,
     register sp_digit* a asm ("r0") = (sp_digit*)a_p;
 
     __asm__ __volatile__ (
-        "mov	r1, #0\n\t"
-        /* i = 0 */
-        "mov	r8, #0\n\t"
-        "\n"
-    "L_sp_256_mont_reduce_8_word_%=: \n\t"
-        "mov	r4, #0\n\t"
-        /* mu = a[i] * 1 (mp) = a[i] */
-        "ldr	r2, [%[a]]\n\t"
-        /* a[i+0] += -1 * mu */
-        "mov	r5, r2\n\t"
-        "str	r4, [%[a]]\n\t"
-        /* a[i+1] += -1 * mu */
-        "ldr	r6, [%[a], #4]\n\t"
-        "mov	r4, r2\n\t"
-        "subs	r5, r5, r2\n\t"
-        "sbc	r4, r4, #0\n\t"
-        "adds	r5, r5, r6\n\t"
-        "adc	r4, r4, #0\n\t"
-        "str	r5, [%[a], #4]\n\t"
-        /* a[i+2] += -1 * mu */
-        "ldr	r6, [%[a], #8]\n\t"
-        "mov	r5, r2\n\t"
-        "subs	r4, r4, r2\n\t"
-        "sbc	r5, r5, #0\n\t"
-        "adds	r4, r4, r6\n\t"
-        "adc	r5, r5, #0\n\t"
-        "str	r4, [%[a], #8]\n\t"
-        /* a[i+3] += 0 * mu */
-        "ldr	r6, [%[a], #12]\n\t"
-        "mov	r4, #0\n\t"
-        "adds	r5, r5, r6\n\t"
-        "adc	r4, r4, #0\n\t"
-        "str	r5, [%[a], #12]\n\t"
-        /* a[i+4] += 0 * mu */
-        "ldr	r6, [%[a], #16]\n\t"
-        "mov	r5, #0\n\t"
-        "adds	r4, r4, r6\n\t"
-        "adc	r5, r5, #0\n\t"
-        "str	r4, [%[a], #16]\n\t"
-        /* a[i+5] += 0 * mu */
-        "ldr	r6, [%[a], #20]\n\t"
-        "mov	r4, #0\n\t"
-        "adds	r5, r5, r6\n\t"
-        "adc	r4, r4, #0\n\t"
-        "str	r5, [%[a], #20]\n\t"
-        /* a[i+6] += 1 * mu */
-        "ldr	r6, [%[a], #24]\n\t"
-        "mov	r5, #0\n\t"
-        "adds	r4, r4, r2\n\t"
-        "adc	r5, r5, #0\n\t"
-        "adds	r4, r4, r6\n\t"
-        "adc	r5, r5, #0\n\t"
-        "str	r4, [%[a], #24]\n\t"
-        /* a[i+7] += -1 * mu */
-        "ldr	r6, [%[a], #28]\n\t"
-        "ldr	r7, [%[a], #32]\n\t"
-        "adds	r4, r1, r2\n\t"
-        "mov	r1, #0\n\t"
-        "adc	r1, r1, r1\n\t"
-        "subs	r5, r5, r2\n\t"
-        "sbcs	r4, r4, #0\n\t"
-        "sbc	r1, r1, #0\n\t"
-        "adds	r5, r5, r6\n\t"
+        "sub	sp, sp, #0x44\n\t"
+        "str	%[a], [sp, #64]\n\t"
+        "mov	lr, sp\n\t"
+        "ldm	%[a]!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
+        "stm	lr!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
+        "ldm	%[a], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
+        "stm	lr, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
+        /* Start Reduction */
+        "ldm	sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
+        "mov	r3, r11\n\t"
+        "mov	r4, r12\n\t"
+        /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */
+        /*    - a[0] << 224 */
+        /*   + (a[0]-a[1] * 2) << (6 * 32) */
+        "adds	r11, r11, r5\n\t"
+        "adc	r12, r12, r6\n\t"
+        "adds	r11, r11, r5\n\t"
+        "adc	r12, r12, r6\n\t"
+        /*   - a[0] << (7 * 32) */
+        "sub	r12, r12, r5\n\t"
+        /*   + a[0]-a[4] << (3 * 32) */
+        "mov	r0, r8\n\t"
+        "mov	r1, r9\n\t"
+        "mov	r2, r10\n\t"
+        "adds	r8, r8, r5\n\t"
+        "adcs	r9, r9, r6\n\t"
+        "adcs	r10, r10, r7\n\t"
+        "adcs	r11, r11, r0\n\t"
+        "adc	r12, r12, r1\n\t"
+        /* a += mu * m */
+        /*   += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */
+        /* a[0]   =                     = t[0] */
+        /* a[1]   =                     = t[1] */
+        /* a[2]   =                     = t[2] */
+        /* a[3]  +=                t[0] = t[3] */
+        /* a[4]  +=                t[1] = t[4] */
+        /* a[5]  +=                t[2] = t[5] */
+        /* a[6]  +=         t[0] + t[3] = t[6] */
+        /* a[7]  +=         t[1] + t[4] = t[7] + t[0] */
+        "adds	r0, r0, r5\n\t"
+        "adcs	r1, r1, r6\n\t"
+        "adcs	r2, r2, r7\n\t"
+        "adcs	r3, r3, r8\n\t"
+        "adcs	r4, r4, r9\n\t"
+        "mov	lr, #0\n\t"
+        "adc	lr, lr, #0\n\t"
+        "adds	r3, r3, r5\n\t"
+        "adcs	r4, r4, r6\n\t"
+        "adc	lr, lr, #0\n\t"
+        "str	r4, [sp, #28]\n\t"
+        /* a[8]  +=  t[0] + t[2] + t[5] + carry */
+        /* a[9]  +=  t[1] + t[3] + t[6] */
+        /* a[10] +=  t[2] + t[4] + t[7] */
+        "add	r0, sp, #32\n\t"
+        "ldm	r0, {r2, r3, r4}\n\t"
+        "adds	r2, r2, lr\n\t"
+        "adcs	r3, r3, #0\n\t"
+        "adcs	r4, r4, #0\n\t"
+        "mov	lr, #0\n\t"
+        "adc	lr, lr, #0\n\t"
+        "adds	r2, r2, r5\n\t"
+        "adcs	r3, r3, r6\n\t"
         "adcs	r4, r4, r7\n\t"
-        "adc	r1, r1, #0\n\t"
-        "str	r5, [%[a], #28]\n\t"
-        "str	r4, [%[a], #32]\n\t"
-        /* i += 1 */
-        "add	r8, r8, #4\n\t"
-        "add	%[a], %[a], #4\n\t"
-        "cmp	r8, #32\n\t"
-        "blt	L_sp_256_mont_reduce_8_word_%=\n\t"
-        "mov	r2, r1\n\t"
-        "sub	r1, r1, #1\n\t"
-        "mvn	r1, r1\n\t"
-        "ldm	%[a], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
-        "sub	%[a], %[a], #32\n\t"
-        "subs	r4, r4, r1\n\t"
-        "sbcs	r5, r5, r1\n\t"
-        "sbcs	r6, r6, r1\n\t"
-        "sbcs	r7, r7, #0\n\t"
+        "adc	lr, lr, #0\n\t"
+        "adds	r2, r2, r7\n\t"
+        "adcs	r3, r3, r8\n\t"
+        "adcs	r4, r4, r9\n\t"
+        "adc	lr, lr, #0\n\t"
+        "adds	r2, r2, r10\n\t"
+        "adcs	r3, r3, r11\n\t"
+        "adcs	r4, r4, r12\n\t"
+        "adc	lr, lr, #0\n\t"
+        "stm	r0!, {r2, r3, r4}\n\t"
+        /* a[11] +=  t[3] + t[5] + carry */
+        /* a[12] +=  t[4] + t[6] */
+        /* a[13] +=  t[5] + t[7] */
+        /* a[14] +=  t[6] */
+        /* a[15] +=  t[7] */
+        "ldm	r0, {r0, r1, r2, r3, r4}\n\t"
+        "adds	r0, r0, lr\n\t"
+        "adcs	r1, r1, #0\n\t"
+        "adcs	r2, r2, #0\n\t"
+        "adcs	r3, r3, #0\n\t"
+        "adcs	r4, r4, #0\n\t"
+        "mov	lr, #0\n\t"
+        "adc	lr, lr, #0\n\t"
+        "adds	r0, r0, r8\n\t"
+        "adcs	r1, r1, r9\n\t"
+        "adcs	r2, r2, r10\n\t"
+        "adcs	r3, r3, r11\n\t"
+        "adcs	r4, r4, r12\n\t"
+        "adc	lr, lr, #0\n\t"
+        "adds	r0, r0, r10\n\t"
+        "adcs	r1, r1, r11\n\t"
+        "adcs	r2, r2, r12\n\t"
+        "adcs	r3, r3, #0\n\t"
+        "adcs	r4, r4, #0\n\t"
+        "adc	lr, lr, #0\n\t"
+        "str	r0, [sp, #44]\n\t"
+        "str	r1, [sp, #48]\n\t"
+        "str	r2, [sp, #52]\n\t"
+        "str	r3, [sp, #56]\n\t"
+        /* a[7..15] - t[0..7] */
+        "add	r0, sp, #28\n\t"
+        "ldm	r0, {r0, r1, r2, r3}\n\t"
+        "subs	r0, r0, r5\n\t"
+        "sbcs	r1, r1, r6\n\t"
+        "sbcs	r2, r2, r7\n\t"
+        "sbcs	r3, r3, r8\n\t"
+        "add	r0, sp, #44\n\t"
+        "mov	r8, r4\n\t"
+        "ldm	r0, {r4, r5, r6, r7}\n\t"
+        "sbcs	r4, r4, r9\n\t"
+        "sbcs	r5, r5, r10\n\t"
+        "sbcs	r6, r6, r11\n\t"
+        "sbcs	r7, r7, r12\n\t"
         "sbcs	r8, r8, #0\n\t"
-        "sbcs	r9, r9, #0\n\t"
-        "sbcs	r10, r10, r2\n\t"
-        "sbc	r11, r11, r1\n\t"
-        "stm	%[a], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
+        "sbc	lr, lr, #0\n\t"
+        /* mask m and sub from result if overflow */
+        "rsb	lr, lr, #0\n\t"
+        "subs	r1, r1, lr\n\t"
+        "sbcs	r2, r2, lr\n\t"
+        "sbcs	r3, r3, lr\n\t"
+        "sbcs	r4, r4, #0\n\t"
+        "sbcs	r5, r5, #0\n\t"
+        "sbcs	r6, r6, #0\n\t"
+        "sbcs	r7, r7, lr, lsr #31\n\t"
+        "sbc	r8, r8, lr\n\t"
+        "ldr	%[a], [sp, #64]\n\t"
+        "stm	%[a], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
+        "add	sp, sp, #0x44\n\t"
         : [a] "+r" (a)
         :
-        : "memory", "r1", "r2", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+        : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr"
     );
     (void)m_p;
     (void)mp_p;
@@ -71801,7 +71774,7 @@ static void sp_256_mont_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbcs	r8, r8, #0\n\t"
         "sbcs	r9, r9, #0\n\t"
         "sbcs	r10, r10, #0\n\t"
-        "sbcs	r11, r11, lr, LSR #31\n\t"
+        "sbcs	r11, r11, lr, lsr #31\n\t"
         "sbcs	r12, r12, lr\n\t"
         "sbc	%[b], %[b], %[b]\n\t"
         "sub	lr, lr, %[b]\n\t"
@@ -71811,7 +71784,7 @@ static void sp_256_mont_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbcs	r8, r8, #0\n\t"
         "sbcs	r9, r9, #0\n\t"
         "sbcs	r10, r10, #0\n\t"
-        "sbcs	r11, r11, lr, LSR #31\n\t"
+        "sbcs	r11, r11, lr, lsr #31\n\t"
         "sbc	r12, r12, lr\n\t"
         "stm	%[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -71851,7 +71824,7 @@ static void sp_256_mont_dbl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbcs	r7, r7, #0\n\t"
         "sbcs	r8, r8, #0\n\t"
         "sbcs	r9, r9, #0\n\t"
-        "sbcs	r10, r10, r2, LSR #31\n\t"
+        "sbcs	r10, r10, r2, lsr #31\n\t"
         "sbcs	r11, r11, r2\n\t"
         "sbc	%[a], %[a], %[a]\n\t"
         "sub	r2, r2, %[a]\n\t"
@@ -71861,7 +71834,7 @@ static void sp_256_mont_dbl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbcs	r7, r7, #0\n\t"
         "sbcs	r8, r8, #0\n\t"
         "sbcs	r9, r9, #0\n\t"
-        "sbcs	r10, r10, r2, LSR #31\n\t"
+        "sbcs	r10, r10, r2, lsr #31\n\t"
         "sbc	r11, r11, r2\n\t"
         "stm	%[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
@@ -71901,7 +71874,7 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbcs	r7, r7, #0\n\t"
         "sbcs	r8, r8, #0\n\t"
         "sbcs	r9, r9, #0\n\t"
-        "sbcs	r10, r10, r12, LSR #31\n\t"
+        "sbcs	r10, r10, r12, lsr #31\n\t"
         "sbcs	r11, r11, r12\n\t"
         "sbc	r2, r2, r2\n\t"
         "sub	r12, r12, r2\n\t"
@@ -71911,7 +71884,7 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbcs	r7, r7, #0\n\t"
         "sbcs	r8, r8, #0\n\t"
         "sbcs	r9, r9, #0\n\t"
-        "sbcs	r10, r10, r12, LSR #31\n\t"
+        "sbcs	r10, r10, r12, lsr #31\n\t"
         "sbc	r11, r11, r12\n\t"
         "ldm	%[a]!, {r2, r3}\n\t"
         "adds	r4, r4, r2\n\t"
@@ -71933,7 +71906,7 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbcs	r7, r7, #0\n\t"
         "sbcs	r8, r8, #0\n\t"
         "sbcs	r9, r9, #0\n\t"
-        "sbcs	r10, r10, r12, LSR #31\n\t"
+        "sbcs	r10, r10, r12, lsr #31\n\t"
         "sbcs	r11, r11, r12\n\t"
         "sbc	r2, r2, r2\n\t"
         "sub	r12, r12, r2\n\t"
@@ -71943,7 +71916,7 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbcs	r7, r7, #0\n\t"
         "sbcs	r8, r8, #0\n\t"
         "sbcs	r9, r9, #0\n\t"
-        "sbcs	r10, r10, r12, LSR #31\n\t"
+        "sbcs	r10, r10, r12, lsr #31\n\t"
         "sbc	r11, r11, r12\n\t"
         "stm	%[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
@@ -71988,7 +71961,7 @@ static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "adcs	r8, r8, #0\n\t"
         "adcs	r9, r9, #0\n\t"
         "adcs	r10, r10, #0\n\t"
-        "adcs	r11, r11, lr, LSR #31\n\t"
+        "adcs	r11, r11, lr, lsr #31\n\t"
         "adcs	r12, r12, lr\n\t"
         "adc	lr, lr, #0\n\t"
         "adds	r5, r5, lr\n\t"
@@ -71997,7 +71970,7 @@ static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "adcs	r8, r8, #0\n\t"
         "adcs	r9, r9, #0\n\t"
         "adcs	r10, r10, #0\n\t"
-        "adcs	r11, r11, lr, LSR #31\n\t"
+        "adcs	r11, r11, lr, lsr #31\n\t"
         "adc	r12, r12, lr\n\t"
         "stm	%[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -72013,7 +71986,7 @@ static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p)
+static void sp_256_mont_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -72023,7 +71996,6 @@ static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_
         "ldm	%[a], {r4, r5, r6, r7}\n\t"
         "and	r3, r4, #1\n\t"
         "rsb	r8, r3, #0\n\t"
-        "and	r9, r8, #1\n\t"
         "adds	r4, r4, r8\n\t"
         "adcs	r5, r5, r8\n\t"
         "adcs	r6, r6, r8\n\t"
@@ -72043,7 +72015,7 @@ static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_
 #endif
         "adcs	r4, r4, #0\n\t"
         "adcs	r5, r5, #0\n\t"
-        "adcs	r6, r6, r9\n\t"
+        "adcs	r6, r6, r8, lsr #31\n\t"
         "adcs	r7, r7, r8\n\t"
         "mov	r3, #0\n\t"
         "adc	r3, r3, #0\n\t"
@@ -72128,7 +72100,7 @@ static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p,
     /* T2 = Y * Y */
     sp_256_mont_sqr_8(t2, y, p256_mod, p256_mp_mod);
     /* T2 = T2/2 */
-    sp_256_div2_8(t2, t2, p256_mod);
+    sp_256_mont_div2_8(t2, t2, p256_mod);
     /* Y = Y * X */
     sp_256_mont_mul_8(y, y, p->x, p256_mod, p256_mp_mod);
     /* X = T1 * T1 */
@@ -72161,7 +72133,8 @@ typedef struct sp_256_proj_point_dbl_8_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
+static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+        const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_256_proj_point_dbl_8_ctx* ctx = (sp_256_proj_point_dbl_8_ctx*)sp_ctx->data;
@@ -72235,7 +72208,7 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_256_div2_8(ctx->t2, ctx->t2, p256_mod);
+        sp_256_mont_div2_8(ctx->t2, ctx->t2, p256_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -72945,7 +72918,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     sp_256_mont_sub_8(y, y, t1, p256_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_256_div2_8(y, y, p256_mod);
+    sp_256_mont_div2_8(y, y, p256_mod);
 }
 
 /* Convert the projective point to affine.
@@ -73423,8 +73396,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap);
@@ -73843,8 +73816,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap);
@@ -76658,8 +76631,8 @@ static void sp_256_mask_8(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[16], t2[9];
     sp_digit div, r1;
@@ -76699,7 +76672,8 @@ static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_256_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_256_mod_8(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_256_div_8(a, m, NULL, r);
 }
@@ -77268,6 +77242,80 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
 #endif /* HAVE_ECC_SIGN */
 
 #ifndef WOLFSSL_SP_SMALL
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "mov	r12, #0\n\t"
+        "add	lr, %[a], #32\n\t"
+        "\n"
+    "L_sp_256_sub_8_word_%=: \n\t"
+        "rsbs	r12, r12, #0\n\t"
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "sbc	r12, r3, r3\n\t"
+        "cmp	%[a], lr\n\t"
+        "bne	L_sp_256_sub_8_word_%=\n\t"
+        "mov	%[r], r12\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "subs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "sbc	%[r], r6, r6\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
 static void sp_256_rshift1_8(sp_digit* r_p, const sp_digit* a_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
@@ -77923,7 +77971,7 @@ static int sp_256_mod_inv_8(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_256_cmp_8(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_256_cmp_8(u, v) >= 0))) {
             sp_256_sub_8(u, u, v);
             o = sp_256_sub_8(b, b, d);
             if (o != 0)
@@ -78352,19 +78400,21 @@ static int sp_256_ecc_is_point_8(const sp_point_256* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 8;
 
+        /* y^2 - x^3 - a.x = b */
         sp_256_sqr_8(t1, point->y);
         (void)sp_256_mod_8(t1, t1, p256_mod);
         sp_256_sqr_8(t2, point->x);
         (void)sp_256_mod_8(t2, t2, p256_mod);
         sp_256_mul_8(t2, t2, point->x);
         (void)sp_256_mod_8(t2, t2, p256_mod);
-        (void)sp_256_sub_8(t2, p256_mod, t2);
-        sp_256_mont_add_8(t1, t1, t2, p256_mod);
+        sp_256_mont_sub_8(t1, t1, t2, p256_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_256_mont_add_8(t1, t1, point->x, p256_mod);
         sp_256_mont_add_8(t1, t1, point->x, p256_mod);
         sp_256_mont_add_8(t1, t1, point->x, p256_mod);
 
+
         if (sp_256_cmp_8(t1, p256_b) != 0) {
             err = MP_VAL;
         }
@@ -87839,87 +87889,6 @@ static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit
     return (uint32_t)(size_t)r;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "mov	r12, #0\n\t"
-        "add	lr, %[a], #48\n\t"
-        "\n"
-    "L_sp_384_sub_12_word_%=: \n\t"
-        "rsbs	r12, r12, #0\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "sbc	r12, r3, r3\n\t"
-        "cmp	%[a], lr\n\t"
-        "bne	L_sp_384_sub_12_word_%=\n\t"
-        "mov	%[r], r12\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "subs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "sbc	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 #endif /* WOLFSSL_SP_SMALL */
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -89394,6 +89363,87 @@ static void sp_384_mont_tpl_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi
     sp_384_cond_sub_12(r, r, m, 0 - o);
 }
 
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "mov	r12, #0\n\t"
+        "add	lr, %[a], #48\n\t"
+        "\n"
+    "L_sp_384_sub_12_word_%=: \n\t"
+        "rsbs	r12, r12, #0\n\t"
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "sbc	r12, r3, r3\n\t"
+        "cmp	%[a], lr\n\t"
+        "bne	L_sp_384_sub_12_word_%=\n\t"
+        "mov	%[r], r12\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "subs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "sbc	%[r], r6, r6\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
 #ifdef WOLFSSL_SP_SMALL
 /* Conditionally add a and b using the mask m.
  * m is -1 to add and 0 when not.
@@ -89590,7 +89640,7 @@ static void sp_384_rshift1_12(sp_digit* r_p, const sp_digit* a_p)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_384_div2_12(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_384_mont_div2_12(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     sp_digit o;
 
@@ -89643,7 +89693,7 @@ static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p,
     /* T2 = Y * Y */
     sp_384_mont_sqr_12(t2, y, p384_mod, p384_mp_mod);
     /* T2 = T2/2 */
-    sp_384_div2_12(t2, t2, p384_mod);
+    sp_384_mont_div2_12(t2, t2, p384_mod);
     /* Y = Y * X */
     sp_384_mont_mul_12(y, y, p->x, p384_mod, p384_mp_mod);
     /* X = T1 * T1 */
@@ -89676,7 +89726,8 @@ typedef struct sp_384_proj_point_dbl_12_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
+static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
+        const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_384_proj_point_dbl_12_ctx* ctx = (sp_384_proj_point_dbl_12_ctx*)sp_ctx->data;
@@ -89750,7 +89801,7 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_384_div2_12(ctx->t2, ctx->t2, p384_mod);
+        sp_384_mont_div2_12(ctx->t2, ctx->t2, p384_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -90486,7 +90537,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     sp_384_mont_sub_12(y, y, t1, p384_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_384_div2_12(y, y, p384_mod);
+    sp_384_mont_div2_12(y, y, p384_mod);
 }
 
 /* Convert the projective point to affine.
@@ -90980,8 +91031,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_fast_12(r, g, k, map, ct, heap);
@@ -91416,8 +91467,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_fast_12(r, g, k, map, ct, heap);
@@ -94376,8 +94427,8 @@ static void sp_384_mask_12(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[24], t2[13];
     sp_digit div, r1;
@@ -94417,7 +94468,8 @@ static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digi
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_384_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_384_mod_12(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_384_div_12(a, m, NULL, r);
 }
@@ -95865,7 +95917,7 @@ static int sp_384_mod_inv_12(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_384_cmp_12(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_384_cmp_12(u, v) >= 0))) {
             sp_384_sub_12(u, u, v);
             o = sp_384_sub_12(b, b, d);
             if (o != 0)
@@ -96298,19 +96350,21 @@ static int sp_384_ecc_is_point_12(const sp_point_384* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 12;
 
+        /* y^2 - x^3 - a.x = b */
         sp_384_sqr_12(t1, point->y);
         (void)sp_384_mod_12(t1, t1, p384_mod);
         sp_384_sqr_12(t2, point->x);
         (void)sp_384_mod_12(t2, t2, p384_mod);
         sp_384_mul_12(t2, t2, point->x);
         (void)sp_384_mod_12(t2, t2, p384_mod);
-        (void)sp_384_sub_12(t2, p384_mod, t2);
-        sp_384_mont_add_12(t1, t1, t2, p384_mod);
+        sp_384_mont_sub_12(t1, t1, t2, p384_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_384_mont_add_12(t1, t1, point->x, p384_mod);
         sp_384_mont_add_12(t1, t1, point->x, p384_mod);
         sp_384_mont_add_12(t1, t1, point->x, p384_mod);
 
+
         if (sp_384_cmp_12(t1, p384_b) != 0) {
             err = MP_VAL;
         }
@@ -114178,103 +114232,6 @@ static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit
     return (uint32_t)(size_t)r;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "mov	r12, #0\n\t"
-        "add	lr, %[a], #0x40\n\t"
-        "\n"
-    "L_sp_521_sub_17_word_%=: \n\t"
-        "rsbs	r12, r12, #0\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "sbc	r12, r3, r3\n\t"
-        "cmp	%[a], lr\n\t"
-        "bne	L_sp_521_sub_17_word_%=\n\t"
-        "rsbs	r12, r12, #0\n\t"
-        "ldm	%[a]!, {r3}\n\t"
-        "ldm	%[b]!, {r7}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "stm	%[r]!, {r3}\n\t"
-        "sbc	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "subs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3}\n\t"
-        "ldm	%[b]!, {r7}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "stm	%[r]!, {r3}\n\t"
-        "sbc	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 #endif /* WOLFSSL_SP_SMALL */
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -116694,7 +116651,7 @@ static void sp_521_rshift1_17(sp_digit* r_p, const sp_digit* a_p)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_521_div2_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_521_mont_div2_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     sp_digit o = a[0] & 1;
 
@@ -116748,7 +116705,7 @@ static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p,
     /* T2 = Y * Y */
     sp_521_mont_sqr_17(t2, y, p521_mod, p521_mp_mod);
     /* T2 = T2/2 */
-    sp_521_div2_17(t2, t2, p521_mod);
+    sp_521_mont_div2_17(t2, t2, p521_mod);
     /* Y = Y * X */
     sp_521_mont_mul_17(y, y, p->x, p521_mod, p521_mp_mod);
     /* X = T1 * T1 */
@@ -116781,7 +116738,8 @@ typedef struct sp_521_proj_point_dbl_17_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
+static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
+        const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_521_proj_point_dbl_17_ctx* ctx = (sp_521_proj_point_dbl_17_ctx*)sp_ctx->data;
@@ -116855,7 +116813,7 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_521_div2_17(ctx->t2, ctx->t2, p521_mod);
+        sp_521_mont_div2_17(ctx->t2, ctx->t2, p521_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -117628,7 +117586,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     sp_521_mont_sub_17(y, y, t1, p521_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_521_div2_17(y, y, p521_mod);
+    sp_521_mont_div2_17(y, y, p521_mod);
 }
 
 /* Convert the projective point to affine.
@@ -118142,8 +118100,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_fast_17(r, g, k, map, ct, heap);
@@ -118598,8 +118556,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_fast_17(r, g, k, map, ct, heap);
@@ -122730,8 +122688,8 @@ static void sp_521_mask_17(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[35];
     sp_digit t2[18];
@@ -122777,7 +122735,8 @@ static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d, sp_digi
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_521_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_521_mod_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_521_div_17(a, m, NULL, r);
 }
@@ -123336,6 +123295,103 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
 #endif /* HAVE_ECC_SIGN */
 
 #ifndef WOLFSSL_SP_SMALL
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "mov	r12, #0\n\t"
+        "add	lr, %[a], #0x40\n\t"
+        "\n"
+    "L_sp_521_sub_17_word_%=: \n\t"
+        "rsbs	r12, r12, #0\n\t"
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "sbc	r12, r3, r3\n\t"
+        "cmp	%[a], lr\n\t"
+        "bne	L_sp_521_sub_17_word_%=\n\t"
+        "rsbs	r12, r12, #0\n\t"
+        "ldm	%[a]!, {r3}\n\t"
+        "ldm	%[b]!, {r7}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "stm	%[r]!, {r3}\n\t"
+        "sbc	%[r], r6, r6\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "subs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "sbcs	r4, r4, r8\n\t"
+        "sbcs	r5, r5, r9\n\t"
+        "sbcs	r6, r6, r10\n\t"
+        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
+        "ldm	%[a]!, {r3}\n\t"
+        "ldm	%[b]!, {r7}\n\t"
+        "sbcs	r3, r3, r7\n\t"
+        "stm	%[r]!, {r3}\n\t"
+        "sbc	%[r], r6, r6\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
 /* Divide the number by 2 mod the modulus. (r = a / 2 % m)
  *
  * r  Result of division by 2.
@@ -124673,7 +124729,7 @@ static int sp_521_mod_inv_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_521_cmp_17(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_521_cmp_17(u, v) >= 0))) {
             sp_521_sub_17(u, u, v);
             o = sp_521_sub_17(b, b, d);
             if (o != 0)
@@ -125118,19 +125174,21 @@ static int sp_521_ecc_is_point_17(const sp_point_521* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 17;
 
+        /* y^2 - x^3 - a.x = b */
         sp_521_sqr_17(t1, point->y);
         (void)sp_521_mod_17(t1, t1, p521_mod);
         sp_521_sqr_17(t2, point->x);
         (void)sp_521_mod_17(t2, t2, p521_mod);
         sp_521_mul_17(t2, t2, point->x);
         (void)sp_521_mod_17(t2, t2, p521_mod);
-        (void)sp_521_sub_17(t2, p521_mod, t2);
-        sp_521_mont_add_17(t1, t1, t2, p521_mod);
+        sp_521_mont_sub_17(t1, t1, t2, p521_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_521_mont_add_17(t1, t1, point->x, p521_mod);
         sp_521_mont_add_17(t1, t1, point->x, p521_mod);
         sp_521_mont_add_17(t1, t1, point->x, p521_mod);
 
+
         if (sp_521_cmp_17(t1, p521_b) != 0) {
             err = MP_VAL;
         }
@@ -143336,8 +143394,8 @@ static sp_int32 sp_1024_cmp_32(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[64], t2[33];
     sp_digit div, r1;
@@ -143377,7 +143435,8 @@ static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_dig
  * m  A single precision number that is the modulus to reduce with.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_1024_mod_32(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static WC_INLINE int sp_1024_mod_32(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     return sp_1024_div_32(a, m, NULL, r);
 }
@@ -146416,7 +146475,7 @@ static void sp_1024_rshift1_32(sp_digit* r_p, const sp_digit* a_p)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_1024_div2_32(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_1024_mont_div2_32(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     sp_digit o;
 
@@ -146469,7 +146528,7 @@ static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p,
     /* T2 = Y * Y */
     sp_1024_mont_sqr_32(t2, y, p1024_mod, p1024_mp_mod);
     /* T2 = T2/2 */
-    sp_1024_div2_32(t2, t2, p1024_mod);
+    sp_1024_mont_div2_32(t2, t2, p1024_mod);
     /* Y = Y * X */
     sp_1024_mont_mul_32(y, y, p->x, p1024_mod, p1024_mp_mod);
     /* X = T1 * T1 */
@@ -146502,7 +146561,8 @@ typedef struct sp_1024_proj_point_dbl_32_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
+static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
+        const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_1024_proj_point_dbl_32_ctx* ctx = (sp_1024_proj_point_dbl_32_ctx*)sp_ctx->data;
@@ -146576,7 +146636,7 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_1024_div2_32(ctx->t2, ctx->t2, p1024_mod);
+        sp_1024_mont_div2_32(ctx->t2, ctx->t2, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -146626,122 +146686,6 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_1024_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "mov	r12, #0\n\t"
-        "add	lr, %[a], #0x80\n\t"
-        "\n"
-    "L_sp_1024_sub_32_word_%=: \n\t"
-        "rsbs	r12, r12, #0\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "sbc	r12, r3, r3\n\t"
-        "cmp	%[a], lr\n\t"
-        "bne	L_sp_1024_sub_32_word_%=\n\t"
-        "mov	%[r], r12\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_1024_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "subs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
-        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
-        "sbcs	r3, r3, r7\n\t"
-        "sbcs	r4, r4, r8\n\t"
-        "sbcs	r5, r5, r9\n\t"
-        "sbcs	r6, r6, r10\n\t"
-        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
-        "sbc	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -147301,7 +147245,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     sp_1024_mont_sub_32(y, y, t1, p1024_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_1024_div2_32(y, y, p1024_mod);
+    sp_1024_mont_div2_32(y, y, p1024_mod);
 }
 
 /* Convert the projective point to affine.
@@ -147714,8 +147658,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap);
@@ -148069,8 +148013,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap);
@@ -154224,7 +154168,7 @@ static void sp_1024_accumulate_line_dbl_32(sp_digit* vx, sp_digit* vy,
     /* ty = 4 * p.y ^ 2 */
     sp_1024_mont_sqr_32(ty, ry, p1024_mod, p1024_mp_mod);
     /* t1 = 2 * p.y ^ 2 */
-    sp_1024_div2_32(t1, ty, p1024_mod);
+    sp_1024_mont_div2_32(t1, ty, p1024_mod);
     /* r.x -= 2 * (p.y ^ 2) */
     sp_1024_mont_sub_32(rx, rx, t1, p1024_mod);
     /* p'.z = p.y * 2 * p.z */
@@ -154244,7 +154188,7 @@ static void sp_1024_accumulate_line_dbl_32(sp_digit* vx, sp_digit* vy,
     /* t1 = (4 * p.y^2) ^ 2 = 16 * p.y^4 */
     sp_1024_mont_sqr_32(t1, ty, p1024_mod, p1024_mp_mod);
     /* t1 = 16 * p.y^4 / 2 = 8 * p.y^4 */
-    sp_1024_div2_32(t1, t1, p1024_mod);
+    sp_1024_mont_div2_32(t1, t1, p1024_mod);
     /* p'.y = 4 * p.y^2 * p.x */
     sp_1024_mont_mul_32(p->y, ty, p->x, p1024_mod, p1024_mp_mod);
     /* p'.x = l^2 */
@@ -154662,7 +154606,7 @@ static void sp_1024_accumulate_line_dbl_n_32(sp_digit* vx, sp_digit* vy,
         /* ty = py ^ 2 */
         sp_1024_mont_sqr_32(ty, p->y, p1024_mod, p1024_mp_mod);
         /* t1 = py ^ 2 / 2 */
-        sp_1024_div2_32(t1, ty, p1024_mod);
+        sp_1024_mont_div2_32(t1, ty, p1024_mod);
         /* r.x -= py ^ 2 / 2 */
         sp_1024_mont_sub_32(rx, rx, t1, p1024_mod);
         /* p'.z = py * pz */
@@ -154700,7 +154644,7 @@ static void sp_1024_accumulate_line_dbl_n_32(sp_digit* vx, sp_digit* vy,
     }
 
     /* p'.y = py' / 2 */
-    sp_1024_div2_32(p->y, p->y, p1024_mod);
+    sp_1024_mont_div2_32(p->y, p->y, p1024_mod);
 }
 
 /* Operations to perform based on order - 1.
@@ -155540,19 +155484,21 @@ static int sp_1024_ecc_is_point_32(const sp_point_1024* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 32;
 
+        /* y^2 - x^3 - a.x = b */
         sp_1024_sqr_32(t1, point->y);
         (void)sp_1024_mod_32(t1, t1, p1024_mod);
         sp_1024_sqr_32(t2, point->x);
         (void)sp_1024_mod_32(t2, t2, p1024_mod);
         sp_1024_mul_32(t2, t2, point->x);
         (void)sp_1024_mod_32(t2, t2, p1024_mod);
-        (void)sp_1024_sub_32(t2, p1024_mod, t2);
-        sp_1024_mont_add_32(t1, t1, t2, p1024_mod);
+        sp_1024_mont_sub_32(t1, t1, t2, p1024_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_1024_mont_add_32(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_32(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_32(t1, t1, point->x, p1024_mod);
 
+
         n = sp_1024_cmp_32(t1, p1024_mod);
         sp_1024_cond_sub_32(t1, t1, p1024_mod, ~(n >> 31));
         sp_1024_norm_32(t1);
diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c
index ed7935443..cbacbfe88 100644
--- a/wolfcrypt/src/sp_arm64.c
+++ b/wolfcrypt/src/sp_arm64.c
@@ -3976,8 +3976,8 @@ static sp_int64 sp_2048_cmp_16(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_16(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_16(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[32], t2[17];
     sp_digit div, r1;
@@ -5016,8 +5016,8 @@ static sp_digit div_2048_word_32_cond(sp_digit d1, sp_digit d0, sp_digit div)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_32_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_32_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[64], t2[33];
     sp_digit div, r1;
@@ -5583,8 +5583,8 @@ static sp_int64 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[64], t2[33];
     sp_digit div, r1;
@@ -13167,8 +13167,8 @@ static sp_int64 sp_3072_cmp_24(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_24(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_24(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[48], t2[25];
     sp_digit div, r1;
@@ -14447,8 +14447,8 @@ static sp_digit div_3072_word_48_cond(sp_digit d1, sp_digit d0, sp_digit div)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_48_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_48_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[96], t2[49];
     sp_digit div, r1;
@@ -15166,8 +15166,8 @@ static sp_int64 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[96], t2[49];
     sp_digit div, r1;
@@ -19406,8 +19406,8 @@ static sp_digit div_4096_word_64_cond(sp_digit d1, sp_digit d0, sp_digit div)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_4096_div_64_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[128], t2[65];
     sp_digit div, r1;
@@ -20277,8 +20277,8 @@ static sp_int64 sp_4096_cmp_64(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_div_64(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_4096_div_64(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[128], t2[65];
     sp_digit div, r1;
@@ -23157,9 +23157,9 @@ static void sp_256_mont_tpl_4(sp_digit* r, const sp_digit* a, const sp_digit* m)
         "sbcs	x4, x4, x7\n\t"
         "sub	x8, xzr, x7\n\t"
         "sbcs	x5, x5, xzr\n\t"
-        "stp	x3, x4, [%[r], 0]\n\t"
+        "stp	x3, x4, [%[r],0]\n\t"
         "sbc	x6, x6, x8\n\t"
-        "stp	x5, x6, [%[r], 16]\n\t"
+        "stp	x5, x6, [%[r],16]\n\t"
         :
         : [r] "r" (r), [a] "r" (a)
         : "memory", "x9", "x10", "x11", "x12", "x3", "x4", "x5", "x6", "x7", "x8", "x13", "cc"
@@ -23217,25 +23217,25 @@ static void sp_256_mont_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b,
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_256_mont_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     __asm__ __volatile__ (
-        "ldp	x3, x4, [%[a], 0]\n\t"
-        "ldp	x5, x6, [%[a], 16]\n\t"
-        "sbfx	x8, x3, 0, 1\n\t"
-        "adds	x3, x3, x8\n\t"
-        "lsr	x7, x8, 32\n\t"
-        "adcs	x4, x4, x7\n\t"
-        "sub	x8, xzr, x7\n\t"
-        "adcs	x5, x5, xzr\n\t"
-        "extr	x3, x4, x3, 1\n\t"
-        "adcs	x6, x6, x8\n\t"
-        "extr	x4, x5, x4, 1\n\t"
-        "adc	x9, xzr, xzr\n\t"
-        "extr	x5, x6, x5, 1\n\t"
-        "extr	x6, x9, x6, 1\n\t"
-        "stp	x3, x4, [%[r], 0]\n\t"
-        "stp	x5, x6, [%[r], 16]\n\t"
+        "ldp   x3, x4, [%[a], 0]\n\t"
+        "ldp   x5, x6, [%[a], 16]\n\t"
+        "sbfx  x8, x3, 0, 1\n\t"
+        "adds      x3, x3, x8\n\t"
+        "lsr       x7, x8, 32\n\t"
+        "adcs      x4, x4, x7\n\t"
+        "sub       x8, xzr, x7\n\t"
+        "adcs      x5, x5, xzr\n\t"
+        "extr      x3, x4, x3, 1\n\t"
+        "adcs      x6, x6, x8\n\t"
+        "extr  x4, x5, x4, 1\n\t"
+        "adc   x9, xzr, xzr\n\t"
+        "extr  x5, x6, x5, 1\n\t"
+        "extr  x6, x9, x6, 1\n\t"
+        "stp   x3, x4, [%[r], 0]\n\t"
+        "stp   x5, x6, [%[r], 16]\n\t"
         :
         : [r] "r" (r), [a] "r" (a), [m] "r" (m)
         : "memory", "x3", "x4", "x5", "x6", "x7", "x9", "x8", "cc"
@@ -23425,7 +23425,7 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
     /* T2 = Y * Y */
     sp_256_mont_sqr_4(t2, y, p256_mod, p256_mp_mod);
     /* T2 = T2/2 */
-    sp_256_div2_4(t2, t2, p256_mod);
+    sp_256_mont_div2_4(t2, t2, p256_mod);
     /* Y = Y * X */
     sp_256_mont_mul_4(y, y, p->x, p256_mod, p256_mp_mod);
     /* X = T1 * T1 */
@@ -23455,7 +23455,8 @@ typedef struct sp_256_proj_point_dbl_4_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
+static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+        const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_256_proj_point_dbl_4_ctx* ctx = (sp_256_proj_point_dbl_4_ctx*)sp_ctx->data;
@@ -23527,7 +23528,7 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_256_div2_4(ctx->t2, ctx->t2, p256_mod);
+        sp_256_mont_div2_4(ctx->t2, ctx->t2, p256_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -23657,7 +23658,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
     sp_256_mont_sub_4(y, y, t1, p256_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_256_div2_4(y, y, p256_mod);
+    sp_256_mont_div2_4(y, y, p256_mod);
 }
 
 /* Compare two numbers to determine if they are equal.
@@ -24120,7 +24121,7 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
         sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_4(y, y, t1, p256_mod);
         /* Y = Y/2 */
-        sp_256_div2_4(r[j].y, y, p256_mod);
+        sp_256_mont_div2_4(r[j].y, y, p256_mod);
         r[j].infinity = 0;
     }
 }
@@ -25007,8 +25008,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_win_add_sub_4(r, g, k, map, ct, heap);
@@ -25436,8 +25437,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_win_add_sub_4(r, g, k, map, ct, heap);
@@ -40265,8 +40266,8 @@ static void sp_256_mask_4(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_256_div_4(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_256_div_4(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[8], t2[5];
     sp_digit div, r1;
@@ -42137,19 +42138,21 @@ static int sp_256_ecc_is_point_4(const sp_point_256* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 4;
 
+        /* y^2 - x^3 - a.x = b */
         sp_256_sqr_4(t1, point->y);
         (void)sp_256_mod_4(t1, t1, p256_mod);
         sp_256_sqr_4(t2, point->x);
         (void)sp_256_mod_4(t2, t2, p256_mod);
         sp_256_mul_4(t2, t2, point->x);
         (void)sp_256_mod_4(t2, t2, p256_mod);
-        (void)sp_256_sub_4(t2, p256_mod, t2);
-        sp_256_mont_add_4(t1, t1, t2, p256_mod);
+        sp_256_mont_sub_4(t1, t1, t2, p256_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_256_mont_add_4(t1, t1, point->x, p256_mod);
         sp_256_mont_add_4(t1, t1, point->x, p256_mod);
         sp_256_mont_add_4(t1, t1, point->x, p256_mod);
 
+
         if (sp_256_cmp_4(t1, p256_b) != 0) {
             err = MP_VAL;
         }
@@ -44403,7 +44406,7 @@ static void sp_384_rshift1_6(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_384_div2_6(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_384_mont_div2_6(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     sp_digit o;
 
@@ -44456,7 +44459,7 @@ static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p,
     /* T2 = Y * Y */
     sp_384_mont_sqr_6(t2, y, p384_mod, p384_mp_mod);
     /* T2 = T2/2 */
-    sp_384_div2_6(t2, t2, p384_mod);
+    sp_384_mont_div2_6(t2, t2, p384_mod);
     /* Y = Y * X */
     sp_384_mont_mul_6(y, y, p->x, p384_mod, p384_mp_mod);
     /* X = T1 * T1 */
@@ -44489,7 +44492,8 @@ typedef struct sp_384_proj_point_dbl_6_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
+static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
+        const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_384_proj_point_dbl_6_ctx* ctx = (sp_384_proj_point_dbl_6_ctx*)sp_ctx->data;
@@ -44563,7 +44567,7 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_384_div2_6(ctx->t2, ctx->t2, p384_mod);
+        sp_384_mont_div2_6(ctx->t2, ctx->t2, p384_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -44701,7 +44705,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
     sp_384_mont_sub_6(y, y, t1, p384_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_384_div2_6(y, y, p384_mod);
+    sp_384_mont_div2_6(y, y, p384_mod);
 }
 
 /* Compare two numbers to determine if they are equal.
@@ -45087,7 +45091,7 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r,
         sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_6(y, y, t1, p384_mod);
         /* Y = Y/2 */
-        sp_384_div2_6(r[j].y, y, p384_mod);
+        sp_384_mont_div2_6(r[j].y, y, p384_mod);
         r[j].infinity = 0;
     }
 }
@@ -45941,8 +45945,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_6(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_6(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_win_add_sub_6(r, g, k, map, ct, heap);
@@ -46370,8 +46374,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_6(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_6(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_win_add_sub_6(r, g, k, map, ct, heap);
@@ -67082,8 +67086,8 @@ static void sp_384_mask_6(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_384_div_6(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_384_div_6(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[12], t2[7];
     sp_digit div, r1;
@@ -67784,7 +67788,7 @@ static int sp_384_mod_inv_6(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_384_cmp_6(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_384_cmp_6(u, v) >= 0))) {
             sp_384_sub_6(u, u, v);
             o = sp_384_sub_6(b, b, d);
             if (o != 0)
@@ -68211,19 +68215,21 @@ static int sp_384_ecc_is_point_6(const sp_point_384* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 6;
 
+        /* y^2 - x^3 - a.x = b */
         sp_384_sqr_6(t1, point->y);
         (void)sp_384_mod_6(t1, t1, p384_mod);
         sp_384_sqr_6(t2, point->x);
         (void)sp_384_mod_6(t2, t2, p384_mod);
         sp_384_mul_6(t2, t2, point->x);
         (void)sp_384_mod_6(t2, t2, p384_mod);
-        (void)sp_384_sub_6(t2, p384_mod, t2);
-        sp_384_mont_add_6(t1, t1, t2, p384_mod);
+        sp_384_mont_sub_6(t1, t1, t2, p384_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_384_mont_add_6(t1, t1, point->x, p384_mod);
         sp_384_mont_add_6(t1, t1, point->x, p384_mod);
         sp_384_mont_add_6(t1, t1, point->x, p384_mod);
 
+
         if (sp_384_cmp_6(t1, p384_b) != 0) {
             err = MP_VAL;
         }
@@ -70787,8 +70793,8 @@ static sp_int64 sp_521_cmp_9(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_521_div_9(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_521_div_9(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[19];
     sp_digit t2[10];
@@ -72774,7 +72780,7 @@ static void sp_521_rshift1_9(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_521_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_521_mont_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     sp_digit o;
 
@@ -72827,7 +72833,7 @@ static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p,
     /* T2 = Y * Y */
     sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod);
     /* T2 = T2/2 */
-    sp_521_div2_9(t2, t2, p521_mod);
+    sp_521_mont_div2_9(t2, t2, p521_mod);
     /* Y = Y * X */
     sp_521_mont_mul_9(y, y, p->x, p521_mod, p521_mp_mod);
     /* X = T1 * T1 */
@@ -72860,7 +72866,8 @@ typedef struct sp_521_proj_point_dbl_9_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
+static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
+        const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_521_proj_point_dbl_9_ctx* ctx = (sp_521_proj_point_dbl_9_ctx*)sp_ctx->data;
@@ -72934,7 +72941,7 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_521_div2_9(ctx->t2, ctx->t2, p521_mod);
+        sp_521_mont_div2_9(ctx->t2, ctx->t2, p521_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -73072,7 +73079,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     sp_521_mont_sub_9(y, y, t1, p521_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_521_div2_9(y, y, p521_mod);
+    sp_521_mont_div2_9(y, y, p521_mod);
 }
 
 /* Compare two numbers to determine if they are equal.
@@ -73460,7 +73467,7 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r,
         sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_9(y, y, t1, p521_mod);
         /* Y = Y/2 */
-        sp_521_div2_9(r[j].y, y, p521_mod);
+        sp_521_mont_div2_9(r[j].y, y, p521_mod);
         r[j].infinity = 0;
     }
 }
@@ -74359,8 +74366,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap);
@@ -74806,8 +74813,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap);
@@ -112712,7 +112719,7 @@ static int sp_521_mod_inv_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_521_cmp_9(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_521_cmp_9(u, v) >= 0))) {
             sp_521_sub_9(u, u, v);
             o = sp_521_sub_9(b, b, d);
             if (o != 0)
@@ -113149,19 +113156,21 @@ static int sp_521_ecc_is_point_9(const sp_point_521* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 9;
 
+        /* y^2 - x^3 - a.x = b */
         sp_521_sqr_9(t1, point->y);
         (void)sp_521_mod_9(t1, t1, p521_mod);
         sp_521_sqr_9(t2, point->x);
         (void)sp_521_mod_9(t2, t2, p521_mod);
         sp_521_mul_9(t2, t2, point->x);
         (void)sp_521_mod_9(t2, t2, p521_mod);
-        (void)sp_521_sub_9(t2, p521_mod, t2);
-        sp_521_mont_add_9(t1, t1, t2, p521_mod);
+        sp_521_mont_sub_9(t1, t1, t2, p521_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_521_mont_add_9(t1, t1, point->x, p521_mod);
         sp_521_mont_add_9(t1, t1, point->x, p521_mod);
         sp_521_mont_add_9(t1, t1, point->x, p521_mod);
 
+
         if (sp_521_cmp_9(t1, p521_b) != 0) {
             err = MP_VAL;
         }
@@ -115479,8 +115488,8 @@ static sp_int64 sp_1024_cmp_16(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_1024_div_16(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_1024_div_16(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[32], t2[17];
     sp_digit div, r1;
@@ -116866,7 +116875,7 @@ static void sp_1024_rshift1_16(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_1024_div2_16(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_1024_mont_div2_16(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     sp_digit o;
 
@@ -116919,7 +116928,7 @@ static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p,
     /* T2 = Y * Y */
     sp_1024_mont_sqr_16(t2, y, p1024_mod, p1024_mp_mod);
     /* T2 = T2/2 */
-    sp_1024_div2_16(t2, t2, p1024_mod);
+    sp_1024_mont_div2_16(t2, t2, p1024_mod);
     /* Y = Y * X */
     sp_1024_mont_mul_16(y, y, p->x, p1024_mod, p1024_mp_mod);
     /* X = T1 * T1 */
@@ -116952,7 +116961,8 @@ typedef struct sp_1024_proj_point_dbl_16_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
+static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
+        const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_1024_proj_point_dbl_16_ctx* ctx = (sp_1024_proj_point_dbl_16_ctx*)sp_ctx->data;
@@ -117026,7 +117036,7 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_1024_div2_16(ctx->t2, ctx->t2, p1024_mod);
+        sp_1024_mont_div2_16(ctx->t2, ctx->t2, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -117164,107 +117174,9 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
     sp_1024_mont_sub_16(y, y, t1, p1024_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_1024_div2_16(y, y, p1024_mod);
+    sp_1024_mont_div2_16(y, y, p1024_mod);
 }
 
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_1024_sub_16(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    sp_digit c = 0;
-
-    __asm__ __volatile__ (
-        "add	x11, %[a], 128\n\t"
-        "\n1:\n\t"
-        "subs	%[c], xzr, %[c]\n\t"
-        "ldp	x3, x4, [%[a]], #16\n\t"
-        "ldp	x5, x6, [%[a]], #16\n\t"
-        "ldp	x7, x8, [%[b]], #16\n\t"
-        "sbcs	x3, x3, x7\n\t"
-        "ldp	x9, x10, [%[b]], #16\n\t"
-        "sbcs	x4, x4, x8\n\t"
-        "sbcs	x5, x5, x9\n\t"
-        "stp	x3, x4, [%[r]], #16\n\t"
-        "sbcs	x6, x6, x10\n\t"
-        "stp	x5, x6, [%[r]], #16\n\t"
-        "csetm	%[c], cc\n\t"
-        "cmp	%[a], x11\n\t"
-        "b.ne	1b\n\t"
-        : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc"
-    );
-
-    return c;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_1024_sub_16(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    __asm__ __volatile__ (
-        "ldp	x3, x4, [%[a], 0]\n\t"
-        "ldp	x7, x8, [%[b], 0]\n\t"
-        "subs	x3, x3, x7\n\t"
-        "ldp	x5, x6, [%[a], 16]\n\t"
-        "sbcs	x4, x4, x8\n\t"
-        "ldp	x9, x10, [%[b], 16]\n\t"
-        "sbcs	x5, x5, x9\n\t"
-        "stp	x3, x4, [%[r], 0]\n\t"
-        "sbcs	x6, x6, x10\n\t"
-        "stp	x5, x6, [%[r], 16]\n\t"
-        "ldp	x3, x4, [%[a], 32]\n\t"
-        "ldp	x7, x8, [%[b], 32]\n\t"
-        "sbcs	x3, x3, x7\n\t"
-        "ldp	x5, x6, [%[a], 48]\n\t"
-        "sbcs	x4, x4, x8\n\t"
-        "ldp	x9, x10, [%[b], 48]\n\t"
-        "sbcs	x5, x5, x9\n\t"
-        "stp	x3, x4, [%[r], 32]\n\t"
-        "sbcs	x6, x6, x10\n\t"
-        "stp	x5, x6, [%[r], 48]\n\t"
-        "ldp	x3, x4, [%[a], 64]\n\t"
-        "ldp	x7, x8, [%[b], 64]\n\t"
-        "sbcs	x3, x3, x7\n\t"
-        "ldp	x5, x6, [%[a], 80]\n\t"
-        "sbcs	x4, x4, x8\n\t"
-        "ldp	x9, x10, [%[b], 80]\n\t"
-        "sbcs	x5, x5, x9\n\t"
-        "stp	x3, x4, [%[r], 64]\n\t"
-        "sbcs	x6, x6, x10\n\t"
-        "stp	x5, x6, [%[r], 80]\n\t"
-        "ldp	x3, x4, [%[a], 96]\n\t"
-        "ldp	x7, x8, [%[b], 96]\n\t"
-        "sbcs	x3, x3, x7\n\t"
-        "ldp	x5, x6, [%[a], 112]\n\t"
-        "sbcs	x4, x4, x8\n\t"
-        "ldp	x9, x10, [%[b], 112]\n\t"
-        "sbcs	x5, x5, x9\n\t"
-        "stp	x3, x4, [%[r], 96]\n\t"
-        "sbcs	x6, x6, x10\n\t"
-        "stp	x5, x6, [%[r], 112]\n\t"
-        "csetm	%[r], cc\n\t"
-        : [r] "+r" (r)
-        : [a] "r" (a), [b] "r" (b)
-        : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc"
-    );
-
-    return (sp_digit)r;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -117653,7 +117565,7 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r,
         sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_16(y, y, t1, p1024_mod);
         /* Y = Y/2 */
-        sp_1024_div2_16(r[j].y, y, p1024_mod);
+        sp_1024_mont_div2_16(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
     }
 }
@@ -118373,8 +118285,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_16(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_16(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_win_add_sub_16(r, g, k, map, ct, heap);
@@ -124034,7 +123946,7 @@ static void sp_1024_accumulate_line_dbl_16(sp_digit* vx, sp_digit* vy,
     /* ty = 4 * p.y ^ 2 */
     sp_1024_mont_sqr_16(ty, ry, p1024_mod, p1024_mp_mod);
     /* t1 = 2 * p.y ^ 2 */
-    sp_1024_div2_16(t1, ty, p1024_mod);
+    sp_1024_mont_div2_16(t1, ty, p1024_mod);
     /* r.x -= 2 * (p.y ^ 2) */
     sp_1024_mont_sub_16(rx, rx, t1, p1024_mod);
     /* p'.z = p.y * 2 * p.z */
@@ -124054,7 +123966,7 @@ static void sp_1024_accumulate_line_dbl_16(sp_digit* vx, sp_digit* vy,
     /* t1 = (4 * p.y^2) ^ 2 = 16 * p.y^4 */
     sp_1024_mont_sqr_16(t1, ty, p1024_mod, p1024_mp_mod);
     /* t1 = 16 * p.y^4 / 2 = 8 * p.y^4 */
-    sp_1024_div2_16(t1, t1, p1024_mod);
+    sp_1024_mont_div2_16(t1, t1, p1024_mod);
     /* p'.y = 4 * p.y^2 * p.x */
     sp_1024_mont_mul_16(p->y, ty, p->x, p1024_mod, p1024_mp_mod);
     /* p'.x = l^2 */
@@ -124472,7 +124384,7 @@ static void sp_1024_accumulate_line_dbl_n_16(sp_digit* vx, sp_digit* vy,
         /* ty = py ^ 2 */
         sp_1024_mont_sqr_16(ty, p->y, p1024_mod, p1024_mp_mod);
         /* t1 = py ^ 2 / 2 */
-        sp_1024_div2_16(t1, ty, p1024_mod);
+        sp_1024_mont_div2_16(t1, ty, p1024_mod);
         /* r.x -= py ^ 2 / 2 */
         sp_1024_mont_sub_16(rx, rx, t1, p1024_mod);
         /* p'.z = py * pz */
@@ -124510,7 +124422,7 @@ static void sp_1024_accumulate_line_dbl_n_16(sp_digit* vx, sp_digit* vy,
     }
 
     /* p'.y = py' / 2 */
-    sp_1024_div2_16(p->y, p->y, p1024_mod);
+    sp_1024_mont_div2_16(p->y, p->y, p1024_mod);
 }
 
 /* Operations to perform based on order - 1.
@@ -125425,19 +125337,21 @@ static int sp_1024_ecc_is_point_16(const sp_point_1024* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 16;
 
+        /* y^2 - x^3 - a.x = b */
         sp_1024_sqr_16(t1, point->y);
         (void)sp_1024_mod_16(t1, t1, p1024_mod);
         sp_1024_sqr_16(t2, point->x);
         (void)sp_1024_mod_16(t2, t2, p1024_mod);
         sp_1024_mul_16(t2, t2, point->x);
         (void)sp_1024_mod_16(t2, t2, p1024_mod);
-        (void)sp_1024_sub_16(t2, p1024_mod, t2);
-        sp_1024_mont_add_16(t1, t1, t2, p1024_mod);
+        sp_1024_mont_sub_16(t1, t1, t2, p1024_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_1024_mont_add_16(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_16(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_16(t1, t1, point->x, p1024_mod);
 
+
         n = sp_1024_cmp_16(t1, p1024_mod);
         sp_1024_cond_sub_16(t1, t1, p1024_mod, ~(n >> 63));
         sp_1024_norm_16(t1);
diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c
index bf6b671c5..1873ef373 100644
--- a/wolfcrypt/src/sp_armthumb.c
+++ b/wolfcrypt/src/sp_armthumb.c
@@ -21927,6 +21927,7 @@ SP_NOINLINE static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a,
     return (uint32_t)(size_t)r;
 }
 
+#define sp_2048_mont_reduce_order_64   sp_2048_mont_reduce_64
 /* Reduce the number back to 2048 bits using Montgomery reduction.
  *
  * a   A single precision number to reduce in place.
@@ -23943,8 +23944,8 @@ SP_NOINLINE static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[64], t2[33];
     sp_digit div, r1;
@@ -24400,6 +24401,7 @@ SP_NOINLINE static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a,
     return (uint32_t)(size_t)r;
 }
 
+#define sp_2048_mont_reduce_order_64   sp_2048_mont_reduce_64
 /* Reduce the number back to 2048 bits using Montgomery reduction.
  *
  * a   A single precision number to reduce in place.
@@ -27317,8 +27319,8 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[128], t2[65];
     sp_digit div, r1;
@@ -27530,8 +27532,8 @@ SP_NOINLINE static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[128], t2[65];
     sp_digit div, r1;
@@ -73115,6 +73117,7 @@ SP_NOINLINE static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a,
     return (uint32_t)(size_t)r;
 }
 
+#define sp_3072_mont_reduce_order_96   sp_3072_mont_reduce_96
 /* Reduce the number back to 3072 bits using Montgomery reduction.
  *
  * a   A single precision number to reduce in place.
@@ -75403,8 +75406,8 @@ SP_NOINLINE static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[96], t2[49];
     sp_digit div, r1;
@@ -75860,6 +75863,7 @@ SP_NOINLINE static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a,
     return (uint32_t)(size_t)r;
 }
 
+#define sp_3072_mont_reduce_order_96   sp_3072_mont_reduce_96
 /* Reduce the number back to 3072 bits using Montgomery reduction.
  *
  * a   A single precision number to reduce in place.
@@ -79603,8 +79607,8 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[192], t2[97];
     sp_digit div, r1;
@@ -79821,8 +79825,8 @@ SP_NOINLINE static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[192], t2[97];
     sp_digit div, r1;
@@ -87591,6 +87595,7 @@ SP_NOINLINE static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a,
     return (uint32_t)(size_t)r;
 }
 
+#define sp_4096_mont_reduce_order_128   sp_4096_mont_reduce_128
 /* Reduce the number back to 4096 bits using Montgomery reduction.
  *
  * a   A single precision number to reduce in place.
@@ -92150,8 +92155,8 @@ SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[256], t2[129];
     sp_digit div, r1;
@@ -92369,8 +92374,8 @@ SP_NOINLINE static sp_int32 sp_4096_cmp_128(const sp_digit* a,
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[256], t2[129];
     sp_digit div, r1;
@@ -97560,166 +97565,6 @@ SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a,
     return (uint32_t)(size_t)r;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    __asm__ __volatile__ (
-        "movs	r6, %[a]\n\t"
-        "movs	r3, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	r6, r6, #32\n\t"
-#else
-        "add	r6, r6, #32\n\t"
-#endif
-        "\n"
-    "L_sp_256_sub_8_word_%=:\n\t"
-        "movs	r5, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r5, r5, r3\n\t"
-#else
-        "sub	r5, r5, r3\n\t"
-#endif
-        "ldr	r4, [%[a]]\n\t"
-        "ldr	r5, [%[b]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r5\n\t"
-#else
-        "sbc	r4, r5\n\t"
-#endif
-        "str	r4, [%[r]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r3\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r3\n\t"
-#else
-        "sbc	r3, r3\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[a], %[a], #4\n\t"
-#else
-        "add	%[a], %[a], #4\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[b], %[b], #4\n\t"
-#else
-        "add	%[b], %[b], #4\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[r], %[r], #4\n\t"
-#else
-        "add	%[r], %[r], #4\n\t"
-#endif
-        "cmp	%[a], r6\n\t"
-        "bne	L_sp_256_sub_8_word_%=\n\t"
-        "movs	%[r], r3\n\t"
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    __asm__ __volatile__ (
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r3, r3, r5\n\t"
-#else
-        "sub	r3, r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	%[r], %[r], %[r]\n\t"
-#elif defined(__clang__)
-        "sbcs	%[r], %[r]\n\t"
-#else
-        "sbc	%[r], %[r]\n\t"
-#endif
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 #endif /* WOLFSSL_SP_SMALL */
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -97999,69 +97844,6 @@ static int sp_256_point_to_ecc_point_8(const sp_point_256* p, ecc_point* pm)
     return err;
 }
 
-/* Conditionally subtract b from a using the mask m.
- * m is -1 to subtract and 0 when not copying.
- *
- * r  A single precision number representing condition subtract result.
- * a  A single precision number to subtract from.
- * b  A single precision number to subtract.
- * m  Mask value to apply.
- */
-SP_NOINLINE static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
-{
-    __asm__ __volatile__ (
-        "movs	r4, #0\n\t"
-        "movs	r5, #32\n\t"
-        "mov	r8, r5\n\t"
-        "movs	r7, #0\n\t"
-        "\n"
-    "L_sp_256_cond_sub_8_words_%=:\n\t"
-        "ldr	r6, [%[b], r7]\n\t"
-#ifdef WOLFSSL_KEIL
-        "ands	r6, r6, %[m]\n\t"
-#elif defined(__clang__)
-        "ands	r6, %[m]\n\t"
-#else
-        "and	r6, %[m]\n\t"
-#endif
-        "movs	r5, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r5, r5, r4\n\t"
-#else
-        "sub	r5, r5, r4\n\t"
-#endif
-        "ldr	r5, [%[a], r7]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r5, r5, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r5, r6\n\t"
-#else
-        "sbc	r5, r6\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r4\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r4\n\t"
-#else
-        "sbc	r4, r4\n\t"
-#endif
-        "str	r5, [%[r], r7]\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	r7, r7, #4\n\t"
-#else
-        "add	r7, r7, #4\n\t"
-#endif
-        "cmp	r7, r8\n\t"
-        "blt	L_sp_256_cond_sub_8_words_%=\n\t"
-        "movs	%[r], r4\n\t"
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
-        :
-        : "memory", "r4", "r5", "r6", "r7", "r8"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 /* Reduce the number back to 256 bits using Montgomery reduction.
  *
  * a   A single precision number to reduce in place.
@@ -99281,6 +99063,69 @@ SP_NOINLINE static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
  */
 #define sp_256_norm_8(a)
 
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not copying.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+SP_NOINLINE static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, sp_digit m)
+{
+    __asm__ __volatile__ (
+        "movs	r4, #0\n\t"
+        "movs	r5, #32\n\t"
+        "mov	r8, r5\n\t"
+        "movs	r7, #0\n\t"
+        "\n"
+    "L_sp_256_cond_sub_8_words_%=:\n\t"
+        "ldr	r6, [%[b], r7]\n\t"
+#ifdef WOLFSSL_KEIL
+        "ands	r6, r6, %[m]\n\t"
+#elif defined(__clang__)
+        "ands	r6, %[m]\n\t"
+#else
+        "and	r6, %[m]\n\t"
+#endif
+        "movs	r5, #0\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "subs	r5, r5, r4\n\t"
+#else
+        "sub	r5, r5, r4\n\t"
+#endif
+        "ldr	r5, [%[a], r7]\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r5, r5, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r5, r6\n\t"
+#else
+        "sbc	r5, r6\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r4\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r4\n\t"
+#else
+        "sbc	r4, r4\n\t"
+#endif
+        "str	r5, [%[r], r7]\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	r7, r7, #4\n\t"
+#else
+        "add	r7, r7, #4\n\t"
+#endif
+        "cmp	r7, r8\n\t"
+        "blt	L_sp_256_cond_sub_8_words_%=\n\t"
+        "movs	%[r], r4\n\t"
+        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
+        :
+        : "memory", "r4", "r5", "r6", "r7", "r8"
+    );
+    return (uint32_t)(size_t)r;
+}
+
 /* Map the Montgomery form projective coordinate point to an affine point.
  *
  * r  Resulting affine coordinate point.
@@ -100202,7 +100047,7 @@ SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a,
  * a  Number to divide.
  * m  Modulus (prime).
  */
-SP_NOINLINE static void sp_256_div2_8(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_256_mont_div2_8(sp_digit* r, const sp_digit* a,
         const sp_digit* m)
 {
     (void)m;
@@ -100513,7 +100358,7 @@ static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p,
     /* T2 = Y * Y */
     sp_256_mont_sqr_8(t2, y, p256_mod, p256_mp_mod);
     /* T2 = T2/2 */
-    sp_256_div2_8(t2, t2, p256_mod);
+    sp_256_mont_div2_8(t2, t2, p256_mod);
     /* Y = Y * X */
     sp_256_mont_mul_8(y, y, p->x, p256_mod, p256_mp_mod);
     /* X = T1 * T1 */
@@ -100546,7 +100391,8 @@ typedef struct sp_256_proj_point_dbl_8_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
+static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+        const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_256_proj_point_dbl_8_ctx* ctx = (sp_256_proj_point_dbl_8_ctx*)sp_ctx->data;
@@ -100620,7 +100466,7 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_256_div2_8(ctx->t2, ctx->t2, p256_mod);
+        sp_256_mont_div2_8(ctx->t2, ctx->t2, p256_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -101330,7 +101176,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     sp_256_mont_sub_8(y, y, t1, p256_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_256_div2_8(y, y, p256_mod);
+    sp_256_mont_div2_8(y, y, p256_mod);
 }
 
 /* Convert the projective point to affine.
@@ -101808,8 +101654,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap);
@@ -102228,8 +102074,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap);
@@ -105430,8 +105276,8 @@ static void sp_256_mask_8(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[16], t2[9];
     sp_digit div, r1;
@@ -106042,6 +105888,166 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
 #endif /* HAVE_ECC_SIGN */
 
 #ifndef WOLFSSL_SP_SMALL
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    __asm__ __volatile__ (
+        "movs	r6, %[a]\n\t"
+        "movs	r3, #0\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	r6, r6, #32\n\t"
+#else
+        "add	r6, r6, #32\n\t"
+#endif
+        "\n"
+    "L_sp_256_sub_8_word_%=:\n\t"
+        "movs	r5, #0\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "subs	r5, r5, r3\n\t"
+#else
+        "sub	r5, r5, r3\n\t"
+#endif
+        "ldr	r4, [%[a]]\n\t"
+        "ldr	r5, [%[b]]\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r5\n\t"
+#else
+        "sbc	r4, r5\n\t"
+#endif
+        "str	r4, [%[r]]\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r3\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r3\n\t"
+#else
+        "sbc	r3, r3\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	%[a], %[a], #4\n\t"
+#else
+        "add	%[a], %[a], #4\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	%[b], %[b], #4\n\t"
+#else
+        "add	%[b], %[b], #4\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	%[r], %[r], #4\n\t"
+#else
+        "add	%[r], %[r], #4\n\t"
+#endif
+        "cmp	%[a], r6\n\t"
+        "bne	L_sp_256_sub_8_word_%=\n\t"
+        "movs	%[r], r3\n\t"
+        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    __asm__ __volatile__ (
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "subs	r3, r3, r5\n\t"
+#else
+        "sub	r3, r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	%[r], %[r], %[r]\n\t"
+#elif defined(__clang__)
+        "sbcs	%[r], %[r]\n\t"
+#else
+        "sbc	%[r], %[r]\n\t"
+#endif
+        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
 /* Right shift a by 1 bit into r. (r = a >> 1)
  *
  * r  A single precision integer.
@@ -107266,7 +107272,7 @@ static int sp_256_mod_inv_8(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_256_cmp_8(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_256_cmp_8(u, v) >= 0))) {
             sp_256_sub_8(u, u, v);
             o = sp_256_sub_8(b, b, d);
             if (o != 0)
@@ -107695,19 +107701,21 @@ static int sp_256_ecc_is_point_8(const sp_point_256* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 8;
 
+        /* y^2 - x^3 - a.x = b */
         sp_256_sqr_8(t1, point->y);
         (void)sp_256_mod_8(t1, t1, p256_mod);
         sp_256_sqr_8(t2, point->x);
         (void)sp_256_mod_8(t2, t2, p256_mod);
         sp_256_mul_8(t2, t2, point->x);
         (void)sp_256_mod_8(t2, t2, p256_mod);
-        (void)sp_256_sub_8(t2, p256_mod, t2);
-        sp_256_mont_add_8(t1, t1, t2, p256_mod);
+        sp_256_mont_sub_8(t1, t1, t2, p256_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_256_mont_add_8(t1, t1, point->x, p256_mod);
         sp_256_mont_add_8(t1, t1, point->x, p256_mod);
         sp_256_mont_add_8(t1, t1, point->x, p256_mod);
 
+
         if (sp_256_cmp_8(t1, p256_b) != 0) {
             err = MP_VAL;
         }
@@ -109173,200 +109181,6 @@ SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a,
     return (uint32_t)(size_t)r;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    __asm__ __volatile__ (
-        "movs	r6, %[a]\n\t"
-        "movs	r3, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	r6, r6, #48\n\t"
-#else
-        "add	r6, r6, #48\n\t"
-#endif
-        "\n"
-    "L_sp_384_sub_12_word_%=:\n\t"
-        "movs	r5, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r5, r5, r3\n\t"
-#else
-        "sub	r5, r5, r3\n\t"
-#endif
-        "ldr	r4, [%[a]]\n\t"
-        "ldr	r5, [%[b]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r5\n\t"
-#else
-        "sbc	r4, r5\n\t"
-#endif
-        "str	r4, [%[r]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r3\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r3\n\t"
-#else
-        "sbc	r3, r3\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[a], %[a], #4\n\t"
-#else
-        "add	%[a], %[a], #4\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[b], %[b], #4\n\t"
-#else
-        "add	%[b], %[b], #4\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[r], %[r], #4\n\t"
-#else
-        "add	%[r], %[r], #4\n\t"
-#endif
-        "cmp	%[a], r6\n\t"
-        "bne	L_sp_384_sub_12_word_%=\n\t"
-        "movs	%[r], r3\n\t"
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    __asm__ __volatile__ (
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r3, r3, r5\n\t"
-#else
-        "sub	r3, r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	%[r], %[r], %[r]\n\t"
-#elif defined(__clang__)
-        "sbcs	%[r], %[r]\n\t"
-#else
-        "sbc	%[r], %[r]\n\t"
-#endif
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 #endif /* WOLFSSL_SP_SMALL */
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -110801,6 +110615,200 @@ SP_NOINLINE static void sp_384_mont_tpl_12(sp_digit* r, const sp_digit* a,
     sp_384_cond_sub_12(r, r, m, 0 - o);
 }
 
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    __asm__ __volatile__ (
+        "movs	r6, %[a]\n\t"
+        "movs	r3, #0\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	r6, r6, #48\n\t"
+#else
+        "add	r6, r6, #48\n\t"
+#endif
+        "\n"
+    "L_sp_384_sub_12_word_%=:\n\t"
+        "movs	r5, #0\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "subs	r5, r5, r3\n\t"
+#else
+        "sub	r5, r5, r3\n\t"
+#endif
+        "ldr	r4, [%[a]]\n\t"
+        "ldr	r5, [%[b]]\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r5\n\t"
+#else
+        "sbc	r4, r5\n\t"
+#endif
+        "str	r4, [%[r]]\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r3\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r3\n\t"
+#else
+        "sbc	r3, r3\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	%[a], %[a], #4\n\t"
+#else
+        "add	%[a], %[a], #4\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	%[b], %[b], #4\n\t"
+#else
+        "add	%[b], %[b], #4\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	%[r], %[r], #4\n\t"
+#else
+        "add	%[r], %[r], #4\n\t"
+#endif
+        "cmp	%[a], r6\n\t"
+        "bne	L_sp_384_sub_12_word_%=\n\t"
+        "movs	%[r], r3\n\t"
+        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    __asm__ __volatile__ (
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "subs	r3, r3, r5\n\t"
+#else
+        "sub	r3, r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	%[r], %[r], %[r]\n\t"
+#elif defined(__clang__)
+        "sbcs	%[r], %[r]\n\t"
+#else
+        "sbc	%[r], %[r]\n\t"
+#endif
+        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
 /* Conditionally add a and b using the mask m.
  * m is -1 to add and 0 when not.
  *
@@ -111122,7 +111130,7 @@ static void sp_384_rshift1_12(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-SP_NOINLINE static void sp_384_div2_12(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_384_mont_div2_12(sp_digit* r, const sp_digit* a,
         const sp_digit* m)
 {
     sp_digit o;
@@ -111176,7 +111184,7 @@ static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p,
     /* T2 = Y * Y */
     sp_384_mont_sqr_12(t2, y, p384_mod, p384_mp_mod);
     /* T2 = T2/2 */
-    sp_384_div2_12(t2, t2, p384_mod);
+    sp_384_mont_div2_12(t2, t2, p384_mod);
     /* Y = Y * X */
     sp_384_mont_mul_12(y, y, p->x, p384_mod, p384_mp_mod);
     /* X = T1 * T1 */
@@ -111209,7 +111217,8 @@ typedef struct sp_384_proj_point_dbl_12_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
+static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
+        const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_384_proj_point_dbl_12_ctx* ctx = (sp_384_proj_point_dbl_12_ctx*)sp_ctx->data;
@@ -111283,7 +111292,7 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_384_div2_12(ctx->t2, ctx->t2, p384_mod);
+        sp_384_mont_div2_12(ctx->t2, ctx->t2, p384_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -112019,7 +112028,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     sp_384_mont_sub_12(y, y, t1, p384_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_384_div2_12(y, y, p384_mod);
+    sp_384_mont_div2_12(y, y, p384_mod);
 }
 
 /* Convert the projective point to affine.
@@ -112513,8 +112522,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_fast_12(r, g, k, map, ct, heap);
@@ -112949,8 +112958,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_fast_12(r, g, k, map, ct, heap);
@@ -116227,8 +116236,8 @@ static void sp_384_mask_12(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[24], t2[13];
     sp_digit div, r1;
@@ -118455,7 +118464,7 @@ static int sp_384_mod_inv_12(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_384_cmp_12(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_384_cmp_12(u, v) >= 0))) {
             sp_384_sub_12(u, u, v);
             o = sp_384_sub_12(b, b, d);
             if (o != 0)
@@ -118888,19 +118897,21 @@ static int sp_384_ecc_is_point_12(const sp_point_384* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 12;
 
+        /* y^2 - x^3 - a.x = b */
         sp_384_sqr_12(t1, point->y);
         (void)sp_384_mod_12(t1, t1, p384_mod);
         sp_384_sqr_12(t2, point->x);
         (void)sp_384_mod_12(t2, t2, p384_mod);
         sp_384_mul_12(t2, t2, point->x);
         (void)sp_384_mod_12(t2, t2, p384_mod);
-        (void)sp_384_sub_12(t2, p384_mod, t2);
-        sp_384_mont_add_12(t1, t1, t2, p384_mod);
+        sp_384_mont_sub_12(t1, t1, t2, p384_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_384_mont_add_12(t1, t1, point->x, p384_mod);
         sp_384_mont_add_12(t1, t1, point->x, p384_mod);
         sp_384_mont_add_12(t1, t1, point->x, p384_mod);
 
+
         if (sp_384_cmp_12(t1, p384_b) != 0) {
             err = MP_VAL;
         }
@@ -120452,244 +120463,6 @@ SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a,
     return (uint32_t)(size_t)r;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    __asm__ __volatile__ (
-        "movs	r6, %[a]\n\t"
-        "movs	r3, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	r6, r6, #0x44\n\t"
-#else
-        "add	r6, r6, #0x44\n\t"
-#endif
-        "\n"
-    "L_sp_521_sub_17_word_%=:\n\t"
-        "movs	r5, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r5, r5, r3\n\t"
-#else
-        "sub	r5, r5, r3\n\t"
-#endif
-        "ldr	r4, [%[a]]\n\t"
-        "ldr	r5, [%[b]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r5\n\t"
-#else
-        "sbc	r4, r5\n\t"
-#endif
-        "str	r4, [%[r]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r3\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r3\n\t"
-#else
-        "sbc	r3, r3\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[a], %[a], #4\n\t"
-#else
-        "add	%[a], %[a], #4\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[b], %[b], #4\n\t"
-#else
-        "add	%[b], %[b], #4\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[r], %[r], #4\n\t"
-#else
-        "add	%[r], %[r], #4\n\t"
-#endif
-        "cmp	%[a], r6\n\t"
-        "bne	L_sp_521_sub_17_word_%=\n\t"
-        "movs	%[r], r3\n\t"
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    __asm__ __volatile__ (
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r3, r3, r5\n\t"
-#else
-        "sub	r3, r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldr	r5, [%[b]]\n\t"
-        "ldr	r3, [%[a]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-        "str	r3, [%[r]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	%[r], %[r], %[r]\n\t"
-#elif defined(__clang__)
-        "sbcs	%[r], %[r]\n\t"
-#else
-        "sbc	%[r], %[r]\n\t"
-#endif
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 #endif /* WOLFSSL_SP_SMALL */
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -124203,75 +123976,6 @@ SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r, const sp_digit* a,
     );
 }
 
-/* Conditionally add a and b using the mask m.
- * m is -1 to add and 0 when not.
- *
- * r  A single precision number representing conditional add result.
- * a  A single precision number to add with.
- * b  A single precision number to add.
- * m  Mask value to apply.
- */
-SP_NOINLINE static sp_digit sp_521_cond_add_17(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, sp_digit m)
-{
-    __asm__ __volatile__ (
-        "movs	r4, #0\n\t"
-        "movs	r5, #0x44\n\t"
-        "mov	r8, r5\n\t"
-        "movs	r7, #0\n\t"
-        "\n"
-    "L_sp_521_cond_add_17_words_%=:\n\t"
-        "ldr	r6, [%[b], r7]\n\t"
-#ifdef WOLFSSL_KEIL
-        "ands	r6, r6, %[m]\n\t"
-#elif defined(__clang__)
-        "ands	r6, %[m]\n\t"
-#else
-        "and	r6, %[m]\n\t"
-#endif
-        "movs	r5, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r5, r5, #1\n\t"
-#else
-        "sub	r5, r5, #1\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	r5, r5, r4\n\t"
-#else
-        "add	r5, r5, r4\n\t"
-#endif
-        "ldr	r5, [%[a], r7]\n\t"
-#ifdef WOLFSSL_KEIL
-        "adcs	r5, r5, r6\n\t"
-#elif defined(__clang__)
-        "adcs	r5, r6\n\t"
-#else
-        "adc	r5, r6\n\t"
-#endif
-        "movs	r4, #0\n\t"
-#ifdef WOLFSSL_KEIL
-        "adcs	r4, r4, r4\n\t"
-#elif defined(__clang__)
-        "adcs	r4, r4\n\t"
-#else
-        "adc	r4, r4\n\t"
-#endif
-        "str	r5, [%[r], r7]\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	r7, r7, #4\n\t"
-#else
-        "add	r7, r7, #4\n\t"
-#endif
-        "cmp	r7, r8\n\t"
-        "blt	L_sp_521_cond_add_17_words_%=\n\t"
-        "movs	%[r], r4\n\t"
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
-        :
-        : "memory", "r4", "r5", "r6", "r7", "r8"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 /* Subtract two Montgomery form numbers (r = a - b % m).
  *
  * r   Result of subtration.
@@ -124612,6 +124316,75 @@ SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r, const sp_digit* a,
     );
 }
 
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+SP_NOINLINE static sp_digit sp_521_cond_add_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, sp_digit m)
+{
+    __asm__ __volatile__ (
+        "movs	r4, #0\n\t"
+        "movs	r5, #0x44\n\t"
+        "mov	r8, r5\n\t"
+        "movs	r7, #0\n\t"
+        "\n"
+    "L_sp_521_cond_add_17_words_%=:\n\t"
+        "ldr	r6, [%[b], r7]\n\t"
+#ifdef WOLFSSL_KEIL
+        "ands	r6, r6, %[m]\n\t"
+#elif defined(__clang__)
+        "ands	r6, %[m]\n\t"
+#else
+        "and	r6, %[m]\n\t"
+#endif
+        "movs	r5, #0\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "subs	r5, r5, #1\n\t"
+#else
+        "sub	r5, r5, #1\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	r5, r5, r4\n\t"
+#else
+        "add	r5, r5, r4\n\t"
+#endif
+        "ldr	r5, [%[a], r7]\n\t"
+#ifdef WOLFSSL_KEIL
+        "adcs	r5, r5, r6\n\t"
+#elif defined(__clang__)
+        "adcs	r5, r6\n\t"
+#else
+        "adc	r5, r6\n\t"
+#endif
+        "movs	r4, #0\n\t"
+#ifdef WOLFSSL_KEIL
+        "adcs	r4, r4, r4\n\t"
+#elif defined(__clang__)
+        "adcs	r4, r4\n\t"
+#else
+        "adc	r4, r4\n\t"
+#endif
+        "str	r5, [%[r], r7]\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	r7, r7, #4\n\t"
+#else
+        "add	r7, r7, #4\n\t"
+#endif
+        "cmp	r7, r8\n\t"
+        "blt	L_sp_521_cond_add_17_words_%=\n\t"
+        "movs	%[r], r4\n\t"
+        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b), [m] "+l" (m)
+        :
+        : "memory", "r4", "r5", "r6", "r7", "r8"
+    );
+    return (uint32_t)(size_t)r;
+}
+
 /* Right shift a by 1 bit into r. (r = a >> 1)
  *
  * r  A single precision integer.
@@ -124943,7 +124716,7 @@ static void sp_521_rshift1_17(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-SP_NOINLINE static void sp_521_div2_17(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_521_mont_div2_17(sp_digit* r, const sp_digit* a,
         const sp_digit* m)
 {
     sp_digit o;
@@ -124997,7 +124770,7 @@ static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p,
     /* T2 = Y * Y */
     sp_521_mont_sqr_17(t2, y, p521_mod, p521_mp_mod);
     /* T2 = T2/2 */
-    sp_521_div2_17(t2, t2, p521_mod);
+    sp_521_mont_div2_17(t2, t2, p521_mod);
     /* Y = Y * X */
     sp_521_mont_mul_17(y, y, p->x, p521_mod, p521_mp_mod);
     /* X = T1 * T1 */
@@ -125030,7 +124803,8 @@ typedef struct sp_521_proj_point_dbl_17_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
+static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
+        const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_521_proj_point_dbl_17_ctx* ctx = (sp_521_proj_point_dbl_17_ctx*)sp_ctx->data;
@@ -125104,7 +124878,7 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_521_div2_17(ctx->t2, ctx->t2, p521_mod);
+        sp_521_mont_div2_17(ctx->t2, ctx->t2, p521_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -125877,7 +125651,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     sp_521_mont_sub_17(y, y, t1, p521_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_521_div2_17(y, y, p521_mod);
+    sp_521_mont_div2_17(y, y, p521_mod);
 }
 
 /* Convert the projective point to affine.
@@ -126391,8 +126165,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_fast_17(r, g, k, map, ct, heap);
@@ -126847,8 +126621,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_fast_17(r, g, k, map, ct, heap);
@@ -132400,8 +132174,8 @@ static void sp_521_mask_17(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[35];
     sp_digit t2[18];
@@ -133008,6 +132782,244 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
 #endif /* HAVE_ECC_SIGN */
 
 #ifndef WOLFSSL_SP_SMALL
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    __asm__ __volatile__ (
+        "movs	r6, %[a]\n\t"
+        "movs	r3, #0\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	r6, r6, #0x44\n\t"
+#else
+        "add	r6, r6, #0x44\n\t"
+#endif
+        "\n"
+    "L_sp_521_sub_17_word_%=:\n\t"
+        "movs	r5, #0\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "subs	r5, r5, r3\n\t"
+#else
+        "sub	r5, r5, r3\n\t"
+#endif
+        "ldr	r4, [%[a]]\n\t"
+        "ldr	r5, [%[b]]\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r5\n\t"
+#else
+        "sbc	r4, r5\n\t"
+#endif
+        "str	r4, [%[r]]\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r3\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r3\n\t"
+#else
+        "sbc	r3, r3\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	%[a], %[a], #4\n\t"
+#else
+        "add	%[a], %[a], #4\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	%[b], %[b], #4\n\t"
+#else
+        "add	%[b], %[b], #4\n\t"
+#endif
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "adds	%[r], %[r], #4\n\t"
+#else
+        "add	%[r], %[r], #4\n\t"
+#endif
+        "cmp	%[a], r6\n\t"
+        "bne	L_sp_521_sub_17_word_%=\n\t"
+        "movs	%[r], r3\n\t"
+        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    __asm__ __volatile__ (
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#if defined(__clang__) || defined(WOLFSSL_KEIL)
+        "subs	r3, r3, r5\n\t"
+#else
+        "sub	r3, r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldm	%[b]!, {r5, r6}\n\t"
+        "ldm	%[a]!, {r3, r4}\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+#ifdef WOLFSSL_KEIL
+        "sbcs	r4, r4, r6\n\t"
+#elif defined(__clang__)
+        "sbcs	r4, r6\n\t"
+#else
+        "sbc	r4, r6\n\t"
+#endif
+        "stm	%[r]!, {r3, r4}\n\t"
+        "ldr	r5, [%[b]]\n\t"
+        "ldr	r3, [%[a]]\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	r3, r3, r5\n\t"
+#elif defined(__clang__)
+        "sbcs	r3, r5\n\t"
+#else
+        "sbc	r3, r5\n\t"
+#endif
+        "str	r3, [%[r]]\n\t"
+#ifdef WOLFSSL_KEIL
+        "sbcs	%[r], %[r], %[r]\n\t"
+#elif defined(__clang__)
+        "sbcs	%[r], %[r]\n\t"
+#else
+        "sbc	%[r], %[r]\n\t"
+#endif
+        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
 /* Divide the number by 2 mod the modulus. (r = a / 2 % m)
  *
  * r  Result of division by 2.
@@ -135382,7 +135394,7 @@ static int sp_521_mod_inv_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_521_cmp_17(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_521_cmp_17(u, v) >= 0))) {
             sp_521_sub_17(u, u, v);
             o = sp_521_sub_17(b, b, d);
             if (o != 0)
@@ -135827,19 +135839,21 @@ static int sp_521_ecc_is_point_17(const sp_point_521* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 17;
 
+        /* y^2 - x^3 - a.x = b */
         sp_521_sqr_17(t1, point->y);
         (void)sp_521_mod_17(t1, t1, p521_mod);
         sp_521_sqr_17(t2, point->x);
         (void)sp_521_mod_17(t2, t2, p521_mod);
         sp_521_mul_17(t2, t2, point->x);
         (void)sp_521_mod_17(t2, t2, p521_mod);
-        (void)sp_521_sub_17(t2, p521_mod, t2);
-        sp_521_mont_add_17(t1, t1, t2, p521_mod);
+        sp_521_mont_sub_17(t1, t1, t2, p521_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_521_mont_add_17(t1, t1, point->x, p521_mod);
         sp_521_mont_add_17(t1, t1, point->x, p521_mod);
         sp_521_mont_add_17(t1, t1, point->x, p521_mod);
 
+
         if (sp_521_cmp_17(t1, p521_b) != 0) {
             err = MP_VAL;
         }
@@ -202364,8 +202378,8 @@ SP_NOINLINE static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[64], t2[33];
     sp_digit div, r1;
@@ -202685,6 +202699,7 @@ static int sp_1024_point_to_ecc_point_32(const sp_point_1024* p, ecc_point* pm)
     return err;
 }
 
+#define sp_1024_mont_reduce_order_32   sp_1024_mont_reduce_32
 /* Reduce the number back to 1024 bits using Montgomery reduction.
  *
  * a   A single precision number to reduce in place.
@@ -209584,7 +209599,7 @@ static void sp_1024_rshift1_32(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-SP_NOINLINE static void sp_1024_div2_32(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_1024_mont_div2_32(sp_digit* r, const sp_digit* a,
         const sp_digit* m)
 {
     sp_digit o;
@@ -209638,7 +209653,7 @@ static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p,
     /* T2 = Y * Y */
     sp_1024_mont_sqr_32(t2, y, p1024_mod, p1024_mp_mod);
     /* T2 = T2/2 */
-    sp_1024_div2_32(t2, t2, p1024_mod);
+    sp_1024_mont_div2_32(t2, t2, p1024_mod);
     /* Y = Y * X */
     sp_1024_mont_mul_32(y, y, p->x, p1024_mod, p1024_mp_mod);
     /* X = T1 * T1 */
@@ -209671,7 +209686,8 @@ typedef struct sp_1024_proj_point_dbl_32_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
+static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
+        const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_1024_proj_point_dbl_32_ctx* ctx = (sp_1024_proj_point_dbl_32_ctx*)sp_ctx->data;
@@ -209745,7 +209761,7 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_1024_div2_32(ctx->t2, ctx->t2, p1024_mod);
+        sp_1024_mont_div2_32(ctx->t2, ctx->t2, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -209795,370 +209811,6 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static sp_digit sp_1024_sub_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    __asm__ __volatile__ (
-        "movs	r6, %[a]\n\t"
-        "movs	r3, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	r6, r6, #0x80\n\t"
-#else
-        "add	r6, r6, #0x80\n\t"
-#endif
-        "\n"
-    "L_sp_1024_sub_32_word_%=:\n\t"
-        "movs	r5, #0\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r5, r5, r3\n\t"
-#else
-        "sub	r5, r5, r3\n\t"
-#endif
-        "ldr	r4, [%[a]]\n\t"
-        "ldr	r5, [%[b]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r5\n\t"
-#else
-        "sbc	r4, r5\n\t"
-#endif
-        "str	r4, [%[r]]\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r3\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r3\n\t"
-#else
-        "sbc	r3, r3\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[a], %[a], #4\n\t"
-#else
-        "add	%[a], %[a], #4\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[b], %[b], #4\n\t"
-#else
-        "add	%[b], %[b], #4\n\t"
-#endif
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "adds	%[r], %[r], #4\n\t"
-#else
-        "add	%[r], %[r], #4\n\t"
-#endif
-        "cmp	%[a], r6\n\t"
-        "bne	L_sp_1024_sub_32_word_%=\n\t"
-        "movs	%[r], r3\n\t"
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static sp_digit sp_1024_sub_32(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    __asm__ __volatile__ (
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#if defined(__clang__) || defined(WOLFSSL_KEIL)
-        "subs	r3, r3, r5\n\t"
-#else
-        "sub	r3, r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-        "ldm	%[b]!, {r5, r6}\n\t"
-        "ldm	%[a]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	r3, r3, r5\n\t"
-#elif defined(__clang__)
-        "sbcs	r3, r5\n\t"
-#else
-        "sbc	r3, r5\n\t"
-#endif
-#ifdef WOLFSSL_KEIL
-        "sbcs	r4, r4, r6\n\t"
-#elif defined(__clang__)
-        "sbcs	r4, r6\n\t"
-#else
-        "sbc	r4, r6\n\t"
-#endif
-        "stm	%[r]!, {r3, r4}\n\t"
-#ifdef WOLFSSL_KEIL
-        "sbcs	%[r], %[r], %[r]\n\t"
-#elif defined(__clang__)
-        "sbcs	%[r], %[r]\n\t"
-#else
-        "sbc	%[r], %[r]\n\t"
-#endif
-        : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -210718,7 +210370,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     sp_1024_mont_sub_32(y, y, t1, p1024_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_1024_div2_32(y, y, p1024_mod);
+    sp_1024_mont_div2_32(y, y, p1024_mod);
 }
 
 /* Convert the projective point to affine.
@@ -211131,8 +210783,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap);
@@ -211486,8 +211138,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap);
@@ -217641,7 +217293,7 @@ static void sp_1024_accumulate_line_dbl_32(sp_digit* vx, sp_digit* vy,
     /* ty = 4 * p.y ^ 2 */
     sp_1024_mont_sqr_32(ty, ry, p1024_mod, p1024_mp_mod);
     /* t1 = 2 * p.y ^ 2 */
-    sp_1024_div2_32(t1, ty, p1024_mod);
+    sp_1024_mont_div2_32(t1, ty, p1024_mod);
     /* r.x -= 2 * (p.y ^ 2) */
     sp_1024_mont_sub_32(rx, rx, t1, p1024_mod);
     /* p'.z = p.y * 2 * p.z */
@@ -217661,7 +217313,7 @@ static void sp_1024_accumulate_line_dbl_32(sp_digit* vx, sp_digit* vy,
     /* t1 = (4 * p.y^2) ^ 2 = 16 * p.y^4 */
     sp_1024_mont_sqr_32(t1, ty, p1024_mod, p1024_mp_mod);
     /* t1 = 16 * p.y^4 / 2 = 8 * p.y^4 */
-    sp_1024_div2_32(t1, t1, p1024_mod);
+    sp_1024_mont_div2_32(t1, t1, p1024_mod);
     /* p'.y = 4 * p.y^2 * p.x */
     sp_1024_mont_mul_32(p->y, ty, p->x, p1024_mod, p1024_mp_mod);
     /* p'.x = l^2 */
@@ -218079,7 +217731,7 @@ static void sp_1024_accumulate_line_dbl_n_32(sp_digit* vx, sp_digit* vy,
         /* ty = py ^ 2 */
         sp_1024_mont_sqr_32(ty, p->y, p1024_mod, p1024_mp_mod);
         /* t1 = py ^ 2 / 2 */
-        sp_1024_div2_32(t1, ty, p1024_mod);
+        sp_1024_mont_div2_32(t1, ty, p1024_mod);
         /* r.x -= py ^ 2 / 2 */
         sp_1024_mont_sub_32(rx, rx, t1, p1024_mod);
         /* p'.z = py * pz */
@@ -218117,7 +217769,7 @@ static void sp_1024_accumulate_line_dbl_n_32(sp_digit* vx, sp_digit* vy,
     }
 
     /* p'.y = py' / 2 */
-    sp_1024_div2_32(p->y, p->y, p1024_mod);
+    sp_1024_mont_div2_32(p->y, p->y, p1024_mod);
 }
 
 /* Operations to perform based on order - 1.
@@ -218957,19 +218609,21 @@ static int sp_1024_ecc_is_point_32(const sp_point_1024* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 32;
 
+        /* y^2 - x^3 - a.x = b */
         sp_1024_sqr_32(t1, point->y);
         (void)sp_1024_mod_32(t1, t1, p1024_mod);
         sp_1024_sqr_32(t2, point->x);
         (void)sp_1024_mod_32(t2, t2, p1024_mod);
         sp_1024_mul_32(t2, t2, point->x);
         (void)sp_1024_mod_32(t2, t2, p1024_mod);
-        (void)sp_1024_sub_32(t2, p1024_mod, t2);
-        sp_1024_mont_add_32(t1, t1, t2, p1024_mod);
+        sp_1024_mont_sub_32(t1, t1, t2, p1024_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_1024_mont_add_32(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_32(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_32(t1, t1, point->x, p1024_mod);
 
+
         n = sp_1024_cmp_32(t1, p1024_mod);
         sp_1024_cond_sub_32(t1, t1, p1024_mod, ~(n >> 31));
         sp_1024_norm_32(t1);
diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c
index dc5c3385d..2f011818f 100644
--- a/wolfcrypt/src/sp_c32.c
+++ b/wolfcrypt/src/sp_c32.c
@@ -87,11 +87,14 @@
 #define SP_PRINT_INT(var, name)                       \
     fprintf(stderr, name "=%d\n", var)
 
-#if (((!defined(WC_NO_CACHE_RESISTANT) && \
-      (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH))) || \
-     (defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP))) && \
+#if ((defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && \
+     ((!defined(WC_NO_CACHE_RESISTANT) && \
+       (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH))) || \
+      (defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP))) && \
     !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || (defined(WOLFSSL_SP_SMALL) && \
-    defined(WOLFSSL_HAVE_SP_ECC))
+    defined(WOLFSSL_HAVE_SP_ECC) && (!defined(WOLFSSL_SP_NO_256) || \
+    defined(WOLFSSL_SP_384) || defined(WOLFSSL_SP_521) || \
+    defined(WOLFSSL_SP_1024)))
 /* Mask for address to obfuscate which of the two address will be used. */
 static const size_t addr_mask[2] = { 0, (size_t)-1 };
 #endif
@@ -21259,7 +21262,8 @@ SP_NOINLINE static void sp_256_rshift1_9(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_256_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_256_mont_div2_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     sp_256_cond_add_9(r, a, m, 0 - (a[0] & 1));
     sp_256_norm_9(r);
@@ -21310,7 +21314,7 @@ static void sp_256_proj_point_dbl_9(sp_point_256* r, const sp_point_256* p,
     /* T2 = Y * Y */
     sp_256_mont_sqr_9(t2, y, p256_mod, p256_mp_mod);
     /* T2 = T2/2 */
-    sp_256_div2_9(t2, t2, p256_mod);
+    sp_256_mont_div2_9(t2, t2, p256_mod);
     /* Y = Y * X */
     sp_256_mont_mul_9(y, y, p->x, p256_mod, p256_mp_mod);
     /* X = T1 * T1 */
@@ -21343,7 +21347,8 @@ typedef struct sp_256_proj_point_dbl_9_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
+static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+        const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_256_proj_point_dbl_9_ctx* ctx = (sp_256_proj_point_dbl_9_ctx*)sp_ctx->data;
@@ -21417,7 +21422,7 @@ static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_256_div2_9(ctx->t2, ctx->t2, p256_mod);
+        sp_256_mont_div2_9(ctx->t2, ctx->t2, p256_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -22277,7 +22282,7 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i,
     sp_256_mont_sub_9(y, y, t1, p256_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_256_div2_9(y, y, p256_mod);
+    sp_256_mont_div2_9(y, y, p256_mod);
 }
 
 /* Double the Montgomery form projective point p a number of times.
@@ -22348,7 +22353,7 @@ static void sp_256_proj_point_dbl_n_store_9(sp_point_256* r,
         sp_256_mont_mul_9(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_9(y, y, t1, p256_mod);
         /* Y = Y/2 */
-        sp_256_div2_9(r[j].y, y, p256_mod);
+        sp_256_mont_div2_9(r[j].y, y, p256_mod);
         r[j].infinity = 0;
     }
 }
@@ -23191,8 +23196,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap);
@@ -26116,8 +26121,8 @@ static int sp_256_mod_inv_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
         }
 
         while (ut > 1 && vt > 1) {
-            if (ut > vt || (ut == vt &&
-                                       sp_256_cmp_9(u, v) >= 0)) {
+            if ((ut > vt) || ((ut == vt) &&
+                    (sp_256_cmp_9(u, v) >= 0))) {
                 sp_256_sub_9(u, u, v);
                 sp_256_norm_9(u);
 
@@ -26563,19 +26568,21 @@ static int sp_256_ecc_is_point_9(const sp_point_256* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 9;
 
+        /* y^2 - x^3 - a.x = b */
         sp_256_sqr_9(t1, point->y);
         (void)sp_256_mod_9(t1, t1, p256_mod);
         sp_256_sqr_9(t2, point->x);
         (void)sp_256_mod_9(t2, t2, p256_mod);
         sp_256_mul_9(t2, t2, point->x);
         (void)sp_256_mod_9(t2, t2, p256_mod);
-        (void)sp_256_sub_9(t2, p256_mod, t2);
-        sp_256_mont_add_9(t1, t1, t2, p256_mod);
+        sp_256_mont_sub_9(t1, t1, t2, p256_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_256_mont_add_9(t1, t1, point->x, p256_mod);
         sp_256_mont_add_9(t1, t1, point->x, p256_mod);
         sp_256_mont_add_9(t1, t1, point->x, p256_mod);
 
+
         if (sp_256_cmp_9(t1, p256_b) != 0) {
             err = MP_VAL;
         }
@@ -28691,7 +28698,8 @@ SP_NOINLINE static void sp_384_rshift1_15(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_384_div2_15(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_384_mont_div2_15(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     sp_384_cond_add_15(r, a, m, 0 - (a[0] & 1));
     sp_384_norm_15(r);
@@ -28742,7 +28750,7 @@ static void sp_384_proj_point_dbl_15(sp_point_384* r, const sp_point_384* p,
     /* T2 = Y * Y */
     sp_384_mont_sqr_15(t2, y, p384_mod, p384_mp_mod);
     /* T2 = T2/2 */
-    sp_384_div2_15(t2, t2, p384_mod);
+    sp_384_mont_div2_15(t2, t2, p384_mod);
     /* Y = Y * X */
     sp_384_mont_mul_15(y, y, p->x, p384_mod, p384_mp_mod);
     /* X = T1 * T1 */
@@ -28775,7 +28783,8 @@ typedef struct sp_384_proj_point_dbl_15_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_384_proj_point_dbl_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
+static int sp_384_proj_point_dbl_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
+        const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_384_proj_point_dbl_15_ctx* ctx = (sp_384_proj_point_dbl_15_ctx*)sp_ctx->data;
@@ -28849,7 +28858,7 @@ static int sp_384_proj_point_dbl_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_384_div2_15(ctx->t2, ctx->t2, p384_mod);
+        sp_384_mont_div2_15(ctx->t2, ctx->t2, p384_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -29767,7 +29776,7 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i,
     sp_384_mont_sub_15(y, y, t1, p384_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_384_div2_15(y, y, p384_mod);
+    sp_384_mont_div2_15(y, y, p384_mod);
 }
 
 /* Double the Montgomery form projective point p a number of times.
@@ -29838,7 +29847,7 @@ static void sp_384_proj_point_dbl_n_store_15(sp_point_384* r,
         sp_384_mont_mul_15(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_15(y, y, t1, p384_mod);
         /* Y = Y/2 */
-        sp_384_div2_15(r[j].y, y, p384_mod);
+        sp_384_mont_div2_15(r[j].y, y, p384_mod);
         r[j].infinity = 0;
     }
 }
@@ -30741,8 +30750,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_15(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_15(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_win_add_sub_15(r, g, k, map, ct, heap);
@@ -34187,8 +34196,8 @@ static int sp_384_mod_inv_15(sp_digit* r, const sp_digit* a, const sp_digit* m)
         }
 
         while (ut > 1 && vt > 1) {
-            if (ut > vt || (ut == vt &&
-                                       sp_384_cmp_15(u, v) >= 0)) {
+            if ((ut > vt) || ((ut == vt) &&
+                    (sp_384_cmp_15(u, v) >= 0))) {
                 sp_384_sub_15(u, u, v);
                 sp_384_norm_15(u);
 
@@ -34640,19 +34649,21 @@ static int sp_384_ecc_is_point_15(const sp_point_384* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 15;
 
+        /* y^2 - x^3 - a.x = b */
         sp_384_sqr_15(t1, point->y);
         (void)sp_384_mod_15(t1, t1, p384_mod);
         sp_384_sqr_15(t2, point->x);
         (void)sp_384_mod_15(t2, t2, p384_mod);
         sp_384_mul_15(t2, t2, point->x);
         (void)sp_384_mod_15(t2, t2, p384_mod);
-        (void)sp_384_sub_15(t2, p384_mod, t2);
-        sp_384_mont_add_15(t1, t1, t2, p384_mod);
+        sp_384_mont_sub_15(t1, t1, t2, p384_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_384_mont_add_15(t1, t1, point->x, p384_mod);
         sp_384_mont_add_15(t1, t1, point->x, p384_mod);
         sp_384_mont_add_15(t1, t1, point->x, p384_mod);
 
+
         if (sp_384_cmp_15(t1, p384_b) != 0) {
             err = MP_VAL;
         }
@@ -36365,7 +36376,8 @@ SP_NOINLINE static void sp_521_rshift1_21(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_521_div2_21(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_521_mont_div2_21(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     sp_521_cond_add_21(r, a, m, 0 - (a[0] & 1));
     sp_521_norm_21(r);
@@ -36416,7 +36428,7 @@ static void sp_521_proj_point_dbl_21(sp_point_521* r, const sp_point_521* p,
     /* T2 = Y * Y */
     sp_521_mont_sqr_21(t2, y, p521_mod, p521_mp_mod);
     /* T2 = T2/2 */
-    sp_521_div2_21(t2, t2, p521_mod);
+    sp_521_mont_div2_21(t2, t2, p521_mod);
     /* Y = Y * X */
     sp_521_mont_mul_21(y, y, p->x, p521_mod, p521_mp_mod);
     /* X = T1 * T1 */
@@ -36449,7 +36461,8 @@ typedef struct sp_521_proj_point_dbl_21_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_521_proj_point_dbl_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
+static int sp_521_proj_point_dbl_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
+        const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_521_proj_point_dbl_21_ctx* ctx = (sp_521_proj_point_dbl_21_ctx*)sp_ctx->data;
@@ -36523,7 +36536,7 @@ static int sp_521_proj_point_dbl_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_521_div2_21(ctx->t2, ctx->t2, p521_mod);
+        sp_521_mont_div2_21(ctx->t2, ctx->t2, p521_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -37306,7 +37319,7 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int i,
     sp_521_mont_sub_21(y, y, t1, p521_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_521_div2_21(y, y, p521_mod);
+    sp_521_mont_div2_21(y, y, p521_mod);
 }
 
 /* Double the Montgomery form projective point p a number of times.
@@ -37377,7 +37390,7 @@ static void sp_521_proj_point_dbl_n_store_21(sp_point_521* r,
         sp_521_mont_mul_21(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_21(y, y, t1, p521_mod);
         /* Y = Y/2 */
-        sp_521_div2_21(r[j].y, y, p521_mod);
+        sp_521_mont_div2_21(r[j].y, y, p521_mod);
         r[j].infinity = 0;
     }
 }
@@ -38340,8 +38353,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_21(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_21(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_win_add_sub_21(r, g, k, map, ct, heap);
@@ -42342,8 +42355,8 @@ static int sp_521_mod_inv_21(sp_digit* r, const sp_digit* a, const sp_digit* m)
         }
 
         while (ut > 1 && vt > 1) {
-            if (ut > vt || (ut == vt &&
-                                       sp_521_cmp_21(u, v) >= 0)) {
+            if ((ut > vt) || ((ut == vt) &&
+                    (sp_521_cmp_21(u, v) >= 0))) {
                 sp_521_sub_21(u, u, v);
                 sp_521_norm_21(u);
 
@@ -42810,19 +42823,21 @@ static int sp_521_ecc_is_point_21(const sp_point_521* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 21;
 
+        /* y^2 - x^3 - a.x = b */
         sp_521_sqr_21(t1, point->y);
         (void)sp_521_mod_21(t1, t1, p521_mod);
         sp_521_sqr_21(t2, point->x);
         (void)sp_521_mod_21(t2, t2, p521_mod);
         sp_521_mul_21(t2, t2, point->x);
         (void)sp_521_mod_21(t2, t2, p521_mod);
-        (void)sp_521_sub_21(t2, p521_mod, t2);
-        sp_521_mont_add_21(t1, t1, t2, p521_mod);
+        sp_521_mont_sub_21(t1, t1, t2, p521_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_521_mont_add_21(t1, t1, point->x, p521_mod);
         sp_521_mont_add_21(t1, t1, point->x, p521_mod);
         sp_521_mont_add_21(t1, t1, point->x, p521_mod);
 
+
         if (sp_521_cmp_21(t1, p521_b) != 0) {
             err = MP_VAL;
         }
@@ -45161,7 +45176,8 @@ SP_NOINLINE static void sp_1024_rshift1_42(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_1024_div2_42(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_1024_mont_div2_42(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     sp_1024_cond_add_42(r, a, m, 0 - (a[0] & 1));
     sp_1024_norm_42(r);
@@ -45212,7 +45228,7 @@ static void sp_1024_proj_point_dbl_42(sp_point_1024* r, const sp_point_1024* p,
     /* T2 = Y * Y */
     sp_1024_mont_sqr_42(t2, y, p1024_mod, p1024_mp_mod);
     /* T2 = T2/2 */
-    sp_1024_div2_42(t2, t2, p1024_mod);
+    sp_1024_mont_div2_42(t2, t2, p1024_mod);
     /* Y = Y * X */
     sp_1024_mont_mul_42(y, y, p->x, p1024_mod, p1024_mp_mod);
     /* X = T1 * T1 */
@@ -45245,7 +45261,8 @@ typedef struct sp_1024_proj_point_dbl_42_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_1024_proj_point_dbl_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
+static int sp_1024_proj_point_dbl_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
+        const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_1024_proj_point_dbl_42_ctx* ctx = (sp_1024_proj_point_dbl_42_ctx*)sp_ctx->data;
@@ -45319,7 +45336,7 @@ static int sp_1024_proj_point_dbl_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_1024_div2_42(ctx->t2, ctx->t2, p1024_mod);
+        sp_1024_mont_div2_42(ctx->t2, ctx->t2, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -46136,7 +46153,7 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int i,
     sp_1024_mont_sub_42(y, y, t1, p1024_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_1024_div2_42(y, y, p1024_mod);
+    sp_1024_mont_div2_42(y, y, p1024_mod);
 }
 
 /* Double the Montgomery form projective point p a number of times.
@@ -46207,7 +46224,7 @@ static void sp_1024_proj_point_dbl_n_store_42(sp_point_1024* r,
         sp_1024_mont_mul_42(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_42(y, y, t1, p1024_mod);
         /* Y = Y/2 */
-        sp_1024_div2_42(r[j].y, y, p1024_mod);
+        sp_1024_mont_div2_42(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
     }
 }
@@ -46921,8 +46938,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_42(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_42(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_win_add_sub_42(r, g, k, map, ct, heap);
@@ -53383,7 +53400,7 @@ static void sp_1024_accumulate_line_dbl_42(sp_digit* vx, sp_digit* vy,
     /* ty = 4 * p.y ^ 2 */
     sp_1024_mont_sqr_42(ty, ry, p1024_mod, p1024_mp_mod);
     /* t1 = 2 * p.y ^ 2 */
-    sp_1024_div2_42(t1, ty, p1024_mod);
+    sp_1024_mont_div2_42(t1, ty, p1024_mod);
     /* r.x -= 2 * (p.y ^ 2) */
     sp_1024_mont_sub_42(rx, rx, t1, p1024_mod);
     /* p'.z = p.y * 2 * p.z */
@@ -53403,7 +53420,7 @@ static void sp_1024_accumulate_line_dbl_42(sp_digit* vx, sp_digit* vy,
     /* t1 = (4 * p.y^2) ^ 2 = 16 * p.y^4 */
     sp_1024_mont_sqr_42(t1, ty, p1024_mod, p1024_mp_mod);
     /* t1 = 16 * p.y^4 / 2 = 8 * p.y^4 */
-    sp_1024_div2_42(t1, t1, p1024_mod);
+    sp_1024_mont_div2_42(t1, t1, p1024_mod);
     /* p'.y = 4 * p.y^2 * p.x */
     sp_1024_mont_mul_42(p->y, ty, p->x, p1024_mod, p1024_mp_mod);
     /* p'.x = l^2 */
@@ -53821,7 +53838,7 @@ static void sp_1024_accumulate_line_dbl_n_42(sp_digit* vx, sp_digit* vy,
         /* ty = py ^ 2 */
         sp_1024_mont_sqr_42(ty, p->y, p1024_mod, p1024_mp_mod);
         /* t1 = py ^ 2 / 2 */
-        sp_1024_div2_42(t1, ty, p1024_mod);
+        sp_1024_mont_div2_42(t1, ty, p1024_mod);
         /* r.x -= py ^ 2 / 2 */
         sp_1024_mont_sub_42(rx, rx, t1, p1024_mod);
         /* p'.z = py * pz */
@@ -53859,7 +53876,7 @@ static void sp_1024_accumulate_line_dbl_n_42(sp_digit* vx, sp_digit* vy,
     }
 
     /* p'.y = py' / 2 */
-    sp_1024_div2_42(p->y, p->y, p1024_mod);
+    sp_1024_mont_div2_42(p->y, p->y, p1024_mod);
 }
 
 /* Operations to perform based on order - 1.
@@ -54696,19 +54713,21 @@ static int sp_1024_ecc_is_point_42(const sp_point_1024* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 42;
 
+        /* y^2 - x^3 - a.x = b */
         sp_1024_sqr_42(t1, point->y);
         (void)sp_1024_mod_42(t1, t1, p1024_mod);
         sp_1024_sqr_42(t2, point->x);
         (void)sp_1024_mod_42(t2, t2, p1024_mod);
         sp_1024_mul_42(t2, t2, point->x);
         (void)sp_1024_mod_42(t2, t2, p1024_mod);
-        (void)sp_1024_sub_42(t2, p1024_mod, t2);
-        sp_1024_mont_add_42(t1, t1, t2, p1024_mod);
+        sp_1024_mont_sub_42(t1, t1, t2, p1024_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_1024_mont_add_42(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_42(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_42(t1, t1, point->x, p1024_mod);
 
+
         n = sp_1024_cmp_42(t1, p1024_mod);
         sp_1024_cond_sub_42(t1, t1, p1024_mod, ~(n >> 24));
         sp_1024_norm_42(t1);
diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c
index d9a55dbe6..38f47e645 100644
--- a/wolfcrypt/src/sp_c64.c
+++ b/wolfcrypt/src/sp_c64.c
@@ -87,11 +87,14 @@
 #define SP_PRINT_INT(var, name)                       \
     fprintf(stderr, name "=%d\n", var)
 
-#if (((!defined(WC_NO_CACHE_RESISTANT) && \
-      (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH))) || \
-     (defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP))) && \
+#if ((defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && \
+     ((!defined(WC_NO_CACHE_RESISTANT) && \
+       (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH))) || \
+      (defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP))) && \
     !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || (defined(WOLFSSL_SP_SMALL) && \
-    defined(WOLFSSL_HAVE_SP_ECC))
+    defined(WOLFSSL_HAVE_SP_ECC) && (!defined(WOLFSSL_SP_NO_256) || \
+    defined(WOLFSSL_SP_384) || defined(WOLFSSL_SP_521) || \
+    defined(WOLFSSL_SP_1024)))
 /* Mask for address to obfuscate which of the two address will be used. */
 static const size_t addr_mask[2] = { 0, (size_t)-1 };
 #endif
@@ -22231,7 +22234,8 @@ SP_NOINLINE static void sp_256_rshift1_5(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_256_div2_5(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_256_mont_div2_5(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     sp_256_cond_add_5(r, a, m, 0 - (a[0] & 1));
     sp_256_norm_5(r);
@@ -22282,7 +22286,7 @@ static void sp_256_proj_point_dbl_5(sp_point_256* r, const sp_point_256* p,
     /* T2 = Y * Y */
     sp_256_mont_sqr_5(t2, y, p256_mod, p256_mp_mod);
     /* T2 = T2/2 */
-    sp_256_div2_5(t2, t2, p256_mod);
+    sp_256_mont_div2_5(t2, t2, p256_mod);
     /* Y = Y * X */
     sp_256_mont_mul_5(y, y, p->x, p256_mod, p256_mp_mod);
     /* X = T1 * T1 */
@@ -22315,7 +22319,8 @@ typedef struct sp_256_proj_point_dbl_5_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_256_proj_point_dbl_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
+static int sp_256_proj_point_dbl_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+        const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_256_proj_point_dbl_5_ctx* ctx = (sp_256_proj_point_dbl_5_ctx*)sp_ctx->data;
@@ -22389,7 +22394,7 @@ static int sp_256_proj_point_dbl_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_256_div2_5(ctx->t2, ctx->t2, p256_mod);
+        sp_256_mont_div2_5(ctx->t2, ctx->t2, p256_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -23224,7 +23229,7 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int i,
     sp_256_mont_sub_5(y, y, t1, p256_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_256_div2_5(y, y, p256_mod);
+    sp_256_mont_div2_5(y, y, p256_mod);
 }
 
 /* Double the Montgomery form projective point p a number of times.
@@ -23295,7 +23300,7 @@ static void sp_256_proj_point_dbl_n_store_5(sp_point_256* r,
         sp_256_mont_mul_5(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_5(y, y, t1, p256_mod);
         /* Y = Y/2 */
-        sp_256_div2_5(r[j].y, y, p256_mod);
+        sp_256_mont_div2_5(r[j].y, y, p256_mod);
         r[j].infinity = 0;
     }
 }
@@ -24098,8 +24103,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_5(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_5(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_win_add_sub_5(r, g, k, map, ct, heap);
@@ -27014,8 +27019,8 @@ static int sp_256_mod_inv_5(sp_digit* r, const sp_digit* a, const sp_digit* m)
         }
 
         while (ut > 1 && vt > 1) {
-            if (ut > vt || (ut == vt &&
-                                       sp_256_cmp_5(u, v) >= 0)) {
+            if ((ut > vt) || ((ut == vt) &&
+                    (sp_256_cmp_5(u, v) >= 0))) {
                 sp_256_sub_5(u, u, v);
                 sp_256_norm_5(u);
 
@@ -27457,19 +27462,21 @@ static int sp_256_ecc_is_point_5(const sp_point_256* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 5;
 
+        /* y^2 - x^3 - a.x = b */
         sp_256_sqr_5(t1, point->y);
         (void)sp_256_mod_5(t1, t1, p256_mod);
         sp_256_sqr_5(t2, point->x);
         (void)sp_256_mod_5(t2, t2, p256_mod);
         sp_256_mul_5(t2, t2, point->x);
         (void)sp_256_mod_5(t2, t2, p256_mod);
-        (void)sp_256_sub_5(t2, p256_mod, t2);
-        sp_256_mont_add_5(t1, t1, t2, p256_mod);
+        sp_256_mont_sub_5(t1, t1, t2, p256_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_256_mont_add_5(t1, t1, point->x, p256_mod);
         sp_256_mont_add_5(t1, t1, point->x, p256_mod);
         sp_256_mont_add_5(t1, t1, point->x, p256_mod);
 
+
         if (sp_256_cmp_5(t1, p256_b) != 0) {
             err = MP_VAL;
         }
@@ -29151,7 +29158,8 @@ SP_NOINLINE static void sp_384_rshift1_7(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_384_div2_7(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_384_mont_div2_7(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     sp_384_cond_add_7(r, a, m, 0 - (a[0] & 1));
     sp_384_norm_7(r);
@@ -29202,7 +29210,7 @@ static void sp_384_proj_point_dbl_7(sp_point_384* r, const sp_point_384* p,
     /* T2 = Y * Y */
     sp_384_mont_sqr_7(t2, y, p384_mod, p384_mp_mod);
     /* T2 = T2/2 */
-    sp_384_div2_7(t2, t2, p384_mod);
+    sp_384_mont_div2_7(t2, t2, p384_mod);
     /* Y = Y * X */
     sp_384_mont_mul_7(y, y, p->x, p384_mod, p384_mp_mod);
     /* X = T1 * T1 */
@@ -29235,7 +29243,8 @@ typedef struct sp_384_proj_point_dbl_7_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_384_proj_point_dbl_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
+static int sp_384_proj_point_dbl_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
+        const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_384_proj_point_dbl_7_ctx* ctx = (sp_384_proj_point_dbl_7_ctx*)sp_ctx->data;
@@ -29309,7 +29318,7 @@ static int sp_384_proj_point_dbl_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_384_div2_7(ctx->t2, ctx->t2, p384_mod);
+        sp_384_mont_div2_7(ctx->t2, ctx->t2, p384_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -30181,7 +30190,7 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int i,
     sp_384_mont_sub_7(y, y, t1, p384_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_384_div2_7(y, y, p384_mod);
+    sp_384_mont_div2_7(y, y, p384_mod);
 }
 
 /* Double the Montgomery form projective point p a number of times.
@@ -30252,7 +30261,7 @@ static void sp_384_proj_point_dbl_n_store_7(sp_point_384* r,
         sp_384_mont_mul_7(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_7(y, y, t1, p384_mod);
         /* Y = Y/2 */
-        sp_384_div2_7(r[j].y, y, p384_mod);
+        sp_384_mont_div2_7(r[j].y, y, p384_mod);
         r[j].infinity = 0;
     }
 }
@@ -31075,8 +31084,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_7(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_7(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_win_add_sub_7(r, g, k, map, ct, heap);
@@ -34476,8 +34485,8 @@ static int sp_384_mod_inv_7(sp_digit* r, const sp_digit* a, const sp_digit* m)
         }
 
         while (ut > 1 && vt > 1) {
-            if (ut > vt || (ut == vt &&
-                                       sp_384_cmp_7(u, v) >= 0)) {
+            if ((ut > vt) || ((ut == vt) &&
+                    (sp_384_cmp_7(u, v) >= 0))) {
                 sp_384_sub_7(u, u, v);
                 sp_384_norm_7(u);
 
@@ -34921,19 +34930,21 @@ static int sp_384_ecc_is_point_7(const sp_point_384* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 7;
 
+        /* y^2 - x^3 - a.x = b */
         sp_384_sqr_7(t1, point->y);
         (void)sp_384_mod_7(t1, t1, p384_mod);
         sp_384_sqr_7(t2, point->x);
         (void)sp_384_mod_7(t2, t2, p384_mod);
         sp_384_mul_7(t2, t2, point->x);
         (void)sp_384_mod_7(t2, t2, p384_mod);
-        (void)sp_384_sub_7(t2, p384_mod, t2);
-        sp_384_mont_add_7(t1, t1, t2, p384_mod);
+        sp_384_mont_sub_7(t1, t1, t2, p384_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_384_mont_add_7(t1, t1, point->x, p384_mod);
         sp_384_mont_add_7(t1, t1, point->x, p384_mod);
         sp_384_mont_add_7(t1, t1, point->x, p384_mod);
 
+
         if (sp_384_cmp_7(t1, p384_b) != 0) {
             err = MP_VAL;
         }
@@ -36686,7 +36697,8 @@ SP_NOINLINE static void sp_521_rshift1_9(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_521_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_521_mont_div2_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     sp_521_cond_add_9(r, a, m, 0 - (a[0] & 1));
     sp_521_norm_9(r);
@@ -36737,7 +36749,7 @@ static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p,
     /* T2 = Y * Y */
     sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod);
     /* T2 = T2/2 */
-    sp_521_div2_9(t2, t2, p521_mod);
+    sp_521_mont_div2_9(t2, t2, p521_mod);
     /* Y = Y * X */
     sp_521_mont_mul_9(y, y, p->x, p521_mod, p521_mp_mod);
     /* X = T1 * T1 */
@@ -36770,7 +36782,8 @@ typedef struct sp_521_proj_point_dbl_9_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
+static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
+        const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_521_proj_point_dbl_9_ctx* ctx = (sp_521_proj_point_dbl_9_ctx*)sp_ctx->data;
@@ -36844,7 +36857,7 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_521_div2_9(ctx->t2, ctx->t2, p521_mod);
+        sp_521_mont_div2_9(ctx->t2, ctx->t2, p521_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -37598,7 +37611,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     sp_521_mont_sub_9(y, y, t1, p521_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_521_div2_9(y, y, p521_mod);
+    sp_521_mont_div2_9(y, y, p521_mod);
 }
 
 /* Double the Montgomery form projective point p a number of times.
@@ -37669,7 +37682,7 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r,
         sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_9(y, y, t1, p521_mod);
         /* Y = Y/2 */
-        sp_521_div2_9(r[j].y, y, p521_mod);
+        sp_521_mont_div2_9(r[j].y, y, p521_mod);
         r[j].infinity = 0;
     }
 }
@@ -38512,8 +38525,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap);
@@ -41945,8 +41958,8 @@ static int sp_521_mod_inv_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
         }
 
         while (ut > 1 && vt > 1) {
-            if (ut > vt || (ut == vt &&
-                                       sp_521_cmp_9(u, v) >= 0)) {
+            if ((ut > vt) || ((ut == vt) &&
+                    (sp_521_cmp_9(u, v) >= 0))) {
                 sp_521_sub_9(u, u, v);
                 sp_521_norm_9(u);
 
@@ -42401,19 +42414,21 @@ static int sp_521_ecc_is_point_9(const sp_point_521* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 9;
 
+        /* y^2 - x^3 - a.x = b */
         sp_521_sqr_9(t1, point->y);
         (void)sp_521_mod_9(t1, t1, p521_mod);
         sp_521_sqr_9(t2, point->x);
         (void)sp_521_mod_9(t2, t2, p521_mod);
         sp_521_mul_9(t2, t2, point->x);
         (void)sp_521_mod_9(t2, t2, p521_mod);
-        (void)sp_521_sub_9(t2, p521_mod, t2);
-        sp_521_mont_add_9(t1, t1, t2, p521_mod);
+        sp_521_mont_sub_9(t1, t1, t2, p521_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_521_mont_add_9(t1, t1, point->x, p521_mod);
         sp_521_mont_add_9(t1, t1, point->x, p521_mod);
         sp_521_mont_add_9(t1, t1, point->x, p521_mod);
 
+
         if (sp_521_cmp_9(t1, p521_b) != 0) {
             err = MP_VAL;
         }
@@ -44574,7 +44589,8 @@ SP_NOINLINE static void sp_1024_rshift1_18(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_1024_div2_18(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_1024_mont_div2_18(sp_digit* r, const sp_digit* a,
+        const sp_digit* m)
 {
     sp_1024_cond_add_18(r, a, m, 0 - (a[0] & 1));
     sp_1024_norm_18(r);
@@ -44625,7 +44641,7 @@ static void sp_1024_proj_point_dbl_18(sp_point_1024* r, const sp_point_1024* p,
     /* T2 = Y * Y */
     sp_1024_mont_sqr_18(t2, y, p1024_mod, p1024_mp_mod);
     /* T2 = T2/2 */
-    sp_1024_div2_18(t2, t2, p1024_mod);
+    sp_1024_mont_div2_18(t2, t2, p1024_mod);
     /* Y = Y * X */
     sp_1024_mont_mul_18(y, y, p->x, p1024_mod, p1024_mp_mod);
     /* X = T1 * T1 */
@@ -44658,7 +44674,8 @@ typedef struct sp_1024_proj_point_dbl_18_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_1024_proj_point_dbl_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
+static int sp_1024_proj_point_dbl_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
+        const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_1024_proj_point_dbl_18_ctx* ctx = (sp_1024_proj_point_dbl_18_ctx*)sp_ctx->data;
@@ -44732,7 +44749,7 @@ static int sp_1024_proj_point_dbl_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_1024_div2_18(ctx->t2, ctx->t2, p1024_mod);
+        sp_1024_mont_div2_18(ctx->t2, ctx->t2, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -45490,7 +45507,7 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int i,
     sp_1024_mont_sub_18(y, y, t1, p1024_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_1024_div2_18(y, y, p1024_mod);
+    sp_1024_mont_div2_18(y, y, p1024_mod);
 }
 
 /* Double the Montgomery form projective point p a number of times.
@@ -45561,7 +45578,7 @@ static void sp_1024_proj_point_dbl_n_store_18(sp_point_1024* r,
         sp_1024_mont_mul_18(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_18(y, y, t1, p1024_mod);
         /* Y = Y/2 */
-        sp_1024_div2_18(r[j].y, y, p1024_mod);
+        sp_1024_mont_div2_18(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
     }
 }
@@ -46275,8 +46292,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_18(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_18(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_win_add_sub_18(r, g, k, map, ct, heap);
@@ -51967,7 +51984,7 @@ static void sp_1024_accumulate_line_dbl_18(sp_digit* vx, sp_digit* vy,
     /* ty = 4 * p.y ^ 2 */
     sp_1024_mont_sqr_18(ty, ry, p1024_mod, p1024_mp_mod);
     /* t1 = 2 * p.y ^ 2 */
-    sp_1024_div2_18(t1, ty, p1024_mod);
+    sp_1024_mont_div2_18(t1, ty, p1024_mod);
     /* r.x -= 2 * (p.y ^ 2) */
     sp_1024_mont_sub_18(rx, rx, t1, p1024_mod);
     /* p'.z = p.y * 2 * p.z */
@@ -51987,7 +52004,7 @@ static void sp_1024_accumulate_line_dbl_18(sp_digit* vx, sp_digit* vy,
     /* t1 = (4 * p.y^2) ^ 2 = 16 * p.y^4 */
     sp_1024_mont_sqr_18(t1, ty, p1024_mod, p1024_mp_mod);
     /* t1 = 16 * p.y^4 / 2 = 8 * p.y^4 */
-    sp_1024_div2_18(t1, t1, p1024_mod);
+    sp_1024_mont_div2_18(t1, t1, p1024_mod);
     /* p'.y = 4 * p.y^2 * p.x */
     sp_1024_mont_mul_18(p->y, ty, p->x, p1024_mod, p1024_mp_mod);
     /* p'.x = l^2 */
@@ -52405,7 +52422,7 @@ static void sp_1024_accumulate_line_dbl_n_18(sp_digit* vx, sp_digit* vy,
         /* ty = py ^ 2 */
         sp_1024_mont_sqr_18(ty, p->y, p1024_mod, p1024_mp_mod);
         /* t1 = py ^ 2 / 2 */
-        sp_1024_div2_18(t1, ty, p1024_mod);
+        sp_1024_mont_div2_18(t1, ty, p1024_mod);
         /* r.x -= py ^ 2 / 2 */
         sp_1024_mont_sub_18(rx, rx, t1, p1024_mod);
         /* p'.z = py * pz */
@@ -52443,7 +52460,7 @@ static void sp_1024_accumulate_line_dbl_n_18(sp_digit* vx, sp_digit* vy,
     }
 
     /* p'.y = py' / 2 */
-    sp_1024_div2_18(p->y, p->y, p1024_mod);
+    sp_1024_mont_div2_18(p->y, p->y, p1024_mod);
 }
 
 /* Operations to perform based on order - 1.
@@ -53280,19 +53297,21 @@ static int sp_1024_ecc_is_point_18(const sp_point_1024* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 18;
 
+        /* y^2 - x^3 - a.x = b */
         sp_1024_sqr_18(t1, point->y);
         (void)sp_1024_mod_18(t1, t1, p1024_mod);
         sp_1024_sqr_18(t2, point->x);
         (void)sp_1024_mod_18(t2, t2, p1024_mod);
         sp_1024_mul_18(t2, t2, point->x);
         (void)sp_1024_mod_18(t2, t2, p1024_mod);
-        (void)sp_1024_sub_18(t2, p1024_mod, t2);
-        sp_1024_mont_add_18(t1, t1, t2, p1024_mod);
+        sp_1024_mont_sub_18(t1, t1, t2, p1024_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_1024_mont_add_18(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_18(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_18(t1, t1, point->x, p1024_mod);
 
+
         n = sp_1024_cmp_18(t1, p1024_mod);
         sp_1024_cond_sub_18(t1, t1, p1024_mod, ~(n >> 56));
         sp_1024_norm_18(t1);
diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c
index f2103078f..95c7820d7 100644
--- a/wolfcrypt/src/sp_cortexm.c
+++ b/wolfcrypt/src/sp_cortexm.c
@@ -4544,8 +4544,8 @@ static sp_int32 sp_2048_cmp_32(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[64], t2[33];
     sp_digit div, r1;
@@ -6492,8 +6492,8 @@ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[128], t2[65];
     sp_digit div, r1;
@@ -7345,8 +7345,8 @@ static sp_int32 sp_2048_cmp_64(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[128], t2[65];
     sp_digit div, r1;
@@ -15281,8 +15281,8 @@ static sp_int32 sp_3072_cmp_48(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[96], t2[49];
     sp_digit div, r1;
@@ -17813,8 +17813,8 @@ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[192], t2[97];
     sp_digit div, r1;
@@ -19018,8 +19018,8 @@ static sp_int32 sp_3072_cmp_96(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[192], t2[97];
     sp_digit div, r1;
@@ -25379,8 +25379,8 @@ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[256], t2[129];
     sp_digit div, r1;
@@ -26936,8 +26936,8 @@ static sp_int32 sp_4096_cmp_128(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[256], t2[129];
     sp_digit div, r1;
@@ -30214,84 +30214,6 @@ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
     return (uint32_t)(size_t)r;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "MOV	r11, #0x0\n\t"
-        "ADD	r12, %[a], #0x20\n\t"
-        "\n"
-    "L_sp_256_sub_8_word_%=:\n\t"
-        "RSBS	r11, r11, #0x0\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "SBC	r11, r3, r3\n\t"
-        "CMP	%[a], r12\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_256_sub_8_word_%=\n\t"
-#else
-        "BNE.N	L_sp_256_sub_8_word_%=\n\t"
-#endif
-        "MOV	%[r], r11\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SUBS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "SBC	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 #endif /* WOLFSSL_SP_SMALL */
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -33226,7 +33148,7 @@ static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p)
+static void sp_256_mont_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -33236,7 +33158,6 @@ static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_
         "LDM	%[a], {r4, r5, r6, r7}\n\t"
         "AND	r3, r4, #0x1\n\t"
         "RSB	r8, r3, #0x0\n\t"
-        "AND	r9, r8, #0x1\n\t"
         "ADDS	r4, r4, r8\n\t"
         "ADCS	r5, r5, r8\n\t"
         "ADCS	r6, r6, r8\n\t"
@@ -33246,7 +33167,7 @@ static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_
         "LDRD	r6, r7, [%[a], #24]\n\t"
         "ADCS	r4, r4, #0x0\n\t"
         "ADCS	r5, r5, #0x0\n\t"
-        "ADCS	r6, r6, r9\n\t"
+        "ADCS	r6, r6, r8, LSR #31\n\t"
         "ADCS	r7, r7, r8\n\t"
         "MOV	r3, #0x0\n\t"
         "ADC	r3, r3, #0x0\n\t"
@@ -33254,10 +33175,10 @@ static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_
         "LSR	r9, r5, #1\n\t"
         "LSR	r10, r6, #1\n\t"
         "LSR	r11, r7, #1\n\t"
-        "ORR	r8, r8, r5, lsl #31\n\t"
-        "ORR	r9, r9, r6, lsl #31\n\t"
-        "ORR	r10, r10, r7, lsl #31\n\t"
-        "ORR	r11, r11, r3, lsl #31\n\t"
+        "ORR	r8, r8, r5, LSL #31\n\t"
+        "ORR	r9, r9, r6, LSL #31\n\t"
+        "ORR	r10, r10, r7, LSL #31\n\t"
+        "ORR	r11, r11, r3, LSL #31\n\t"
         "MOV	r3, r4\n\t"
         "STRD	r8, r9, [%[r], #16]\n\t"
         "STRD	r10, r11, [%[r], #24]\n\t"
@@ -33266,10 +33187,10 @@ static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_
         "LSR	r9, r5, #1\n\t"
         "LSR	r10, r6, #1\n\t"
         "LSR	r11, r7, #1\n\t"
-        "ORR	r8, r8, r5, lsl #31\n\t"
-        "ORR	r9, r9, r6, lsl #31\n\t"
-        "ORR	r10, r10, r7, lsl #31\n\t"
-        "ORR	r11, r11, r3, lsl #31\n\t"
+        "ORR	r8, r8, r5, LSL #31\n\t"
+        "ORR	r9, r9, r6, LSL #31\n\t"
+        "ORR	r10, r10, r7, LSL #31\n\t"
+        "ORR	r11, r11, r3, LSL #31\n\t"
         "STM	%[r], {r8, r9, r10, r11}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
@@ -33321,7 +33242,7 @@ static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p,
     /* T2 = Y * Y */
     sp_256_mont_sqr_8(t2, y, p256_mod, p256_mp_mod);
     /* T2 = T2/2 */
-    sp_256_div2_8(t2, t2, p256_mod);
+    sp_256_mont_div2_8(t2, t2, p256_mod);
     /* Y = Y * X */
     sp_256_mont_mul_8(y, y, p->x, p256_mod, p256_mp_mod);
     /* X = T1 * T1 */
@@ -33354,7 +33275,8 @@ typedef struct sp_256_proj_point_dbl_8_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
+static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+        const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_256_proj_point_dbl_8_ctx* ctx = (sp_256_proj_point_dbl_8_ctx*)sp_ctx->data;
@@ -33428,7 +33350,7 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_256_div2_8(ctx->t2, ctx->t2, p256_mod);
+        sp_256_mont_div2_8(ctx->t2, ctx->t2, p256_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -34138,7 +34060,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     sp_256_mont_sub_8(y, y, t1, p256_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_256_div2_8(y, y, p256_mod);
+    sp_256_mont_div2_8(y, y, p256_mod);
 }
 
 /* Convert the projective point to affine.
@@ -34616,8 +34538,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap);
@@ -35036,8 +34958,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap);
@@ -37519,8 +37441,8 @@ static void sp_256_mask_8(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[16], t2[9];
     sp_digit div, r1;
@@ -38129,6 +38051,84 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
 #endif /* HAVE_ECC_SIGN */
 
 #ifndef WOLFSSL_SP_SMALL
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "MOV	r11, #0x0\n\t"
+        "ADD	r12, %[a], #0x20\n\t"
+        "\n"
+    "L_sp_256_sub_8_word_%=:\n\t"
+        "RSBS	r11, r11, #0x0\n\t"
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "SBC	r11, r3, r3\n\t"
+        "CMP	%[a], r12\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "BNE	L_sp_256_sub_8_word_%=\n\t"
+#else
+        "BNE.N	L_sp_256_sub_8_word_%=\n\t"
+#endif
+        "MOV	%[r], r11\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SUBS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "SBC	%[r], r6, r6\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
 static void sp_256_rshift1_8(sp_digit* r_p, const sp_digit* a_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
@@ -38390,7 +38390,7 @@ static int sp_256_mod_inv_8(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_256_cmp_8(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_256_cmp_8(u, v) >= 0))) {
             sp_256_sub_8(u, u, v);
             o = sp_256_sub_8(b, b, d);
             if (o != 0)
@@ -38819,19 +38819,21 @@ static int sp_256_ecc_is_point_8(const sp_point_256* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 8;
 
+        /* y^2 - x^3 - a.x = b */
         sp_256_sqr_8(t1, point->y);
         (void)sp_256_mod_8(t1, t1, p256_mod);
         sp_256_sqr_8(t2, point->x);
         (void)sp_256_mod_8(t2, t2, p256_mod);
         sp_256_mul_8(t2, t2, point->x);
         (void)sp_256_mod_8(t2, t2, p256_mod);
-        (void)sp_256_sub_8(t2, p256_mod, t2);
-        sp_256_mont_add_8(t1, t1, t2, p256_mod);
+        sp_256_mont_sub_8(t1, t1, t2, p256_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_256_mont_add_8(t1, t1, point->x, p256_mod);
         sp_256_mont_add_8(t1, t1, point->x, p256_mod);
         sp_256_mont_add_8(t1, t1, point->x, p256_mod);
 
+
         if (sp_256_cmp_8(t1, p256_b) != 0) {
             err = MP_VAL;
         }
@@ -41412,91 +41414,6 @@ static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit
     return (uint32_t)(size_t)r;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "MOV	r11, #0x0\n\t"
-        "ADD	r12, %[a], #0x30\n\t"
-        "\n"
-    "L_sp_384_sub_12_word_%=:\n\t"
-        "RSBS	r11, r11, #0x0\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "SBC	r11, r3, r3\n\t"
-        "CMP	%[a], r12\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_384_sub_12_word_%=\n\t"
-#else
-        "BNE.N	L_sp_384_sub_12_word_%=\n\t"
-#endif
-        "MOV	%[r], r11\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SUBS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "SBC	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 #endif /* WOLFSSL_SP_SMALL */
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -42583,6 +42500,91 @@ static void sp_384_mont_tpl_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi
     sp_384_cond_sub_12(r, r, m, 0 - o);
 }
 
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "MOV	r11, #0x0\n\t"
+        "ADD	r12, %[a], #0x30\n\t"
+        "\n"
+    "L_sp_384_sub_12_word_%=:\n\t"
+        "RSBS	r11, r11, #0x0\n\t"
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "SBC	r11, r3, r3\n\t"
+        "CMP	%[a], r12\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "BNE	L_sp_384_sub_12_word_%=\n\t"
+#else
+        "BNE.N	L_sp_384_sub_12_word_%=\n\t"
+#endif
+        "MOV	%[r], r11\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SUBS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "SBC	%[r], r6, r6\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
 #ifdef WOLFSSL_SP_SMALL
 /* Conditionally add a and b using the mask m.
  * m is -1 to add and 0 when not.
@@ -42783,7 +42785,7 @@ static void sp_384_rshift1_12(sp_digit* r_p, const sp_digit* a_p)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_384_div2_12(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_384_mont_div2_12(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     sp_digit o;
 
@@ -42836,7 +42838,7 @@ static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p,
     /* T2 = Y * Y */
     sp_384_mont_sqr_12(t2, y, p384_mod, p384_mp_mod);
     /* T2 = T2/2 */
-    sp_384_div2_12(t2, t2, p384_mod);
+    sp_384_mont_div2_12(t2, t2, p384_mod);
     /* Y = Y * X */
     sp_384_mont_mul_12(y, y, p->x, p384_mod, p384_mp_mod);
     /* X = T1 * T1 */
@@ -42869,7 +42871,8 @@ typedef struct sp_384_proj_point_dbl_12_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
+static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
+        const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_384_proj_point_dbl_12_ctx* ctx = (sp_384_proj_point_dbl_12_ctx*)sp_ctx->data;
@@ -42943,7 +42946,7 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_384_div2_12(ctx->t2, ctx->t2, p384_mod);
+        sp_384_mont_div2_12(ctx->t2, ctx->t2, p384_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -43679,7 +43682,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     sp_384_mont_sub_12(y, y, t1, p384_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_384_div2_12(y, y, p384_mod);
+    sp_384_mont_div2_12(y, y, p384_mod);
 }
 
 /* Convert the projective point to affine.
@@ -44173,8 +44176,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_fast_12(r, g, k, map, ct, heap);
@@ -44609,8 +44612,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_12(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_fast_12(r, g, k, map, ct, heap);
@@ -47129,8 +47132,8 @@ static void sp_384_mask_12(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[24], t2[13];
     sp_digit div, r1;
@@ -48019,7 +48022,7 @@ static int sp_384_mod_inv_12(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_384_cmp_12(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_384_cmp_12(u, v) >= 0))) {
             sp_384_sub_12(u, u, v);
             o = sp_384_sub_12(b, b, d);
             if (o != 0)
@@ -48452,19 +48455,21 @@ static int sp_384_ecc_is_point_12(const sp_point_384* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 12;
 
+        /* y^2 - x^3 - a.x = b */
         sp_384_sqr_12(t1, point->y);
         (void)sp_384_mod_12(t1, t1, p384_mod);
         sp_384_sqr_12(t2, point->x);
         (void)sp_384_mod_12(t2, t2, p384_mod);
         sp_384_mul_12(t2, t2, point->x);
         (void)sp_384_mod_12(t2, t2, p384_mod);
-        (void)sp_384_sub_12(t2, p384_mod, t2);
-        sp_384_mont_add_12(t1, t1, t2, p384_mod);
+        sp_384_mont_sub_12(t1, t1, t2, p384_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_384_mont_add_12(t1, t1, point->x, p384_mod);
         sp_384_mont_add_12(t1, t1, point->x, p384_mod);
         sp_384_mont_add_12(t1, t1, point->x, p384_mod);
 
+
         if (sp_384_cmp_12(t1, p384_b) != 0) {
             err = MP_VAL;
         }
@@ -52708,107 +52713,6 @@ static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit
     return (uint32_t)(size_t)r;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "MOV	r11, #0x0\n\t"
-        "ADD	r12, %[a], #0x40\n\t"
-        "\n"
-    "L_sp_521_sub_17_word_%=:\n\t"
-        "RSBS	r11, r11, #0x0\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "SBC	r11, r3, r3\n\t"
-        "CMP	%[a], r12\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_521_sub_17_word_%=\n\t"
-#else
-        "BNE.N	L_sp_521_sub_17_word_%=\n\t"
-#endif
-        "RSBS	r11, r11, #0x0\n\t"
-        "LDM	%[a]!, {r3}\n\t"
-        "LDM	%[b]!, {r7}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "STM	%[r]!, {r3}\n\t"
-        "SBC	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SUBS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3}\n\t"
-        "LDM	%[b]!, {r7}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "STM	%[r]!, {r3}\n\t"
-        "SBC	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
-    );
-    return (uint32_t)(size_t)r;
-}
-
 #endif /* WOLFSSL_SP_SMALL */
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -53173,41 +53077,41 @@ static void sp_521_mont_reduce_17(sp_digit* a_p, const sp_digit* m_p, sp_digit m
         /*  0-7 */
         "LDM	lr!, {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
         "LSR	r1, r1, #9\n\t"
-        "ORR	r1, r1, r2, lsl #23\n\t"
+        "ORR	r1, r1, r2, LSL #23\n\t"
         "LSR	r2, r2, #9\n\t"
-        "ORR	r2, r2, r3, lsl #23\n\t"
+        "ORR	r2, r2, r3, LSL #23\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r4, lsl #23\n\t"
+        "ORR	r3, r3, r4, LSL #23\n\t"
         "LSR	r4, r4, #9\n\t"
-        "ORR	r4, r4, r5, lsl #23\n\t"
+        "ORR	r4, r4, r5, LSL #23\n\t"
         "LSR	r5, r5, #9\n\t"
-        "ORR	r5, r5, r6, lsl #23\n\t"
+        "ORR	r5, r5, r6, LSL #23\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r8, lsl #23\n\t"
+        "ORR	r7, r7, r8, LSL #23\n\t"
         "LSR	r8, r8, #9\n\t"
-        "ORR	r8, r8, r9, lsl #23\n\t"
+        "ORR	r8, r8, r9, LSL #23\n\t"
         "STM	r12!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "MOV	r1, r9\n\t"
         /*  8-16 */
         "LDM	lr!, {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
         "LSR	r1, r1, #9\n\t"
-        "ORR	r1, r1, r2, lsl #23\n\t"
+        "ORR	r1, r1, r2, LSL #23\n\t"
         "LSR	r2, r2, #9\n\t"
-        "ORR	r2, r2, r3, lsl #23\n\t"
+        "ORR	r2, r2, r3, LSL #23\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r4, lsl #23\n\t"
+        "ORR	r3, r3, r4, LSL #23\n\t"
         "LSR	r4, r4, #9\n\t"
-        "ORR	r4, r4, r5, lsl #23\n\t"
+        "ORR	r4, r4, r5, LSL #23\n\t"
         "LSR	r5, r5, #9\n\t"
-        "ORR	r5, r5, r6, lsl #23\n\t"
+        "ORR	r5, r5, r6, LSL #23\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r8, lsl #23\n\t"
+        "ORR	r7, r7, r8, LSL #23\n\t"
         "LSR	r8, r8, #9\n\t"
-        "ORR	r8, r8, r9, lsl #23\n\t"
+        "ORR	r8, r8, r9, LSL #23\n\t"
         "LSR	r9, r9, #9\n\t"
         "STM	r12!, {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
         /* Add top to bottom */
@@ -53463,67 +53367,67 @@ static void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_d
         "LDR	r6, [%[a]]\n\t"
         "LDR	r7, [%[a], #4]\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "STR	r6, [%[a], #4]\n\t"
         "LDR	r6, [%[a], #8]\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r6, lsl #23\n\t"
+        "ORR	r7, r7, r6, LSL #23\n\t"
         "STR	r7, [%[a], #8]\n\t"
         "LDR	r7, [%[a], #12]\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "STR	r6, [%[a], #12]\n\t"
         "LDR	r6, [%[a], #16]\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r6, lsl #23\n\t"
+        "ORR	r7, r7, r6, LSL #23\n\t"
         "STR	r7, [%[a], #16]\n\t"
         "LDR	r7, [%[a], #20]\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "STR	r6, [%[a], #20]\n\t"
         "LDR	r6, [%[a], #24]\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r6, lsl #23\n\t"
+        "ORR	r7, r7, r6, LSL #23\n\t"
         "STR	r7, [%[a], #24]\n\t"
         "LDR	r7, [%[a], #28]\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "STR	r6, [%[a], #28]\n\t"
         "LDR	r6, [%[a], #32]\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r6, lsl #23\n\t"
+        "ORR	r7, r7, r6, LSL #23\n\t"
         "STR	r7, [%[a], #32]\n\t"
         "LDR	r7, [%[a], #36]\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "STR	r6, [%[a], #36]\n\t"
         "LDR	r6, [%[a], #40]\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r6, lsl #23\n\t"
+        "ORR	r7, r7, r6, LSL #23\n\t"
         "STR	r7, [%[a], #40]\n\t"
         "LDR	r7, [%[a], #44]\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "STR	r6, [%[a], #44]\n\t"
         "LDR	r6, [%[a], #48]\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r6, lsl #23\n\t"
+        "ORR	r7, r7, r6, LSL #23\n\t"
         "STR	r7, [%[a], #48]\n\t"
         "LDR	r7, [%[a], #52]\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "STR	r6, [%[a], #52]\n\t"
         "LDR	r6, [%[a], #56]\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r6, lsl #23\n\t"
+        "ORR	r7, r7, r6, LSL #23\n\t"
         "STR	r7, [%[a], #56]\n\t"
         "LDR	r7, [%[a], #60]\n\t"
         "LSR	r6, r6, #9\n\t"
-        "ORR	r6, r6, r7, lsl #23\n\t"
+        "ORR	r6, r6, r7, LSL #23\n\t"
         "STR	r6, [%[a], #60]\n\t"
         "LDR	r6, [%[a], #64]\n\t"
         "LSR	r7, r7, #9\n\t"
-        "ORR	r7, r7, r6, lsl #23\n\t"
+        "ORR	r7, r7, r6, LSL #23\n\t"
         "STR	r7, [%[a], #64]\n\t"
         "LSR	r6, r6, #9\n\t"
         "STR	r6, [%[a], #68]\n\t"
@@ -53678,67 +53582,67 @@ static void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_d
         "LDR	r12, [%[a]]\n\t"
         "LDR	r3, [%[a], #4]\n\t"
         "LSR	r12, r12, #9\n\t"
-        "ORR	r12, r12, r3, lsl #23\n\t"
+        "ORR	r12, r12, r3, LSL #23\n\t"
         "STR	r12, [%[a], #4]\n\t"
         "LDR	r12, [%[a], #8]\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r12, lsl #23\n\t"
+        "ORR	r3, r3, r12, LSL #23\n\t"
         "STR	r3, [%[a], #8]\n\t"
         "LDR	r3, [%[a], #12]\n\t"
         "LSR	r12, r12, #9\n\t"
-        "ORR	r12, r12, r3, lsl #23\n\t"
+        "ORR	r12, r12, r3, LSL #23\n\t"
         "STR	r12, [%[a], #12]\n\t"
         "LDR	r12, [%[a], #16]\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r12, lsl #23\n\t"
+        "ORR	r3, r3, r12, LSL #23\n\t"
         "STR	r3, [%[a], #16]\n\t"
         "LDR	r3, [%[a], #20]\n\t"
         "LSR	r12, r12, #9\n\t"
-        "ORR	r12, r12, r3, lsl #23\n\t"
+        "ORR	r12, r12, r3, LSL #23\n\t"
         "STR	r12, [%[a], #20]\n\t"
         "LDR	r12, [%[a], #24]\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r12, lsl #23\n\t"
+        "ORR	r3, r3, r12, LSL #23\n\t"
         "STR	r3, [%[a], #24]\n\t"
         "LDR	r3, [%[a], #28]\n\t"
         "LSR	r12, r12, #9\n\t"
-        "ORR	r12, r12, r3, lsl #23\n\t"
+        "ORR	r12, r12, r3, LSL #23\n\t"
         "STR	r12, [%[a], #28]\n\t"
         "LDR	r12, [%[a], #32]\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r12, lsl #23\n\t"
+        "ORR	r3, r3, r12, LSL #23\n\t"
         "STR	r3, [%[a], #32]\n\t"
         "LDR	r3, [%[a], #36]\n\t"
         "LSR	r12, r12, #9\n\t"
-        "ORR	r12, r12, r3, lsl #23\n\t"
+        "ORR	r12, r12, r3, LSL #23\n\t"
         "STR	r12, [%[a], #36]\n\t"
         "LDR	r12, [%[a], #40]\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r12, lsl #23\n\t"
+        "ORR	r3, r3, r12, LSL #23\n\t"
         "STR	r3, [%[a], #40]\n\t"
         "LDR	r3, [%[a], #44]\n\t"
         "LSR	r12, r12, #9\n\t"
-        "ORR	r12, r12, r3, lsl #23\n\t"
+        "ORR	r12, r12, r3, LSL #23\n\t"
         "STR	r12, [%[a], #44]\n\t"
         "LDR	r12, [%[a], #48]\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r12, lsl #23\n\t"
+        "ORR	r3, r3, r12, LSL #23\n\t"
         "STR	r3, [%[a], #48]\n\t"
         "LDR	r3, [%[a], #52]\n\t"
         "LSR	r12, r12, #9\n\t"
-        "ORR	r12, r12, r3, lsl #23\n\t"
+        "ORR	r12, r12, r3, LSL #23\n\t"
         "STR	r12, [%[a], #52]\n\t"
         "LDR	r12, [%[a], #56]\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r12, lsl #23\n\t"
+        "ORR	r3, r3, r12, LSL #23\n\t"
         "STR	r3, [%[a], #56]\n\t"
         "LDR	r3, [%[a], #60]\n\t"
         "LSR	r12, r12, #9\n\t"
-        "ORR	r12, r12, r3, lsl #23\n\t"
+        "ORR	r12, r12, r3, LSL #23\n\t"
         "STR	r12, [%[a], #60]\n\t"
         "LDR	r12, [%[a], #64]\n\t"
         "LSR	r3, r3, #9\n\t"
-        "ORR	r3, r3, r12, lsl #23\n\t"
+        "ORR	r3, r3, r12, LSL #23\n\t"
         "STR	r3, [%[a], #64]\n\t"
         "LSR	r12, r12, #9\n\t"
         "STR	r12, [%[a], #68]\n\t"
@@ -54578,7 +54482,7 @@ static void sp_521_rshift1_17(sp_digit* r_p, const sp_digit* a_p)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_521_div2_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_521_mont_div2_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     sp_digit o = a[0] & 1;
 
@@ -54632,7 +54536,7 @@ static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p,
     /* T2 = Y * Y */
     sp_521_mont_sqr_17(t2, y, p521_mod, p521_mp_mod);
     /* T2 = T2/2 */
-    sp_521_div2_17(t2, t2, p521_mod);
+    sp_521_mont_div2_17(t2, t2, p521_mod);
     /* Y = Y * X */
     sp_521_mont_mul_17(y, y, p->x, p521_mod, p521_mp_mod);
     /* X = T1 * T1 */
@@ -54665,7 +54569,8 @@ typedef struct sp_521_proj_point_dbl_17_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
+static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
+        const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_521_proj_point_dbl_17_ctx* ctx = (sp_521_proj_point_dbl_17_ctx*)sp_ctx->data;
@@ -54739,7 +54644,7 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_521_div2_17(ctx->t2, ctx->t2, p521_mod);
+        sp_521_mont_div2_17(ctx->t2, ctx->t2, p521_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -55512,7 +55417,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     sp_521_mont_sub_17(y, y, t1, p521_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_521_div2_17(y, y, p521_mod);
+    sp_521_mont_div2_17(y, y, p521_mod);
 }
 
 /* Convert the projective point to affine.
@@ -56026,8 +55931,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_fast_17(r, g, k, map, ct, heap);
@@ -56482,8 +56387,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_17(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_fast_17(r, g, k, map, ct, heap);
@@ -60029,8 +59934,8 @@ static void sp_521_mask_17(sp_digit* r, const sp_digit* a, sp_digit m)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[35];
     sp_digit t2[18];
@@ -60635,6 +60540,107 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
 #endif /* HAVE_ECC_SIGN */
 
 #ifndef WOLFSSL_SP_SMALL
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "MOV	r11, #0x0\n\t"
+        "ADD	r12, %[a], #0x40\n\t"
+        "\n"
+    "L_sp_521_sub_17_word_%=:\n\t"
+        "RSBS	r11, r11, #0x0\n\t"
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "SBC	r11, r3, r3\n\t"
+        "CMP	%[a], r12\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "BNE	L_sp_521_sub_17_word_%=\n\t"
+#else
+        "BNE.N	L_sp_521_sub_17_word_%=\n\t"
+#endif
+        "RSBS	r11, r11, #0x0\n\t"
+        "LDM	%[a]!, {r3}\n\t"
+        "LDM	%[b]!, {r7}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "STM	%[r]!, {r3}\n\t"
+        "SBC	%[r], r6, r6\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+{
+    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
+    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
+    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
+
+    __asm__ __volatile__ (
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SUBS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "SBCS	r4, r4, r8\n\t"
+        "SBCS	r5, r5, r9\n\t"
+        "SBCS	r6, r6, r10\n\t"
+        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
+        "LDM	%[a]!, {r3}\n\t"
+        "LDM	%[b]!, {r7}\n\t"
+        "SBCS	r3, r3, r7\n\t"
+        "STM	%[r]!, {r3}\n\t"
+        "SBC	%[r], r6, r6\n\t"
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+        :
+        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+    );
+    return (uint32_t)(size_t)r;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
 /* Divide the number by 2 mod the modulus. (r = a / 2 % m)
  *
  * r  Result of division by 2.
@@ -61048,7 +61054,7 @@ static int sp_521_mod_inv_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_521_cmp_17(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_521_cmp_17(u, v) >= 0))) {
             sp_521_sub_17(u, u, v);
             o = sp_521_sub_17(b, b, d);
             if (o != 0)
@@ -61493,19 +61499,21 @@ static int sp_521_ecc_is_point_17(const sp_point_521* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 17;
 
+        /* y^2 - x^3 - a.x = b */
         sp_521_sqr_17(t1, point->y);
         (void)sp_521_mod_17(t1, t1, p521_mod);
         sp_521_sqr_17(t2, point->x);
         (void)sp_521_mod_17(t2, t2, p521_mod);
         sp_521_mul_17(t2, t2, point->x);
         (void)sp_521_mod_17(t2, t2, p521_mod);
-        (void)sp_521_sub_17(t2, p521_mod, t2);
-        sp_521_mont_add_17(t1, t1, t2, p521_mod);
+        sp_521_mont_sub_17(t1, t1, t2, p521_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_521_mont_add_17(t1, t1, point->x, p521_mod);
         sp_521_mont_add_17(t1, t1, point->x, p521_mod);
         sp_521_mont_add_17(t1, t1, point->x, p521_mod);
 
+
         if (sp_521_cmp_17(t1, p521_b) != 0) {
             err = MP_VAL;
         }
@@ -66643,8 +66651,8 @@ static sp_int32 sp_1024_cmp_32(const sp_digit* a_p, const sp_digit* b_p)
  * r  Remainder from the division.
  * returns MP_OKAY indicating success.
  */
-static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_digit* m,
-        sp_digit* r)
+static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d,
+        sp_digit* m, sp_digit* r)
 {
     sp_digit t1[64], t2[33];
     sp_digit div, r1;
@@ -68750,7 +68758,7 @@ static void sp_1024_rshift1_32(sp_digit* r_p, const sp_digit* a_p)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_1024_div2_32(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_1024_mont_div2_32(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     sp_digit o;
 
@@ -68803,7 +68811,7 @@ static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p,
     /* T2 = Y * Y */
     sp_1024_mont_sqr_32(t2, y, p1024_mod, p1024_mp_mod);
     /* T2 = T2/2 */
-    sp_1024_div2_32(t2, t2, p1024_mod);
+    sp_1024_mont_div2_32(t2, t2, p1024_mod);
     /* Y = Y * X */
     sp_1024_mont_mul_32(y, y, p->x, p1024_mod, p1024_mp_mod);
     /* X = T1 * T1 */
@@ -68836,7 +68844,8 @@ typedef struct sp_1024_proj_point_dbl_32_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
+static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
+        const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_1024_proj_point_dbl_32_ctx* ctx = (sp_1024_proj_point_dbl_32_ctx*)sp_ctx->data;
@@ -68910,7 +68919,7 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_1024_div2_32(ctx->t2, ctx->t2, p1024_mod);
+        sp_1024_mont_div2_32(ctx->t2, ctx->t2, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -68960,126 +68969,6 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_1024_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "MOV	r11, #0x0\n\t"
-        "ADD	r12, %[a], #0x80\n\t"
-        "\n"
-    "L_sp_1024_sub_32_word_%=:\n\t"
-        "RSBS	r11, r11, #0x0\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "SBC	r11, r3, r3\n\t"
-        "CMP	%[a], r12\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_1024_sub_32_word_%=\n\t"
-#else
-        "BNE.N	L_sp_1024_sub_32_word_%=\n\t"
-#endif
-        "MOV	%[r], r11\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-static sp_digit sp_1024_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
-{
-    register sp_digit* r asm ("r0") = (sp_digit*)r_p;
-    register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
-    register const sp_digit* b asm ("r2") = (const sp_digit*)b_p;
-
-    __asm__ __volatile__ (
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SUBS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
-        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
-        "SBCS	r3, r3, r7\n\t"
-        "SBCS	r4, r4, r8\n\t"
-        "SBCS	r5, r5, r9\n\t"
-        "SBCS	r6, r6, r10\n\t"
-        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
-        "SBC	%[r], r6, r6\n\t"
-        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
-        :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
-    );
-    return (uint32_t)(size_t)r;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -69639,7 +69528,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     sp_1024_mont_sub_32(y, y, t1, p1024_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_1024_div2_32(y, y, p1024_mod);
+    sp_1024_mont_div2_32(y, y, p1024_mod);
 }
 
 /* Convert the projective point to affine.
@@ -70052,8 +69941,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap);
@@ -70407,8 +70296,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap);
@@ -76562,7 +76451,7 @@ static void sp_1024_accumulate_line_dbl_32(sp_digit* vx, sp_digit* vy,
     /* ty = 4 * p.y ^ 2 */
     sp_1024_mont_sqr_32(ty, ry, p1024_mod, p1024_mp_mod);
     /* t1 = 2 * p.y ^ 2 */
-    sp_1024_div2_32(t1, ty, p1024_mod);
+    sp_1024_mont_div2_32(t1, ty, p1024_mod);
     /* r.x -= 2 * (p.y ^ 2) */
     sp_1024_mont_sub_32(rx, rx, t1, p1024_mod);
     /* p'.z = p.y * 2 * p.z */
@@ -76582,7 +76471,7 @@ static void sp_1024_accumulate_line_dbl_32(sp_digit* vx, sp_digit* vy,
     /* t1 = (4 * p.y^2) ^ 2 = 16 * p.y^4 */
     sp_1024_mont_sqr_32(t1, ty, p1024_mod, p1024_mp_mod);
     /* t1 = 16 * p.y^4 / 2 = 8 * p.y^4 */
-    sp_1024_div2_32(t1, t1, p1024_mod);
+    sp_1024_mont_div2_32(t1, t1, p1024_mod);
     /* p'.y = 4 * p.y^2 * p.x */
     sp_1024_mont_mul_32(p->y, ty, p->x, p1024_mod, p1024_mp_mod);
     /* p'.x = l^2 */
@@ -77000,7 +76889,7 @@ static void sp_1024_accumulate_line_dbl_n_32(sp_digit* vx, sp_digit* vy,
         /* ty = py ^ 2 */
         sp_1024_mont_sqr_32(ty, p->y, p1024_mod, p1024_mp_mod);
         /* t1 = py ^ 2 / 2 */
-        sp_1024_div2_32(t1, ty, p1024_mod);
+        sp_1024_mont_div2_32(t1, ty, p1024_mod);
         /* r.x -= py ^ 2 / 2 */
         sp_1024_mont_sub_32(rx, rx, t1, p1024_mod);
         /* p'.z = py * pz */
@@ -77038,7 +76927,7 @@ static void sp_1024_accumulate_line_dbl_n_32(sp_digit* vx, sp_digit* vy,
     }
 
     /* p'.y = py' / 2 */
-    sp_1024_div2_32(p->y, p->y, p1024_mod);
+    sp_1024_mont_div2_32(p->y, p->y, p1024_mod);
 }
 
 /* Operations to perform based on order - 1.
@@ -77878,19 +77767,21 @@ static int sp_1024_ecc_is_point_32(const sp_point_1024* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 32;
 
+        /* y^2 - x^3 - a.x = b */
         sp_1024_sqr_32(t1, point->y);
         (void)sp_1024_mod_32(t1, t1, p1024_mod);
         sp_1024_sqr_32(t2, point->x);
         (void)sp_1024_mod_32(t2, t2, p1024_mod);
         sp_1024_mul_32(t2, t2, point->x);
         (void)sp_1024_mod_32(t2, t2, p1024_mod);
-        (void)sp_1024_sub_32(t2, p1024_mod, t2);
-        sp_1024_mont_add_32(t1, t1, t2, p1024_mod);
+        sp_1024_mont_sub_32(t1, t1, t2, p1024_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_1024_mont_add_32(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_32(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_32(t1, t1, point->x, p1024_mod);
 
+
         n = sp_1024_cmp_32(t1, p1024_mod);
         sp_1024_cond_sub_32(t1, t1, p1024_mod, ~(n >> 31));
         sp_1024_norm_32(t1);
diff --git a/wolfcrypt/src/sp_int.c b/wolfcrypt/src/sp_int.c
index 022df827c..e9ffed422 100644
--- a/wolfcrypt/src/sp_int.c
+++ b/wolfcrypt/src/sp_int.c
@@ -5495,8 +5495,8 @@ int sp_cmp(const sp_int* a, const sp_int* b)
  *************************/
 
 #if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
-    (defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)) || \
-    defined(OPENSSL_EXTRA)
+    ((defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_SP_SM2)) && \
+     defined(HAVE_ECC)) || defined(OPENSSL_EXTRA)
 /* Check if a bit is set
  *
  * When a is NULL, result is 0.
diff --git a/wolfcrypt/src/sp_sm2_arm32.c b/wolfcrypt/src/sp_sm2_arm32.c
new file mode 100644
index 000000000..211b14392
--- /dev/null
+++ b/wolfcrypt/src/sp_sm2_arm32.c
@@ -0,0 +1,33 @@
+/* sp_sm2_arm32.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_SM2
+
+#error "See https://github.com/wolfSSL/wolfsm for implementation of this file"
+
+#endif
+
diff --git a/wolfcrypt/src/sp_sm2_arm64.c b/wolfcrypt/src/sp_sm2_arm64.c
new file mode 100644
index 000000000..5c84948a0
--- /dev/null
+++ b/wolfcrypt/src/sp_sm2_arm64.c
@@ -0,0 +1,33 @@
+/* sp_sm2_arm64.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_SM2
+
+#error "See https://github.com/wolfSSL/wolfsm for implementation of this file"
+
+#endif
+
diff --git a/wolfcrypt/src/sp_sm2_armthumb.c b/wolfcrypt/src/sp_sm2_armthumb.c
new file mode 100644
index 000000000..5d26e27be
--- /dev/null
+++ b/wolfcrypt/src/sp_sm2_armthumb.c
@@ -0,0 +1,33 @@
+/* sp_sm2_armthumb.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_SM2
+
+#error "See https://github.com/wolfSSL/wolfsm for implementation of this file"
+
+#endif
+
diff --git a/wolfcrypt/src/sp_sm2_c32.c b/wolfcrypt/src/sp_sm2_c32.c
new file mode 100644
index 000000000..41c40d1ef
--- /dev/null
+++ b/wolfcrypt/src/sp_sm2_c32.c
@@ -0,0 +1,33 @@
+/* sp_sm2_c32.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_SM2
+
+#error "See https://github.com/wolfSSL/wolfsm for implementation of this file"
+
+#endif
+
diff --git a/wolfcrypt/src/sp_sm2_c64.c b/wolfcrypt/src/sp_sm2_c64.c
new file mode 100644
index 000000000..ee3801654
--- /dev/null
+++ b/wolfcrypt/src/sp_sm2_c64.c
@@ -0,0 +1,33 @@
+/* sp_sm2_c64.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_SM2
+
+#error "See https://github.com/wolfSSL/wolfsm for implementation of this file"
+
+#endif
+
diff --git a/wolfcrypt/src/sp_sm2_cortexm.c b/wolfcrypt/src/sp_sm2_cortexm.c
new file mode 100644
index 000000000..3bda85f02
--- /dev/null
+++ b/wolfcrypt/src/sp_sm2_cortexm.c
@@ -0,0 +1,33 @@
+/* sp_sm2_cortexm.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_SM2
+
+#error "See https://github.com/wolfSSL/wolfsm for implementation of this file"
+
+#endif
+
diff --git a/wolfcrypt/src/sp_sm2_x86_64.c b/wolfcrypt/src/sp_sm2_x86_64.c
new file mode 100644
index 000000000..f73e40834
--- /dev/null
+++ b/wolfcrypt/src/sp_sm2_x86_64.c
@@ -0,0 +1,33 @@
+/* sp_sm2_x86_64.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_SM2
+
+#error "See https://github.com/wolfSSL/wolfsm for implementation of this file"
+
+#endif
+
diff --git a/wolfcrypt/src/sp_sm2_x86_64_asm.S b/wolfcrypt/src/sp_sm2_x86_64_asm.S
new file mode 100644
index 000000000..6ddc3c77e
--- /dev/null
+++ b/wolfcrypt/src/sp_sm2_x86_64_asm.S
@@ -0,0 +1,33 @@
+/* sp_sm2_x86_64_asm.S
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_SM2
+
+#error "See https://github.com/wolfSSL/wolfsm for implementation of this file"
+
+#endif
+
diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c
index bf106d28f..916a32fbf 100644
--- a/wolfcrypt/src/sp_x86_64.c
+++ b/wolfcrypt/src/sp_x86_64.c
@@ -8600,7 +8600,7 @@ extern void sp_256_mont_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b,
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
+extern void sp_256_mont_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
 #ifdef __cplusplus
 }
 #endif
@@ -8655,7 +8655,7 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
     /* T2 = Y * Y */
     sp_256_mont_sqr_4(t2, y, p256_mod, p256_mp_mod);
     /* T2 = T2/2 */
-    sp_256_div2_4(t2, t2, p256_mod);
+    sp_256_mont_div2_4(t2, t2, p256_mod);
     /* Y = Y * X */
     sp_256_mont_mul_4(y, y, p->x, p256_mod, p256_mp_mod);
     /* X = T1 * T1 */
@@ -8685,7 +8685,8 @@ typedef struct sp_256_proj_point_dbl_4_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
+static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+        const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_256_proj_point_dbl_4_ctx* ctx = (sp_256_proj_point_dbl_4_ctx*)sp_ctx->data;
@@ -8759,7 +8760,7 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_256_div2_4(ctx->t2, ctx->t2, p256_mod);
+        sp_256_mont_div2_4(ctx->t2, ctx->t2, p256_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -8889,7 +8890,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
     sp_256_mont_sub_4(y, y, t1, p256_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_256_div2_4(y, y, p256_mod);
+    sp_256_mont_div2_4(y, y, p256_mod);
 }
 
 /* Compare two numbers to determine if they are equal.
@@ -9268,7 +9269,7 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
         sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_4(y, y, t1, p256_mod);
         /* Y = Y/2 */
-        sp_256_div2_4(r[j].y, y, p256_mod);
+        sp_256_mont_div2_4(r[j].y, y, p256_mod);
         r[j].infinity = 0;
     }
 }
@@ -9698,7 +9699,7 @@ extern sp_digit sp_256_cond_sub_avx2_4(sp_digit* r, const sp_digit* a, const sp_
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void sp_256_mont_reduce_avx2_order_4(sp_digit* a, const sp_digit* m, sp_digit mp);
+extern void sp_256_mont_reduce_order_avx2_4(sp_digit* a, const sp_digit* m, sp_digit mp);
 #ifdef __cplusplus
 }
 #endif
@@ -9749,7 +9750,7 @@ static void sp_256_map_avx2_4(sp_point_256* r, const sp_point_256* p,
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
+extern void sp_256_mont_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
 #ifdef __cplusplus
 }
 #endif
@@ -9798,7 +9799,7 @@ static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p,
     /* T2 = Y * Y */
     sp_256_mont_sqr_avx2_4(t2, y, p256_mod, p256_mp_mod);
     /* T2 = T2/2 */
-    sp_256_div2_avx2_4(t2, t2, p256_mod);
+    sp_256_mont_div2_avx2_4(t2, t2, p256_mod);
     /* Y = Y * X */
     sp_256_mont_mul_avx2_4(y, y, p->x, p256_mod, p256_mp_mod);
     /* X = T1 * T1 */
@@ -9828,7 +9829,8 @@ typedef struct sp_256_proj_point_dbl_avx2_4_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
+static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+        const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_256_proj_point_dbl_avx2_4_ctx* ctx = (sp_256_proj_point_dbl_avx2_4_ctx*)sp_ctx->data;
@@ -9902,7 +9904,7 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_256_div2_avx2_4(ctx->t2, ctx->t2, p256_mod);
+        sp_256_mont_div2_avx2_4(ctx->t2, ctx->t2, p256_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -10032,7 +10034,7 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
     sp_256_mont_sub_avx2_4(y, y, t1, p256_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_256_div2_avx2_4(y, y, p256_mod);
+    sp_256_mont_div2_avx2_4(y, y, p256_mod);
 }
 
 
@@ -10387,7 +10389,7 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r,
         sp_256_mont_mul_avx2_4(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_avx2_4(y, y, t1, p256_mod);
         /* Y = Y/2 */
-        sp_256_div2_avx2_4(r[j].y, y, p256_mod);
+        sp_256_mont_div2_avx2_4(r[j].y, y, p256_mod);
         r[j].infinity = 0;
     }
 }
@@ -11053,8 +11055,8 @@ static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_win_add_sub_4(r, g, k, map, ct, heap);
@@ -11434,8 +11436,8 @@ static int sp_256_ecc_mulmod_stripe_avx2_4(sp_point_256* r, const sp_point_256*
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_ecc_mulmod_avx2_4(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_256_ecc_mulmod_avx2_4(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_256_ecc_mulmod_win_add_sub_avx2_4(r, g, k, map, ct, heap);
@@ -26417,19 +26419,21 @@ static int sp_256_ecc_is_point_4(const sp_point_256* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 4;
 
+        /* y^2 - x^3 - a.x = b */
         sp_256_sqr_4(t1, point->y);
         (void)sp_256_mod_4(t1, t1, p256_mod);
         sp_256_sqr_4(t2, point->x);
         (void)sp_256_mod_4(t2, t2, p256_mod);
         sp_256_mul_4(t2, t2, point->x);
         (void)sp_256_mod_4(t2, t2, p256_mod);
-        (void)sp_256_sub_4(t2, p256_mod, t2);
-        sp_256_mont_add_4(t1, t1, t2, p256_mod);
+        sp_256_mont_sub_4(t1, t1, t2, p256_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_256_mont_add_4(t1, t1, point->x, p256_mod);
         sp_256_mont_add_4(t1, t1, point->x, p256_mod);
         sp_256_mont_add_4(t1, t1, point->x, p256_mod);
 
+
         if (sp_256_cmp_4(t1, p256_b) != 0) {
             err = MP_VAL;
         }
@@ -27717,7 +27721,7 @@ extern void sp_384_mont_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b,
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void sp_384_div2_6(sp_digit* r, const sp_digit* a, const sp_digit* m);
+extern void sp_384_mont_div2_6(sp_digit* r, const sp_digit* a, const sp_digit* m);
 #ifdef __cplusplus
 }
 #endif
@@ -27765,7 +27769,7 @@ static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p,
     /* T2 = Y * Y */
     sp_384_mont_sqr_6(t2, y, p384_mod, p384_mp_mod);
     /* T2 = T2/2 */
-    sp_384_div2_6(t2, t2, p384_mod);
+    sp_384_mont_div2_6(t2, t2, p384_mod);
     /* Y = Y * X */
     sp_384_mont_mul_6(y, y, p->x, p384_mod, p384_mp_mod);
     /* X = T1 * T1 */
@@ -27798,7 +27802,8 @@ typedef struct sp_384_proj_point_dbl_6_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
+static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
+        const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_384_proj_point_dbl_6_ctx* ctx = (sp_384_proj_point_dbl_6_ctx*)sp_ctx->data;
@@ -27872,7 +27877,7 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_384_div2_6(ctx->t2, ctx->t2, p384_mod);
+        sp_384_mont_div2_6(ctx->t2, ctx->t2, p384_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -28010,7 +28015,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
     sp_384_mont_sub_6(y, y, t1, p384_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_384_div2_6(y, y, p384_mod);
+    sp_384_mont_div2_6(y, y, p384_mod);
 }
 
 /* Compare two numbers to determine if they are equal.
@@ -28396,7 +28401,7 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r,
         sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_6(y, y, t1, p384_mod);
         /* Y = Y/2 */
-        sp_384_div2_6(r[j].y, y, p384_mod);
+        sp_384_mont_div2_6(r[j].y, y, p384_mod);
         r[j].infinity = 0;
     }
 }
@@ -28913,7 +28918,7 @@ static void sp_384_map_avx2_6(sp_point_384* r, const sp_point_384* p,
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void sp_384_div2_avx2_6(sp_digit* r, const sp_digit* a, const sp_digit* m);
+extern void sp_384_mont_div2_avx2_6(sp_digit* r, const sp_digit* a, const sp_digit* m);
 #ifdef __cplusplus
 }
 #endif
@@ -28961,7 +28966,7 @@ static void sp_384_proj_point_dbl_avx2_6(sp_point_384* r, const sp_point_384* p,
     /* T2 = Y * Y */
     sp_384_mont_sqr_avx2_6(t2, y, p384_mod, p384_mp_mod);
     /* T2 = T2/2 */
-    sp_384_div2_avx2_6(t2, t2, p384_mod);
+    sp_384_mont_div2_avx2_6(t2, t2, p384_mod);
     /* Y = Y * X */
     sp_384_mont_mul_avx2_6(y, y, p->x, p384_mod, p384_mp_mod);
     /* X = T1 * T1 */
@@ -28994,7 +28999,8 @@ typedef struct sp_384_proj_point_dbl_avx2_6_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_384_proj_point_dbl_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
+static int sp_384_proj_point_dbl_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
+        const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_384_proj_point_dbl_avx2_6_ctx* ctx = (sp_384_proj_point_dbl_avx2_6_ctx*)sp_ctx->data;
@@ -29068,7 +29074,7 @@ static int sp_384_proj_point_dbl_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_384_div2_avx2_6(ctx->t2, ctx->t2, p384_mod);
+        sp_384_mont_div2_avx2_6(ctx->t2, ctx->t2, p384_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -29206,7 +29212,7 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i,
     sp_384_mont_sub_avx2_6(y, y, t1, p384_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_384_div2_avx2_6(y, y, p384_mod);
+    sp_384_mont_div2_avx2_6(y, y, p384_mod);
 }
 
 
@@ -29568,7 +29574,7 @@ static void sp_384_proj_point_dbl_n_store_avx2_6(sp_point_384* r,
         sp_384_mont_mul_avx2_6(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_avx2_6(y, y, t1, p384_mod);
         /* Y = Y/2 */
-        sp_384_div2_avx2_6(r[j].y, y, p384_mod);
+        sp_384_mont_div2_avx2_6(r[j].y, y, p384_mod);
         r[j].infinity = 0;
     }
 }
@@ -30237,8 +30243,8 @@ static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_6(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_6(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_win_add_sub_6(r, g, k, map, ct, heap);
@@ -30621,8 +30627,8 @@ static int sp_384_ecc_mulmod_stripe_avx2_6(sp_point_384* r, const sp_point_384*
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_384_ecc_mulmod_avx2_6(sp_point_384* r, const sp_point_384* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_384_ecc_mulmod_avx2_6(sp_point_384* r, const sp_point_384* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_384_ecc_mulmod_win_add_sub_avx2_6(r, g, k, map, ct, heap);
@@ -50861,7 +50867,7 @@ static int sp_384_mod_inv_6(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_384_cmp_6(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_384_cmp_6(u, v) >= 0))) {
             sp_384_sub_6(u, u, v);
             o = sp_384_sub_6(b, b, d);
             if (o != 0)
@@ -51360,19 +51366,21 @@ static int sp_384_ecc_is_point_6(const sp_point_384* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 6;
 
+        /* y^2 - x^3 - a.x = b */
         sp_384_sqr_6(t1, point->y);
         (void)sp_384_mod_6(t1, t1, p384_mod);
         sp_384_sqr_6(t2, point->x);
         (void)sp_384_mod_6(t2, t2, p384_mod);
         sp_384_mul_6(t2, t2, point->x);
         (void)sp_384_mod_6(t2, t2, p384_mod);
-        (void)sp_384_sub_6(t2, p384_mod, t2);
-        sp_384_mont_add_6(t1, t1, t2, p384_mod);
+        sp_384_mont_sub_6(t1, t1, t2, p384_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_384_mont_add_6(t1, t1, point->x, p384_mod);
         sp_384_mont_add_6(t1, t1, point->x, p384_mod);
         sp_384_mont_add_6(t1, t1, point->x, p384_mod);
 
+
         if (sp_384_cmp_6(t1, p384_b) != 0) {
             err = MP_VAL;
         }
@@ -52612,7 +52620,7 @@ extern void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b,
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void sp_521_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m);
+extern void sp_521_mont_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m);
 #ifdef __cplusplus
 }
 #endif
@@ -52660,7 +52668,7 @@ static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p,
     /* T2 = Y * Y */
     sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod);
     /* T2 = T2/2 */
-    sp_521_div2_9(t2, t2, p521_mod);
+    sp_521_mont_div2_9(t2, t2, p521_mod);
     /* Y = Y * X */
     sp_521_mont_mul_9(y, y, p->x, p521_mod, p521_mp_mod);
     /* X = T1 * T1 */
@@ -52693,7 +52701,8 @@ typedef struct sp_521_proj_point_dbl_9_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
+static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
+        const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_521_proj_point_dbl_9_ctx* ctx = (sp_521_proj_point_dbl_9_ctx*)sp_ctx->data;
@@ -52767,7 +52776,7 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_521_div2_9(ctx->t2, ctx->t2, p521_mod);
+        sp_521_mont_div2_9(ctx->t2, ctx->t2, p521_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -52905,7 +52914,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     sp_521_mont_sub_9(y, y, t1, p521_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_521_div2_9(y, y, p521_mod);
+    sp_521_mont_div2_9(y, y, p521_mod);
 }
 
 /* Compare two numbers to determine if they are equal.
@@ -53293,7 +53302,7 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r,
         sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_9(y, y, t1, p521_mod);
         /* Y = Y/2 */
-        sp_521_div2_9(r[j].y, y, p521_mod);
+        sp_521_mont_div2_9(r[j].y, y, p521_mod);
         r[j].infinity = 0;
     }
 }
@@ -53787,7 +53796,7 @@ static void sp_521_map_avx2_9(sp_point_521* r, const sp_point_521* p,
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void sp_521_div2_avx2_9(sp_digit* r, const sp_digit* a, const sp_digit* m);
+extern void sp_521_mont_div2_avx2_9(sp_digit* r, const sp_digit* a, const sp_digit* m);
 #ifdef __cplusplus
 }
 #endif
@@ -53835,7 +53844,7 @@ static void sp_521_proj_point_dbl_avx2_9(sp_point_521* r, const sp_point_521* p,
     /* T2 = Y * Y */
     sp_521_mont_sqr_avx2_9(t2, y, p521_mod, p521_mp_mod);
     /* T2 = T2/2 */
-    sp_521_div2_avx2_9(t2, t2, p521_mod);
+    sp_521_mont_div2_avx2_9(t2, t2, p521_mod);
     /* Y = Y * X */
     sp_521_mont_mul_avx2_9(y, y, p->x, p521_mod, p521_mp_mod);
     /* X = T1 * T1 */
@@ -53868,7 +53877,8 @@ typedef struct sp_521_proj_point_dbl_avx2_9_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_521_proj_point_dbl_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
+static int sp_521_proj_point_dbl_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
+        const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_521_proj_point_dbl_avx2_9_ctx* ctx = (sp_521_proj_point_dbl_avx2_9_ctx*)sp_ctx->data;
@@ -53942,7 +53952,7 @@ static int sp_521_proj_point_dbl_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_521_div2_avx2_9(ctx->t2, ctx->t2, p521_mod);
+        sp_521_mont_div2_avx2_9(ctx->t2, ctx->t2, p521_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -54080,7 +54090,7 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i,
     sp_521_mont_sub_avx2_9(y, y, t1, p521_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_521_div2_avx2_9(y, y, p521_mod);
+    sp_521_mont_div2_avx2_9(y, y, p521_mod);
 }
 
 
@@ -54442,7 +54452,7 @@ static void sp_521_proj_point_dbl_n_store_avx2_9(sp_point_521* r,
         sp_521_mont_mul_avx2_9(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_avx2_9(y, y, t1, p521_mod);
         /* Y = Y/2 */
-        sp_521_div2_avx2_9(r[j].y, y, p521_mod);
+        sp_521_mont_div2_avx2_9(r[j].y, y, p521_mod);
         r[j].infinity = 0;
     }
 }
@@ -55111,8 +55121,8 @@ static void sp_ecc_get_cache_521(const sp_point_521* g, sp_cache_521_t** cache)
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap);
@@ -55495,8 +55505,8 @@ static int sp_521_ecc_mulmod_stripe_avx2_9(sp_point_521* r, const sp_point_521*
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_521_ecc_mulmod_avx2_9(sp_point_521* r, const sp_point_521* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_521_ecc_mulmod_avx2_9(sp_point_521* r, const sp_point_521* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_521_ecc_mulmod_win_add_sub_avx2_9(r, g, k, map, ct, heap);
@@ -91985,7 +91995,7 @@ static int sp_521_mod_inv_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
     }
 
     while (ut > 1 && vt > 1) {
-        if (ut > vt || (ut == vt && sp_521_cmp_9(u, v) >= 0)) {
+        if ((ut > vt) || ((ut == vt) && (sp_521_cmp_9(u, v) >= 0))) {
             sp_521_sub_9(u, u, v);
             o = sp_521_sub_9(b, b, d);
             if (o != 0)
@@ -92494,19 +92504,21 @@ static int sp_521_ecc_is_point_9(const sp_point_521* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 9;
 
+        /* y^2 - x^3 - a.x = b */
         sp_521_sqr_9(t1, point->y);
         (void)sp_521_mod_9(t1, t1, p521_mod);
         sp_521_sqr_9(t2, point->x);
         (void)sp_521_mod_9(t2, t2, p521_mod);
         sp_521_mul_9(t2, t2, point->x);
         (void)sp_521_mod_9(t2, t2, p521_mod);
-        (void)sp_521_sub_9(t2, p521_mod, t2);
-        sp_521_mont_add_9(t1, t1, t2, p521_mod);
+        sp_521_mont_sub_9(t1, t1, t2, p521_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_521_mont_add_9(t1, t1, point->x, p521_mod);
         sp_521_mont_add_9(t1, t1, point->x, p521_mod);
         sp_521_mont_add_9(t1, t1, point->x, p521_mod);
 
+
         if (sp_521_cmp_9(t1, p521_b) != 0) {
             err = MP_VAL;
         }
@@ -93862,7 +93874,7 @@ extern void sp_1024_mont_sub_16(sp_digit* r, const sp_digit* a, const sp_digit*
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void sp_1024_div2_16(sp_digit* r, const sp_digit* a, const sp_digit* m);
+extern void sp_1024_mont_div2_16(sp_digit* r, const sp_digit* a, const sp_digit* m);
 #ifdef __cplusplus
 }
 #endif
@@ -93910,7 +93922,7 @@ static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p,
     /* T2 = Y * Y */
     sp_1024_mont_sqr_16(t2, y, p1024_mod, p1024_mp_mod);
     /* T2 = T2/2 */
-    sp_1024_div2_16(t2, t2, p1024_mod);
+    sp_1024_mont_div2_16(t2, t2, p1024_mod);
     /* Y = Y * X */
     sp_1024_mont_mul_16(y, y, p->x, p1024_mod, p1024_mp_mod);
     /* X = T1 * T1 */
@@ -93943,7 +93955,8 @@ typedef struct sp_1024_proj_point_dbl_16_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
+static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
+        const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_1024_proj_point_dbl_16_ctx* ctx = (sp_1024_proj_point_dbl_16_ctx*)sp_ctx->data;
@@ -94017,7 +94030,7 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_1024_div2_16(ctx->t2, ctx->t2, p1024_mod);
+        sp_1024_mont_div2_16(ctx->t2, ctx->t2, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -94155,16 +94168,9 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
     sp_1024_mont_sub_16(y, y, t1, p1024_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_1024_div2_16(y, y, p1024_mod);
+    sp_1024_mont_div2_16(y, y, p1024_mod);
 }
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern sp_digit sp_1024_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* b);
-#ifdef __cplusplus
-}
-#endif
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -94553,7 +94559,7 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r,
         sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_16(y, y, t1, p1024_mod);
         /* Y = Y/2 */
-        sp_1024_div2_16(r[j].y, y, p1024_mod);
+        sp_1024_mont_div2_16(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
     }
 }
@@ -95017,7 +95023,7 @@ extern void sp_1024_mont_sub_avx2_16(sp_digit* r, const sp_digit* a, const sp_di
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void sp_1024_div2_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* m);
+extern void sp_1024_mont_div2_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* m);
 #ifdef __cplusplus
 }
 #endif
@@ -95065,7 +95071,7 @@ static void sp_1024_proj_point_dbl_avx2_16(sp_point_1024* r, const sp_point_1024
     /* T2 = Y * Y */
     sp_1024_mont_sqr_avx2_16(t2, y, p1024_mod, p1024_mp_mod);
     /* T2 = T2/2 */
-    sp_1024_div2_avx2_16(t2, t2, p1024_mod);
+    sp_1024_mont_div2_avx2_16(t2, t2, p1024_mod);
     /* Y = Y * X */
     sp_1024_mont_mul_avx2_16(y, y, p->x, p1024_mod, p1024_mp_mod);
     /* X = T1 * T1 */
@@ -95098,7 +95104,8 @@ typedef struct sp_1024_proj_point_dbl_avx2_16_ctx {
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-static int sp_1024_proj_point_dbl_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
+static int sp_1024_proj_point_dbl_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
+        const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
     sp_1024_proj_point_dbl_avx2_16_ctx* ctx = (sp_1024_proj_point_dbl_avx2_16_ctx*)sp_ctx->data;
@@ -95172,7 +95179,7 @@ static int sp_1024_proj_point_dbl_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024
         break;
     case 11:
         /* T2 = T2/2 */
-        sp_1024_div2_avx2_16(ctx->t2, ctx->t2, p1024_mod);
+        sp_1024_mont_div2_avx2_16(ctx->t2, ctx->t2, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
@@ -95310,7 +95317,7 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i,
     sp_1024_mont_sub_avx2_16(y, y, t1, p1024_mod);
 #endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
-    sp_1024_div2_avx2_16(y, y, p1024_mod);
+    sp_1024_mont_div2_avx2_16(y, y, p1024_mod);
 }
 
 
@@ -95672,7 +95679,7 @@ static void sp_1024_proj_point_dbl_n_store_avx2_16(sp_point_1024* r,
         sp_1024_mont_mul_avx2_16(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_avx2_16(y, y, t1, p1024_mod);
         /* Y = Y/2 */
-        sp_1024_div2_avx2_16(r[j].y, y, p1024_mod);
+        sp_1024_mont_div2_avx2_16(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
     }
 }
@@ -96314,8 +96321,8 @@ static void sp_ecc_get_cache_1024(const sp_point_1024* g, sp_cache_1024_t** cach
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_16(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_16(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_win_add_sub_16(r, g, k, map, ct, heap);
@@ -96681,8 +96688,8 @@ static int sp_1024_ecc_mulmod_stripe_avx2_16(sp_point_1024* r, const sp_point_10
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_1024_ecc_mulmod_avx2_16(sp_point_1024* r, const sp_point_1024* g, const sp_digit* k,
-        int map, int ct, void* heap)
+static int sp_1024_ecc_mulmod_avx2_16(sp_point_1024* r, const sp_point_1024* g,
+        const sp_digit* k, int map, int ct, void* heap)
 {
 #ifndef FP_ECC
     return sp_1024_ecc_mulmod_win_add_sub_avx2_16(r, g, k, map, ct, heap);
@@ -102699,7 +102706,7 @@ static void sp_1024_accumulate_line_dbl_16(sp_digit* vx, sp_digit* vy,
     /* ty = 4 * p.y ^ 2 */
     sp_1024_mont_sqr_16(ty, ry, p1024_mod, p1024_mp_mod);
     /* t1 = 2 * p.y ^ 2 */
-    sp_1024_div2_16(t1, ty, p1024_mod);
+    sp_1024_mont_div2_16(t1, ty, p1024_mod);
     /* r.x -= 2 * (p.y ^ 2) */
     sp_1024_mont_sub_16(rx, rx, t1, p1024_mod);
     /* p'.z = p.y * 2 * p.z */
@@ -102719,7 +102726,7 @@ static void sp_1024_accumulate_line_dbl_16(sp_digit* vx, sp_digit* vy,
     /* t1 = (4 * p.y^2) ^ 2 = 16 * p.y^4 */
     sp_1024_mont_sqr_16(t1, ty, p1024_mod, p1024_mp_mod);
     /* t1 = 16 * p.y^4 / 2 = 8 * p.y^4 */
-    sp_1024_div2_16(t1, t1, p1024_mod);
+    sp_1024_mont_div2_16(t1, t1, p1024_mod);
     /* p'.y = 4 * p.y^2 * p.x */
     sp_1024_mont_mul_16(p->y, ty, p->x, p1024_mod, p1024_mp_mod);
     /* p'.x = l^2 */
@@ -103137,7 +103144,7 @@ static void sp_1024_accumulate_line_dbl_n_16(sp_digit* vx, sp_digit* vy,
         /* ty = py ^ 2 */
         sp_1024_mont_sqr_16(ty, p->y, p1024_mod, p1024_mp_mod);
         /* t1 = py ^ 2 / 2 */
-        sp_1024_div2_16(t1, ty, p1024_mod);
+        sp_1024_mont_div2_16(t1, ty, p1024_mod);
         /* r.x -= py ^ 2 / 2 */
         sp_1024_mont_sub_16(rx, rx, t1, p1024_mod);
         /* p'.z = py * pz */
@@ -103175,7 +103182,7 @@ static void sp_1024_accumulate_line_dbl_n_16(sp_digit* vx, sp_digit* vy,
     }
 
     /* p'.y = py' / 2 */
-    sp_1024_div2_16(p->y, p->y, p1024_mod);
+    sp_1024_mont_div2_16(p->y, p->y, p1024_mod);
 }
 
 /* Operations to perform based on order - 1.
@@ -103500,7 +103507,7 @@ static void sp_1024_accumulate_line_dbl_avx2_16(sp_digit* vx, sp_digit* vy,
     /* ty = 4 * p.y ^ 2 */
     sp_1024_mont_sqr_avx2_16(ty, ry, p1024_mod, p1024_mp_mod);
     /* t1 = 2 * p.y ^ 2 */
-    sp_1024_div2_avx2_16(t1, ty, p1024_mod);
+    sp_1024_mont_div2_avx2_16(t1, ty, p1024_mod);
     /* r.x -= 2 * (p.y ^ 2) */
     sp_1024_mont_sub_avx2_16(rx, rx, t1, p1024_mod);
     /* p'.z = p.y * 2 * p.z */
@@ -103520,7 +103527,7 @@ static void sp_1024_accumulate_line_dbl_avx2_16(sp_digit* vx, sp_digit* vy,
     /* t1 = (4 * p.y^2) ^ 2 = 16 * p.y^4 */
     sp_1024_mont_sqr_avx2_16(t1, ty, p1024_mod, p1024_mp_mod);
     /* t1 = 16 * p.y^4 / 2 = 8 * p.y^4 */
-    sp_1024_div2_avx2_16(t1, t1, p1024_mod);
+    sp_1024_mont_div2_avx2_16(t1, t1, p1024_mod);
     /* p'.y = 4 * p.y^2 * p.x */
     sp_1024_mont_mul_avx2_16(p->y, ty, p->x, p1024_mod, p1024_mp_mod);
     /* p'.x = l^2 */
@@ -103938,7 +103945,7 @@ static void sp_1024_accumulate_line_dbl_n_avx2_16(sp_digit* vx, sp_digit* vy,
         /* ty = py ^ 2 */
         sp_1024_mont_sqr_avx2_16(ty, p->y, p1024_mod, p1024_mp_mod);
         /* t1 = py ^ 2 / 2 */
-        sp_1024_div2_avx2_16(t1, ty, p1024_mod);
+        sp_1024_mont_div2_avx2_16(t1, ty, p1024_mod);
         /* r.x -= py ^ 2 / 2 */
         sp_1024_mont_sub_avx2_16(rx, rx, t1, p1024_mod);
         /* p'.z = py * pz */
@@ -103976,7 +103983,7 @@ static void sp_1024_accumulate_line_dbl_n_avx2_16(sp_digit* vx, sp_digit* vy,
     }
 
     /* p'.y = py' / 2 */
-    sp_1024_div2_avx2_16(p->y, p->y, p1024_mod);
+    sp_1024_mont_div2_avx2_16(p->y, p->y, p1024_mod);
 }
 
 /*
@@ -105429,19 +105436,21 @@ static int sp_1024_ecc_is_point_16(const sp_point_1024* point,
     if (err == MP_OKAY) {
         t2 = t1 + 2 * 16;
 
+        /* y^2 - x^3 - a.x = b */
         sp_1024_sqr_16(t1, point->y);
         (void)sp_1024_mod_16(t1, t1, p1024_mod);
         sp_1024_sqr_16(t2, point->x);
         (void)sp_1024_mod_16(t2, t2, p1024_mod);
         sp_1024_mul_16(t2, t2, point->x);
         (void)sp_1024_mod_16(t2, t2, p1024_mod);
-        (void)sp_1024_sub_16(t2, p1024_mod, t2);
-        sp_1024_mont_add_16(t1, t1, t2, p1024_mod);
+        sp_1024_mont_sub_16(t1, t1, t2, p1024_mod);
 
+        /* y^2 - x^3 + 3.x = b, when a = -3  */
         sp_1024_mont_add_16(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_16(t1, t1, point->x, p1024_mod);
         sp_1024_mont_add_16(t1, t1, point->x, p1024_mod);
 
+
         n = sp_1024_cmp_16(t1, p1024_mod);
         sp_1024_cond_sub_16(t1, t1, p1024_mod, ~(n >> 63));
         sp_1024_norm_16(t1);
diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S
index 310f4611d..4c423bc82 100644
--- a/wolfcrypt/src/sp_x86_64_asm.S
+++ b/wolfcrypt/src/sp_x86_64_asm.S
@@ -56743,26 +56743,25 @@ _sp_256_mont_sub_4:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_256_div2_4
-.type	sp_256_div2_4,@function
+.globl	sp_256_mont_div2_4
+.type	sp_256_mont_div2_4,@function
 .align	16
-sp_256_div2_4:
+sp_256_mont_div2_4:
 #else
 .section	__TEXT,__text
-.globl	_sp_256_div2_4
+.globl	_sp_256_mont_div2_4
 .p2align	4
-_sp_256_div2_4:
+_sp_256_mont_div2_4:
 #endif /* __APPLE__ */
         movq	(%rsi), %rdx
         movq	8(%rsi), %rax
         movq	16(%rsi), %rcx
         movq	24(%rsi), %r8
-        movq	$0xffffffff, %r9
         movq	$0xffffffff00000001, %r10
         movq	%rdx, %r11
         andq	$0x01, %r11
         negq	%r11
-        andq	%r11, %r9
+        movl	%r11d, %r9d
         andq	%r11, %r10
         addq	%r11, %rdx
         adcq	%r9, %rax
@@ -56780,7 +56779,7 @@ _sp_256_div2_4:
         movq	%r8, 24(%rdi)
         repz retq
 #ifndef __APPLE__
-.size	sp_256_div2_4,.-sp_256_div2_4
+.size	sp_256_mont_div2_4,.-sp_256_mont_div2_4
 #endif /* __APPLE__ */
 /* Two Montgomery numbers, subtract double second from first (r = a - 2.b % m).
  *
@@ -57424,15 +57423,15 @@ _sp_256_cond_sub_avx2_4:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_256_mont_reduce_avx2_order_4
-.type	sp_256_mont_reduce_avx2_order_4,@function
+.globl	sp_256_mont_reduce_order_avx2_4
+.type	sp_256_mont_reduce_order_avx2_4,@function
 .align	16
-sp_256_mont_reduce_avx2_order_4:
+sp_256_mont_reduce_order_avx2_4:
 #else
 .section	__TEXT,__text
-.globl	_sp_256_mont_reduce_avx2_order_4
+.globl	_sp_256_mont_reduce_order_avx2_4
 .p2align	4
-_sp_256_mont_reduce_avx2_order_4:
+_sp_256_mont_reduce_order_avx2_4:
 #endif /* __APPLE__ */
         pushq	%r12
         pushq	%r13
@@ -57576,7 +57575,7 @@ _sp_256_mont_reduce_avx2_order_4:
         popq	%r12
         repz retq
 #ifndef __APPLE__
-.size	sp_256_mont_reduce_avx2_order_4,.-sp_256_mont_reduce_avx2_order_4
+.size	sp_256_mont_reduce_order_avx2_4,.-sp_256_mont_reduce_order_avx2_4
 #endif /* __APPLE__ */
 #endif /* HAVE_INTEL_AVX2 */
 #ifdef HAVE_INTEL_AVX2
@@ -57588,26 +57587,25 @@ _sp_256_mont_reduce_avx2_order_4:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_256_div2_avx2_4
-.type	sp_256_div2_avx2_4,@function
+.globl	sp_256_mont_div2_avx2_4
+.type	sp_256_mont_div2_avx2_4,@function
 .align	16
-sp_256_div2_avx2_4:
+sp_256_mont_div2_avx2_4:
 #else
 .section	__TEXT,__text
-.globl	_sp_256_div2_avx2_4
+.globl	_sp_256_mont_div2_avx2_4
 .p2align	4
-_sp_256_div2_avx2_4:
+_sp_256_mont_div2_avx2_4:
 #endif /* __APPLE__ */
         movq	(%rsi), %rdx
         movq	8(%rsi), %rax
         movq	16(%rsi), %rcx
         movq	24(%rsi), %r8
-        movq	$0xffffffff, %r9
         movq	$0xffffffff00000001, %r10
         movq	%rdx, %r11
         andq	$0x01, %r11
         negq	%r11
-        andq	%r11, %r9
+        movl	%r11d, %r9d
         andq	%r11, %r10
         addq	%r11, %rdx
         adcq	%r9, %rax
@@ -57625,7 +57623,7 @@ _sp_256_div2_avx2_4:
         movq	%r8, 24(%rdi)
         repz retq
 #ifndef __APPLE__
-.size	sp_256_div2_avx2_4,.-sp_256_div2_avx2_4
+.size	sp_256_mont_div2_avx2_4,.-sp_256_mont_div2_avx2_4
 #endif /* __APPLE__ */
 #endif /* HAVE_INTEL_AVX2 */
 #ifndef WC_NO_CACHE_RESISTANT
@@ -61092,15 +61090,15 @@ _sp_384_mont_sub_6:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_384_div2_6
-.type	sp_384_div2_6,@function
+.globl	sp_384_mont_div2_6
+.type	sp_384_mont_div2_6,@function
 .align	16
-sp_384_div2_6:
+sp_384_mont_div2_6:
 #else
 .section	__TEXT,__text
-.globl	_sp_384_div2_6
+.globl	_sp_384_mont_div2_6
 .p2align	4
-_sp_384_div2_6:
+_sp_384_mont_div2_6:
 #endif /* __APPLE__ */
         subq	$48, %rsp
         movq	(%rsi), %r11
@@ -61159,7 +61157,7 @@ _sp_384_div2_6:
         addq	$48, %rsp
         repz retq
 #ifndef __APPLE__
-.size	sp_384_div2_6,.-sp_384_div2_6
+.size	sp_384_mont_div2_6,.-sp_384_mont_div2_6
 #endif /* __APPLE__ */
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
@@ -61727,15 +61725,15 @@ _sp_384_cond_sub_avx2_6:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_384_div2_avx2_6
-.type	sp_384_div2_avx2_6,@function
+.globl	sp_384_mont_div2_avx2_6
+.type	sp_384_mont_div2_avx2_6,@function
 .align	16
-sp_384_div2_avx2_6:
+sp_384_mont_div2_avx2_6:
 #else
 .section	__TEXT,__text
-.globl	_sp_384_div2_avx2_6
+.globl	_sp_384_mont_div2_avx2_6
 .p2align	4
-_sp_384_div2_avx2_6:
+_sp_384_mont_div2_avx2_6:
 #endif /* __APPLE__ */
         movq	(%rsi), %r11
         xorq	%r10, %r10
@@ -61793,7 +61791,7 @@ _sp_384_div2_avx2_6:
         movq	%r9, 40(%rdi)
         repz retq
 #ifndef __APPLE__
-.size	sp_384_div2_avx2_6,.-sp_384_div2_avx2_6
+.size	sp_384_mont_div2_avx2_6,.-sp_384_mont_div2_avx2_6
 #endif /* __APPLE__ */
 #endif /* HAVE_INTEL_AVX2 */
 #ifndef WC_NO_CACHE_RESISTANT
@@ -66645,15 +66643,15 @@ _sp_521_mont_sub_9:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_521_div2_9
-.type	sp_521_div2_9,@function
+.globl	sp_521_mont_div2_9
+.type	sp_521_mont_div2_9,@function
 .align	16
-sp_521_div2_9:
+sp_521_mont_div2_9:
 #else
 .section	__TEXT,__text
-.globl	_sp_521_div2_9
+.globl	_sp_521_mont_div2_9
 .p2align	4
-_sp_521_div2_9:
+_sp_521_mont_div2_9:
 #endif /* __APPLE__ */
         pushq	%r12
         pushq	%r13
@@ -66703,7 +66701,7 @@ _sp_521_div2_9:
         popq	%r12
         repz retq
 #ifndef __APPLE__
-.size	sp_521_div2_9,.-sp_521_div2_9
+.size	sp_521_mont_div2_9,.-sp_521_mont_div2_9
 #endif /* __APPLE__ */
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
@@ -68411,15 +68409,15 @@ L_521_mont_reduce_order_avx2_9_loop:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_521_div2_avx2_9
-.type	sp_521_div2_avx2_9,@function
+.globl	sp_521_mont_div2_avx2_9
+.type	sp_521_mont_div2_avx2_9,@function
 .align	16
-sp_521_div2_avx2_9:
+sp_521_mont_div2_avx2_9:
 #else
 .section	__TEXT,__text
-.globl	_sp_521_div2_avx2_9
+.globl	_sp_521_mont_div2_avx2_9
 .p2align	4
-_sp_521_div2_avx2_9:
+_sp_521_mont_div2_avx2_9:
 #endif /* __APPLE__ */
         pushq	%r12
         pushq	%r13
@@ -68469,7 +68467,7 @@ _sp_521_div2_avx2_9:
         popq	%r12
         repz retq
 #ifndef __APPLE__
-.size	sp_521_div2_avx2_9,.-sp_521_div2_avx2_9
+.size	sp_521_mont_div2_avx2_9,.-sp_521_mont_div2_avx2_9
 #endif /* __APPLE__ */
 #endif /* HAVE_INTEL_AVX2 */
 #ifndef WC_NO_CACHE_RESISTANT
@@ -77322,15 +77320,15 @@ _sp_1024_mont_sub_16:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_1024_div2_16
-.type	sp_1024_div2_16,@function
+.globl	sp_1024_mont_div2_16
+.type	sp_1024_mont_div2_16,@function
 .align	16
-sp_1024_div2_16:
+sp_1024_mont_div2_16:
 #else
 .section	__TEXT,__text
-.globl	_sp_1024_div2_16
+.globl	_sp_1024_mont_div2_16
 .p2align	4
-_sp_1024_div2_16:
+_sp_1024_mont_div2_16:
 #endif /* __APPLE__ */
         subq	$0x80, %rsp
         movq	(%rsi), %r11
@@ -77469,78 +77467,7 @@ _sp_1024_div2_16:
         addq	$0x80, %rsp
         repz retq
 #ifndef __APPLE__
-.size	sp_1024_div2_16,.-sp_1024_div2_16
-#endif /* __APPLE__ */
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-#ifndef __APPLE__
-.text
-.globl	sp_1024_sub_16
-.type	sp_1024_sub_16,@function
-.align	16
-sp_1024_sub_16:
-#else
-.section	__TEXT,__text
-.globl	_sp_1024_sub_16
-.p2align	4
-_sp_1024_sub_16:
-#endif /* __APPLE__ */
-        movq	(%rsi), %rcx
-        subq	(%rdx), %rcx
-        movq	8(%rsi), %r8
-        movq	%rcx, (%rdi)
-        sbbq	8(%rdx), %r8
-        movq	16(%rsi), %rcx
-        movq	%r8, 8(%rdi)
-        sbbq	16(%rdx), %rcx
-        movq	24(%rsi), %r8
-        movq	%rcx, 16(%rdi)
-        sbbq	24(%rdx), %r8
-        movq	32(%rsi), %rcx
-        movq	%r8, 24(%rdi)
-        sbbq	32(%rdx), %rcx
-        movq	40(%rsi), %r8
-        movq	%rcx, 32(%rdi)
-        sbbq	40(%rdx), %r8
-        movq	48(%rsi), %rcx
-        movq	%r8, 40(%rdi)
-        sbbq	48(%rdx), %rcx
-        movq	56(%rsi), %r8
-        movq	%rcx, 48(%rdi)
-        sbbq	56(%rdx), %r8
-        movq	64(%rsi), %rcx
-        movq	%r8, 56(%rdi)
-        sbbq	64(%rdx), %rcx
-        movq	72(%rsi), %r8
-        movq	%rcx, 64(%rdi)
-        sbbq	72(%rdx), %r8
-        movq	80(%rsi), %rcx
-        movq	%r8, 72(%rdi)
-        sbbq	80(%rdx), %rcx
-        movq	88(%rsi), %r8
-        movq	%rcx, 80(%rdi)
-        sbbq	88(%rdx), %r8
-        movq	96(%rsi), %rcx
-        movq	%r8, 88(%rdi)
-        sbbq	96(%rdx), %rcx
-        movq	104(%rsi), %r8
-        movq	%rcx, 96(%rdi)
-        sbbq	104(%rdx), %r8
-        movq	112(%rsi), %rcx
-        movq	%r8, 104(%rdi)
-        sbbq	112(%rdx), %rcx
-        movq	120(%rsi), %r8
-        movq	%rcx, 112(%rdi)
-        sbbq	120(%rdx), %r8
-        movq	%r8, 120(%rdi)
-        sbbq	%rax, %rax
-        repz retq
-#ifndef __APPLE__
-.size	sp_1024_sub_16,.-sp_1024_sub_16
+.size	sp_1024_mont_div2_16,.-sp_1024_mont_div2_16
 #endif /* __APPLE__ */
 #ifdef HAVE_INTEL_AVX2
 /* Reduce the number back to 1024 bits using Montgomery reduction.
@@ -78657,15 +78584,15 @@ _sp_1024_mont_sub_avx2_16:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_1024_div2_avx2_16
-.type	sp_1024_div2_avx2_16,@function
+.globl	sp_1024_mont_div2_avx2_16
+.type	sp_1024_mont_div2_avx2_16,@function
 .align	16
-sp_1024_div2_avx2_16:
+sp_1024_mont_div2_avx2_16:
 #else
 .section	__TEXT,__text
-.globl	_sp_1024_div2_avx2_16
+.globl	_sp_1024_mont_div2_avx2_16
 .p2align	4
-_sp_1024_div2_avx2_16:
+_sp_1024_mont_div2_avx2_16:
 #endif /* __APPLE__ */
         movq	(%rsi), %r11
         xorq	%r10, %r10
@@ -78803,7 +78730,7 @@ _sp_1024_div2_avx2_16:
         movq	%r9, 120(%rdi)
         repz retq
 #ifndef __APPLE__
-.size	sp_1024_div2_avx2_16,.-sp_1024_div2_avx2_16
+.size	sp_1024_mont_div2_avx2_16,.-sp_1024_mont_div2_avx2_16
 #endif /* __APPLE__ */
 #endif /* HAVE_INTEL_AVX2 */
 /* Read big endian unsigned byte array into r.
diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm
index 5bb8faa21..9e7523d7b 100644
--- a/wolfcrypt/src/sp_x86_64_asm.asm
+++ b/wolfcrypt/src/sp_x86_64_asm.asm
@@ -55583,19 +55583,18 @@ _text ENDS
 ;  * m  Modulus (prime).
 ;  */
 _text SEGMENT READONLY PARA
-sp_256_div2_4 PROC
+sp_256_mont_div2_4 PROC
         push	r12
         push	r13
         mov	rax, QWORD PTR [rdx]
         mov	r8, QWORD PTR [rdx+8]
         mov	r9, QWORD PTR [rdx+16]
         mov	r10, QWORD PTR [rdx+24]
-        mov	r11, 4294967295
         mov	r12, 18446744069414584321
         mov	r13, rax
         and	r13, 1
         neg	r13
-        and	r11, r13
+        mov	r11d, r13d
         and	r12, r13
         add	rax, r13
         adc	r8, r11
@@ -55614,7 +55613,7 @@ sp_256_div2_4 PROC
         pop	r13
         pop	r12
         ret
-sp_256_div2_4 ENDP
+sp_256_mont_div2_4 ENDP
 _text ENDS
 ; /* Two Montgomery numbers, subtract double second from first (r = a - 2.b % m).
 ;  *
@@ -56241,7 +56240,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * mp  The digit representing the negative inverse of m mod 2^n.
 ;  */
 _text SEGMENT READONLY PARA
-sp_256_mont_reduce_avx2_order_4 PROC
+sp_256_mont_reduce_order_avx2_4 PROC
         push	r12
         push	r13
         push	r14
@@ -56389,7 +56388,7 @@ sp_256_mont_reduce_avx2_order_4 PROC
         pop	r13
         pop	r12
         ret
-sp_256_mont_reduce_avx2_order_4 ENDP
+sp_256_mont_reduce_order_avx2_4 ENDP
 _text ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
@@ -56400,19 +56399,18 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m  Modulus (prime).
 ;  */
 _text SEGMENT READONLY PARA
-sp_256_div2_avx2_4 PROC
+sp_256_mont_div2_avx2_4 PROC
         push	r12
         push	r13
         mov	rax, QWORD PTR [rdx]
         mov	r8, QWORD PTR [rdx+8]
         mov	r9, QWORD PTR [rdx+16]
         mov	r10, QWORD PTR [rdx+24]
-        mov	r11, 4294967295
         mov	r12, 18446744069414584321
         mov	r13, rax
         and	r13, 1
         neg	r13
-        and	r11, r13
+        mov	r11d, r13d
         and	r12, r13
         add	rax, r13
         adc	r8, r11
@@ -56431,7 +56429,7 @@ sp_256_div2_avx2_4 PROC
         pop	r13
         pop	r12
         ret
-sp_256_div2_avx2_4 ENDP
+sp_256_mont_div2_avx2_4 ENDP
 _text ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
@@ -59663,7 +59661,7 @@ _text ENDS
 ;  * m  Modulus (prime).
 ;  */
 _text SEGMENT READONLY PARA
-sp_384_div2_6 PROC
+sp_384_mont_div2_6 PROC
         push	r12
         push	r13
         sub	rsp, 48
@@ -59724,7 +59722,7 @@ sp_384_div2_6 PROC
         pop	r13
         pop	r12
         ret
-sp_384_div2_6 ENDP
+sp_384_mont_div2_6 ENDP
 _text ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
 ; /* Touch each possible point that could be being copied.
@@ -60297,7 +60295,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m  Modulus (prime).
 ;  */
 _text SEGMENT READONLY PARA
-sp_384_div2_avx2_6 PROC
+sp_384_mont_div2_avx2_6 PROC
         push	r12
         push	r13
         mov	r13, QWORD PTR [rdx]
@@ -60357,7 +60355,7 @@ sp_384_div2_avx2_6 PROC
         pop	r13
         pop	r12
         ret
-sp_384_div2_avx2_6 ENDP
+sp_384_mont_div2_avx2_6 ENDP
 _text ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
@@ -64989,7 +64987,7 @@ _text ENDS
 ;  * m  Modulus (prime).
 ;  */
 _text SEGMENT READONLY PARA
-sp_521_div2_9 PROC
+sp_521_mont_div2_9 PROC
         push	r12
         push	r13
         push	r14
@@ -65041,7 +65039,7 @@ sp_521_div2_9 PROC
         pop	r13
         pop	r12
         ret
-sp_521_div2_9 ENDP
+sp_521_mont_div2_9 ENDP
 _text ENDS
 IFNDEF WC_NO_CACHE_RESISTANT
 ; /* Touch each possible point that could be being copied.
@@ -66753,7 +66751,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m  Modulus (prime).
 ;  */
 _text SEGMENT READONLY PARA
-sp_521_div2_avx2_9 PROC
+sp_521_mont_div2_avx2_9 PROC
         push	r12
         push	r13
         push	r14
@@ -66805,7 +66803,7 @@ sp_521_div2_avx2_9 PROC
         pop	r13
         pop	r12
         ret
-sp_521_div2_avx2_9 ENDP
+sp_521_mont_div2_avx2_9 ENDP
 _text ENDS
 ENDIF
 IFNDEF WC_NO_CACHE_RESISTANT
@@ -75404,7 +75402,7 @@ _text ENDS
 ;  * m  Modulus (prime).
 ;  */
 _text SEGMENT READONLY PARA
-sp_1024_div2_16 PROC
+sp_1024_mont_div2_16 PROC
         push	r12
         push	r13
         sub	rsp, 128
@@ -75545,67 +75543,7 @@ sp_1024_div2_16 PROC
         pop	r13
         pop	r12
         ret
-sp_1024_div2_16 ENDP
-_text ENDS
-; /* Sub b from a into r. (r = a - b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_sub_16 PROC
-        mov	r9, QWORD PTR [rdx]
-        sub	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        sbb	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        sbb	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        sbb	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        sbb	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        sbb	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        sbb	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        sbb	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        sbb	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        sbb	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        sbb	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        sbb	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        sbb	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        sbb	r10, QWORD PTR [r8+120]
-        mov	QWORD PTR [rcx+120], r10
-        sbb	rax, rax
-        ret
-sp_1024_sub_16 ENDP
+sp_1024_mont_div2_16 ENDP
 _text ENDS
 IFDEF HAVE_INTEL_AVX2
 ; /* Reduce the number back to 1024 bits using Montgomery reduction.
@@ -76683,7 +76621,7 @@ IFDEF HAVE_INTEL_AVX2
 ;  * m  Modulus (prime).
 ;  */
 _text SEGMENT READONLY PARA
-sp_1024_div2_avx2_16 PROC
+sp_1024_mont_div2_avx2_16 PROC
         push	r12
         push	r13
         mov	r13, QWORD PTR [rdx]
@@ -76823,7 +76761,7 @@ sp_1024_div2_avx2_16 PROC
         pop	r13
         pop	r12
         ret
-sp_1024_div2_avx2_16 ENDP
+sp_1024_mont_div2_avx2_16 ENDP
 _text ENDS
 ENDIF
 ; /* Read big endian unsigned byte array into r.
diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index 143681c78..75625b724 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -24307,7 +24307,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t hpke_test(void)
 
 /* size to use for ECC key gen tests */
 #ifndef ECC_KEYGEN_SIZE
-    #ifndef NO_ECC256
+    #if !defined(NO_ECC256) || defined(WOLFSSL_SM2)
         #define ECC_KEYGEN_SIZE 32
     #elif defined(HAVE_ECC384)
         #define ECC_KEYGEN_SIZE 48
diff --git a/wolfssl/wolfcrypt/ecc.h b/wolfssl/wolfcrypt/ecc.h
index ff0467df9..c58e9442b 100644
--- a/wolfssl/wolfcrypt/ecc.h
+++ b/wolfssl/wolfcrypt/ecc.h
@@ -107,7 +107,7 @@
     #define MAX_ECC_BITS_NEEDED    384
 #elif defined(HAVE_ECC320)
     #define MAX_ECC_BITS_NEEDED    320
-#elif !defined(NO_ECC256)
+#elif !defined(NO_ECC256) || defined(WOLFSSL_SM2)
     #define MAX_ECC_BITS_NEEDED    256
 #elif defined(HAVE_ECC239)
     #define MAX_ECC_BITS_NEEDED    239
diff --git a/wolfssl/wolfcrypt/settings.h b/wolfssl/wolfcrypt/settings.h
index 7e44d312c..0bf796f1d 100644
--- a/wolfssl/wolfcrypt/settings.h
+++ b/wolfssl/wolfcrypt/settings.h
@@ -2123,6 +2123,9 @@ extern void uITRON4_free(void *p) ;
     #ifdef WOLFSSL_SP_MATH
         /* for single precision math only make sure the enabled key sizes are
          * included in the ECC curve table */
+        #if defined(WOLFSSL_SP_NO_256) && !defined(NO_ECC256)
+            #define NO_ECC256
+        #endif
         #if defined(WOLFSSL_SP_384) && !defined(HAVE_ECC384)
             #define HAVE_ECC384
         #endif
diff --git a/wolfssl/wolfcrypt/sm2.h b/wolfssl/wolfcrypt/sm2.h
index 9176c8651..87167f42e 100644
--- a/wolfssl/wolfcrypt/sm2.h
+++ b/wolfssl/wolfcrypt/sm2.h
@@ -1,3 +1,23 @@
+/* sm2.h
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
 
 #ifdef WOLFSSL_SM2
 
diff --git a/wolfssl/wolfcrypt/sm3.h b/wolfssl/wolfcrypt/sm3.h
index 43895a030..2b3fc5034 100644
--- a/wolfssl/wolfcrypt/sm3.h
+++ b/wolfssl/wolfcrypt/sm3.h
@@ -1,3 +1,23 @@
+/* sm3.h
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
 
 #ifdef WOLFSSL_SM3
 
diff --git a/wolfssl/wolfcrypt/sm4.h b/wolfssl/wolfcrypt/sm4.h
index 73220752e..f3e66cb89 100644
--- a/wolfssl/wolfcrypt/sm4.h
+++ b/wolfssl/wolfcrypt/sm4.h
@@ -1,3 +1,23 @@
+/* sm4.h
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
 
 #ifdef WOLFSSL_SM4
 
diff --git a/wolfssl/wolfcrypt/sp.h b/wolfssl/wolfcrypt/sp.h
index e9541e9d5..88e9a069b 100644
--- a/wolfssl/wolfcrypt/sp.h
+++ b/wolfssl/wolfcrypt/sp.h
@@ -331,6 +331,36 @@ WOLFSSL_LOCAL int sp_ecc_is_point_1024(const mp_int* pX, const mp_int* pY);
 WOLFSSL_LOCAL int sp_ecc_check_key_1024(const mp_int* pX, const mp_int* pY,
     const mp_int* privm, void* heap);
 
+WOLFSSL_LOCAL int sp_ecc_mulmod_sm2_256(const mp_int* km, const ecc_point* gm,
+    ecc_point* rm, int map, void* heap);
+WOLFSSL_LOCAL int sp_ecc_mulmod_add_sm2_256(const mp_int* km, const ecc_point* gm,
+    const ecc_point* am, int inMont, ecc_point* rm, int map, void* heap);
+WOLFSSL_LOCAL int sp_ecc_mulmod_base_sm2_256(const mp_int* km, ecc_point* rm,
+    int map, void* heap);
+WOLFSSL_LOCAL int sp_ecc_mulmod_base_add_sm2_256(const mp_int* km,
+    const ecc_point* am, int inMont, ecc_point* rm, int map, void* heap);
+
+WOLFSSL_LOCAL int sp_ecc_make_key_sm2_256(WC_RNG* rng, mp_int* priv,
+    ecc_point* pub, void* heap);
+WOLFSSL_LOCAL int sp_ecc_secret_gen_sm2_256(const mp_int* priv,
+    const ecc_point* pub, byte* out, word32* outlen, void* heap);
+WOLFSSL_LOCAL int sp_ecc_sign_sm2_256(const byte* hash, word32 hashLen,
+    WC_RNG* rng, const mp_int* priv, mp_int* rm, mp_int* sm, mp_int* km,
+    void* heap);
+WOLFSSL_LOCAL int sp_ecc_verify_sm2_256(const byte* hash, word32 hashLen,
+    const mp_int* pX, const mp_int* pY, const mp_int* pZ, const mp_int* r,
+    const mp_int* sm, int* res, void* heap);
+WOLFSSL_LOCAL int sp_ecc_is_point_sm2_256(const mp_int* pX, const mp_int* pY);
+WOLFSSL_LOCAL int sp_ecc_check_key_sm2_256(const mp_int* pX, const mp_int* pY,
+    const mp_int* privm, void* heap);
+WOLFSSL_LOCAL int sp_ecc_proj_add_point_sm2_256(mp_int* pX, mp_int* pY,
+    mp_int* pZ, mp_int* qX, mp_int* qY, mp_int* qZ, mp_int* rX, mp_int* rY,
+    mp_int* rZ);
+WOLFSSL_LOCAL int sp_ecc_proj_dbl_point_sm2_256(mp_int* pX, mp_int* pY,
+    mp_int* pZ, mp_int* rX, mp_int* rY, mp_int* rZ);
+WOLFSSL_LOCAL int sp_ecc_map_sm2_256(mp_int* pX, mp_int* pY, mp_int* pZ);
+WOLFSSL_LOCAL int sp_ecc_uncompress_sm2_256(mp_int* xm, int odd, mp_int* ym);
+
 #endif /* HAVE_FIPS_VERSION && HAVE_FIPS_VERSION == 2  && !WOLFSSL_SP_ARM[32|64]_ASM */
 
 #ifdef WOLFSSL_SP_NONBLOCK