diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c
index 3c34f3ef6..cb921f594 100644
--- a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c
@@ -406,9 +406,9 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p)
         "str	r8, [%[ks]], #4\n\t"
         "subs	r11, r11, #1\n\t"
         "bne	L_AES_invert_key_mix_loop_%=\n\t"
-        : [ks] "+r" (ks),  [rounds] "+r" (rounds),
-            [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c),
-            [L_AES_ARM32_td] "+r" (L_AES_ARM32_td_c)
+        : [ks] "+r" (ks), [rounds] "+r" (rounds),
+          [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c),
+          [L_AES_ARM32_td] "+r" (L_AES_ARM32_td_c)
         :
         : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11"
@@ -933,9 +933,9 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p,
         "bne	L_AES_set_encrypt_key_loop_128_%=\n\t"
         "\n"
     "L_AES_set_encrypt_key_end_%=: \n\t"
-        : [key] "+r" (key),  [len] "+r" (len),  [ks] "+r" (ks),
-            [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c),
-            [L_AES_ARM32_rcon] "+r" (L_AES_ARM32_rcon_c)
+        : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks),
+          [L_AES_ARM32_te] "+r" (L_AES_ARM32_te_c),
+          [L_AES_ARM32_rcon] "+r" (L_AES_ARM32_rcon_c)
         :
         : "memory", "cc", "r12", "lr", "r5", "r6", "r7", "r8"
     );
@@ -1588,7 +1588,7 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p,
         "eor	r5, r5, r9\n\t"
         "eor	r6, r6, r10\n\t"
         "eor	r7, r7, r11\n\t"
-        : [te] "+r" (te),  [nr] "+r" (nr),  [len] "+r" (len),  [ks] "+r" (ks)
+        : [te] "+r" (te), [nr] "+r" (nr), [len] "+r" (len), [ks] "+r" (ks)
         :
         : "memory", "cc", "lr"
     );
@@ -1841,8 +1841,8 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p,
         "\n"
     "L_AES_ECB_encrypt_end_%=: \n\t"
         "pop	{%[ks]}\n\t"
-        : [in] "+r" (in),  [out] "+r" (out),  [len] "+r" (len),  [ks] "+r" (ks),
-             [nr] "+r" (nr), [L_AES_ARM32_te_ecb] "+r" (L_AES_ARM32_te_ecb_c)
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
+          [nr] "+r" (nr), [L_AES_ARM32_te_ecb] "+r" (L_AES_ARM32_te_ecb_c)
         :
         : "memory", "cc", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11"
     );
@@ -2114,9 +2114,9 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p,
     "L_AES_CBC_encrypt_end_%=: \n\t"
         "pop	{%[ks], r9}\n\t"
         "stm	r9, {r4, r5, r6, r7}\n\t"
-        : [in] "+r" (in),  [out] "+r" (out),  [len] "+r" (len),  [ks] "+r" (ks),
-             [nr] "+r" (nr),  [iv] "+r" (iv),
-            [L_AES_ARM32_te_cbc] "+r" (L_AES_ARM32_te_cbc_c)
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
+          [nr] "+r" (nr), [iv] "+r" (iv),
+          [L_AES_ARM32_te_cbc] "+r" (L_AES_ARM32_te_cbc_c)
         :
         : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
@@ -2389,9 +2389,9 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p,
         "rev	r7, r7\n\t"
 #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
         "stm	r8, {r4, r5, r6, r7}\n\t"
-        : [in] "+r" (in),  [out] "+r" (out),  [len] "+r" (len),  [ks] "+r" (ks),
-             [nr] "+r" (nr),  [ctr] "+r" (ctr),
-            [L_AES_ARM32_te_ctr] "+r" (L_AES_ARM32_te_ctr_c)
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
+          [nr] "+r" (nr), [ctr] "+r" (ctr),
+          [L_AES_ARM32_te_ctr] "+r" (L_AES_ARM32_te_ctr_c)
         :
         : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
@@ -3045,7 +3045,7 @@ void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p)
         "eor	r5, r5, r9\n\t"
         "eor	r6, r6, r10\n\t"
         "eor	r7, r7, r11\n\t"
-        : [td] "+r" (td),  [nr] "+r" (nr),  [td4] "+r" (td4)
+        : [td] "+r" (td), [nr] "+r" (nr), [td4] "+r" (td4)
         :
         : "memory", "cc", "lr"
     );
@@ -3331,9 +3331,9 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p,
         "bne	L_AES_ECB_decrypt_loop_block_128_%=\n\t"
         "\n"
     "L_AES_ECB_decrypt_end_%=: \n\t"
-        : [in] "+r" (in),  [out] "+r" (out),  [len] "+r" (len),  [ks] "+r" (ks),
-             [nr] "+r" (nr), [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c),
-            [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c)
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
+          [nr] "+r" (nr), [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c),
+          [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c)
         :
         : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
@@ -3971,10 +3971,10 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p,
         "\n"
     "L_AES_CBC_decrypt_end_%=: \n\t"
         "pop	{%[ks]-r4}\n\t"
-        : [in] "+r" (in),  [out] "+r" (out),  [len] "+r" (len),  [ks] "+r" (ks),
-             [nr] "+r" (nr),  [iv] "+r" (iv),
-            [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c),
-            [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c)
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
+          [nr] "+r" (nr), [iv] "+r" (iv),
+          [L_AES_ARM32_td_ecb] "+r" (L_AES_ARM32_td_ecb_c),
+          [L_AES_ARM32_td4] "+r" (L_AES_ARM32_td4_c)
         :
         : "memory", "cc", "r12", "lr", "r8", "r9", "r10", "r11"
     );
@@ -4576,8 +4576,8 @@ void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p,
         "subs	%[len], %[len], #16\n\t"
         "add	%[data], %[data], #16\n\t"
         "bne	L_GCM_gmult_len_start_block_%=\n\t"
-        : [x] "+r" (x),  [m] "+r" (m),  [data] "+r" (data),  [len] "+r" (len),
-            [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c)
+        : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len),
+          [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c)
         :
         : "memory", "cc", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11"
@@ -4840,9 +4840,9 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p,
         "rev	r7, r7\n\t"
 #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
         "stm	r8, {r4, r5, r6, r7}\n\t"
-        : [in] "+r" (in),  [out] "+r" (out),  [len] "+r" (len),  [ks] "+r" (ks),
-             [nr] "+r" (nr),  [ctr] "+r" (ctr),
-            [L_AES_ARM32_te_gcm] "+r" (L_AES_ARM32_te_gcm_c)
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks),
+          [nr] "+r" (nr), [ctr] "+r" (ctr),
+          [L_AES_ARM32_te_gcm] "+r" (L_AES_ARM32_te_gcm_c)
         :
         : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
diff --git a/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c
index aebcff155..5d5d17f4c 100644
--- a/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-chacha-asm_c.c
@@ -71,7 +71,7 @@ void wc_chacha_setiv(word32* x_p, const byte* iv_p, word32 counter_p)
         "rev	lr, lr\n\t"
 #endif /* BIG_ENDIAN_ORDER */
         "stm	r3, {r4, r12, lr}\n\t"
-        : [x] "+r" (x),  [iv] "+r" (iv),  [counter] "+r" (counter)
+        : [x] "+r" (x), [iv] "+r" (iv), [counter] "+r" (counter)
         :
         : "memory", "cc", "r3", "r12", "lr", "r4"
     );
@@ -119,8 +119,8 @@ void wc_chacha_setkey(word32* x_p, const byte* key_p, word32 keySz_p)
         "\n"
     "L_chacha_arm32_setkey_same_keyb_ytes_%=: \n\t"
         "stm	%[x], {r4, r5, r12, lr}\n\t"
-        : [x] "+r" (x),  [key] "+r" (key),  [keySz] "+r" (keySz),
-            [L_chacha_arm32_constants] "+r" (L_chacha_arm32_constants_c)
+        : [x] "+r" (x), [key] "+r" (key), [keySz] "+r" (keySz),
+          [L_chacha_arm32_constants] "+r" (L_chacha_arm32_constants_c)
         :
         : "memory", "cc", "r12", "lr", "r4", "r5"
     );
@@ -484,7 +484,7 @@ void wc_chacha_crypt_bytes(ChaCha* ctx_p, byte* c_p, const byte* m_p,
         "\n"
     "L_chacha_arm32_crypt_done_%=: \n\t"
         "add	sp, sp, #52\n\t"
-        : [ctx] "+r" (ctx),  [c] "+r" (c),  [m] "+r" (m),  [len] "+r" (len)
+        : [ctx] "+r" (ctx), [c] "+r" (c), [m] "+r" (m), [len] "+r" (len)
         :
         : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11"
@@ -557,8 +557,8 @@ void wc_chacha_use_over(byte* over_p, byte* output_p, const byte* input_p,
         "b	L_chacha_arm32_over_byte_loop_%=\n\t"
         "\n"
     "L_chacha_arm32_over_done_%=: \n\t"
-        : [over] "+r" (over),  [output] "+r" (output),  [input] "+r" (input),
-             [len] "+r" (len)
+        : [over] "+r" (over), [output] "+r" (output), [input] "+r" (input),
+          [len] "+r" (len)
         :
         : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9"
     );
diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c
index 6871aaade..e3b52140c 100644
--- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c
@@ -335,7 +335,7 @@ void fe_sub(fe r_p, const fe a_p, const fe b_p)
 
     __asm__ __volatile__ (
         "bl	fe_sub_op\n\t"
-        : [r] "+r" (r),  [a] "+r" (a),  [b] "+r" (b)
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
@@ -393,7 +393,7 @@ void fe_add(fe r_p, const fe a_p, const fe b_p)
 
     __asm__ __volatile__ (
         "bl	fe_add_op\n\t"
-        : [r] "+r" (r),  [a] "+r" (a),  [b] "+r" (b)
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
@@ -428,7 +428,7 @@ void fe_frombytes(fe out_p, const unsigned char* in_p)
         "str	r7, [%[out], #20]\n\t"
         "str	r8, [%[out], #24]\n\t"
         "str	r9, [%[out], #28]\n\t"
-        : [out] "+r" (out),  [in] "+r" (in)
+        : [out] "+r" (out), [in] "+r" (in)
         :
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
@@ -472,7 +472,7 @@ void fe_tobytes(unsigned char* out_p, const fe n_p)
         "str	r7, [%[out], #20]\n\t"
         "str	r8, [%[out], #24]\n\t"
         "str	r9, [%[out], #28]\n\t"
-        : [out] "+r" (out),  [n] "+r" (n)
+        : [out] "+r" (out), [n] "+r" (n)
         :
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12"
     );
@@ -575,7 +575,7 @@ void fe_copy(fe r_p, const fe a_p)
 #else
         "strd	r4, r5, [%[r], #24]\n\t"
 #endif
-        : [r] "+r" (r),  [a] "+r" (a)
+        : [r] "+r" (r), [a] "+r" (a)
         :
         : "memory", "cc", "r2", "r3", "r4", "r5"
     );
@@ -602,7 +602,7 @@ void fe_neg(fe r_p, const fe a_p)
         "sbcs	r4, lr, r4\n\t"
         "sbc	r5, r12, r5\n\t"
         "stm	%[r]!, {r2, r3, r4, r5}\n\t"
-        : [r] "+r" (r),  [a] "+r" (a)
+        : [r] "+r" (r), [a] "+r" (a)
         :
         : "memory", "cc", "r2", "r3", "r4", "r5", "r12", "lr"
     );
@@ -2407,7 +2407,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p)
 #else
         "strd	r8, r9, [%[r], #88]\n\t"
 #endif
-        : [r] "+r" (r),  [base] "+r" (base),  [b] "+r" (b)
+        : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
         :
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r3", "r10",
             "r11", "r12", "lr"
@@ -2528,7 +2528,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p)
         "and	r7, r7, lr\n\t"
         "stm	%[r]!, {r4, r5, r6, r7}\n\t"
         "sub	%[base], %[base], %[b]\n\t"
-        : [r] "+r" (r),  [base] "+r" (base),  [b] "+r" (b)
+        : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
         :
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
@@ -3074,7 +3074,7 @@ void fe_mul(fe r_p, const fe a_p, const fe b_p)
 
     __asm__ __volatile__ (
         "bl	fe_mul_op\n\t"
-        : [r] "+r" (r),  [a] "+r" (a),  [b] "+r" (b)
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
@@ -3495,7 +3495,7 @@ void fe_sq(fe r_p, const fe a_p)
 
     __asm__ __volatile__ (
         "bl	fe_sq_op\n\t"
-        : [r] "+r" (r),  [a] "+r" (a)
+        : [r] "+r" (r), [a] "+r" (a)
         :
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
             "lr", "r10", "r11"
@@ -3572,7 +3572,7 @@ void fe_mul121666(fe r_p, fe a_p)
         "adcs	r8, r8, #0\n\t"
         "adc	r9, r9, #0\n\t"
         "stm	%[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
-        : [r] "+r" (r),  [a] "+r" (a)
+        : [r] "+r" (r), [a] "+r" (a)
         :
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
             "lr", "r10"
@@ -3635,7 +3635,7 @@ void fe_mul121666(fe r_p, fe a_p)
         "adcs	r8, r8, #0\n\t"
         "adc	r9, r9, #0\n\t"
         "stm	%[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t"
-        : [r] "+r" (r),  [a] "+r" (a)
+        : [r] "+r" (r), [a] "+r" (a)
         :
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
             "lr", "r10"
@@ -4026,7 +4026,7 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p)
         "bl	fe_mul_op\n\t"
         "mov	r0, #0\n\t"
         "add	sp, sp, #0xbc\n\t"
-        : [r] "+r" (r),  [n] "+r" (n),  [a] "+r" (a)
+        : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
         :
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12", "lr"
@@ -4340,7 +4340,7 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p)
         "stm	%[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
         "mov	r0, #0\n\t"
         "add	sp, sp, #0xc0\n\t"
-        : [r] "+r" (r),  [n] "+r" (n),  [a] "+r" (a)
+        : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
         :
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r3", "r12", "lr"
@@ -4515,7 +4515,7 @@ void fe_invert(fe r_p, const fe a_p)
         "ldr	%[a], [sp, #132]\n\t"
         "ldr	%[r], [sp, #128]\n\t"
         "add	sp, sp, #0x88\n\t"
-        : [r] "+r" (r),  [a] "+r" (a)
+        : [r] "+r" (r), [a] "+r" (a)
         :
         : "memory", "cc", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8",
             "r9", "r10", "r11"
@@ -4836,7 +4836,7 @@ void fe_sq2(fe r_p, const fe a_p)
         "ldr	r0, [sp, #64]\n\t"
         "stm	r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         "add	sp, sp, #0x44\n\t"
-        : [r] "+r" (r),  [a] "+r" (a)
+        : [r] "+r" (r), [a] "+r" (a)
         :
         : "memory", "cc", "lr"
     );
@@ -5015,7 +5015,7 @@ void fe_sq2(fe r_p, const fe a_p)
         "stm	r12, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
         "mov	r0, r12\n\t"
         "mov	r1, lr\n\t"
-        : [r] "+r" (r),  [a] "+r" (a)
+        : [r] "+r" (r), [a] "+r" (a)
         :
         : "memory", "cc", "lr"
     );
@@ -5186,7 +5186,7 @@ void fe_pow22523(fe r_p, const fe a_p)
         "ldr	%[a], [sp, #100]\n\t"
         "ldr	%[r], [sp, #96]\n\t"
         "add	sp, sp, #0x68\n\t"
-        : [r] "+r" (r),  [a] "+r" (a)
+        : [r] "+r" (r), [a] "+r" (a)
         :
         : "memory", "cc", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8",
             "r9", "r10", "r11"
@@ -5217,7 +5217,7 @@ void ge_p1p1_to_p2(ge_p2 * r_p, const ge_p1p1 * p_p)
         "add	r0, r0, #0x40\n\t"
         "bl	fe_mul_op\n\t"
         "add	sp, sp, #8\n\t"
-        : [r] "+r" (r),  [p] "+r" (p)
+        : [r] "+r" (r), [p] "+r" (p)
         :
         : "memory", "cc", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8",
             "r9", "r10", "r11"
@@ -5253,7 +5253,7 @@ void ge_p1p1_to_p3(ge_p3 * r_p, const ge_p1p1 * p_p)
         "add	r0, r0, #0x60\n\t"
         "bl	fe_mul_op\n\t"
         "add	sp, sp, #8\n\t"
-        : [r] "+r" (r),  [p] "+r" (p)
+        : [r] "+r" (r), [p] "+r" (p)
         :
         : "memory", "cc", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8",
             "r9", "r10", "r11"
@@ -5301,7 +5301,7 @@ void ge_p2_dbl(ge_p1p1 * r_p, const ge_p2 * p_p)
         "mov	r1, r0\n\t"
         "bl	fe_sub_op\n\t"
         "add	sp, sp, #8\n\t"
-        : [r] "+r" (r),  [p] "+r" (p)
+        : [r] "+r" (r), [p] "+r" (p)
         :
         : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
@@ -5388,7 +5388,7 @@ void ge_madd(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p)
         "add	r1, r0, #32\n\t"
         "bl	fe_add_sub_op\n\t"
         "add	sp, sp, #12\n\t"
-        : [r] "+r" (r),  [p] "+r" (p),  [q] "+r" (q)
+        : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
         :
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
@@ -5476,7 +5476,7 @@ void ge_msub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p)
         "add	r0, r0, #32\n\t"
         "bl	fe_add_sub_op\n\t"
         "add	sp, sp, #12\n\t"
-        : [r] "+r" (r),  [p] "+r" (p),  [q] "+r" (q)
+        : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
         :
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
@@ -5564,7 +5564,7 @@ void ge_add(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p)
         "add	r0, r0, #32\n\t"
         "bl	fe_add_sub_op\n\t"
         "add	sp, sp, #44\n\t"
-        : [r] "+r" (r),  [p] "+r" (p),  [q] "+r" (q)
+        : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
         :
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
@@ -5652,7 +5652,7 @@ void ge_sub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p)
         "add	r0, r0, #0x40\n\t"
         "bl	fe_add_sub_op\n\t"
         "add	sp, sp, #44\n\t"
-        : [r] "+r" (r),  [p] "+r" (p),  [q] "+r" (q)
+        : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q)
         :
         : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
             "r11", "r12", "lr"
@@ -8528,7 +8528,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
         "str	r8, [%[s], #24]\n\t"
         "str	r9, [%[s], #28]\n\t"
         "add	sp, sp, #0x50\n\t"
-        : [s] "+r" (s),  [a] "+r" (a),  [b] "+r" (b),  [c] "+r" (c)
+        : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c)
         :
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12", "lr"
@@ -9413,7 +9413,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
         "str	r8, [%[s], #24]\n\t"
         "str	r9, [%[s], #28]\n\t"
         "add	sp, sp, #0x50\n\t"
-        : [s] "+r" (s),  [a] "+r" (a),  [b] "+r" (b),  [c] "+r" (c)
+        : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c)
         :
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12", "lr"
diff --git a/wolfcrypt/src/port/arm/armv8-32-kyber-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-kyber-asm_c.c
index 4650b9bc8..4e5081d47 100644
--- a/wolfcrypt/src/port/arm/armv8-32-kyber-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-kyber-asm_c.c
@@ -3312,7 +3312,7 @@ void kyber_arm32_ntt(sword16* r_p)
         "bne	L_kyber_arm32_ntt_loop_567_%=\n\t"
         "add	sp, sp, #8\n\t"
         : [r] "+r" (r),
-            [L_kyber_arm32_ntt_zetas] "+r" (L_kyber_arm32_ntt_zetas_c)
+          [L_kyber_arm32_ntt_zetas] "+r" (L_kyber_arm32_ntt_zetas_c)
         :
         : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8",
             "r9", "r10", "r11"
@@ -8076,7 +8076,7 @@ void kyber_arm32_invntt(sword16* r_p)
         "bne	L_kyber_arm32_invntt_loop_321_%=\n\t"
         "add	sp, sp, #8\n\t"
         : [r] "+r" (r),
-            [L_kyber_arm32_invntt_zetas_inv] "+r" (L_kyber_arm32_invntt_zetas_inv_c)
+          [L_kyber_arm32_invntt_zetas_inv] "+r" (L_kyber_arm32_invntt_zetas_inv_c)
         :
         : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8",
             "r9", "r10", "r11"
@@ -8405,8 +8405,8 @@ void kyber_arm32_basemul_mont(sword16* r_p, const sword16* a_p,
         "stm	%[r]!, {r4, r5}\n\t"
         "pop	{r8}\n\t"
         "bne	L_kyber_arm32_basemul_mont_loop_%=\n\t"
-        : [r] "+r" (r),  [a] "+r" (a),  [b] "+r" (b),
-            [L_kyber_arm32_basemul_mont_zetas] "+r" (L_kyber_arm32_basemul_mont_zetas_c)
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b),
+          [L_kyber_arm32_basemul_mont_zetas] "+r" (L_kyber_arm32_basemul_mont_zetas_c)
         :
         : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11"
@@ -8738,8 +8738,8 @@ void kyber_arm32_basemul_mont_add(sword16* r_p, const sword16* a_p,
         "stm	%[r]!, {r4, r5}\n\t"
         "pop	{r8}\n\t"
         "bne	L_kyber_arm32_basemul_mont_add_loop_%=\n\t"
-        : [r] "+r" (r),  [a] "+r" (a),  [b] "+r" (b),
-            [L_kyber_arm32_basemul_mont_zetas] "+r" (L_kyber_arm32_basemul_mont_zetas_c)
+        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b),
+          [L_kyber_arm32_basemul_mont_zetas] "+r" (L_kyber_arm32_basemul_mont_zetas_c)
         :
         : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11"
@@ -8948,7 +8948,7 @@ void kyber_arm32_csubq(sword16* p_p)
         "subs	r1, r1, #8\n\t"
         "bne	L_kyber_arm32_csubq_loop_%=\n\t"
         : [p] "+r" (p),
-            [L_kyber_arm32_basemul_mont_zetas] "+r" (L_kyber_arm32_basemul_mont_zetas_c)
+          [L_kyber_arm32_basemul_mont_zetas] "+r" (L_kyber_arm32_basemul_mont_zetas_c)
         :
         : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8",
             "r9", "r10", "r11"
@@ -9220,8 +9220,8 @@ unsigned int kyber_arm32_rej_uniform(sword16* p_p, unsigned int len_p,
         "\n"
     "L_kyber_arm32_rej_uniform_done_%=: \n\t"
         "lsr	r0, r12, #1\n\t"
-        : [p] "+r" (p),  [len] "+r" (len),  [r] "+r" (r),  [rLen] "+r" (rLen),
-            [L_kyber_arm32_basemul_mont_zetas] "+r" (L_kyber_arm32_basemul_mont_zetas_c)
+        : [p] "+r" (p), [len] "+r" (len), [r] "+r" (r), [rLen] "+r" (rLen),
+          [L_kyber_arm32_basemul_mont_zetas] "+r" (L_kyber_arm32_basemul_mont_zetas_c)
         :
         : "memory", "cc", "r12", "lr", "r5", "r6", "r7", "r8"
     );
diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c
index aaf596d4c..97bd63aaa 100644
--- a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c
@@ -270,8 +270,8 @@ void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p,
         "\n"
     "L_poly1305_arm32_16_done_%=: \n\t"
         "add	sp, sp, #28\n\t"
-        : [ctx] "+r" (ctx),  [m] "+r" (m),  [len] "+r" (len),
-             [notLast] "+r" (notLast)
+        : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len),
+          [notLast] "+r" (notLast)
         :
         : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
             "r10", "r11"
@@ -321,8 +321,8 @@ void poly1305_set_key(Poly1305* ctx_p, const byte* key_p)
         "stm	lr, {r5, r6, r7, r8, r12}\n\t"
         /* Zero leftover */
         "str	r5, [%[ctx], #52]\n\t"
-        : [ctx] "+r" (ctx),  [key] "+r" (key),
-            [L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c)
+        : [ctx] "+r" (ctx), [key] "+r" (key),
+          [L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c)
         :
         : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
@@ -377,7 +377,7 @@ void poly1305_final(Poly1305* ctx_p, byte* mac_p)
         /* Zero out padding. */
         "add	r9, %[ctx], #36\n\t"
         "stm	r9, {r4, r5, r6, r7}\n\t"
-        : [ctx] "+r" (ctx),  [mac] "+r" (mac)
+        : [ctx] "+r" (ctx), [mac] "+r" (mac)
         :
         : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8",
             "r9"
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c
index 05086af07..f21b5317b 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c
@@ -1732,8 +1732,8 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
         "add	%[data], %[data], #0x40\n\t"
         "bne	L_SHA256_transform_len_begin_%=\n\t"
         "add	sp, sp, #0xc0\n\t"
-        : [sha256] "+r" (sha256),  [data] "+r" (data),  [len] "+r" (len),
-            [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c)
+        : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len),
+          [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c)
         :
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12"
@@ -2797,8 +2797,8 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p)
         "str	r10, [sp, #8]\n\t"
         "bne	L_SHA256_transform_neon_len_begin_%=\n\t"
         "add	sp, sp, #24\n\t"
-        : [sha256] "+r" (sha256),  [data] "+r" (data),  [len] "+r" (len),
-            [L_SHA256_transform_neon_len_k] "+r" (L_SHA256_transform_neon_len_k_c)
+        : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len),
+          [L_SHA256_transform_neon_len_k] "+r" (L_SHA256_transform_neon_len_k_c)
         :
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr",
             "r10", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c
index e9e227ec3..170fcfd8b 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c
@@ -334,7 +334,7 @@ void BlockSha3(word64* state_p)
         "vst1.8	{d24}, [%[state]]\n\t"
         "add	sp, sp, #16\n\t"
         : [state] "+r" (state),
-            [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c)
+          [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c)
         :
         : "memory", "cc", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
             "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c
index 32506f64e..f64ef3ce5 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c
@@ -7601,8 +7601,8 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p)
         "bne	L_SHA512_transform_len_begin_%=\n\t"
         "eor	r0, r0, r0\n\t"
         "add	sp, sp, #0xc0\n\t"
-        : [sha512] "+r" (sha512),  [data] "+r" (data),  [len] "+r" (len),
-            [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c)
+        : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len),
+          [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c)
         :
         : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
             "r12"
@@ -9154,8 +9154,8 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p)
         "subs	%[len], %[len], #0x80\n\t"
         "sub	r3, r3, #0x280\n\t"
         "bne	L_SHA512_transform_neon_len_begin_%=\n\t"
-        : [sha512] "+r" (sha512),  [data] "+r" (data),  [len] "+r" (len),
-            [L_SHA512_transform_neon_len_k] "+r" (L_SHA512_transform_neon_len_k_c)
+        : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len),
+          [L_SHA512_transform_neon_len_k] "+r" (L_SHA512_transform_neon_len_k_c)
         :
         : "memory", "cc", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
             "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9",
diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c
index c8ecf47ec..68449bebd 100644
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -2231,7 +2231,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "stm	%[r]!, {r3, r4, r5, r6}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
+            "r12"
     );
 }
 
@@ -2584,7 +2585,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "add	sp, sp, #36\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -2610,7 +2612,7 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "strd	%[r], %[a], [sp, #36]\n\t"
 #endif
         "mov	lr, %[b]\n\t"
-        "ldm	%[a], {%[r], %[a], %[b], r3}\n\t"
+        "ldm	%[a], {r0, r1, r2, r3}\n\t"
         "ldm	lr!, {r4, r5, r6}\n\t"
         "umull	r10, r11, %[r], r4\n\t"
         "umull	r12, r7, %[a], r4\n\t"
@@ -2655,7 +2657,7 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "umaal	r4, r6, %[b], r7\n\t"
         "sub	lr, lr, #16\n\t"
         "umaal	r5, r6, r3, r7\n\t"
-        "ldm	%[r], {%[r], %[a], %[b], r3}\n\t"
+        "ldm	%[r], {r0, r1, r2, r3}\n\t"
         "str	r6, [sp, #32]\n\t"
         "ldm	lr!, {r6}\n\t"
         "mov	r7, #0\n\t"
@@ -2715,7 +2717,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "add	sp, sp, #44\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7",
+            "r8", "r9", "lr"
     );
 }
 
@@ -2751,7 +2754,7 @@ static sp_digit sp_2048_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -2798,7 +2801,7 @@ static sp_digit sp_2048_sub_in_place_16(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -2848,7 +2851,7 @@ static sp_digit sp_2048_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -2988,7 +2991,7 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -3066,7 +3069,7 @@ static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -3266,7 +3269,7 @@ static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -3400,7 +3403,7 @@ static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -4680,7 +4683,8 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         "stm	%[r]!, {r2, r3, r4, r8}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12"
     );
 }
 
@@ -4923,7 +4927,8 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         "add	sp, sp, #0x44\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -4941,7 +4946,7 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p)
     __asm__ __volatile__ (
         "sub	sp, sp, #32\n\t"
         "str	%[r], [sp, #28]\n\t"
-        "ldm	%[a], {%[r], %[a], r2, r3, r4, r5, r6, r7}\n\t"
+        "ldm	%[a], {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
         "umull	r9, r10, %[r], %[r]\n\t"
         "umull	r11, r12, %[r], %[a]\n\t"
         "adds	r11, r11, r11\n\t"
@@ -5028,18 +5033,19 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         /* R[15] = r7 */
         "ldr	lr, [sp, #28]\n\t"
         "add	lr, lr, #28\n\t"
-        "stm	lr!, {%[r], r12}\n\t"
+        "stm	lr!, {r0, r12}\n\t"
         "stm	lr!, {r11}\n\t"
         "stm	lr!, {r10}\n\t"
         "stm	lr!, {r3, r4, r8, r9}\n\t"
         "stm	lr!, {r7}\n\t"
         "sub	lr, lr, #0x40\n\t"
-        "ldm	sp, {%[r], %[a], r2, r3, r4, r5, r6}\n\t"
-        "stm	lr, {%[r], %[a], r2, r3, r4, r5, r6}\n\t"
+        "ldm	sp, {r0, r1, r2, r3, r4, r5, r6}\n\t"
+        "stm	lr, {r0, r1, r2, r3, r4, r5, r6}\n\t"
         "add	sp, sp, #32\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -5074,7 +5080,7 @@ static sp_digit sp_2048_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -5159,7 +5165,7 @@ static sp_digit sp_2048_sub_16(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -5272,7 +5278,7 @@ static sp_digit sp_2048_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -5347,7 +5353,8 @@ static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "mov	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     return (uint32_t)(size_t)r;
 }
@@ -5383,7 +5390,8 @@ static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -5585,7 +5593,8 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "bgt	L_sp_2048_mul_64_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -5742,7 +5751,8 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p)
         "bgt	L_sp_2048_sqr_64_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -5798,7 +5808,8 @@ static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "mov	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     return (uint32_t)(size_t)r;
 }
@@ -5834,7 +5845,8 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -6036,7 +6048,8 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "bgt	L_sp_2048_mul_32_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -6193,7 +6206,8 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p)
         "bgt	L_sp_2048_sqr_32_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -6314,7 +6328,7 @@ static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r], #256]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -8379,7 +8393,7 @@ static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r4, [%[r]]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
@@ -8408,7 +8422,8 @@ static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -8434,7 +8449,7 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -8448,7 +8463,8 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -8572,7 +8588,7 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp
         "sbc	%[r], lr, lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7"
     );
     return (uint32_t)(size_t)r;
 }
@@ -9553,7 +9569,8 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp);
 }
@@ -9848,7 +9865,8 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp);
 }
@@ -10053,7 +10071,8 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp);
 }
@@ -10183,7 +10202,7 @@ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r], #128]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -11224,7 +11243,7 @@ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r5, [%[r]]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
@@ -11283,7 +11302,7 @@ static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "add	%[d1], r4, r3\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -11421,7 +11440,7 @@ static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "sub	%[d1], r3, r6\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -11820,7 +11839,7 @@ static sp_int32 sp_2048_cmp_32(const sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r2\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)a;
 }
@@ -12229,7 +12248,8 @@ static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -12255,7 +12275,7 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -12269,7 +12289,8 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -12505,7 +12526,7 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp
         "sbc	%[r], lr, lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7"
     );
     return (uint32_t)(size_t)r;
 }
@@ -14414,7 +14435,8 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp);
 }
@@ -14965,7 +14987,8 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp);
 }
@@ -15330,7 +15353,8 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp);
 }
@@ -15398,7 +15422,8 @@ static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12", "lr"
     );
     return (uint32_t)(size_t)r;
 }
@@ -15532,7 +15557,7 @@ static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -15592,7 +15617,7 @@ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "add	%[d1], r4, r3\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -15730,7 +15755,7 @@ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "sub	%[d1], r3, r6\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -16585,7 +16610,7 @@ static sp_int32 sp_2048_cmp_64(const sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r2\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)a;
 }
@@ -17116,7 +17141,8 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -17142,7 +17168,7 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp
         "mov	%[r], lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -17156,7 +17182,8 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -17280,7 +17307,7 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp
         "adc	%[r], r8, r8\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)r;
 }
@@ -17994,7 +18021,7 @@ static void sp_2048_lshift_64(sp_digit* r_p, const sp_digit* a_p, byte n_p)
         "str	r6, [%[r], #4]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
-        : "memory", "r4", "r5", "r6", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r3", "r12"
     );
 }
 
@@ -23899,7 +23926,8 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "stm	%[r]!, {r3, r4, r5, r6}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
+            "r12"
     );
 }
 
@@ -23941,7 +23969,7 @@ static sp_digit sp_3072_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -24002,7 +24030,7 @@ static sp_digit sp_3072_sub_in_place_24(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -24066,7 +24094,7 @@ static sp_digit sp_3072_add_24(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -24238,7 +24266,7 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -24344,7 +24372,7 @@ static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -24600,7 +24628,7 @@ static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -24790,7 +24818,7 @@ static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -27922,7 +27950,8 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p)
         "stm	%[r]!, {r2, r3, r4, r8}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12"
     );
 }
 
@@ -27963,7 +27992,7 @@ static sp_digit sp_3072_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -28062,7 +28091,7 @@ static sp_digit sp_3072_sub_24(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -28203,7 +28232,7 @@ static sp_digit sp_3072_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -28278,7 +28307,8 @@ static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "mov	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     return (uint32_t)(size_t)r;
 }
@@ -28314,7 +28344,8 @@ static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -28516,7 +28547,8 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "bgt	L_sp_3072_mul_96_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -28673,7 +28705,8 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p)
         "bgt	L_sp_3072_sqr_96_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -28729,7 +28762,8 @@ static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "mov	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     return (uint32_t)(size_t)r;
 }
@@ -28765,7 +28799,8 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -28967,7 +29002,8 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "bgt	L_sp_3072_mul_48_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -29124,7 +29160,8 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p)
         "bgt	L_sp_3072_sqr_48_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -29245,7 +29282,7 @@ static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r], #384]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -32334,7 +32371,7 @@ static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r]]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
@@ -32363,7 +32400,8 @@ static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -32389,7 +32427,7 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -32403,7 +32441,8 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -32583,7 +32622,7 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp
         "sbc	%[r], lr, lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7"
     );
     return (uint32_t)(size_t)r;
 }
@@ -34028,7 +34067,8 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp);
 }
@@ -34451,7 +34491,8 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp);
 }
@@ -34736,7 +34777,8 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp);
 }
@@ -34866,7 +34908,7 @@ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r], #192]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -36419,7 +36461,7 @@ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r]]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
@@ -36478,7 +36520,7 @@ static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "add	%[d1], r4, r3\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -36616,7 +36658,7 @@ static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "sub	%[d1], r3, r6\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -37191,7 +37233,7 @@ static sp_int32 sp_3072_cmp_48(const sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r2\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)a;
 }
@@ -37600,7 +37642,8 @@ static void sp_3072_mont_norm_96(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -37626,7 +37669,7 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -37640,7 +37683,8 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -37988,7 +38032,7 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp
         "sbc	%[r], lr, lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7"
     );
     return (uint32_t)(size_t)r;
 }
@@ -40825,7 +40869,8 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp);
 }
@@ -41632,7 +41677,8 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp);
 }
@@ -42157,7 +42203,8 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp);
 }
@@ -42225,7 +42272,8 @@ static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12", "lr"
     );
     return (uint32_t)(size_t)r;
 }
@@ -42415,7 +42463,7 @@ static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -42475,7 +42523,7 @@ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "add	%[d1], r4, r3\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -42613,7 +42661,7 @@ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "sub	%[d1], r3, r6\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -43826,7 +43874,7 @@ static sp_int32 sp_3072_cmp_96(const sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r2\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)a;
 }
@@ -44357,7 +44405,8 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -44383,7 +44432,7 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp
         "mov	%[r], lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -44397,7 +44446,8 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -44577,7 +44627,7 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp
         "adc	%[r], r8, r8\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)r;
 }
@@ -45483,7 +45533,7 @@ static void sp_3072_lshift_96(sp_digit* r_p, const sp_digit* a_p, byte n_p)
         "str	r4, [%[r], #4]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
-        : "memory", "r4", "r5", "r6", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r3", "r12"
     );
 }
 
@@ -46134,7 +46184,7 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -46145,7 +46195,8 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -46380,7 +46431,7 @@ static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_dig
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -46468,7 +46519,8 @@ SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -46494,7 +46546,8 @@ static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_dig
         "mov	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     return (uint32_t)(size_t)r;
 }
@@ -46530,7 +46583,8 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -46732,7 +46786,8 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
         "bgt	L_sp_4096_mul_128_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -46889,7 +46944,8 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p)
         "bgt	L_sp_4096_sqr_128_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -47008,7 +47064,7 @@ static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r], #512]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -51121,7 +51177,7 @@ static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r5, [%[r]]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
@@ -51151,7 +51207,8 @@ static void sp_4096_mont_norm_128(sp_digit* r, const sp_digit* m)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -51177,7 +51234,7 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const s
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -51191,7 +51248,8 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const s
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -51651,7 +51709,7 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const s
         "sbc	%[r], lr, lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7"
     );
     return (uint32_t)(size_t)r;
 }
@@ -55416,7 +55474,8 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp);
 }
@@ -56479,7 +56538,8 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp);
 }
@@ -57164,7 +57224,8 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp);
 }
@@ -57207,7 +57268,8 @@ SP_NOINLINE static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -57232,7 +57294,8 @@ static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_dig
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12", "lr"
     );
     return (uint32_t)(size_t)r;
 }
@@ -57244,7 +57307,8 @@ static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_dig
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
+static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -57478,7 +57542,7 @@ static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_dig
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -57538,7 +57602,7 @@ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "add	%[d1], r4, r3\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -57676,7 +57740,7 @@ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "sub	%[d1], r3, r6\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -59241,7 +59305,7 @@ static sp_int32 sp_4096_cmp_128(const sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r2\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)a;
 }
@@ -59772,7 +59836,8 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -59798,7 +59863,7 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp
         "mov	%[r], lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -59812,7 +59877,8 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -60048,7 +60114,7 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp
         "adc	%[r], r8, r8\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)r;
 }
@@ -61146,7 +61212,7 @@ static void sp_4096_lshift_128(sp_digit* r_p, const sp_digit* a_p, byte n_p)
         "str	r5, [%[r], #4]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
-        : "memory", "r4", "r5", "r6", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r3", "r12"
     );
 }
 
@@ -61627,7 +61693,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p
         "bgt	L_sp_256_mul_8_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -63623,7 +63690,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p
         "stm	%[r]!, {r3, r4, r5, r6}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
+            "r12"
     );
 }
 
@@ -63976,7 +64044,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p
         "add	sp, sp, #36\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -64002,7 +64071,7 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p
         "strd	%[r], %[a], [sp, #36]\n\t"
 #endif
         "mov	lr, %[b]\n\t"
-        "ldm	%[a], {%[r], %[a], %[b], r3}\n\t"
+        "ldm	%[a], {r0, r1, r2, r3}\n\t"
         "ldm	lr!, {r4, r5, r6}\n\t"
         "umull	r10, r11, %[r], r4\n\t"
         "umull	r12, r7, %[a], r4\n\t"
@@ -64047,7 +64116,7 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p
         "umaal	r4, r6, %[b], r7\n\t"
         "sub	lr, lr, #16\n\t"
         "umaal	r5, r6, r3, r7\n\t"
-        "ldm	%[r], {%[r], %[a], %[b], r3}\n\t"
+        "ldm	%[r], {r0, r1, r2, r3}\n\t"
         "str	r6, [sp, #32]\n\t"
         "ldm	lr!, {r6}\n\t"
         "mov	r7, #0\n\t"
@@ -64107,7 +64176,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p
         "add	sp, sp, #44\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7",
+            "r8", "r9", "lr"
     );
 }
 
@@ -64267,7 +64337,8 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         "bgt	L_sp_256_sqr_8_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -65478,7 +65549,8 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         "stm	%[r]!, {r2, r3, r4, r8}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12"
     );
 }
 
@@ -65721,7 +65793,8 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         "add	sp, sp, #0x44\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -65739,7 +65812,7 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
     __asm__ __volatile__ (
         "sub	sp, sp, #32\n\t"
         "str	%[r], [sp, #28]\n\t"
-        "ldm	%[a], {%[r], %[a], r2, r3, r4, r5, r6, r7}\n\t"
+        "ldm	%[a], {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
         "umull	r9, r10, %[r], %[r]\n\t"
         "umull	r11, r12, %[r], %[a]\n\t"
         "adds	r11, r11, r11\n\t"
@@ -65826,18 +65899,19 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         /* R[15] = r7 */
         "ldr	lr, [sp, #28]\n\t"
         "add	lr, lr, #28\n\t"
-        "stm	lr!, {%[r], r12}\n\t"
+        "stm	lr!, {r0, r12}\n\t"
         "stm	lr!, {r11}\n\t"
         "stm	lr!, {r10}\n\t"
         "stm	lr!, {r3, r4, r8, r9}\n\t"
         "stm	lr!, {r7}\n\t"
         "sub	lr, lr, #0x40\n\t"
-        "ldm	sp, {%[r], %[a], r2, r3, r4, r5, r6}\n\t"
-        "stm	lr, {%[r], %[a], r2, r3, r4, r5, r6}\n\t"
+        "ldm	sp, {r0, r1, r2, r3, r4, r5, r6}\n\t"
+        "stm	lr, {r0, r1, r2, r3, r4, r5, r6}\n\t"
         "add	sp, sp, #32\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
 }
 
@@ -65876,7 +65950,8 @@ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
         "mov	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     return (uint32_t)(size_t)r;
 }
@@ -65913,7 +65988,7 @@ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -65925,7 +66000,8 @@ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
  * a  The number to convert.
  * m  The modulus (prime).
  */
-static int sp_256_mod_mul_norm_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p)
+static int sp_256_mod_mul_norm_8(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -66149,7 +66225,8 @@ static int sp_256_mod_mul_norm_8(sp_digit* r_p, const sp_digit* a_p, const sp_di
         "add	sp, sp, #24\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr", "r10"
     );
     (void)m_p;
     return (uint32_t)(size_t)r;
@@ -66360,7 +66437,8 @@ static int sp_256_point_to_ecc_point_8(const sp_point_256* p, ecc_point* pm)
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p,
+    const sp_digit* m_p, sp_digit mp_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -68464,7 +68542,8 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "add	sp, sp, #0x44\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r12"
     );
     (void)m_p;
     (void)mp_p;
@@ -68480,7 +68559,8 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p,
+    const sp_digit* m_p, sp_digit mp_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -68941,7 +69021,8 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "add	sp, sp, #0x44\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
     (void)m_p;
     (void)mp_p;
@@ -68957,7 +69038,8 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p,
+    const sp_digit* m_p, sp_digit mp_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -68972,7 +69054,7 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "strd	%[r], %[a], [sp, #68]\n\t"
 #endif
         "mov	lr, %[b]\n\t"
-        "ldm	%[a], {%[r], %[a], %[b], r3}\n\t"
+        "ldm	%[a], {r0, r1, r2, r3}\n\t"
         "ldm	lr!, {r4, r5, r6}\n\t"
         "umull	r10, r11, %[r], r4\n\t"
         "umull	r12, r7, %[a], r4\n\t"
@@ -69017,7 +69099,7 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "umaal	r4, r6, %[b], r7\n\t"
         "sub	lr, lr, #16\n\t"
         "umaal	r5, r6, r3, r7\n\t"
-        "ldm	%[r], {%[r], %[a], %[b], r3}\n\t"
+        "ldm	%[r], {r0, r1, r2, r3}\n\t"
         "str	r6, [sp, #64]\n\t"
         "ldm	lr!, {r6}\n\t"
         "mov	r7, #0\n\t"
@@ -69196,7 +69278,8 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
         "add	sp, sp, #0x4c\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7",
+            "r8", "r9", "lr"
     );
     (void)m_p;
     (void)mp_p;
@@ -69211,7 +69294,8 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p,
+    sp_digit mp_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -70394,7 +70478,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "add	sp, sp, #0x44\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r12", "r8", "r9", "r10", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r12", "r8", "r9",
+            "r10", "lr"
     );
     (void)m_p;
     (void)mp_p;
@@ -70408,7 +70493,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p,
+    sp_digit mp_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -70760,7 +70846,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "add	sp, sp, #0x44\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
     (void)m_p;
     (void)mp_p;
@@ -70774,7 +70861,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p,
+    sp_digit mp_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -70782,7 +70870,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
     __asm__ __volatile__ (
         "sub	sp, sp, #0x44\n\t"
         "str	%[r], [sp, #64]\n\t"
-        "ldm	%[a], {%[r], %[a], r2, r3, r4, r5, r6, r7}\n\t"
+        "ldm	%[a], {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
         "umull	r9, r10, %[r], %[r]\n\t"
         "umull	r11, r12, %[r], %[a]\n\t"
         "adds	r11, r11, r11\n\t"
@@ -70869,7 +70957,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         /* R[15] = r7 */
         "mov	lr, sp\n\t"
         "add	lr, lr, #28\n\t"
-        "stm	lr!, {%[r], r12}\n\t"
+        "stm	lr!, {r0, r12}\n\t"
         "stm	lr!, {r11}\n\t"
         "stm	lr!, {r10}\n\t"
         "stm	lr!, {r3, r4, r8, r9}\n\t"
@@ -71000,7 +71088,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
         "add	sp, sp, #0x44\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
     (void)m_p;
     (void)mp_p;
@@ -71232,7 +71321,7 @@ static sp_int32 sp_256_cmp_8(const sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r2\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)a;
 }
@@ -71252,7 +71341,8 @@ static sp_int32 sp_256_cmp_8(const sp_digit* a_p, const sp_digit* b_p)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -71278,7 +71368,7 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_d
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -71292,7 +71382,8 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_d
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -71332,7 +71423,7 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_d
         "sbc	%[r], lr, lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7"
     );
     return (uint32_t)(size_t)r;
 }
@@ -71620,7 +71711,8 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p,
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp);
 }
@@ -71723,7 +71815,8 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p,
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp);
 }
@@ -71808,7 +71901,8 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p,
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp);
 }
@@ -71959,7 +72053,8 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p,
         "add	sp, sp, #0x44\n\t"
         : [a] "+r" (a)
         :
-        : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11", "r12", "lr"
     );
     (void)m_p;
     (void)mp_p;
@@ -71972,7 +72067,8 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p,
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p,
+    sp_digit mp_p)
 {
     register sp_digit* a asm ("r0") = (sp_digit*)a_p;
     register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
@@ -72244,7 +72340,8 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp);
 }
@@ -72256,7 +72353,8 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p,
+    sp_digit mp_p)
 {
     register sp_digit* a asm ("r0") = (sp_digit*)a_p;
     register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
@@ -72347,7 +72445,8 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp);
 }
@@ -72359,7 +72458,8 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p,
+    sp_digit mp_p)
 {
     register sp_digit* a asm ("r0") = (sp_digit*)a_p;
     register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
@@ -72432,7 +72532,8 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp);
 }
@@ -72486,7 +72587,8 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_256_mont_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
+static void sp_256_mont_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p,
+    const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -72530,7 +72632,8 @@ static void sp_256_mont_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "stm	%[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
     (void)m_p;
 }
@@ -72580,7 +72683,8 @@ static void sp_256_mont_dbl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "stm	%[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r2"
     );
     (void)m_p;
 }
@@ -72662,7 +72766,8 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "stm	%[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r2", "r3", "r12"
     );
     (void)m_p;
 }
@@ -72674,7 +72779,8 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
+static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p,
+    const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -72716,7 +72822,8 @@ static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "stm	%[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r11", "r12", "lr"
     );
     (void)m_p;
 }
@@ -72793,7 +72900,8 @@ static void sp_256_mont_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "stm	%[r], {r8, r9, r10, r11}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3"
     );
 }
 
@@ -76318,7 +76426,7 @@ static void sp_256_add_one_8(sp_digit* a_p)
         "stm	%[a]!, {r1, r2, r3, r4}\n\t"
         : [a] "+r" (a)
         :
-        : "memory", "r1", "r2", "r3", "r4", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r4"
     );
 }
 
@@ -76722,7 +76830,8 @@ static sp_digit sp_256_sub_in_place_8(sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -76756,7 +76865,7 @@ static sp_digit sp_256_sub_in_place_8(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -76856,7 +76965,7 @@ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r], #32]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -77129,7 +77238,7 @@ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r5, [%[r]]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
@@ -77188,7 +77297,7 @@ static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "add	%[d1], r4, r3\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -77326,7 +77435,7 @@ static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "sub	%[d1], r3, r6\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -78010,7 +78119,8 @@ static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12", "lr"
     );
     return (uint32_t)(size_t)r;
 }
@@ -78046,7 +78156,7 @@ static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -78126,7 +78236,8 @@ static void sp_256_rshift1_8(sp_digit* r_p, const sp_digit* a_p)
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr", "r10"
     );
 }
 
@@ -78212,7 +78323,8 @@ static void sp_256_div2_mod_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "stm	%[r], {r8, r9, r10, r11}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
 }
 
@@ -78255,7 +78367,8 @@ static const unsigned char L_sp_256_num_bits_8_table[] = {
 static int sp_256_num_bits_8(const sp_digit* a_p)
 {
     register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register unsigned char* L_sp_256_num_bits_8_table_c asm ("r1") = (unsigned char*)&L_sp_256_num_bits_8_table;
+    register unsigned char* L_sp_256_num_bits_8_table_c asm ("r1") =
+        (unsigned char*)&L_sp_256_num_bits_8_table;
 
     __asm__ __volatile__ (
         "mov	lr, %[L_sp_256_num_bits_8_table]\n\t"
@@ -78567,9 +78680,10 @@ static int sp_256_num_bits_8(const sp_digit* a_p)
         "\n"
     "L_sp_256_num_bits_8_9_%=: \n\t"
         "mov	%[a], r12\n\t"
-        : [a] "+r" (a), [L_sp_256_num_bits_8_table] "+r" (L_sp_256_num_bits_8_table_c)
+        : [a] "+r" (a),
+          [L_sp_256_num_bits_8_table] "+r" (L_sp_256_num_bits_8_table_c)
         :
-        : "memory", "r2", "r3", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -78658,7 +78772,7 @@ static int sp_256_num_bits_8(const sp_digit* a_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a)
         :
-        : "memory", "r1", "r2", "r3", "r12", "lr", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r12", "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -79931,7 +80045,8 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "bgt	L_sp_384_mul_12_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -85428,7 +85543,8 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "stm	%[r]!, {r3, r4, r5, r6}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
+            "r12"
     );
 }
 
@@ -85587,7 +85703,8 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p)
         "bgt	L_sp_384_sqr_12_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -88650,7 +88767,8 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p)
         "stm	%[r]!, {r2, r3, r4, r8}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12"
     );
 }
 
@@ -88688,7 +88806,8 @@ static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "mov	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     return (uint32_t)(size_t)r;
 }
@@ -88732,7 +88851,7 @@ static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -89043,7 +89162,8 @@ static int sp_384_point_to_ecc_point_12(const sp_point_384* p, ecc_point* pm)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -89069,7 +89189,7 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -89083,7 +89203,8 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -89137,7 +89258,7 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_
         "sbc	%[r], lr, lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7"
     );
     return (uint32_t)(size_t)r;
 }
@@ -89540,7 +89661,8 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_384_cond_sub_12(a - 12, a, m, (sp_digit)0 - mp);
 }
@@ -89675,7 +89797,8 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_384_cond_sub_12(a - 12, a, m, (sp_digit)0 - mp);
 }
@@ -89780,7 +89903,8 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_384_cond_sub_12(a - 12, a, m, (sp_digit)0 - mp);
 }
@@ -90101,7 +90225,7 @@ static sp_int32 sp_384_cmp_12(const sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r2\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)a;
 }
@@ -90159,7 +90283,8 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_384_mont_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
+static void sp_384_mont_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p,
+    const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -90242,7 +90367,8 @@ static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12", "lr"
     );
     return (uint32_t)(size_t)r;
 }
@@ -90285,7 +90411,7 @@ static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -90300,7 +90426,8 @@ static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -90326,7 +90453,7 @@ static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_
         "mov	%[r], lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -90340,7 +90467,8 @@ static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -90394,7 +90522,7 @@ static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_
         "adc	%[r], r8, r8\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)r;
 }
@@ -90407,7 +90535,8 @@ static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_384_mont_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
+static void sp_384_mont_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p,
+    const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -90477,7 +90606,7 @@ static void sp_384_rshift1_12(sp_digit* r_p, const sp_digit* a_p)
         "str	r4, [%[r], #44]\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "cc"
+        : "memory", "cc", "r2", "r3", "r4"
     );
 }
 
@@ -94081,7 +94210,7 @@ static void sp_384_add_one_12(sp_digit* a_p)
         "stm	%[a]!, {r1, r2, r3, r4}\n\t"
         : [a] "+r" (a)
         :
-        : "memory", "r1", "r2", "r3", "r4", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r4"
     );
 }
 
@@ -94485,7 +94614,8 @@ static sp_digit sp_384_sub_in_place_12(sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -94526,7 +94656,7 @@ static sp_digit sp_384_sub_in_place_12(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -94626,7 +94756,7 @@ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r], #48]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -95027,7 +95157,7 @@ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r]]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
@@ -95086,7 +95216,7 @@ static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "add	%[d1], r4, r3\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -95224,7 +95354,7 @@ static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "sub	%[d1], r3, r6\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -95952,7 +96082,8 @@ static void sp_384_div2_mod_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "str	r10, [%[r], #44]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
 }
 
@@ -95995,7 +96126,8 @@ static const unsigned char L_sp_384_num_bits_12_table[] = {
 static int sp_384_num_bits_12(const sp_digit* a_p)
 {
     register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register unsigned char* L_sp_384_num_bits_12_table_c asm ("r1") = (unsigned char*)&L_sp_384_num_bits_12_table;
+    register unsigned char* L_sp_384_num_bits_12_table_c asm ("r1") =
+        (unsigned char*)&L_sp_384_num_bits_12_table;
 
     __asm__ __volatile__ (
         "mov	lr, %[L_sp_384_num_bits_12_table]\n\t"
@@ -96559,9 +96691,10 @@ static int sp_384_num_bits_12(const sp_digit* a_p)
         "\n"
     "L_sp_384_num_bits_12_13_%=: \n\t"
         "mov	%[a], r12\n\t"
-        : [a] "+r" (a), [L_sp_384_num_bits_12_table] "+r" (L_sp_384_num_bits_12_table_c)
+        : [a] "+r" (a),
+          [L_sp_384_num_bits_12_table] "+r" (L_sp_384_num_bits_12_table_c)
         :
-        : "memory", "r2", "r3", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -96710,7 +96843,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a)
         :
-        : "memory", "r1", "r2", "r3", "r12", "lr", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r12", "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -98032,7 +98165,8 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "bgt	L_sp_521_mul_17_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -109048,7 +109182,8 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "stm	%[r]!, {r3}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
+            "r12"
     );
 }
 
@@ -109210,7 +109345,8 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p)
         "bgt	L_sp_521_sqr_17_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -115082,7 +115218,8 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p)
         "stm	%[r]!, {r2}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12"
     );
 }
 
@@ -115126,7 +115263,8 @@ static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "adc	%[r], r4, #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     return (uint32_t)(size_t)r;
 }
@@ -115181,7 +115319,7 @@ static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -115409,7 +115547,8 @@ static int sp_521_point_to_ecc_point_17(const sp_point_521* p, ecc_point* pm)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -115435,7 +115574,7 @@ static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -115449,7 +115588,8 @@ static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -115522,7 +115662,7 @@ static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_
         "sbc	%[r], lr, lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7"
     );
     return (uint32_t)(size_t)r;
 }
@@ -115650,7 +115790,8 @@ static SP_NOINLINE void sp_521_mont_reduce_17(sp_digit* a_p, const sp_digit* m_p
         "stm	%[a]!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t"
         : [a] "+r" (a)
         :
-        : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11", "r12", "lr"
     );
     (void)m_p;
     (void)mp_p;
@@ -115663,7 +115804,8 @@ static SP_NOINLINE void sp_521_mont_reduce_17(sp_digit* a_p, const sp_digit* m_p
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p,
+    sp_digit mp_p)
 {
     register sp_digit* a asm ("r0") = (sp_digit*)a_p;
     register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
@@ -116279,7 +116421,8 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_521_cond_sub_17(a - 17, a, m, (sp_digit)0 - mp);
 }
@@ -116291,7 +116434,8 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p,
+    sp_digit mp_p)
 {
     register sp_digit* a asm ("r0") = (sp_digit*)a_p;
     register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
@@ -116537,7 +116681,8 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_521_cond_sub_17(a - 17, a, m, (sp_digit)0 - mp);
 }
@@ -116549,7 +116694,8 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
+static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p,
+    sp_digit mp_p)
 {
     register sp_digit* a asm ("r0") = (sp_digit*)a_p;
     register const sp_digit* m asm ("r1") = (const sp_digit*)m_p;
@@ -116750,7 +116896,8 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_521_cond_sub_17(a - 17, a, m, (sp_digit)0 - mp);
 }
@@ -117123,7 +117270,7 @@ static sp_int32 sp_521_cmp_17(const sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r2\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)a;
 }
@@ -117181,7 +117328,8 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_521_mont_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
+static void sp_521_mont_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p,
+    const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -117256,7 +117404,8 @@ static void sp_521_mont_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "stm	%[r]!, {r4}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     (void)m_p;
 }
@@ -117332,7 +117481,8 @@ static void sp_521_mont_dbl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "stm	%[r]!, {r4}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "r3", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r2", "r3"
     );
     (void)m_p;
 }
@@ -117428,7 +117578,8 @@ static void sp_521_mont_tpl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "stm	%[r]!, {r4}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "r3", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r2", "r3"
     );
     (void)m_p;
 }
@@ -117440,7 +117591,8 @@ static void sp_521_mont_tpl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_521_mont_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
+static void sp_521_mont_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p,
+    const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -117516,7 +117668,8 @@ static void sp_521_mont_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "stm	%[r]!, {r4}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     (void)m_p;
 }
@@ -117595,7 +117748,7 @@ static void sp_521_rshift1_17(sp_digit* r_p, const sp_digit* a_p)
         "str	r3, [%[r], #64]\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "cc"
+        : "memory", "cc", "r2", "r3", "r4"
     );
 }
 
@@ -121830,7 +121983,7 @@ static void sp_521_add_one_17(sp_digit* a_p)
         "stm	%[a]!, {r1}\n\t"
         : [a] "+r" (a)
         :
-        : "memory", "r1", "r2", "r3", "r4", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r4"
     );
 }
 
@@ -122304,7 +122457,7 @@ static void sp_521_rshift_17(sp_digit* r_p, const sp_digit* a_p, byte n_p)
 #endif
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
-        : "memory", "r4", "r5", "r6", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r3", "r12"
     );
 }
 
@@ -122424,7 +122577,7 @@ static void sp_521_lshift_17(sp_digit* r_p, const sp_digit* a_p, byte n_p)
         "str	r5, [%[r], #4]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
-        : "memory", "r4", "r5", "r6", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r3", "r12"
     );
 }
 
@@ -122642,7 +122795,7 @@ static void sp_521_lshift_34(sp_digit* r_p, const sp_digit* a_p, byte n_p)
         "str	r6, [%[r], #4]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
         :
-        : "memory", "r4", "r5", "r6", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r3", "r12"
     );
 }
 
@@ -122681,7 +122834,8 @@ static sp_digit sp_521_sub_in_place_17(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], %[a], %[a]\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -122733,7 +122887,7 @@ static sp_digit sp_521_sub_in_place_17(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -122833,7 +122987,7 @@ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r], #68]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -123394,7 +123548,7 @@ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r5, [%[r]]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
@@ -123453,7 +123607,7 @@ static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "add	%[d1], r4, r3\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -123591,7 +123745,7 @@ static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "sub	%[d1], r3, r6\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -124281,7 +124435,8 @@ static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12", "lr"
     );
     return (uint32_t)(size_t)r;
 }
@@ -124335,7 +124490,7 @@ static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -124481,7 +124636,8 @@ static void sp_521_div2_mod_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "str	r9, [%[r], #64]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
 }
 
@@ -124524,7 +124680,8 @@ static const unsigned char L_sp_521_num_bits_17_table[] = {
 static int sp_521_num_bits_17(const sp_digit* a_p)
 {
     register const sp_digit* a asm ("r0") = (const sp_digit*)a_p;
-    register unsigned char* L_sp_521_num_bits_17_table_c asm ("r1") = (unsigned char*)&L_sp_521_num_bits_17_table;
+    register unsigned char* L_sp_521_num_bits_17_table_c asm ("r1") =
+        (unsigned char*)&L_sp_521_num_bits_17_table;
 
     __asm__ __volatile__ (
         "mov	lr, %[L_sp_521_num_bits_17_table]\n\t"
@@ -125403,9 +125560,10 @@ static int sp_521_num_bits_17(const sp_digit* a_p)
         "\n"
     "L_sp_521_num_bits_17_18_%=: \n\t"
         "mov	%[a], r12\n\t"
-        : [a] "+r" (a), [L_sp_521_num_bits_17_table] "+r" (L_sp_521_num_bits_17_table_c)
+        : [a] "+r" (a),
+          [L_sp_521_num_bits_17_table] "+r" (L_sp_521_num_bits_17_table_c)
         :
-        : "memory", "r2", "r3", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -125629,7 +125787,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a)
         :
-        : "memory", "r1", "r2", "r3", "r12", "lr", "cc"
+        : "memory", "cc", "r1", "r2", "r3", "r12", "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -136390,7 +136548,8 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "stm	%[r]!, {r3, r4, r5, r6}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11",
+            "r12"
     );
 }
 
@@ -141622,7 +141781,8 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p)
         "stm	%[r]!, {r2, r3, r4, r8}\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+            "r12"
     );
 }
 
@@ -141671,7 +141831,7 @@ static sp_digit sp_1024_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -141746,7 +141906,7 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p)
         "sbc	%[a], r9, r9\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
     return (uint32_t)(size_t)a;
 }
@@ -141824,7 +141984,7 @@ static sp_digit sp_1024_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "adc	%[r], %[r], #0\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -141942,7 +142102,7 @@ static sp_digit sp_1024_sub_16(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "sbc	%[r], r6, r6\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
     );
     return (uint32_t)(size_t)r;
 }
@@ -142179,7 +142339,8 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "bgt	L_sp_1024_mul_32_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -142336,7 +142497,8 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p)
         "bgt	L_sp_1024_sqr_32_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr",
+            "r11"
     );
 }
 
@@ -142456,7 +142618,8 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r12\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "cc"
+        : "memory", "cc", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12",
+            "lr"
     );
     return (uint32_t)(size_t)a;
 }
@@ -142471,7 +142634,8 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -142497,7 +142661,7 @@ static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp
         "mov	%[r], r12\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -142511,7 +142675,8 @@ static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -142635,7 +142800,7 @@ static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp
         "sbc	%[r], lr, lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7"
     );
     return (uint32_t)(size_t)r;
 }
@@ -142674,7 +142839,8 @@ static sp_digit sp_1024_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi
         "mov	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r3", "r12"
     );
     return (uint32_t)(size_t)r;
 }
@@ -142774,7 +142940,7 @@ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r3, [%[r], #128]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
     );
 }
 
@@ -143815,7 +143981,7 @@ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
         "str	r5, [%[r]]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "r8"
     );
 }
 
@@ -143874,7 +144040,7 @@ static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "add	%[d1], r4, r3\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -144012,7 +144178,7 @@ static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
         "sub	%[d1], r3, r6\n\t"
         : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)d1;
 }
@@ -144441,7 +144607,7 @@ static sp_int32 sp_1024_cmp_32(const sp_digit* a_p, const sp_digit* b_p)
         "mov	%[a], r2\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         :
-        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)a;
 }
@@ -145755,7 +145921,8 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_1024_cond_sub_32(a - 32, a, m, mp);
 }
@@ -146055,7 +146222,8 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], r3\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_1024_cond_sub_32(a - 32, a, m, mp);
 }
@@ -146265,7 +146433,8 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_
         "mov	%[mp], lr\n\t"
         : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
         :
-        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
+        : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9",
+            "r10", "r11"
     );
     sp_1024_cond_sub_32(a - 32, a, m, mp);
 }
@@ -146415,7 +146584,8 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_1024_mont_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
+static void sp_1024_mont_add_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -146577,7 +146747,8 @@ static void sp_1024_mont_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig
         "stm	%[r]!, {r4, r5, r6, r7}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r12"
     );
 }
 
@@ -146587,7 +146758,8 @@ static void sp_1024_mont_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig
  * a   Number to double in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_1024_mont_dbl_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p)
+static void sp_1024_mont_dbl_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -146732,7 +146904,8 @@ static void sp_1024_mont_dbl_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig
         "stm	%[r]!, {r4, r5, r6, r7}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
-        : "memory", "r8", "r9", "r10", "r11", "r4", "r5", "r6", "r7", "r12", "cc"
+        : "memory", "cc", "r8", "r9", "r10", "r11", "r4", "r5", "r6", "r7",
+            "r12"
     );
 }
 
@@ -146742,7 +146915,8 @@ static void sp_1024_mont_dbl_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig
  * a   Number to triple in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_1024_mont_tpl_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p)
+static void sp_1024_mont_tpl_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -147042,7 +147216,8 @@ static void sp_1024_mont_tpl_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig
         "stm	%[r]!, {r4, r5, r6, r7}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
         :
-        : "memory", "r8", "r9", "r10", "r11", "r4", "r5", "r6", "r7", "r12", "cc"
+        : "memory", "cc", "r8", "r9", "r10", "r11", "r4", "r5", "r6", "r7",
+            "r12"
     );
 }
 
@@ -147053,7 +147228,8 @@ static void sp_1024_mont_tpl_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_1024_mont_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p)
+static void sp_1024_mont_sub_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, const sp_digit* m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -147209,7 +147385,8 @@ static void sp_1024_mont_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig
         "stm	%[r]!, {r4, r5, r6, r7}\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
+        : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
+            "r12"
     );
 }
 
@@ -147222,7 +147399,8 @@ static void sp_1024_mont_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -147248,7 +147426,7 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp
         "mov	%[r], lr\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6"
     );
     return (uint32_t)(size_t)r;
 }
@@ -147262,7 +147440,8 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
+static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p,
+    const sp_digit* b_p, sp_digit m_p)
 {
     register sp_digit* r asm ("r0") = (sp_digit*)r_p;
     register const sp_digit* a asm ("r1") = (const sp_digit*)a_p;
@@ -147386,7 +147565,7 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp
         "adc	%[r], r8, r8\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
         :
-        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "cc"
+        : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
     );
     return (uint32_t)(size_t)r;
 }
@@ -147526,7 +147705,7 @@ static void sp_1024_rshift1_32(sp_digit* r_p, const sp_digit* a_p)
         "str	r3, [%[r], #124]\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
-        : "memory", "r2", "r3", "r4", "cc"
+        : "memory", "cc", "r2", "r3", "r4"
     );
 }