aboutsummaryrefslogtreecommitdiff
path: root/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c
diff options
context:
space:
mode:
authorauth12 <[email protected]>2020-07-19 11:57:04 -0700
committerGitHub <[email protected]>2020-07-19 11:57:04 -0700
commit1bae439a35a3aadca6772716aaeea8c8a0991114 (patch)
treef8eab7a7bae237ad697feecfae26b17bab91b16e /client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c
parentMore placeholders and general plan. (diff)
parentMerge branch 'master' into windows (diff)
downloadloader-1bae439a35a3aadca6772716aaeea8c8a0991114.tar.xz
loader-1bae439a35a3aadca6772716aaeea8c8a0991114.zip
Merge pull request #1 from auth12/windows
Windows
Diffstat (limited to 'client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c')
-rw-r--r--client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c5581
1 files changed, 5581 insertions, 0 deletions
diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c
new file mode 100644
index 0000000..f7ef379
--- /dev/null
+++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c
@@ -0,0 +1,5581 @@
+/* armv8-32-curve25519
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c
+ */
+
+#ifndef __aarch64__
+
+#include <stdint.h>
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#include <wolfssl/wolfcrypt/fe_operations.h>
+#include <stdint.h>
+
+void fe_init()
+{
+ __asm__ __volatile__ (
+ "\n\t"
+ :
+ :
+ : "memory"
+ );
+}
+
+void fe_frombytes(fe out, const unsigned char* in)
+{
+ __asm__ __volatile__ (
+ "ldrd r2, r3, [%[in]]\n\t"
+ "ldrd r12, lr, [%[in], #8]\n\t"
+ "ldrd r4, r5, [%[in], #16]\n\t"
+ "ldrd r6, r7, [%[in], #24]\n\t"
+ "and r7, r7, #0x7fffffff\n\t"
+ "strd r2, r3, [%[out]]\n\t"
+ "strd r12, lr, [%[out], #8]\n\t"
+ "strd r4, r5, [%[out], #16]\n\t"
+ "strd r6, r7, [%[out], #24]\n\t"
+ : [out] "+r" (out), [in] "+r" (in)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7"
+ );
+}
+
+void fe_tobytes(unsigned char* out, const fe n)
+{
+ __asm__ __volatile__ (
+ "ldrd r2, r3, [%[in]]\n\t"
+ "ldrd r12, lr, [%[in], #8]\n\t"
+ "ldrd r4, r5, [%[in], #16]\n\t"
+ "ldrd r6, r7, [%[in], #24]\n\t"
+ "adds r8, r2, #19\n\t"
+ "adcs r8, r3, #0\n\t"
+ "adcs r8, r12, #0\n\t"
+ "adcs r8, lr, #0\n\t"
+ "adcs r8, r4, #0\n\t"
+ "adcs r8, r5, #0\n\t"
+ "adcs r8, r6, #0\n\t"
+ "adc r8, r7, #0\n\t"
+ "asr r8, r8, #31\n\t"
+ "and r8, r8, #19\n\t"
+ "adds r2, r2, r8\n\t"
+ "adcs r3, r3, #0\n\t"
+ "adcs r12, r12, #0\n\t"
+ "adcs lr, lr, #0\n\t"
+ "adcs r4, r4, #0\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adc r7, r7, #0\n\t"
+ "and r7, r7, #0x7fffffff\n\t"
+ "strd r2, r3, [%[out]]\n\t"
+ "strd r12, lr, [%[out], #8]\n\t"
+ "strd r4, r5, [%[out], #16]\n\t"
+ "strd r6, r7, [%[out], #24]\n\t"
+ : [out] "+r" (out), [n] "+r" (n)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
+ );
+}
+
+void fe_1(fe n)
+{
+ __asm__ __volatile__ (
+ /* Set one */
+ "mov r2, #1\n\t"
+ "mov r1, #0\n\t"
+ "strd r2, r1, [%[n]]\n\t"
+ "strd r1, r1, [%[n], #8]\n\t"
+ "strd r1, r1, [%[n], #16]\n\t"
+ "strd r1, r1, [%[n], #24]\n\t"
+ : [n] "+r" (n)
+ :
+ : "memory", "r1", "r2"
+ );
+}
+
+void fe_0(fe n)
+{
+ __asm__ __volatile__ (
+ /* Set zero */
+ "mov r1, #0\n\t"
+ "strd r1, r1, [%[n]]\n\t"
+ "strd r1, r1, [%[n], #8]\n\t"
+ "strd r1, r1, [%[n], #16]\n\t"
+ "strd r1, r1, [%[n], #24]\n\t"
+ : [n] "+r" (n)
+ :
+ : "memory", "r1"
+ );
+}
+
+void fe_copy(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ /* Copy */
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r12, lr, [%[a], #8]\n\t"
+ "strd r2, r3, [%[r]]\n\t"
+ "strd r12, lr, [%[r], #8]\n\t"
+ "ldrd r2, r3, [%[a], #16]\n\t"
+ "ldrd r12, lr, [%[a], #24]\n\t"
+ "strd r2, r3, [%[r], #16]\n\t"
+ "strd r12, lr, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr"
+ );
+}
+
+void fe_sub(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ /* Sub */
+ "ldrd r12, lr, [%[a]]\n\t"
+ "ldrd r4, r5, [%[a], #8]\n\t"
+ "ldrd r6, r7, [%[b]]\n\t"
+ "ldrd r8, r9, [%[b], #8]\n\t"
+ "subs r6, r12, r6\n\t"
+ "sbcs r7, lr, r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "strd r6, r7, [%[r]]\n\t"
+ "strd r8, r9, [%[r], #8]\n\t"
+ "ldrd r12, lr, [%[a], #16]\n\t"
+ "ldrd r4, r5, [%[a], #24]\n\t"
+ "ldrd r6, r7, [%[b], #16]\n\t"
+ "ldrd r8, r9, [%[b], #24]\n\t"
+ "sbcs r6, r12, r6\n\t"
+ "sbcs r7, lr, r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbc r9, r5, r9\n\t"
+ "mov r10, #-19\n\t"
+ "asr r3, r9, #31\n\t"
+ /* Mask the modulus */
+ "and r10, r3, r10\n\t"
+ "and r11, r3, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r12, lr, [%[r]]\n\t"
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "adds r12, r12, r10\n\t"
+ "adcs lr, lr, r3\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adcs r7, r7, r3\n\t"
+ "adcs r8, r8, r3\n\t"
+ "adc r9, r9, r11\n\t"
+ "strd r12, lr, [%[r]]\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "strd r6, r7, [%[r], #16]\n\t"
+ "strd r8, r9, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_add(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ /* Add */
+ "ldrd r12, lr, [%[a]]\n\t"
+ "ldrd r4, r5, [%[a], #8]\n\t"
+ "ldrd r6, r7, [%[b]]\n\t"
+ "ldrd r8, r9, [%[b], #8]\n\t"
+ "adds r6, r12, r6\n\t"
+ "adcs r7, lr, r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "strd r6, r7, [%[r]]\n\t"
+ "strd r8, r9, [%[r], #8]\n\t"
+ "ldrd r12, lr, [%[a], #16]\n\t"
+ "ldrd r4, r5, [%[a], #24]\n\t"
+ "ldrd r6, r7, [%[b], #16]\n\t"
+ "ldrd r8, r9, [%[b], #24]\n\t"
+ "adcs r6, r12, r6\n\t"
+ "adcs r7, lr, r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adc r9, r5, r9\n\t"
+ "mov r10, #-19\n\t"
+ "asr r3, r9, #31\n\t"
+ /* Mask the modulus */
+ "and r10, r3, r10\n\t"
+ "and r11, r3, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r12, lr, [%[r]]\n\t"
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "subs r12, r12, r10\n\t"
+ "sbcs lr, lr, r3\n\t"
+ "sbcs r4, r4, r3\n\t"
+ "sbcs r5, r5, r3\n\t"
+ "sbcs r6, r6, r3\n\t"
+ "sbcs r7, r7, r3\n\t"
+ "sbcs r8, r8, r3\n\t"
+ "sbc r9, r9, r11\n\t"
+ "strd r12, lr, [%[r]]\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "strd r6, r7, [%[r], #16]\n\t"
+ "strd r8, r9, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_neg(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "mov r5, #-1\n\t"
+ "mov r4, #-19\n\t"
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r12, lr, [%[a], #8]\n\t"
+ "subs r2, r4, r2\n\t"
+ "sbcs r3, r5, r3\n\t"
+ "sbcs r12, r5, r12\n\t"
+ "sbcs lr, r5, lr\n\t"
+ "strd r2, r3, [%[r]]\n\t"
+ "strd r12, lr, [%[r], #8]\n\t"
+ "mov r4, #0x7fffffff\n\t"
+ "ldrd r2, r3, [%[a], #16]\n\t"
+ "ldrd r12, lr, [%[a], #24]\n\t"
+ "sbcs r2, r5, r2\n\t"
+ "sbcs r3, r5, r3\n\t"
+ "sbcs r12, r5, r12\n\t"
+ "sbc lr, r4, lr\n\t"
+ "strd r2, r3, [%[r], #16]\n\t"
+ "strd r12, lr, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5"
+ );
+}
+
+int fe_isnonzero(const fe a)
+{
+ __asm__ __volatile__ (
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r12, lr, [%[a], #8]\n\t"
+ "ldrd r4, r5, [%[a], #16]\n\t"
+ "ldrd r6, r7, [%[a], #24]\n\t"
+ "adds r1, r2, #19\n\t"
+ "adcs r1, r3, #0\n\t"
+ "adcs r1, r12, #0\n\t"
+ "adcs r1, lr, #0\n\t"
+ "adcs r1, r4, #0\n\t"
+ "adcs r1, r5, #0\n\t"
+ "adcs r1, r6, #0\n\t"
+ "adc r1, r7, #0\n\t"
+ "asr r1, r1, #31\n\t"
+ "and r1, r1, #19\n\t"
+ "adds r2, r2, r1\n\t"
+ "adcs r3, r3, #0\n\t"
+ "adcs r12, r12, #0\n\t"
+ "adcs lr, lr, #0\n\t"
+ "adcs r4, r4, #0\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adc r7, r7, #0\n\t"
+ "and r7, r7, #0x7fffffff\n\t"
+ "orr r2, r2, r3\n\t"
+ "orr r12, r12, lr\n\t"
+ "orr r4, r4, r5\n\t"
+ "orr r6, r6, r7\n\t"
+ "orr r12, r12, r4\n\t"
+ "orr r2, r2, r6\n\t"
+ "orr %[a], r2, r12\n\t"
+ : [a] "+r" (a)
+ :
+ : "memory", "r1", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
+ );
+ return (uint32_t)(size_t)a;
+}
+
+int fe_isnegative(const fe a)
+{
+ __asm__ __volatile__ (
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r12, lr, [%[a], #8]\n\t"
+ "adds r1, r2, #19\n\t"
+ "adcs r1, r3, #0\n\t"
+ "adcs r1, r12, #0\n\t"
+ "adcs r1, lr, #0\n\t"
+ "ldrd r2, r3, [%[a], #16]\n\t"
+ "ldrd r12, lr, [%[a], #24]\n\t"
+ "adcs r1, r2, #0\n\t"
+ "adcs r1, r3, #0\n\t"
+ "adcs r1, r12, #0\n\t"
+ "ldr r2, [%[a]]\n\t"
+ "adc r1, lr, #0\n\t"
+ "and %[a], r2, #1\n\t"
+ "lsr r1, r1, #31\n\t"
+ "eor %[a], %[a], r1\n\t"
+ : [a] "+r" (a)
+ :
+ : "memory", "r1", "r2", "r3", "r12", "lr"
+ );
+ return (uint32_t)(size_t)a;
+}
+
+void fe_cmov_table(fe* r, fe* base, signed char b)
+{
+ __asm__ __volatile__ (
+ "sxtb %[b], %[b]\n\t"
+ "sbfx r7, %[b], #7, #1\n\t"
+ "eor r10, %[b], r7\n\t"
+ "sub r10, r10, r7\n\t"
+ "mov r3, #1\n\t"
+ "mov r12, #0\n\t"
+ "mov lr, #1\n\t"
+ "mov r4, #0\n\t"
+ "mov r5, #0\n\t"
+ "mov r6, #0\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #31\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #30\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #29\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #28\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #27\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #26\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #25\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #24\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "sub %[base], %[base], #0x2a0\n\t"
+ "mov r8, #-19\n\t"
+ "mov r9, #-1\n\t"
+ "subs r8, r8, r5\n\t"
+ "sbcs r9, r9, r6\n\t"
+ "sbc r11, r11, r11\n\t"
+ "asr r10, %[b], #31\n\t"
+ "eor r7, r3, lr\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r3, r3, r7\n\t"
+ "eor lr, lr, r7\n\t"
+ "eor r7, r12, r4\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r12, r12, r7\n\t"
+ "eor r4, r4, r7\n\t"
+ "eor r8, r8, r5\n\t"
+ "and r8, r8, r10\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r9, r9, r10\n\t"
+ "eor r6, r6, r9\n\t"
+ "strd r3, r12, [%[r]]\n\t"
+ "strd lr, r4, [%[r], #32]\n\t"
+ "strd r5, r6, [%[r], #64]\n\t"
+ "sbfx r7, %[b], #7, #1\n\t"
+ "eor r10, %[b], r7\n\t"
+ "sub r10, r10, r7\n\t"
+ "mov r3, #0\n\t"
+ "mov r12, #0\n\t"
+ "mov lr, #0\n\t"
+ "mov r4, #0\n\t"
+ "mov r5, #0\n\t"
+ "mov r6, #0\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #31\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #30\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #29\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #28\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #27\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #26\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #25\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #24\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "sub %[base], %[base], #0x2a0\n\t"
+ "mov r8, #-1\n\t"
+ "mov r9, #-1\n\t"
+ "rsbs r11, r11, #0\n\t"
+ "sbcs r8, r8, r5\n\t"
+ "sbcs r9, r9, r6\n\t"
+ "sbc r11, r11, r11\n\t"
+ "asr r10, %[b], #31\n\t"
+ "eor r7, r3, lr\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r3, r3, r7\n\t"
+ "eor lr, lr, r7\n\t"
+ "eor r7, r12, r4\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r12, r12, r7\n\t"
+ "eor r4, r4, r7\n\t"
+ "eor r8, r8, r5\n\t"
+ "and r8, r8, r10\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r9, r9, r10\n\t"
+ "eor r6, r6, r9\n\t"
+ "strd r3, r12, [%[r], #8]\n\t"
+ "strd lr, r4, [%[r], #40]\n\t"
+ "strd r5, r6, [%[r], #72]\n\t"
+ "sbfx r7, %[b], #7, #1\n\t"
+ "eor r10, %[b], r7\n\t"
+ "sub r10, r10, r7\n\t"
+ "mov r3, #0\n\t"
+ "mov r12, #0\n\t"
+ "mov lr, #0\n\t"
+ "mov r4, #0\n\t"
+ "mov r5, #0\n\t"
+ "mov r6, #0\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #31\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #30\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #29\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #28\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #27\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #26\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #25\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #24\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "sub %[base], %[base], #0x2a0\n\t"
+ "mov r8, #-1\n\t"
+ "mov r9, #-1\n\t"
+ "rsbs r11, r11, #0\n\t"
+ "sbcs r8, r8, r5\n\t"
+ "sbcs r9, r9, r6\n\t"
+ "sbc r11, r11, r11\n\t"
+ "asr r10, %[b], #31\n\t"
+ "eor r7, r3, lr\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r3, r3, r7\n\t"
+ "eor lr, lr, r7\n\t"
+ "eor r7, r12, r4\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r12, r12, r7\n\t"
+ "eor r4, r4, r7\n\t"
+ "eor r8, r8, r5\n\t"
+ "and r8, r8, r10\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r9, r9, r10\n\t"
+ "eor r6, r6, r9\n\t"
+ "strd r3, r12, [%[r], #16]\n\t"
+ "strd lr, r4, [%[r], #48]\n\t"
+ "strd r5, r6, [%[r], #80]\n\t"
+ "sbfx r7, %[b], #7, #1\n\t"
+ "eor r10, %[b], r7\n\t"
+ "sub r10, r10, r7\n\t"
+ "mov r3, #0\n\t"
+ "mov r12, #0\n\t"
+ "mov lr, #0\n\t"
+ "mov r4, #0\n\t"
+ "mov r5, #0\n\t"
+ "mov r6, #0\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #31\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #30\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #29\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #28\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #27\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #26\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #25\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #24\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "sub %[base], %[base], #0x2a0\n\t"
+ "mov r8, #-1\n\t"
+ "mov r9, #0x7fffffff\n\t"
+ "rsbs r11, r11, #0\n\t"
+ "sbcs r8, r8, r5\n\t"
+ "sbc r9, r9, r6\n\t"
+ "asr r10, %[b], #31\n\t"
+ "eor r7, r3, lr\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r3, r3, r7\n\t"
+ "eor lr, lr, r7\n\t"
+ "eor r7, r12, r4\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r12, r12, r7\n\t"
+ "eor r4, r4, r7\n\t"
+ "eor r8, r8, r5\n\t"
+ "and r8, r8, r10\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r9, r9, r10\n\t"
+ "eor r6, r6, r9\n\t"
+ "strd r3, r12, [%[r], #24]\n\t"
+ "strd lr, r4, [%[r], #56]\n\t"
+ "strd r5, r6, [%[r], #88]\n\t"
+ : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_mul(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x40\n\t"
+ /* Multiply */
+ "ldr r7, [%[a]]\n\t"
+ "ldr r8, [%[a], #4]\n\t"
+ "ldr r9, [%[b]]\n\t"
+ "ldr lr, [%[b], #4]\n\t"
+ /* A[0] * B[0] = 0 */
+ "umull r4, r5, r7, r9\n\t"
+ "str r4, [sp]\n\t"
+ /* A[0] * B[1] = 1 */
+ "umull r3, r6, r7, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * B[0] = 1 */
+ "umull r3, r12, r8, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #4]\n\t"
+ /* A[2] * B[0] = 2 */
+ "ldr r10, [%[a], #8]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r6, r6, r3\n\t"
+ "adc r4, r4, r12\n\t"
+ /* A[1] * B[1] = 2 */
+ "umull r3, r12, r8, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[0] * B[2] = 2 */
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #8]\n\t"
+ /* A[0] * B[3] = 3 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * B[2] = 3 */
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[2] * B[1] = 3 */
+ "umull r3, r12, r10, lr\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * B[0] = 3 */
+ "ldr r10, [%[a], #12]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #12]\n\t"
+ /* A[4] * B[0] = 4 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * B[1] = 4 */
+ "ldr r10, [%[a], #12]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[2] * B[2] = 4 */
+ "ldr r10, [%[a], #8]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * B[3] = 4 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[0] * B[4] = 4 */
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #16]\n\t"
+ /* A[0] * B[5] = 5 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * B[4] = 5 */
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * B[3] = 5 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[3] * B[2] = 5 */
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[4] * B[1] = 5 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * B[0] = 5 */
+ "ldr r10, [%[a], #20]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #20]\n\t"
+ /* A[6] * B[0] = 6 */
+ "ldr r10, [%[a], #24]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * B[1] = 6 */
+ "ldr r10, [%[a], #20]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[4] * B[2] = 6 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * B[3] = 6 */
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[2] * B[4] = 6 */
+ "ldr r10, [%[a], #8]\n\t"
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * B[5] = 6 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[0] * B[6] = 6 */
+ "ldr r11, [%[b], #24]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #24]\n\t"
+ /* A[0] * B[7] = 7 */
+ "ldr r11, [%[b], #28]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * B[6] = 7 */
+ "ldr r11, [%[b], #24]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[2] * B[5] = 7 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * B[4] = 7 */
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[4] * B[3] = 7 */
+ "ldr r10, [%[a], #16]\n\t"
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[5] * B[2] = 7 */
+ "ldr r10, [%[a], #20]\n\t"
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[6] * B[1] = 7 */
+ "ldr r10, [%[a], #24]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[7] * B[0] = 7 */
+ "ldr r10, [%[a], #28]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #28]\n\t"
+ "ldr r7, [%[a], #24]\n\t"
+ "ldr r9, [%[b], #24]\n\t"
+ /* A[7] * B[1] = 8 */
+ "umull r3, r12, r10, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[6] * B[2] = 8 */
+ "umull r3, r12, r7, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * B[3] = 8 */
+ "ldr r10, [%[a], #20]\n\t"
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[4] * B[4] = 8 */
+ "ldr r10, [%[a], #16]\n\t"
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[3] * B[5] = 8 */
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * B[6] = 8 */
+ "ldr r10, [%[a], #8]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * B[7] = 8 */
+ "ldr r11, [%[b], #28]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #32]\n\t"
+ "ldr r8, [%[a], #28]\n\t"
+ "mov lr, r11\n\t"
+ /* A[2] * B[7] = 9 */
+ "umull r3, r12, r10, lr\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * B[6] = 9 */
+ "ldr r10, [%[a], #12]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[4] * B[5] = 9 */
+ "ldr r10, [%[a], #16]\n\t"
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * B[4] = 9 */
+ "ldr r10, [%[a], #20]\n\t"
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[6] * B[3] = 9 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[7] * B[2] = 9 */
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #36]\n\t"
+ /* A[7] * B[3] = 10 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[6] * B[4] = 10 */
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[5] * B[5] = 10 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[4] * B[6] = 10 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * B[7] = 10 */
+ "ldr r10, [%[a], #12]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #40]\n\t"
+ /* A[4] * B[7] = 11 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * B[6] = 11 */
+ "ldr r10, [%[a], #20]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[6] * B[5] = 11 */
+ "umull r3, r12, r7, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[7] * B[4] = 11 */
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #44]\n\t"
+ /* A[7] * B[5] = 12 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[6] * B[6] = 12 */
+ "umull r3, r12, r7, r9\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * B[7] = 12 */
+ "umull r3, r12, r10, lr\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #48]\n\t"
+ /* A[6] * B[7] = 13 */
+ "umull r3, r12, r7, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[7] * B[6] = 13 */
+ "umull r3, r12, r8, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #52]\n\t"
+ /* A[7] * B[7] = 14 */
+ "umull r3, r12, r8, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "adc r4, r4, r12\n\t"
+ "str r6, [sp, #56]\n\t"
+ "str r4, [sp, #60]\n\t"
+ /* Reduce */
+ /* Load bottom half */
+ "ldrd r4, r5, [sp]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "ldrd r8, r9, [sp, #16]\n\t"
+ "ldrd r10, r11, [sp, #24]\n\t"
+ "lsr r3, r11, #31\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "mov lr, #19\n\t"
+ "ldr %[a], [sp, #32]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #36]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #40]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r7, r7, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #44]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r7, r7, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r8, r8, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #48]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r8, r8, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r9, r9, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #52]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r9, r9, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r10, r10, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #56]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r10, r10, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r11, r11, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #60]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "adds r11, r11, r3\n\t"
+ "adc r3, r12, %[b]\n\t"
+ /* Overflow */
+ "lsl r3, r3, #1\n\t"
+ "orr r3, r3, r11, lsr #31\n\t"
+ "mul r3, r3, lr\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Reduce if top bit set */
+ "asr r3, r11, #31\n\t"
+ "and r3, r3, lr\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Store */
+ "strd r4, r5, [%[r]]\n\t"
+ "strd r6, r7, [%[r], #8]\n\t"
+ "strd r8, r9, [%[r], #16]\n\t"
+ "strd r10, r11, [%[r], #24]\n\t"
+ "add sp, sp, #0x40\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_sq(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x40\n\t"
+ /* Square */
+ "ldr r7, [%[a]]\n\t"
+ "ldr r8, [%[a], #4]\n\t"
+ "ldr r9, [%[a], #8]\n\t"
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r12, [%[a], #16]\n\t"
+ /* A[0] * A[0] = 0 */
+ "umull r4, r5, r7, r7\n\t"
+ "str r4, [sp]\n\t"
+ /* A[0] * A[1] = 1 */
+ "umull r2, r3, r7, r8\n\t"
+ "mov r6, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adc r6, r6, r3\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #4]\n\t"
+ /* A[1] * A[1] = 2 */
+ "umull r2, r3, r8, r8\n\t"
+ "adds r6, r6, r2\n\t"
+ "adc r4, r4, r3\n\t"
+ /* A[0] * A[2] = 2 */
+ "umull r2, r3, r7, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #8]\n\t"
+ /* A[0] * A[3] = 3 */
+ "umull r2, r3, r7, r10\n\t"
+ "adds r4, r4, r2\n\t"
+ "adc r5, r5, r3\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * A[2] = 3 */
+ "umull r2, r3, r8, r9\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #12]\n\t"
+ /* A[2] * A[2] = 4 */
+ "umull r2, r3, r9, r9\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * A[3] = 4 */
+ "umull r2, r3, r8, r10\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[0] * A[4] = 4 */
+ "umull r2, r3, r7, r12\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #16]\n\t"
+ /* A[0] * A[5] = 5 */
+ "ldr r11, [%[a], #20]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * A[4] = 5 */
+ "umull r2, r3, r8, r12\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * A[3] = 5 */
+ "umull r2, r3, r9, r10\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #20]\n\t"
+ /* A[3] * A[3] = 6 */
+ "umull r2, r3, r10, r10\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[2] * A[4] = 6 */
+ "umull r2, r3, r9, r12\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * A[5] = 6 */
+ "umull r2, r3, r8, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[0] * A[6] = 6 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #24]\n\t"
+ /* A[0] * A[7] = 7 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * A[6] = 7 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r8, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[2] * A[5] = 7 */
+ "ldr r11, [%[a], #20]\n\t"
+ "umull r2, r3, r9, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * A[4] = 7 */
+ "umull r2, r3, r10, r12\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #28]\n\t"
+ /* A[4] * A[4] = 8 */
+ "umull r2, r3, r12, r12\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[3] * A[5] = 8 */
+ "umull r2, r3, r10, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * A[6] = 8 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r9, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * A[7] = 8 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r8, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #32]\n\t"
+ "ldr r7, [%[a], #20]\n\t"
+ /* A[2] * A[7] = 9 */
+ "umull r2, r3, r9, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * A[6] = 9 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r10, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[4] * A[5] = 9 */
+ "umull r2, r3, r12, r7\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #36]\n\t"
+ "mov r8, r11\n\t"
+ /* A[5] * A[5] = 10 */
+ "umull r2, r3, r7, r7\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[4] * A[6] = 10 */
+ "umull r2, r3, r12, r8\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * A[7] = 10 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r10, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #40]\n\t"
+ "mov r9, r11\n\t"
+ /* A[4] * A[7] = 11 */
+ "umull r2, r3, r12, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * A[6] = 11 */
+ "umull r2, r3, r7, r8\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #44]\n\t"
+ /* A[6] * A[6] = 12 */
+ "umull r2, r3, r8, r8\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * A[7] = 12 */
+ "umull r2, r3, r7, r9\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #48]\n\t"
+ /* A[6] * A[7] = 13 */
+ "umull r2, r3, r8, r9\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #52]\n\t"
+ /* A[7] * A[7] = 14 */
+ "umull r2, r3, r9, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "adc r4, r4, r3\n\t"
+ "str r6, [sp, #56]\n\t"
+ "str r4, [sp, #60]\n\t"
+ /* Reduce */
+ /* Load bottom half */
+ "ldrd r4, r5, [sp]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "ldrd r8, r9, [sp, #16]\n\t"
+ "ldrd r10, r11, [sp, #24]\n\t"
+ "lsr r2, r11, #31\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "mov r12, #19\n\t"
+ "ldr %[a], [sp, #32]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #36]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #40]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r7, r7, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #44]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r7, r7, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r8, r8, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #48]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r8, r8, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r9, r9, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #52]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r9, r9, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r10, r10, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #56]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r10, r10, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r11, r11, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #60]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "adds r11, r11, r2\n\t"
+ "adc r2, r3, lr\n\t"
+ /* Overflow */
+ "lsl r2, r2, #1\n\t"
+ "orr r2, r2, r11, lsr #31\n\t"
+ "mul r2, r2, r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Reduce if top bit set */
+ "asr r2, r11, #31\n\t"
+ "and r2, r2, r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Store */
+ "strd r4, r5, [%[r]]\n\t"
+ "strd r6, r7, [%[r], #8]\n\t"
+ "strd r8, r9, [%[r], #16]\n\t"
+ "strd r10, r11, [%[r], #24]\n\t"
+ "add sp, sp, #0x40\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_mul121666(fe r, fe a)
+{
+ __asm__ __volatile__ (
+ /* Multiply by 121666 */
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r4, r5, [%[a], #8]\n\t"
+ "ldrd r6, r7, [%[a], #16]\n\t"
+ "ldrd r8, r9, [%[a], #24]\n\t"
+ "movw lr, #0xdb42\n\t"
+ "movt lr, #1\n\t"
+ "umull r2, r10, r2, lr\n\t"
+ "umull r3, r12, r3, lr\n\t"
+ "adds r3, r3, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r4, r12, r4, lr\n\t"
+ "adds r4, r4, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r5, r12, r5, lr\n\t"
+ "adds r5, r5, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r6, r12, r6, lr\n\t"
+ "adds r6, r6, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r7, r12, r7, lr\n\t"
+ "adds r7, r7, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r8, r12, r8, lr\n\t"
+ "adds r8, r8, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r9, r12, r9, lr\n\t"
+ "adds r9, r9, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "mov lr, #19\n\t"
+ "lsl r10, r10, #1\n\t"
+ "orr r10, r10, r9, lsr #31\n\t"
+ "mul r10, r10, lr\n\t"
+ "and r9, r9, #0x7fffffff\n\t"
+ "adds r2, r2, r10\n\t"
+ "adcs r3, r3, #0\n\t"
+ "adcs r4, r4, #0\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adc r9, r9, #0\n\t"
+ "strd r2, r3, [%[r]]\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "strd r6, r7, [%[r], #16]\n\t"
+ "strd r8, r9, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+ );
+}
+
+void fe_sq2(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x40\n\t"
+ /* Square * 2 */
+ "ldr r7, [%[a]]\n\t"
+ "ldr r8, [%[a], #4]\n\t"
+ "ldr r9, [%[a], #8]\n\t"
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r12, [%[a], #16]\n\t"
+ /* A[0] * A[0] = 0 */
+ "umull r4, r5, r7, r7\n\t"
+ "str r4, [sp]\n\t"
+ /* A[0] * A[1] = 1 */
+ "umull r2, r3, r7, r8\n\t"
+ "mov r6, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adc r6, r6, r3\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #4]\n\t"
+ /* A[1] * A[1] = 2 */
+ "umull r2, r3, r8, r8\n\t"
+ "adds r6, r6, r2\n\t"
+ "adc r4, r4, r3\n\t"
+ /* A[0] * A[2] = 2 */
+ "umull r2, r3, r7, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #8]\n\t"
+ /* A[0] * A[3] = 3 */
+ "umull r2, r3, r7, r10\n\t"
+ "adds r4, r4, r2\n\t"
+ "adc r5, r5, r3\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * A[2] = 3 */
+ "umull r2, r3, r8, r9\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #12]\n\t"
+ /* A[2] * A[2] = 4 */
+ "umull r2, r3, r9, r9\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * A[3] = 4 */
+ "umull r2, r3, r8, r10\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[0] * A[4] = 4 */
+ "umull r2, r3, r7, r12\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #16]\n\t"
+ /* A[0] * A[5] = 5 */
+ "ldr r11, [%[a], #20]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * A[4] = 5 */
+ "umull r2, r3, r8, r12\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * A[3] = 5 */
+ "umull r2, r3, r9, r10\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #20]\n\t"
+ /* A[3] * A[3] = 6 */
+ "umull r2, r3, r10, r10\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[2] * A[4] = 6 */
+ "umull r2, r3, r9, r12\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * A[5] = 6 */
+ "umull r2, r3, r8, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[0] * A[6] = 6 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #24]\n\t"
+ /* A[0] * A[7] = 7 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * A[6] = 7 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r8, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[2] * A[5] = 7 */
+ "ldr r11, [%[a], #20]\n\t"
+ "umull r2, r3, r9, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * A[4] = 7 */
+ "umull r2, r3, r10, r12\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #28]\n\t"
+ /* A[4] * A[4] = 8 */
+ "umull r2, r3, r12, r12\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[3] * A[5] = 8 */
+ "umull r2, r3, r10, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * A[6] = 8 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r9, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * A[7] = 8 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r8, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #32]\n\t"
+ "ldr r7, [%[a], #20]\n\t"
+ /* A[2] * A[7] = 9 */
+ "umull r2, r3, r9, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * A[6] = 9 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r10, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[4] * A[5] = 9 */
+ "umull r2, r3, r12, r7\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #36]\n\t"
+ "mov r8, r11\n\t"
+ /* A[5] * A[5] = 10 */
+ "umull r2, r3, r7, r7\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[4] * A[6] = 10 */
+ "umull r2, r3, r12, r8\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * A[7] = 10 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r10, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #40]\n\t"
+ "mov r9, r11\n\t"
+ /* A[4] * A[7] = 11 */
+ "umull r2, r3, r12, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * A[6] = 11 */
+ "umull r2, r3, r7, r8\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #44]\n\t"
+ /* A[6] * A[6] = 12 */
+ "umull r2, r3, r8, r8\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * A[7] = 12 */
+ "umull r2, r3, r7, r9\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #48]\n\t"
+ /* A[6] * A[7] = 13 */
+ "umull r2, r3, r8, r9\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #52]\n\t"
+ /* A[7] * A[7] = 14 */
+ "umull r2, r3, r9, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "adc r4, r4, r3\n\t"
+ "str r6, [sp, #56]\n\t"
+ "str r4, [sp, #60]\n\t"
+ /* Double and Reduce */
+ /* Load bottom half */
+ "ldrd r4, r5, [sp]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "ldrd r8, r9, [sp, #16]\n\t"
+ "ldrd r10, r11, [sp, #24]\n\t"
+ "lsr r2, r11, #30\n\t"
+ "lsl r11, r11, #1\n\t"
+ "orr r11, r11, r10, lsr #31\n\t"
+ "lsl r10, r10, #1\n\t"
+ "orr r10, r10, r9, lsr #31\n\t"
+ "lsl r9, r9, #1\n\t"
+ "orr r9, r9, r8, lsr #31\n\t"
+ "lsl r8, r8, #1\n\t"
+ "orr r8, r8, r7, lsr #31\n\t"
+ "lsl r7, r7, #1\n\t"
+ "orr r7, r7, r6, lsr #31\n\t"
+ "lsl r6, r6, #1\n\t"
+ "orr r6, r6, r5, lsr #31\n\t"
+ "lsl r5, r5, #1\n\t"
+ "orr r5, r5, r4, lsr #31\n\t"
+ "lsl r4, r4, #1\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "mov r12, #19\n\t"
+ "ldr %[a], [sp, #32]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #36]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #40]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r7, r7, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #44]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r7, r7, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r8, r8, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #48]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r8, r8, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r9, r9, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #52]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r9, r9, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r10, r10, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #56]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r10, r10, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r11, r11, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #60]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "adds r11, r11, r2\n\t"
+ "adc r2, r3, lr\n\t"
+ /* Overflow */
+ "lsl r2, r2, #1\n\t"
+ "orr r2, r2, r11, lsr #31\n\t"
+ "mul r2, r2, r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Reduce if top bit set */
+ "asr r2, r11, #31\n\t"
+ "and r2, r2, r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Store */
+ "strd r4, r5, [%[r]]\n\t"
+ "strd r6, r7, [%[r], #8]\n\t"
+ "strd r8, r9, [%[r], #16]\n\t"
+ "strd r10, r11, [%[r], #24]\n\t"
+ "add sp, sp, #0x40\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_invert(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x88\n\t"
+ /* Invert */
+ "str %[r], [sp, #128]\n\t"
+ "str %[a], [sp, #132]\n\t"
+ "mov r0, sp\n\t"
+ "ldr r1, [sp, #132]\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "ldr r1, [sp, #132]\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r0, sp\n\t"
+ "mov r1, sp\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #4\n\t"
+ "\n"
+ "L_fe_invert1_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert1_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #9\n\t"
+ "\n"
+ "L_fe_invert2_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert2_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #19\n\t"
+ "\n"
+ "L_fe_invert3_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert3_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #10\n\t"
+ "\n"
+ "L_fe_invert4_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert4_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #49\n\t"
+ "\n"
+ "L_fe_invert5_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert5_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #0x63\n\t"
+ "\n"
+ "L_fe_invert6_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert6_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #50\n\t"
+ "\n"
+ "L_fe_invert7_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert7_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #5\n\t"
+ "\n"
+ "L_fe_invert8_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert8_%=\n\t"
+ "ldr r0, [sp, #128]\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "ldr %[a], [sp, #132]\n\t"
+ "ldr %[r], [sp, #128]\n\t"
+ "add sp, sp, #0x88\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "lr", "r4"
+ );
+}
+
+int curve25519(byte* r, byte* n, byte* a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0xbc\n\t"
+ "str %[r], [sp, #160]\n\t"
+ "str %[n], [sp, #164]\n\t"
+ "str %[a], [sp, #168]\n\t"
+ "mov %[n], #0\n\t"
+ "str %[n], [sp, #172]\n\t"
+ /* Set one */
+ "mov r11, #1\n\t"
+ "mov r10, #0\n\t"
+ "strd r11, r10, [%[r]]\n\t"
+ "strd r10, r10, [%[r], #8]\n\t"
+ "strd r10, r10, [%[r], #16]\n\t"
+ "strd r10, r10, [%[r], #24]\n\t"
+ /* Set zero */
+ "mov r10, #0\n\t"
+ "strd r10, r10, [sp]\n\t"
+ "strd r10, r10, [sp, #8]\n\t"
+ "strd r10, r10, [sp, #16]\n\t"
+ "strd r10, r10, [sp, #24]\n\t"
+ /* Set one */
+ "mov r11, #1\n\t"
+ "mov r10, #0\n\t"
+ "strd r11, r10, [sp, #32]\n\t"
+ "strd r10, r10, [sp, #40]\n\t"
+ "strd r10, r10, [sp, #48]\n\t"
+ "strd r10, r10, [sp, #56]\n\t"
+ /* Copy */
+ "ldrd r4, r5, [%[a]]\n\t"
+ "ldrd r6, r7, [%[a], #8]\n\t"
+ "strd r4, r5, [sp, #64]\n\t"
+ "strd r6, r7, [sp, #72]\n\t"
+ "ldrd r4, r5, [%[a], #16]\n\t"
+ "ldrd r6, r7, [%[a], #24]\n\t"
+ "strd r4, r5, [sp, #80]\n\t"
+ "strd r6, r7, [sp, #88]\n\t"
+ "mov %[n], #30\n\t"
+ "str %[n], [sp, #180]\n\t"
+ "mov %[a], #28\n\t"
+ "str %[a], [sp, #176]\n\t"
+ "\n"
+ "L_curve25519_words_%=: \n\t"
+ "\n"
+ "L_curve25519_bits_%=: \n\t"
+ "ldr %[n], [sp, #164]\n\t"
+ "ldr %[a], [%[n], r2]\n\t"
+ "ldr %[n], [sp, #180]\n\t"
+ "lsr %[a], %[a], %[n]\n\t"
+ "and %[a], %[a], #1\n\t"
+ "str %[a], [sp, #184]\n\t"
+ "ldr %[n], [sp, #172]\n\t"
+ "eor %[n], %[n], %[a]\n\t"
+ "str %[n], [sp, #172]\n\t"
+ "ldr %[r], [sp, #160]\n\t"
+ /* Conditional Swap */
+ "neg %[n], %[n]\n\t"
+ "ldrd r4, r5, [%[r]]\n\t"
+ "ldrd r6, r7, [sp, #64]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [%[r]]\n\t"
+ "strd r6, r7, [sp, #64]\n\t"
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "ldrd r6, r7, [sp, #72]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "strd r6, r7, [sp, #72]\n\t"
+ "ldrd r4, r5, [%[r], #16]\n\t"
+ "ldrd r6, r7, [sp, #80]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [%[r], #16]\n\t"
+ "strd r6, r7, [sp, #80]\n\t"
+ "ldrd r4, r5, [%[r], #24]\n\t"
+ "ldrd r6, r7, [sp, #88]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [%[r], #24]\n\t"
+ "strd r6, r7, [sp, #88]\n\t"
+ "ldr %[n], [sp, #172]\n\t"
+ /* Conditional Swap */
+ "neg %[n], %[n]\n\t"
+ "ldrd r4, r5, [sp]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [sp]\n\t"
+ "strd r6, r7, [sp, #32]\n\t"
+ "ldrd r4, r5, [sp, #8]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [sp, #8]\n\t"
+ "strd r6, r7, [sp, #40]\n\t"
+ "ldrd r4, r5, [sp, #16]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [sp, #16]\n\t"
+ "strd r6, r7, [sp, #48]\n\t"
+ "ldrd r4, r5, [sp, #24]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [sp, #24]\n\t"
+ "strd r6, r7, [sp, #56]\n\t"
+ "ldr %[n], [sp, #184]\n\t"
+ "str %[n], [sp, #172]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd r4, r5, [%[r]]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [%[r]]\n\t"
+ /* Sub */
+ "subs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #128]\n\t"
+ /* Add */
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [%[r], #8]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #136]\n\t"
+ /* Add */
+ "ldrd r4, r5, [%[r], #16]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [%[r], #16]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #144]\n\t"
+ /* Add */
+ "ldrd r4, r5, [%[r], #24]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r9, r5, r7\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "sbc r11, r5, r7\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r9, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r4, r5, [%[r]]\n\t"
+ "subs r4, r4, r3\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [%[r]]\n\t"
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "ldrd r4, r5, [%[r], #16]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [%[r], #16]\n\t"
+ "sbcs r8, r8, %[a]\n\t"
+ "sbc r9, r9, r12\n\t"
+ "strd r8, r9, [%[r], #24]\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r4, r5, [sp, #128]\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #128]\n\t"
+ "ldrd r4, r5, [sp, #136]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #136]\n\t"
+ "ldrd r4, r5, [sp, #144]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #144]\n\t"
+ "adcs r10, r10, %[a]\n\t"
+ "adc r11, r11, r12\n\t"
+ "strd r10, r11, [sp, #152]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd r4, r5, [sp, #64]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "adds r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp]\n\t"
+ /* Sub */
+ "subs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #96]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #72]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #8]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #104]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #80]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #16]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #112]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #88]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r9, r5, r7\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "sbc r11, r5, r7\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r9, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r4, r5, [sp]\n\t"
+ "subs r4, r4, r3\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp]\n\t"
+ "ldrd r4, r5, [sp, #8]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #8]\n\t"
+ "ldrd r4, r5, [sp, #16]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #16]\n\t"
+ "sbcs r8, r8, %[a]\n\t"
+ "sbc r9, r9, r12\n\t"
+ "strd r8, r9, [sp, #24]\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r4, r5, [sp, #96]\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #96]\n\t"
+ "ldrd r4, r5, [sp, #104]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #104]\n\t"
+ "ldrd r4, r5, [sp, #112]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #112]\n\t"
+ "adcs r10, r10, %[a]\n\t"
+ "adc r11, r11, r12\n\t"
+ "strd r10, r11, [sp, #120]\n\t"
+ "ldr r2, [sp, #160]\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r0, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r2, sp, #0x80\n\t"
+ "add r1, sp, #0\n\t"
+ "add r0, sp, #0\n\t"
+ "bl fe_mul\n\t"
+ "add r1, sp, #0x80\n\t"
+ "add r0, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "ldr r1, [sp, #160]\n\t"
+ "add r0, sp, #0x80\n\t"
+ "bl fe_sq\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd r4, r5, [sp, #32]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #64]\n\t"
+ /* Sub */
+ "subs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #40]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #72]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #8]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #48]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #80]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #16]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #56]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r9, r5, r7\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "sbc r11, r5, r7\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r9, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r4, r5, [sp, #64]\n\t"
+ "subs r4, r4, r3\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #64]\n\t"
+ "ldrd r4, r5, [sp, #72]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #72]\n\t"
+ "ldrd r4, r5, [sp, #80]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #80]\n\t"
+ "sbcs r8, r8, %[a]\n\t"
+ "sbc r9, r9, r12\n\t"
+ "strd r8, r9, [sp, #88]\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r4, r5, [sp]\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp]\n\t"
+ "ldrd r4, r5, [sp, #8]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #8]\n\t"
+ "ldrd r4, r5, [sp, #16]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #16]\n\t"
+ "adcs r10, r10, %[a]\n\t"
+ "adc r11, r11, r12\n\t"
+ "strd r10, r11, [sp, #24]\n\t"
+ "add r2, sp, #0x60\n\t"
+ "add r1, sp, #0x80\n\t"
+ "ldr r0, [sp, #160]\n\t"
+ "bl fe_mul\n\t"
+ /* Sub */
+ "ldrd r4, r5, [sp, #128]\n\t"
+ "ldrd r6, r7, [sp, #136]\n\t"
+ "ldrd r8, r9, [sp, #96]\n\t"
+ "ldrd r10, r11, [sp, #104]\n\t"
+ "subs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "sbcs r11, r7, r11\n\t"
+ "strd r8, r9, [sp, #128]\n\t"
+ "strd r10, r11, [sp, #136]\n\t"
+ "ldrd r4, r5, [sp, #144]\n\t"
+ "ldrd r6, r7, [sp, #152]\n\t"
+ "ldrd r8, r9, [sp, #112]\n\t"
+ "ldrd r10, r11, [sp, #120]\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "sbc r11, r7, r11\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r4, r5, [sp, #128]\n\t"
+ "ldrd r6, r7, [sp, #136]\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "adcs r6, r6, %[a]\n\t"
+ "adcs r7, r7, %[a]\n\t"
+ "adcs r8, r8, %[a]\n\t"
+ "adcs r9, r9, %[a]\n\t"
+ "adcs r10, r10, %[a]\n\t"
+ "adc r11, r11, r12\n\t"
+ "strd r4, r5, [sp, #128]\n\t"
+ "strd r6, r7, [sp, #136]\n\t"
+ "strd r8, r9, [sp, #144]\n\t"
+ "strd r10, r11, [sp, #152]\n\t"
+ "add r1, sp, #0\n\t"
+ "add r0, sp, #0\n\t"
+ "bl fe_sq\n\t"
+ /* Multiply by 121666 */
+ "ldrd r4, r5, [sp, #128]\n\t"
+ "ldrd r6, r7, [sp, #136]\n\t"
+ "ldrd r8, r9, [sp, #144]\n\t"
+ "ldrd r10, r11, [sp, #152]\n\t"
+ "movw r12, #0xdb42\n\t"
+ "movt r12, #1\n\t"
+ "umull r4, %[a], r4, r12\n\t"
+ "umull r5, r3, r5, r12\n\t"
+ "adds r5, r5, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r6, r3, r6, r12\n\t"
+ "adds r6, r6, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r7, r3, r7, r12\n\t"
+ "adds r7, r7, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r8, r3, r8, r12\n\t"
+ "adds r8, r8, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r9, r3, r9, r12\n\t"
+ "adds r9, r9, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r10, r3, r10, r12\n\t"
+ "adds r10, r10, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r11, r3, r11, r12\n\t"
+ "adds r11, r11, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "mov r12, #19\n\t"
+ "lsl %[a], %[a], #1\n\t"
+ "orr %[a], %[a], r11, lsr #31\n\t"
+ "mul %[a], %[a], r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, %[a]\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ "strd r4, r5, [sp, #32]\n\t"
+ "strd r6, r7, [sp, #40]\n\t"
+ "strd r8, r9, [sp, #48]\n\t"
+ "strd r10, r11, [sp, #56]\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r0, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #96]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "ldrd r8, r9, [sp, #32]\n\t"
+ "ldrd r10, r11, [sp, #40]\n\t"
+ "adds r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "adcs r11, r7, r11\n\t"
+ "strd r8, r9, [sp, #96]\n\t"
+ "strd r10, r11, [sp, #104]\n\t"
+ "ldrd r4, r5, [sp, #112]\n\t"
+ "ldrd r6, r7, [sp, #120]\n\t"
+ "ldrd r8, r9, [sp, #48]\n\t"
+ "ldrd r10, r11, [sp, #56]\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "adc r11, r7, r11\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r4, r5, [sp, #96]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "subs r4, r4, r3\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "sbcs r6, r6, %[a]\n\t"
+ "sbcs r7, r7, %[a]\n\t"
+ "sbcs r8, r8, %[a]\n\t"
+ "sbcs r9, r9, %[a]\n\t"
+ "sbcs r10, r10, %[a]\n\t"
+ "sbc r11, r11, r12\n\t"
+ "strd r4, r5, [sp, #96]\n\t"
+ "strd r6, r7, [sp, #104]\n\t"
+ "strd r8, r9, [sp, #112]\n\t"
+ "strd r10, r11, [sp, #120]\n\t"
+ "add r2, sp, #0\n\t"
+ "ldr r1, [sp, #168]\n\t"
+ "add r0, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r2, sp, #0x60\n\t"
+ "add r1, sp, #0x80\n\t"
+ "add r0, sp, #0\n\t"
+ "bl fe_mul\n\t"
+ "ldr %[a], [sp, #176]\n\t"
+ "ldr %[n], [sp, #180]\n\t"
+ "subs %[n], %[n], #1\n\t"
+ "str %[n], [sp, #180]\n\t"
+ "bge L_curve25519_bits_%=\n\t"
+ "mov %[n], #31\n\t"
+ "str %[n], [sp, #180]\n\t"
+ "subs %[a], %[a], #4\n\t"
+ "str %[a], [sp, #176]\n\t"
+ "bge L_curve25519_words_%=\n\t"
+ /* Invert */
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #0x60\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #4\n\t"
+ "\n"
+ "L_curve25519_inv_1_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_1_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #9\n\t"
+ "\n"
+ "L_curve25519_inv_2_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_2_%=\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x80\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #19\n\t"
+ "\n"
+ "L_curve25519_inv_3_%=: \n\t"
+ "add r0, sp, #0x80\n\t"
+ "add r1, sp, #0x80\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_3_%=\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x80\n\t"
+ "add r2, sp, #0x60\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #10\n\t"
+ "\n"
+ "L_curve25519_inv_4_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_4_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #49\n\t"
+ "\n"
+ "L_curve25519_inv_5_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_5_%=\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x80\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #0x63\n\t"
+ "\n"
+ "L_curve25519_inv_6_%=: \n\t"
+ "add r0, sp, #0x80\n\t"
+ "add r1, sp, #0x80\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_6_%=\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x80\n\t"
+ "add r2, sp, #0x60\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #50\n\t"
+ "\n"
+ "L_curve25519_inv_7_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_7_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #5\n\t"
+ "\n"
+ "L_curve25519_inv_8_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_8_%=\n\t"
+ "add r0, sp, #0\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r2, sp, #0\n\t"
+ "ldr r1, [sp, #160]\n\t"
+ "ldr r0, [sp, #160]\n\t"
+ "bl fe_mul\n\t"
+ "mov r0, #0\n\t"
+ "add sp, sp, #0xbc\n\t"
+ : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ return (uint32_t)(size_t)r;
+}
+
+void fe_pow22523(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x68\n\t"
+ /* pow22523 */
+ "str %[r], [sp, #96]\n\t"
+ "str %[a], [sp, #100]\n\t"
+ "mov r0, sp\n\t"
+ "ldr r1, [sp, #100]\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "ldr r1, [sp, #100]\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r0, sp\n\t"
+ "mov r1, sp\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r0, sp\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "mov r0, sp\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #4\n\t"
+ "\n"
+ "L_fe_pow22523_1_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_1_%=\n\t"
+ "mov r0, sp\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #9\n\t"
+ "\n"
+ "L_fe_pow22523_2_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_2_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #19\n\t"
+ "\n"
+ "L_fe_pow22523_3_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_3_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #10\n\t"
+ "\n"
+ "L_fe_pow22523_4_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_4_%=\n\t"
+ "mov r0, sp\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #49\n\t"
+ "\n"
+ "L_fe_pow22523_5_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_5_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #0x63\n\t"
+ "\n"
+ "L_fe_pow22523_6_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_6_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #50\n\t"
+ "\n"
+ "L_fe_pow22523_7_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_7_%=\n\t"
+ "mov r0, sp\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #2\n\t"
+ "\n"
+ "L_fe_pow22523_8_%=: \n\t"
+ "mov r0, sp\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_8_%=\n\t"
+ "ldr r0, [sp, #96]\n\t"
+ "mov r1, sp\n\t"
+ "ldr r2, [sp, #100]\n\t"
+ "bl fe_mul\n\t"
+ "ldr %[a], [sp, #100]\n\t"
+ "ldr %[r], [sp, #96]\n\t"
+ "add sp, sp, #0x68\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "lr", "r4"
+ );
+}
+
+void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #16\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[px], [sp, #12]\n\t"
+ "ldr r2, [sp, #32]\n\t"
+ "ldr r1, [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #28]\n\t"
+ "ldr r1, [sp, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #32]\n\t"
+ "ldr r1, [sp, #28]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "add sp, sp, #16\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "lr"
+ );
+}
+
+void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #16\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r2, [sp, #36]\n\t"
+ "ldr r1, [sp, #24]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #32]\n\t"
+ "ldr r1, [sp, #28]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #36]\n\t"
+ "ldr r1, [sp, #32]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #28]\n\t"
+ "ldr r1, [sp, #24]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "add sp, sp, #16\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "lr"
+ );
+}
+
+void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #16\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r1, [sp, #88]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_sq\n\t"
+ "ldr r1, [sp, #92]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_sq\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #88]\n\t"
+ "ldr r2, [sp, #92]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_sq\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #8]\n\t"
+ "ldr r2, [sp]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r2]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "ldrd r5, r6, [r2, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r2, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #24]\n\t"
+ "ldrd r5, r6, [r2, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #12]\n\t"
+ "ldr r2, [sp, #4]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r1, [sp, #96]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_sq2\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "ldr r1, [sp, #8]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "ldrd r7, r8, [r1]\n\t"
+ "ldrd r9, r10, [r1, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "add sp, sp, #16\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #32\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #108]\n\t"
+ "ldr r2, [sp, #104]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #108]\n\t"
+ "ldr r2, [sp, #104]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r2, [sp, #124]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #128]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #116]\n\t"
+ "ldr r1, [sp, #120]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r2, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "ldr r1, [sp, #112]\n\t"
+ /* Double */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "adds %[rt], %[rt], %[rt]\n\t"
+ "adcs r4, r4, r4\n\t"
+ "adcs r5, r5, r5\n\t"
+ "adcs r6, r6, r6\n\t"
+ "adcs r7, r7, r7\n\t"
+ "adcs r8, r8, r8\n\t"
+ "adcs r9, r9, r9\n\t"
+ "adc r10, r10, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "ldr r1, [sp, #12]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r1]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "ldrd r5, r6, [r1, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r0, #24]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "add sp, sp, #32\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ (void)qxy2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #32\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #108]\n\t"
+ "ldr r2, [sp, #104]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #108]\n\t"
+ "ldr r2, [sp, #104]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r2, [sp, #128]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #124]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #116]\n\t"
+ "ldr r1, [sp, #120]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r2, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "ldr r1, [sp, #112]\n\t"
+ /* Double */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "adds %[rt], %[rt], %[rt]\n\t"
+ "adcs r4, r4, r4\n\t"
+ "adcs r5, r5, r5\n\t"
+ "adcs r6, r6, r6\n\t"
+ "adcs r7, r7, r7\n\t"
+ "adcs r8, r8, r8\n\t"
+ "adcs r9, r9, r9\n\t"
+ "adc r10, r10, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "ldr r1, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "add sp, sp, #32\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ (void)qxy2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x60\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #172]\n\t"
+ "ldr r2, [sp, #168]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #172]\n\t"
+ "ldr r2, [sp, #168]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r2, [sp, #192]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #196]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #180]\n\t"
+ "ldr r1, [sp, #188]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #184]\n\t"
+ "ldr r1, [sp, #176]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #16\n\t"
+ "ldr r1, [sp]\n\t"
+ /* Double */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "adds %[rt], %[rt], %[rt]\n\t"
+ "adcs r4, r4, r4\n\t"
+ "adcs r5, r5, r5\n\t"
+ "adcs r6, r6, r6\n\t"
+ "adcs r7, r7, r7\n\t"
+ "adcs r8, r8, r8\n\t"
+ "adcs r9, r9, r9\n\t"
+ "adc r10, r10, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r2, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "ldr r1, [sp, #12]\n\t"
+ "add r2, sp, #16\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r1]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r1, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "add sp, sp, #0x60\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ (void)qz;
+ (void)qt2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x60\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #172]\n\t"
+ "ldr r2, [sp, #168]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #172]\n\t"
+ "ldr r2, [sp, #168]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r2, [sp, #196]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #192]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #180]\n\t"
+ "ldr r1, [sp, #188]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #184]\n\t"
+ "ldr r1, [sp, #176]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #16\n\t"
+ "ldr r1, [sp]\n\t"
+ /* Double */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "adds %[rt], %[rt], %[rt]\n\t"
+ "adcs r4, r4, r4\n\t"
+ "adcs r5, r5, r5\n\t"
+ "adcs r6, r6, r6\n\t"
+ "adcs r7, r7, r7\n\t"
+ "adcs r8, r8, r8\n\t"
+ "adcs r9, r9, r9\n\t"
+ "adc r10, r10, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r2, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "ldr r1, [sp, #8]\n\t"
+ "add r2, sp, #16\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "add sp, sp, #0x60\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ (void)qz;
+ (void)qt2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+#endif /* WOLFSSL_ARMASM */
+#endif /* !__aarch64__ */