summaryrefslogtreecommitdiff
path: root/wolfcrypt/src/port/arm/armv8-curve25519.c
diff options
context:
space:
mode:
authorauth12 <[email protected]>2020-07-22 08:34:12 -0700
committerauth12 <[email protected]>2020-07-22 08:34:12 -0700
commit5015ddb9b1eee748efc24056e46f81888c975f7a (patch)
treea810f6ee90f8bfe0e934fdd9142198e6b3862957 /wolfcrypt/src/port/arm/armv8-curve25519.c
downloadwolfssl_windows-5015ddb9b1eee748efc24056e46f81888c975f7a.tar.xz
wolfssl_windows-5015ddb9b1eee748efc24056e46f81888c975f7a.zip
Initial commit
Diffstat (limited to 'wolfcrypt/src/port/arm/armv8-curve25519.c')
-rw-r--r--wolfcrypt/src/port/arm/armv8-curve25519.c6725
1 files changed, 6725 insertions, 0 deletions
diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.c b/wolfcrypt/src/port/arm/armv8-curve25519.c
new file mode 100644
index 0000000..d1ab4c8
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-curve25519.c
@@ -0,0 +1,6725 @@
+/* armv8-curve25519
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c
+ */
+#ifdef __aarch64__
+#include <stdint.h>
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#include <wolfssl/wolfcrypt/fe_operations.h>
+#include <stdint.h>
+
+void fe_init()
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ :
+ :
+ : "memory"
+ );
+}
+
+void fe_frombytes(fe out, const unsigned char* in)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "ldp x2, x3, [%x[in]]\n\t"
+ "ldp x4, x5, [%x[in], #16]\n\t"
+ "and x5, x5, #0x7fffffffffffffff\n\t"
+ "stp x2, x3, [%x[out]]\n\t"
+ "stp x4, x5, [%x[out], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [out] "+r" (out), [in] "+r" (in)
+ :
+ : "memory", "x2", "x3", "x4", "x5", "x6"
+ );
+}
+
+void fe_tobytes(unsigned char* out, const fe n)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "mov x7, #19\n\t"
+ "ldp x2, x3, [%x[n]]\n\t"
+ "ldp x4, x5, [%x[n], #16]\n\t"
+ "adds x6, x2, x7\n\t"
+ "adcs x6, x3, xzr\n\t"
+ "adcs x6, x4, xzr\n\t"
+ "adc x6, x5, xzr\n\t"
+ "and x6, x7, x6, asr 63\n\t"
+ "adds x2, x2, x6\n\t"
+ "adcs x3, x3, xzr\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adc x5, x5, xzr\n\t"
+ "and x5, x5, #0x7fffffffffffffff\n\t"
+ "stp x2, x3, [%x[out]]\n\t"
+ "stp x4, x5, [%x[out], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [out] "+r" (out), [n] "+r" (n)
+ :
+ : "memory", "x2", "x3", "x4", "x5", "x6", "x7"
+ );
+}
+
+void fe_1(fe n)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Set one */
+ "mov x1, #1\n\t"
+ "stp x1, xzr, [%x[n]]\n\t"
+ "stp xzr, xzr, [%x[n], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [n] "+r" (n)
+ :
+ : "memory", "x1"
+ );
+}
+
+void fe_0(fe n)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Set zero */
+ "stp xzr, xzr, [%x[n]]\n\t"
+ "stp xzr, xzr, [%x[n], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [n] "+r" (n)
+ :
+ : "memory"
+ );
+}
+
+void fe_copy(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Copy */
+ "ldp x2, x3, [%x[a]]\n\t"
+ "ldp x4, x5, [%x[a], #16]\n\t"
+ "stp x2, x3, [%x[r]]\n\t"
+ "stp x4, x5, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x2", "x3", "x4", "x5"
+ );
+}
+
+void fe_sub(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Sub */
+ "ldp x3, x4, [%x[a]]\n\t"
+ "ldp x5, x6, [%x[a], #16]\n\t"
+ "ldp x7, x8, [%x[b]]\n\t"
+ "ldp x9, x10, [%x[b], #16]\n\t"
+ "subs x3, x3, x7\n\t"
+ "sbcs x4, x4, x8\n\t"
+ "sbcs x5, x5, x9\n\t"
+ "sbcs x6, x6, x10\n\t"
+ "mov x12, #-19\n\t"
+ "csetm x11, cc\n\t"
+ /* Mask the modulus */
+ "and x12, x11, x12\n\t"
+ "and x13, x11, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x3, x3, x12\n\t"
+ "adcs x4, x4, x11\n\t"
+ "adcs x5, x5, x11\n\t"
+ "adc x6, x6, x13\n\t"
+ "stp x3, x4, [%x[r]]\n\t"
+ "stp x5, x6, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
+ );
+}
+
+void fe_add(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Add */
+ "ldp x3, x4, [%x[a]]\n\t"
+ "ldp x5, x6, [%x[a], #16]\n\t"
+ "ldp x7, x8, [%x[b]]\n\t"
+ "ldp x9, x10, [%x[b], #16]\n\t"
+ "adds x3, x3, x7\n\t"
+ "adcs x4, x4, x8\n\t"
+ "adcs x5, x5, x9\n\t"
+ "adc x6, x6, x10\n\t"
+ "mov x12, #-19\n\t"
+ "asr x11, x6, #63\n\t"
+ /* Mask the modulus */
+ "and x12, x11, x12\n\t"
+ "and x13, x11, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x3, x3, x12\n\t"
+ "sbcs x4, x4, x11\n\t"
+ "sbcs x5, x5, x11\n\t"
+ "sbc x6, x6, x13\n\t"
+ "stp x3, x4, [%x[r]]\n\t"
+ "stp x5, x6, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
+ );
+}
+
+void fe_neg(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "ldp x2, x3, [%x[a]]\n\t"
+ "ldp x4, x5, [%x[a], #16]\n\t"
+ "mov x6, #-19\n\t"
+ "mov x7, #-1\n\t"
+ "mov x8, #-1\n\t"
+ "mov x9, #0x7fffffffffffffff\n\t"
+ "subs x6, x6, x2\n\t"
+ "sbcs x7, x7, x3\n\t"
+ "sbcs x8, x8, x4\n\t"
+ "sbc x9, x9, x5\n\t"
+ "stp x6, x7, [%x[r]]\n\t"
+ "stp x8, x9, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9"
+ );
+}
+
+int fe_isnonzero(const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "mov x6, #19\n\t"
+ "ldp x1, x2, [%x[a]]\n\t"
+ "ldp x3, x4, [%x[a], #16]\n\t"
+ "adds x5, x1, x6\n\t"
+ "adcs x5, x2, xzr\n\t"
+ "adcs x5, x3, xzr\n\t"
+ "adc x5, x4, xzr\n\t"
+ "and x5, x6, x5, asr 63\n\t"
+ "adds x1, x1, x5\n\t"
+ "adcs x2, x2, xzr\n\t"
+ "adcs x3, x3, xzr\n\t"
+ "adc x4, x4, xzr\n\t"
+ "and x4, x4, #0x7fffffffffffffff\n\t"
+ "orr %x[a], x1, x2\n\t"
+ "orr x3, x3, x4\n\t"
+ "orr %x[a], %x[a], x3\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [a] "+r" (a)
+ :
+ : "memory", "x1", "x2", "x3", "x4", "x5", "x6"
+ );
+ return (uint32_t)(size_t)a;
+}
+
+int fe_isnegative(const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "mov x6, #19\n\t"
+ "ldp x1, x2, [%x[a]]\n\t"
+ "ldp x3, x4, [%x[a], #16]\n\t"
+ "adds x5, x1, x6\n\t"
+ "adcs x5, x2, xzr\n\t"
+ "adcs x5, x3, xzr\n\t"
+ "adc x5, x4, xzr\n\t"
+ "and %x[a], x1, #1\n\t"
+ "eor %x[a], %x[a], x5, lsr 63\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [a] "+r" (a)
+ :
+ : "memory", "x1", "x2", "x3", "x4", "x5", "x6"
+ );
+ return (uint32_t)(size_t)a;
+}
+
+void fe_cmov_table(fe* r, fe* base, signed char b)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-32]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[r], [x29, #16]\n\t"
+ "sxtb %x[b], %w[b]\n\t"
+ "sbfx x3, %x[b], #7, #1\n\t"
+ "eor %x[r], %x[b], x3\n\t"
+ "sub %x[r], %x[r], x3\n\t"
+ "mov x4, #1\n\t"
+ "mov x5, xzr\n\t"
+ "mov x6, xzr\n\t"
+ "mov x7, xzr\n\t"
+ "mov x8, #1\n\t"
+ "mov x9, xzr\n\t"
+ "mov x10, xzr\n\t"
+ "mov x11, xzr\n\t"
+ "mov x12, xzr\n\t"
+ "mov x13, xzr\n\t"
+ "mov x14, xzr\n\t"
+ "mov x15, xzr\n\t"
+ "cmp %x[r], #1\n\t"
+ "ldp x16, x17, [%x[base]]\n\t"
+ "ldp x19, x20, [%x[base], #16]\n\t"
+ "ldp x21, x22, [%x[base], #32]\n\t"
+ "ldp x23, x24, [%x[base], #48]\n\t"
+ "ldp x25, x26, [%x[base], #64]\n\t"
+ "ldp x27, x28, [%x[base], #80]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #2\n\t"
+ "ldp x16, x17, [%x[base], #96]\n\t"
+ "ldp x19, x20, [%x[base], #112]\n\t"
+ "ldp x21, x22, [%x[base], #128]\n\t"
+ "ldp x23, x24, [%x[base], #144]\n\t"
+ "ldp x25, x26, [%x[base], #160]\n\t"
+ "ldp x27, x28, [%x[base], #176]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #3\n\t"
+ "ldp x16, x17, [%x[base], #192]\n\t"
+ "ldp x19, x20, [%x[base], #208]\n\t"
+ "ldp x21, x22, [%x[base], #224]\n\t"
+ "ldp x23, x24, [%x[base], #240]\n\t"
+ "ldp x25, x26, [%x[base], #256]\n\t"
+ "ldp x27, x28, [%x[base], #272]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #4\n\t"
+ "ldp x16, x17, [%x[base], #288]\n\t"
+ "ldp x19, x20, [%x[base], #304]\n\t"
+ "ldp x21, x22, [%x[base], #320]\n\t"
+ "ldp x23, x24, [%x[base], #336]\n\t"
+ "ldp x25, x26, [%x[base], #352]\n\t"
+ "ldp x27, x28, [%x[base], #368]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "add %x[base], %x[base], #0x180\n\t"
+ "cmp %x[r], #5\n\t"
+ "ldp x16, x17, [%x[base]]\n\t"
+ "ldp x19, x20, [%x[base], #16]\n\t"
+ "ldp x21, x22, [%x[base], #32]\n\t"
+ "ldp x23, x24, [%x[base], #48]\n\t"
+ "ldp x25, x26, [%x[base], #64]\n\t"
+ "ldp x27, x28, [%x[base], #80]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #6\n\t"
+ "ldp x16, x17, [%x[base], #96]\n\t"
+ "ldp x19, x20, [%x[base], #112]\n\t"
+ "ldp x21, x22, [%x[base], #128]\n\t"
+ "ldp x23, x24, [%x[base], #144]\n\t"
+ "ldp x25, x26, [%x[base], #160]\n\t"
+ "ldp x27, x28, [%x[base], #176]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #7\n\t"
+ "ldp x16, x17, [%x[base], #192]\n\t"
+ "ldp x19, x20, [%x[base], #208]\n\t"
+ "ldp x21, x22, [%x[base], #224]\n\t"
+ "ldp x23, x24, [%x[base], #240]\n\t"
+ "ldp x25, x26, [%x[base], #256]\n\t"
+ "ldp x27, x28, [%x[base], #272]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #8\n\t"
+ "ldp x16, x17, [%x[base], #288]\n\t"
+ "ldp x19, x20, [%x[base], #304]\n\t"
+ "ldp x21, x22, [%x[base], #320]\n\t"
+ "ldp x23, x24, [%x[base], #336]\n\t"
+ "ldp x25, x26, [%x[base], #352]\n\t"
+ "ldp x27, x28, [%x[base], #368]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "mov x16, #-19\n\t"
+ "mov x17, #-1\n\t"
+ "mov x19, #-1\n\t"
+ "mov x20, #0x7fffffffffffffff\n\t"
+ "subs x16, x16, x12\n\t"
+ "sbcs x17, x17, x13\n\t"
+ "sbcs x19, x19, x14\n\t"
+ "sbc x20, x20, x15\n\t"
+ "cmp %x[b], #0\n\t"
+ "mov x3, x4\n\t"
+ "csel x4, x8, x4, lt\n\t"
+ "csel x8, x3, x8, lt\n\t"
+ "mov x3, x5\n\t"
+ "csel x5, x9, x5, lt\n\t"
+ "csel x9, x3, x9, lt\n\t"
+ "mov x3, x6\n\t"
+ "csel x6, x10, x6, lt\n\t"
+ "csel x10, x3, x10, lt\n\t"
+ "mov x3, x7\n\t"
+ "csel x7, x11, x7, lt\n\t"
+ "csel x11, x3, x11, lt\n\t"
+ "csel x12, x16, x12, lt\n\t"
+ "csel x13, x17, x13, lt\n\t"
+ "csel x14, x19, x14, lt\n\t"
+ "csel x15, x20, x15, lt\n\t"
+ "ldr %x[r], [x29, #16]\n\t"
+ "stp x4, x5, [%x[r]]\n\t"
+ "stp x6, x7, [%x[r], #16]\n\t"
+ "stp x8, x9, [%x[r], #32]\n\t"
+ "stp x10, x11, [%x[r], #48]\n\t"
+ "stp x12, x13, [%x[r], #64]\n\t"
+ "stp x14, x15, [%x[r], #80]\n\t"
+ "ldp x29, x30, [sp], #32\n\t"
+ : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+void fe_mul(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Multiply */
+ "ldp x14, x15, [%x[a]]\n\t"
+ "ldp x16, x17, [%x[a], #16]\n\t"
+ "ldp x19, x20, [%x[b]]\n\t"
+ "ldp x21, x22, [%x[b], #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x6, x14, x19\n\t"
+ "umulh x7, x14, x19\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x20\n\t"
+ "umulh x8, x14, x20\n\t"
+ "adds x7, x7, x3\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x19\n\t"
+ "umulh x4, x15, x19\n\t"
+ "adds x7, x7, x3\n\t"
+ "adcs x8, x8, x4\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x21\n\t"
+ "umulh x4, x14, x21\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x20\n\t"
+ "umulh x4, x15, x20\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x19\n\t"
+ "umulh x4, x16, x19\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x22\n\t"
+ "umulh x4, x14, x22\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs x10, x10, x4\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x21\n\t"
+ "umulh x4, x15, x21\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs x10, x10, x4\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x20\n\t"
+ "umulh x4, x16, x20\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs x10, x10, x4\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x19\n\t"
+ "umulh x4, x17, x19\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs x10, x10, x4\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x22\n\t"
+ "umulh x4, x15, x22\n\t"
+ "adds x10, x10, x3\n\t"
+ "adcs x11, x11, x4\n\t"
+ "adc x12, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x21\n\t"
+ "umulh x4, x16, x21\n\t"
+ "adds x10, x10, x3\n\t"
+ "adcs x11, x11, x4\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x20\n\t"
+ "umulh x4, x17, x20\n\t"
+ "adds x10, x10, x3\n\t"
+ "adcs x11, x11, x4\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x22\n\t"
+ "umulh x4, x16, x22\n\t"
+ "adds x11, x11, x3\n\t"
+ "adcs x12, x12, x4\n\t"
+ "adc x13, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x21\n\t"
+ "umulh x4, x17, x21\n\t"
+ "adds x11, x11, x3\n\t"
+ "adcs x12, x12, x4\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x22\n\t"
+ "umulh x4, x17, x22\n\t"
+ "adds x12, x12, x3\n\t"
+ "adc x13, x13, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x13, x13, x12, #63\n\t"
+ "extr x12, x12, x11, #63\n\t"
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, x10\n\t"
+ "umulh x10, x3, x10\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x11\n\t"
+ "umulh x11, x3, x11\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x12\n\t"
+ "umulh x12, x3, x12\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x13\n\t"
+ "umulh x5, x3, x13\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, x10\n\t"
+ "adcs x8, x8, x11\n\t"
+ "adcs x9, x9, x12\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "stp x6, x7, [%x[r]]\n\t"
+ "stp x8, x9, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22"
+ );
+}
+
+void fe_sq(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Square */
+ "ldp x13, x14, [%x[a]]\n\t"
+ "ldp x15, x16, [%x[a], #16]\n\t"
+ /* A[0] * A[1] */
+ "mul x6, x13, x14\n\t"
+ "umulh x7, x13, x14\n\t"
+ /* A[0] * A[2] */
+ "mul x2, x13, x15\n\t"
+ "umulh x8, x13, x15\n\t"
+ "adds x7, x7, x2\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x2, x13, x16\n\t"
+ "umulh x9, x13, x16\n\t"
+ "adds x8, x8, x2\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x2, x14, x15\n\t"
+ "umulh x3, x14, x15\n\t"
+ "adds x8, x8, x2\n\t"
+ "adcs x9, x9, x3\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x2, x14, x16\n\t"
+ "umulh x3, x14, x16\n\t"
+ "adds x9, x9, x2\n\t"
+ "adc x10, x10, x3\n\t"
+ /* A[2] * A[3] */
+ "mul x2, x15, x16\n\t"
+ "umulh x11, x15, x16\n\t"
+ "adds x10, x10, x2\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Double */
+ "adds x6, x6, x6\n\t"
+ "adcs x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adcs x11, x11, x11\n\t"
+ "adc x12, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x5, x13, x13\n\t"
+ "umulh x4, x13, x13\n\t"
+ /* A[1] * A[1] */
+ "mul x2, x14, x14\n\t"
+ "umulh x3, x14, x14\n\t"
+ "adds x6, x6, x4\n\t"
+ "adcs x7, x7, x2\n\t"
+ "adc x4, x3, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x2, x15, x15\n\t"
+ "umulh x3, x15, x15\n\t"
+ "adds x8, x8, x4\n\t"
+ "adcs x9, x9, x2\n\t"
+ "adc x4, x3, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x2, x16, x16\n\t"
+ "umulh x3, x16, x16\n\t"
+ "adds x10, x10, x4\n\t"
+ "adcs x11, x11, x2\n\t"
+ "adc x12, x12, x3\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x12, x12, x11, #63\n\t"
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "and x8, x8, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x2, #19\n\t"
+ "mul x3, x2, x9\n\t"
+ "umulh x9, x2, x9\n\t"
+ "adds x5, x5, x3\n\t"
+ "mul x3, x2, x10\n\t"
+ "umulh x10, x2, x10\n\t"
+ "adcs x6, x6, x3\n\t"
+ "mul x3, x2, x11\n\t"
+ "umulh x11, x2, x11\n\t"
+ "adcs x7, x7, x3\n\t"
+ "mul x3, x2, x12\n\t"
+ "umulh x4, x2, x12\n\t"
+ "adcs x8, x8, x3\n\t"
+ "adc x4, x4, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adcs x8, x8, x11\n\t"
+ "adc x4, x4, xzr\n\t"
+ /* Overflow */
+ "extr x4, x4, x8, #63\n\t"
+ "mul x4, x4, x2\n\t"
+ "and x8, x8, #0x7fffffffffffffff\n\t"
+ "adds x5, x5, x4\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x4, x2, x8, asr 63\n\t"
+ "and x8, x8, #0x7fffffffffffffff\n\t"
+ "adds x5, x5, x4\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* Store */
+ "stp x5, x6, [%x[r]]\n\t"
+ "stp x7, x8, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16"
+ );
+}
+
+void fe_invert(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-160]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Invert */
+ "str %x[r], [x29, #144]\n\t"
+ "str %x[a], [x29, #152]\n\t"
+ "add x0, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "ldr x1, [x29, #152]\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #16\n\t"
+ "add x1, x29, #16\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #48\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #4\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert1_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert1_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #9\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert2_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert2_%=\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #19\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_fe_invert3_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert3_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "mov x20, #10\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert4_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert4_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #49\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert5_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert5_%=\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #0x63\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_fe_invert6_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert6_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "mov x20, #50\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert7_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert7_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "mov x20, #5\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_invert8_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert8_%=\n\t"
+ "ldr x0, [x29, #144]\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "ldp x29, x30, [sp], #0xa0\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x20"
+ );
+}
+
+int curve25519(byte* r, byte* n, byte* a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-192]!\n\t"
+ "add x29, sp, #0\n\t"
+ "mov x23, xzr\n\t"
+ "str %x[r], [x29, #176]\n\t"
+ "str %x[a], [x29, #184]\n\t"
+ /* Copy */
+ "ldp x6, x7, [%x[a]]\n\t"
+ "ldp x8, x9, [%x[a], #16]\n\t"
+ "stp x6, x7, [x29, #80]\n\t"
+ "stp x8, x9, [x29, #96]\n\t"
+ /* Set one */
+ "mov %x[a], #1\n\t"
+ "stp %x[a], xzr, [%x[r]]\n\t"
+ "stp xzr, xzr, [%x[r], #16]\n\t"
+ /* Set zero */
+ "stp xzr, xzr, [x29, #16]\n\t"
+ "stp xzr, xzr, [x29, #32]\n\t"
+ /* Set one */
+ "mov %x[a], #1\n\t"
+ "stp %x[a], xzr, [x29, #48]\n\t"
+ "stp xzr, xzr, [x29, #64]\n\t"
+ "mov x25, #62\n\t"
+ "mov x24, #24\n\t"
+ "\n"
+ "L_curve25519_words_%=: \n\t"
+ "\n"
+ "L_curve25519_bits_%=: \n\t"
+ "ldr %x[a], [%x[n], x24]\n\t"
+ "lsr %x[a], %x[a], x25\n\t"
+ "and %x[a], %x[a], #1\n\t"
+ "eor x23, x23, %x[a]\n\t"
+ /* Conditional Swap */
+ "cmp x23, #1\n\t"
+ "ldp x10, x11, [%x[r]]\n\t"
+ "ldp x12, x13, [%x[r], #16]\n\t"
+ "ldp x6, x7, [x29, #80]\n\t"
+ "ldp x8, x9, [x29, #96]\n\t"
+ "csel x14, x10, x6, eq\n\t"
+ "csel x10, x6, x10, eq\n\t"
+ "csel x15, x11, x7, eq\n\t"
+ "csel x11, x7, x11, eq\n\t"
+ "csel x16, x12, x8, eq\n\t"
+ "csel x12, x8, x12, eq\n\t"
+ "csel x17, x13, x9, eq\n\t"
+ "csel x13, x9, x13, eq\n\t"
+ /* Conditional Swap */
+ "cmp x23, #1\n\t"
+ "ldp x19, x20, [x29, #16]\n\t"
+ "ldp x21, x22, [x29, #32]\n\t"
+ "ldp x6, x7, [x29, #48]\n\t"
+ "ldp x8, x9, [x29, #64]\n\t"
+ "csel x5, x19, x6, eq\n\t"
+ "csel x19, x6, x19, eq\n\t"
+ "csel x26, x20, x7, eq\n\t"
+ "csel x20, x7, x20, eq\n\t"
+ "csel x27, x21, x8, eq\n\t"
+ "csel x21, x8, x21, eq\n\t"
+ "csel x28, x22, x9, eq\n\t"
+ "csel x22, x9, x22, eq\n\t"
+ "mov x23, %x[a]\n\t"
+ /* Add */
+ "adds x6, x10, x19\n\t"
+ "adcs x7, x11, x20\n\t"
+ "adcs x8, x12, x21\n\t"
+ "adc x9, x13, x22\n\t"
+ "mov x3, #-19\n\t"
+ "asr %x[a], x9, #63\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x6, x6, x3\n\t"
+ "sbcs x7, x7, %x[a]\n\t"
+ "sbcs x8, x8, %x[a]\n\t"
+ "sbc x9, x9, x4\n\t"
+ /* Sub */
+ "subs x19, x10, x19\n\t"
+ "sbcs x20, x11, x20\n\t"
+ "sbcs x21, x12, x21\n\t"
+ "sbcs x22, x13, x22\n\t"
+ "mov x3, #-19\n\t"
+ "csetm %x[a], cc\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, %x[a]\n\t"
+ "adcs x21, x21, %x[a]\n\t"
+ "adc x22, x22, x4\n\t"
+ "stp x19, x20, [x29, #144]\n\t"
+ "stp x21, x22, [x29, #160]\n\t"
+ /* Add */
+ "adds x10, x14, x5\n\t"
+ "adcs x11, x15, x26\n\t"
+ "adcs x12, x16, x27\n\t"
+ "adc x13, x17, x28\n\t"
+ "mov x3, #-19\n\t"
+ "asr %x[a], x13, #63\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x10, x10, x3\n\t"
+ "sbcs x11, x11, %x[a]\n\t"
+ "sbcs x12, x12, %x[a]\n\t"
+ "sbc x13, x13, x4\n\t"
+ /* Sub */
+ "subs x14, x14, x5\n\t"
+ "sbcs x15, x15, x26\n\t"
+ "sbcs x16, x16, x27\n\t"
+ "sbcs x17, x17, x28\n\t"
+ "mov x3, #-19\n\t"
+ "csetm %x[a], cc\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, %x[a]\n\t"
+ "adcs x16, x16, %x[a]\n\t"
+ "adc x17, x17, x4\n\t"
+ /* Multiply */
+ /* A[0] * B[0] */
+ "mul x19, x14, x6\n\t"
+ "umulh x20, x14, x6\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x7\n\t"
+ "umulh x21, x14, x7\n\t"
+ "adds x20, x20, x3\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x6\n\t"
+ "umulh x4, x15, x6\n\t"
+ "adds x20, x20, x3\n\t"
+ "adcs x21, x21, x4\n\t"
+ "adc x22, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x8\n\t"
+ "umulh x4, x14, x8\n\t"
+ "adds x21, x21, x3\n\t"
+ "adc x22, x22, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x7\n\t"
+ "umulh x4, x15, x7\n\t"
+ "adds x21, x21, x3\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc %x[a], xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x6\n\t"
+ "umulh x4, x16, x6\n\t"
+ "adds x21, x21, x3\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x9\n\t"
+ "umulh x4, x14, x9\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x8\n\t"
+ "umulh x4, x15, x8\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x7\n\t"
+ "umulh x4, x16, x7\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x6\n\t"
+ "umulh x4, x17, x6\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x9\n\t"
+ "umulh x4, x15, x9\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x8\n\t"
+ "umulh x4, x16, x8\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x7\n\t"
+ "umulh x4, x17, x7\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x9\n\t"
+ "umulh x4, x16, x9\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x8\n\t"
+ "umulh x4, x17, x8\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, x28, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x9\n\t"
+ "umulh x4, x17, x9\n\t"
+ "adds x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x22, #63\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x19, x19, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x20, x20, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x21, x21, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x20, x20, %x[a]\n\t"
+ "adcs x21, x21, x26\n\t"
+ "adcs x22, x22, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x22, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ "adds x19, x19, x5\n\t"
+ "adcs x20, x20, xzr\n\t"
+ "adcs x21, x21, xzr\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x22, asr 63\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ "adds x19, x19, x5\n\t"
+ "adcs x20, x20, xzr\n\t"
+ "adcs x21, x21, xzr\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Store */
+ "stp x19, x20, [x29, #112]\n\t"
+ "stp x21, x22, [x29, #128]\n\t"
+ /* Multiply */
+ "ldp %x[a], x26, [x29, #144]\n\t"
+ "ldp x27, x28, [x29, #160]\n\t"
+ /* A[0] * B[0] */
+ "mul x19, x10, %x[a]\n\t"
+ "umulh x20, x10, %x[a]\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x10, x26\n\t"
+ "umulh x21, x10, x26\n\t"
+ "adds x20, x20, x3\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x11, %x[a]\n\t"
+ "umulh x4, x11, %x[a]\n\t"
+ "adds x20, x20, x3\n\t"
+ "adcs x21, x21, x4\n\t"
+ "adc x22, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x10, x27\n\t"
+ "umulh x4, x10, x27\n\t"
+ "adds x21, x21, x3\n\t"
+ "adc x22, x22, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x11, x26\n\t"
+ "umulh x4, x11, x26\n\t"
+ "adds x21, x21, x3\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc x14, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x12, %x[a]\n\t"
+ "umulh x4, x12, %x[a]\n\t"
+ "adds x21, x21, x3\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x10, x28\n\t"
+ "umulh x4, x10, x28\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x11, x27\n\t"
+ "umulh x4, x11, x27\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x12, x26\n\t"
+ "umulh x4, x12, x26\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x13, %x[a]\n\t"
+ "umulh x4, x13, %x[a]\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x11, x28\n\t"
+ "umulh x4, x11, x28\n\t"
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, x4\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x12, x27\n\t"
+ "umulh x4, x12, x27\n\t"
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, x4\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x13, x26\n\t"
+ "umulh x4, x13, x26\n\t"
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, x4\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x12, x28\n\t"
+ "umulh x4, x12, x28\n\t"
+ "adds x15, x15, x3\n\t"
+ "adcs x16, x16, x4\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x13, x27\n\t"
+ "umulh x4, x13, x27\n\t"
+ "adds x15, x15, x3\n\t"
+ "adcs x16, x16, x4\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x13, x28\n\t"
+ "umulh x4, x13, x28\n\t"
+ "adds x16, x16, x3\n\t"
+ "adc x17, x17, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "extr x15, x15, x14, #63\n\t"
+ "extr x14, x14, x22, #63\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, x14\n\t"
+ "umulh x14, x3, x14\n\t"
+ "adds x19, x19, x4\n\t"
+ "mul x4, x3, x15\n\t"
+ "umulh x15, x3, x15\n\t"
+ "adcs x20, x20, x4\n\t"
+ "mul x4, x3, x16\n\t"
+ "umulh x16, x3, x16\n\t"
+ "adcs x21, x21, x4\n\t"
+ "mul x4, x3, x17\n\t"
+ "umulh x5, x3, x17\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x20, x20, x14\n\t"
+ "adcs x21, x21, x15\n\t"
+ "adcs x22, x22, x16\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x22, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ "adds x19, x19, x5\n\t"
+ "adcs x20, x20, xzr\n\t"
+ "adcs x21, x21, xzr\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x22, asr 63\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ "adds x19, x19, x5\n\t"
+ "adcs x20, x20, xzr\n\t"
+ "adcs x21, x21, xzr\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Store */
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x11, %x[a], x26\n\t"
+ "umulh x12, %x[a], x26\n\t"
+ /* A[0] * A[2] */
+ "mul x3, %x[a], x27\n\t"
+ "umulh x13, %x[a], x27\n\t"
+ "adds x12, x12, x3\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x3, %x[a], x28\n\t"
+ "umulh x14, %x[a], x28\n\t"
+ "adds x13, x13, x3\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x3, x26, x27\n\t"
+ "umulh x4, x26, x27\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x3, x26, x28\n\t"
+ "umulh x4, x26, x28\n\t"
+ "adds x14, x14, x3\n\t"
+ "adc x15, x15, x4\n\t"
+ /* A[2] * A[3] */
+ "mul x3, x27, x28\n\t"
+ "umulh x16, x27, x28\n\t"
+ "adds x15, x15, x3\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* Double */
+ "adds x11, x11, x11\n\t"
+ "adcs x12, x12, x12\n\t"
+ "adcs x13, x13, x13\n\t"
+ "adcs x14, x14, x14\n\t"
+ "adcs x15, x15, x15\n\t"
+ "adcs x16, x16, x16\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x10, %x[a], %x[a]\n\t"
+ "umulh x5, %x[a], %x[a]\n\t"
+ /* A[1] * A[1] */
+ "mul x3, x26, x26\n\t"
+ "umulh x4, x26, x26\n\t"
+ "adds x11, x11, x5\n\t"
+ "adcs x12, x12, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x3, x27, x27\n\t"
+ "umulh x4, x27, x27\n\t"
+ "adds x13, x13, x5\n\t"
+ "adcs x14, x14, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x3, x28, x28\n\t"
+ "umulh x4, x28, x28\n\t"
+ "adds x15, x15, x5\n\t"
+ "adcs x16, x16, x3\n\t"
+ "adc x17, x17, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "extr x15, x15, x14, #63\n\t"
+ "extr x14, x14, x13, #63\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, x14\n\t"
+ "umulh x14, x3, x14\n\t"
+ "adds x10, x10, x4\n\t"
+ "mul x4, x3, x15\n\t"
+ "umulh x15, x3, x15\n\t"
+ "adcs x11, x11, x4\n\t"
+ "mul x4, x3, x16\n\t"
+ "umulh x16, x3, x16\n\t"
+ "adcs x12, x12, x4\n\t"
+ "mul x4, x3, x17\n\t"
+ "umulh x5, x3, x17\n\t"
+ "adcs x13, x13, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x11, x11, x14\n\t"
+ "adcs x12, x12, x15\n\t"
+ "adcs x13, x13, x16\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x13, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ "adds x10, x10, x5\n\t"
+ "adcs x11, x11, xzr\n\t"
+ "adcs x12, x12, xzr\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x13, asr 63\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ "adds x10, x10, x5\n\t"
+ "adcs x11, x11, xzr\n\t"
+ "adcs x12, x12, xzr\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* Store */
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x15, x6, x7\n\t"
+ "umulh x16, x6, x7\n\t"
+ /* A[0] * A[2] */
+ "mul x3, x6, x8\n\t"
+ "umulh x17, x6, x8\n\t"
+ "adds x16, x16, x3\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x3, x6, x9\n\t"
+ "umulh %x[a], x6, x9\n\t"
+ "adds x17, x17, x3\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x3, x7, x8\n\t"
+ "umulh x4, x7, x8\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x3, x7, x9\n\t"
+ "umulh x4, x7, x9\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adc x26, x26, x4\n\t"
+ /* A[2] * A[3] */
+ "mul x3, x8, x9\n\t"
+ "umulh x27, x8, x9\n\t"
+ "adds x26, x26, x3\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Double */
+ "adds x15, x15, x15\n\t"
+ "adcs x16, x16, x16\n\t"
+ "adcs x17, x17, x17\n\t"
+ "adcs %x[a], %x[a], %x[a]\n\t"
+ "adcs x26, x26, x26\n\t"
+ "adcs x27, x27, x27\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x14, x6, x6\n\t"
+ "umulh x5, x6, x6\n\t"
+ /* A[1] * A[1] */
+ "mul x3, x7, x7\n\t"
+ "umulh x4, x7, x7\n\t"
+ "adds x15, x15, x5\n\t"
+ "adcs x16, x16, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x3, x8, x8\n\t"
+ "umulh x4, x8, x8\n\t"
+ "adds x17, x17, x5\n\t"
+ "adcs %x[a], %x[a], x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x3, x9, x9\n\t"
+ "umulh x4, x9, x9\n\t"
+ "adds x26, x26, x5\n\t"
+ "adcs x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x17, #63\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x14, x14, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x15, x15, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x16, x16, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x17, x17, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x15, x15, %x[a]\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adcs x17, x17, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x17, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ "adds x14, x14, x5\n\t"
+ "adcs x15, x15, xzr\n\t"
+ "adcs x16, x16, xzr\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x17, asr 63\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ "adds x14, x14, x5\n\t"
+ "adcs x15, x15, xzr\n\t"
+ "adcs x16, x16, xzr\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* Store */
+ /* Multiply */
+ /* A[0] * B[0] */
+ "mul x6, x14, x10\n\t"
+ "umulh x7, x14, x10\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x11\n\t"
+ "umulh x8, x14, x11\n\t"
+ "adds x7, x7, x3\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x10\n\t"
+ "umulh x4, x15, x10\n\t"
+ "adds x7, x7, x3\n\t"
+ "adcs x8, x8, x4\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x12\n\t"
+ "umulh x4, x14, x12\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x11\n\t"
+ "umulh x4, x15, x11\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc %x[a], xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x10\n\t"
+ "umulh x4, x16, x10\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x13\n\t"
+ "umulh x4, x14, x13\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x12\n\t"
+ "umulh x4, x15, x12\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x11\n\t"
+ "umulh x4, x16, x11\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x10\n\t"
+ "umulh x4, x17, x10\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x13\n\t"
+ "umulh x4, x15, x13\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x12\n\t"
+ "umulh x4, x16, x12\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x11\n\t"
+ "umulh x4, x17, x11\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x13\n\t"
+ "umulh x4, x16, x13\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x12\n\t"
+ "umulh x4, x17, x12\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, x28, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x13\n\t"
+ "umulh x4, x17, x13\n\t"
+ "adds x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, %x[a]\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adcs x9, x9, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "stp x6, x7, [%x[r]]\n\t"
+ "stp x8, x9, [%x[r], #16]\n\t"
+ /* Sub */
+ "subs x14, x14, x10\n\t"
+ "sbcs x15, x15, x11\n\t"
+ "sbcs x16, x16, x12\n\t"
+ "sbcs x17, x17, x13\n\t"
+ "mov x3, #-19\n\t"
+ "csetm %x[a], cc\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, %x[a]\n\t"
+ "adcs x16, x16, %x[a]\n\t"
+ "adc x17, x17, x4\n\t"
+ /* Multiply by 121666 */
+ "mov x5, #0xdb42\n\t"
+ "movk x5, #1, lsl 16\n\t"
+ "mul x6, x14, x5\n\t"
+ "umulh x7, x14, x5\n\t"
+ "mul x3, x15, x5\n\t"
+ "umulh x4, x15, x5\n\t"
+ "adds x7, x7, x3\n\t"
+ "adc x8, xzr, x4\n\t"
+ "mul x3, x16, x5\n\t"
+ "umulh x4, x16, x5\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, xzr, x4\n\t"
+ "mul x3, x17, x5\n\t"
+ "umulh x4, x17, x5\n\t"
+ "adds x9, x9, x3\n\t"
+ "adc x4, xzr, x4\n\t"
+ "mov x5, #19\n\t"
+ "extr x4, x4, x9, #63\n\t"
+ "mul x4, x4, x5\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x4\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Add */
+ "adds x10, x10, x6\n\t"
+ "adcs x11, x11, x7\n\t"
+ "adcs x12, x12, x8\n\t"
+ "adc x13, x13, x9\n\t"
+ "mov x3, #-19\n\t"
+ "asr %x[a], x13, #63\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x10, x10, x3\n\t"
+ "sbcs x11, x11, %x[a]\n\t"
+ "sbcs x12, x12, %x[a]\n\t"
+ "sbc x13, x13, x4\n\t"
+ /* Multiply */
+ /* A[0] * B[0] */
+ "mul x6, x14, x10\n\t"
+ "umulh x7, x14, x10\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x11\n\t"
+ "umulh x8, x14, x11\n\t"
+ "adds x7, x7, x3\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x10\n\t"
+ "umulh x4, x15, x10\n\t"
+ "adds x7, x7, x3\n\t"
+ "adcs x8, x8, x4\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x12\n\t"
+ "umulh x4, x14, x12\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x11\n\t"
+ "umulh x4, x15, x11\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc %x[a], xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x10\n\t"
+ "umulh x4, x16, x10\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x13\n\t"
+ "umulh x4, x14, x13\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x12\n\t"
+ "umulh x4, x15, x12\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x11\n\t"
+ "umulh x4, x16, x11\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x10\n\t"
+ "umulh x4, x17, x10\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x13\n\t"
+ "umulh x4, x15, x13\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x12\n\t"
+ "umulh x4, x16, x12\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x11\n\t"
+ "umulh x4, x17, x11\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x13\n\t"
+ "umulh x4, x16, x13\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x12\n\t"
+ "umulh x4, x17, x12\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, x28, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x13\n\t"
+ "umulh x4, x17, x13\n\t"
+ "adds x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, %x[a]\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adcs x9, x9, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "stp x6, x7, [x29, #16]\n\t"
+ "stp x8, x9, [x29, #32]\n\t"
+ /* Add */
+ "ldp x6, x7, [x29, #112]\n\t"
+ "ldp x8, x9, [x29, #128]\n\t"
+ "adds x10, x6, x19\n\t"
+ "adcs x11, x7, x20\n\t"
+ "adcs x12, x8, x21\n\t"
+ "adc x13, x9, x22\n\t"
+ "mov x3, #-19\n\t"
+ "asr %x[a], x13, #63\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x10, x10, x3\n\t"
+ "sbcs x11, x11, %x[a]\n\t"
+ "sbcs x12, x12, %x[a]\n\t"
+ "sbc x13, x13, x4\n\t"
+ /* Sub */
+ "subs x19, x6, x19\n\t"
+ "sbcs x20, x7, x20\n\t"
+ "sbcs x21, x8, x21\n\t"
+ "sbcs x22, x9, x22\n\t"
+ "mov x3, #-19\n\t"
+ "csetm %x[a], cc\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, %x[a]\n\t"
+ "adcs x21, x21, %x[a]\n\t"
+ "adc x22, x22, x4\n\t"
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x7, x10, x11\n\t"
+ "umulh x8, x10, x11\n\t"
+ /* A[0] * A[2] */
+ "mul x3, x10, x12\n\t"
+ "umulh x9, x10, x12\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x3, x10, x13\n\t"
+ "umulh %x[a], x10, x13\n\t"
+ "adds x9, x9, x3\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x3, x11, x12\n\t"
+ "umulh x4, x11, x12\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x3, x11, x13\n\t"
+ "umulh x4, x11, x13\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adc x26, x26, x4\n\t"
+ /* A[2] * A[3] */
+ "mul x3, x12, x13\n\t"
+ "umulh x27, x12, x13\n\t"
+ "adds x26, x26, x3\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Double */
+ "adds x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs %x[a], %x[a], %x[a]\n\t"
+ "adcs x26, x26, x26\n\t"
+ "adcs x27, x27, x27\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x6, x10, x10\n\t"
+ "umulh x5, x10, x10\n\t"
+ /* A[1] * A[1] */
+ "mul x3, x11, x11\n\t"
+ "umulh x4, x11, x11\n\t"
+ "adds x7, x7, x5\n\t"
+ "adcs x8, x8, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x3, x12, x12\n\t"
+ "umulh x4, x12, x12\n\t"
+ "adds x9, x9, x5\n\t"
+ "adcs %x[a], %x[a], x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x3, x13, x13\n\t"
+ "umulh x4, x13, x13\n\t"
+ "adds x26, x26, x5\n\t"
+ "adcs x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, %x[a]\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adcs x9, x9, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "stp x6, x7, [x29, #80]\n\t"
+ "stp x8, x9, [x29, #96]\n\t"
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x7, x19, x20\n\t"
+ "umulh x8, x19, x20\n\t"
+ /* A[0] * A[2] */
+ "mul x3, x19, x21\n\t"
+ "umulh x9, x19, x21\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x3, x19, x22\n\t"
+ "umulh %x[a], x19, x22\n\t"
+ "adds x9, x9, x3\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x3, x20, x21\n\t"
+ "umulh x4, x20, x21\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x3, x20, x22\n\t"
+ "umulh x4, x20, x22\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adc x26, x26, x4\n\t"
+ /* A[2] * A[3] */
+ "mul x3, x21, x22\n\t"
+ "umulh x27, x21, x22\n\t"
+ "adds x26, x26, x3\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Double */
+ "adds x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs %x[a], %x[a], %x[a]\n\t"
+ "adcs x26, x26, x26\n\t"
+ "adcs x27, x27, x27\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x6, x19, x19\n\t"
+ "umulh x5, x19, x19\n\t"
+ /* A[1] * A[1] */
+ "mul x3, x20, x20\n\t"
+ "umulh x4, x20, x20\n\t"
+ "adds x7, x7, x5\n\t"
+ "adcs x8, x8, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x3, x21, x21\n\t"
+ "umulh x4, x21, x21\n\t"
+ "adds x9, x9, x5\n\t"
+ "adcs %x[a], %x[a], x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x3, x22, x22\n\t"
+ "umulh x4, x22, x22\n\t"
+ "adds x26, x26, x5\n\t"
+ "adcs x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, %x[a]\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adcs x9, x9, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "ldr %x[a], [x29, #184]\n\t"
+ /* Multiply */
+ "ldp x14, x15, [%x[a]]\n\t"
+ "ldp x16, x17, [%x[a], #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x10, x14, x6\n\t"
+ "umulh x11, x14, x6\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x7\n\t"
+ "umulh x12, x14, x7\n\t"
+ "adds x11, x11, x3\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x6\n\t"
+ "umulh x4, x15, x6\n\t"
+ "adds x11, x11, x3\n\t"
+ "adcs x12, x12, x4\n\t"
+ "adc x13, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x8\n\t"
+ "umulh x4, x14, x8\n\t"
+ "adds x12, x12, x3\n\t"
+ "adc x13, x13, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x7\n\t"
+ "umulh x4, x15, x7\n\t"
+ "adds x12, x12, x3\n\t"
+ "adcs x13, x13, x4\n\t"
+ "adc %x[a], xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x6\n\t"
+ "umulh x4, x16, x6\n\t"
+ "adds x12, x12, x3\n\t"
+ "adcs x13, x13, x4\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x9\n\t"
+ "umulh x4, x14, x9\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x8\n\t"
+ "umulh x4, x15, x8\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x7\n\t"
+ "umulh x4, x16, x7\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x6\n\t"
+ "umulh x4, x17, x6\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x9\n\t"
+ "umulh x4, x15, x9\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x8\n\t"
+ "umulh x4, x16, x8\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x7\n\t"
+ "umulh x4, x17, x7\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x9\n\t"
+ "umulh x4, x16, x9\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x8\n\t"
+ "umulh x4, x17, x8\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, x28, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x9\n\t"
+ "umulh x4, x17, x9\n\t"
+ "adds x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x13, #63\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x10, x10, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x11, x11, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x12, x12, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x13, x13, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x11, x11, %x[a]\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adcs x13, x13, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x13, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ "adds x10, x10, x5\n\t"
+ "adcs x11, x11, xzr\n\t"
+ "adcs x12, x12, xzr\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x13, asr 63\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ "adds x10, x10, x5\n\t"
+ "adcs x11, x11, xzr\n\t"
+ "adcs x12, x12, xzr\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* Store */
+ "stp x10, x11, [x29, #48]\n\t"
+ "stp x12, x13, [x29, #64]\n\t"
+ "sub x25, x25, #1\n\t"
+ "cmp x25, #0\n\t"
+ "bge L_curve25519_bits_%=\n\t"
+ "mov x25, #63\n\t"
+ "sub x24, x24, #8\n\t"
+ "cmp x24, #0\n\t"
+ "bge L_curve25519_words_%=\n\t"
+ /* Invert */
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #16\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #48\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x1, x29, #0x50\n\t"
+ "add x2, x29, #0x70\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #4\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_1_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_1_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "add x1, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #9\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_2_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_2_%=\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x90\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #19\n\t"
+ "add x1, x29, #0x90\n\t"
+ "\n"
+ "L_curve25519_inv_3_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_3_%=\n\t"
+ "add x0, x29, #0x70\n\t"
+ "add x2, x29, #0x70\n\t"
+ "bl fe_mul\n\t"
+ "mov x24, #10\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_4_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_4_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "add x1, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #49\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_5_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_5_%=\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x90\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #0x63\n\t"
+ "add x1, x29, #0x90\n\t"
+ "\n"
+ "L_curve25519_inv_6_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_6_%=\n\t"
+ "add x0, x29, #0x70\n\t"
+ "add x2, x29, #0x70\n\t"
+ "bl fe_mul\n\t"
+ "mov x24, #50\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_7_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_7_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "mov x24, #5\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_curve25519_inv_8_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_8_%=\n\t"
+ "add x0, x29, #16\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "ldr %x[r], [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x6, x7, [%x[r]]\n\t"
+ "ldp x8, x9, [%x[r], #16]\n\t"
+ "ldp x10, x11, [x29, #16]\n\t"
+ "ldp x12, x13, [x29, #32]\n\t"
+ /* A[0] * B[0] */
+ "mul x14, x6, x10\n\t"
+ "umulh x15, x6, x10\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x6, x11\n\t"
+ "umulh x16, x6, x11\n\t"
+ "adds x15, x15, x3\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x7, x10\n\t"
+ "umulh x4, x7, x10\n\t"
+ "adds x15, x15, x3\n\t"
+ "adcs x16, x16, x4\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x6, x12\n\t"
+ "umulh x4, x6, x12\n\t"
+ "adds x16, x16, x3\n\t"
+ "adc x17, x17, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x7, x11\n\t"
+ "umulh x4, x7, x11\n\t"
+ "adds x16, x16, x3\n\t"
+ "adcs x17, x17, x4\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x8, x10\n\t"
+ "umulh x4, x8, x10\n\t"
+ "adds x16, x16, x3\n\t"
+ "adcs x17, x17, x4\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x6, x13\n\t"
+ "umulh x4, x6, x13\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs x19, x19, x4\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x7, x12\n\t"
+ "umulh x4, x7, x12\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs x19, x19, x4\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x8, x11\n\t"
+ "umulh x4, x8, x11\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs x19, x19, x4\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x9, x10\n\t"
+ "umulh x4, x9, x10\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs x19, x19, x4\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x7, x13\n\t"
+ "umulh x4, x7, x13\n\t"
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, x4\n\t"
+ "adc x21, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x8, x12\n\t"
+ "umulh x4, x8, x12\n\t"
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, x4\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x9, x11\n\t"
+ "umulh x4, x9, x11\n\t"
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, x4\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x8, x13\n\t"
+ "umulh x4, x8, x13\n\t"
+ "adds x20, x20, x3\n\t"
+ "adcs x21, x21, x4\n\t"
+ "adc x22, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x9, x12\n\t"
+ "umulh x4, x9, x12\n\t"
+ "adds x20, x20, x3\n\t"
+ "adcs x21, x21, x4\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x9, x13\n\t"
+ "umulh x4, x9, x13\n\t"
+ "adds x21, x21, x3\n\t"
+ "adc x22, x22, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x22, x22, x21, #63\n\t"
+ "extr x21, x21, x20, #63\n\t"
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, x19\n\t"
+ "umulh x19, x3, x19\n\t"
+ "adds x14, x14, x4\n\t"
+ "mul x4, x3, x20\n\t"
+ "umulh x20, x3, x20\n\t"
+ "adcs x15, x15, x4\n\t"
+ "mul x4, x3, x21\n\t"
+ "umulh x21, x3, x21\n\t"
+ "adcs x16, x16, x4\n\t"
+ "mul x4, x3, x22\n\t"
+ "umulh x5, x3, x22\n\t"
+ "adcs x17, x17, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x15, x15, x19\n\t"
+ "adcs x16, x16, x20\n\t"
+ "adcs x17, x17, x21\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x17, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ "adds x14, x14, x5\n\t"
+ "adcs x15, x15, xzr\n\t"
+ "adcs x16, x16, xzr\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x17, asr 63\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ "adds x14, x14, x5\n\t"
+ "adcs x15, x15, xzr\n\t"
+ "adcs x16, x16, xzr\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* Store */
+ "stp x14, x15, [%x[r]]\n\t"
+ "stp x16, x17, [%x[r], #16]\n\t"
+ "mov x0, xzr\n\t"
+ "ldp x29, x30, [sp], #0xc0\n\t"
+ : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ return (uint32_t)(size_t)r;
+}
+
+void fe_pow22523(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-128]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* pow22523 */
+ "str %x[r], [x29, #112]\n\t"
+ "str %x[a], [x29, #120]\n\t"
+ "add x0, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "ldr x1, [x29, #120]\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #16\n\t"
+ "add x1, x29, #16\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #48\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #4\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_1_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_1_%=\n\t"
+ "add x0, x29, #16\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #9\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_2_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_2_%=\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #19\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_pow22523_3_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_3_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "mov x21, #10\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_4_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_4_%=\n\t"
+ "add x0, x29, #16\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #49\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_5_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_5_%=\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #0x63\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_pow22523_6_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_6_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "mov x21, #50\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_7_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_7_%=\n\t"
+ "add x0, x29, #16\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "mov x21, #2\n\t"
+ "add x1, x29, #16\n\t"
+ "\n"
+ "L_fe_pow22523_8_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_8_%=\n\t"
+ "ldr x0, [x29, #112]\n\t"
+ "ldr x2, [x29, #120]\n\t"
+ "bl fe_mul\n\t"
+ "ldp x29, x30, [sp], #0x80\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x21"
+ );
+}
+
+void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-64]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[ry], [x29, #16]\n\t"
+ "str %x[rz], [x29, #24]\n\t"
+ "str %x[px], [x29, #32]\n\t"
+ "str %x[py], [x29, #40]\n\t"
+ "str %x[pz], [x29, #48]\n\t"
+ "str %x[pt], [x29, #56]\n\t"
+ "ldr x1, [x29, #32]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x1]\n\t"
+ "ldp x13, x14, [x1, #16]\n\t"
+ "ldp x15, x16, [x2]\n\t"
+ "ldp x17, x19, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x11, x15\n\t"
+ "umulh x4, x11, x15\n\t"
+ /* A[0] * B[1] */
+ "mul x20, x11, x16\n\t"
+ "umulh x5, x11, x16\n\t"
+ "adds x4, x4, x20\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x20, x12, x15\n\t"
+ "umulh x21, x12, x15\n\t"
+ "adds x4, x4, x20\n\t"
+ "adcs x5, x5, x21\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x20, x11, x17\n\t"
+ "umulh x21, x11, x17\n\t"
+ "adds x5, x5, x20\n\t"
+ "adc x6, x6, x21\n\t"
+ /* A[1] * B[1] */
+ "mul x20, x12, x16\n\t"
+ "umulh x21, x12, x16\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x20, x13, x15\n\t"
+ "umulh x21, x13, x15\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x20, x11, x19\n\t"
+ "umulh x21, x11, x19\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x20, x12, x17\n\t"
+ "umulh x21, x12, x17\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x20, x13, x16\n\t"
+ "umulh x21, x13, x16\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x20, x14, x15\n\t"
+ "umulh x21, x14, x15\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x20, x12, x19\n\t"
+ "umulh x21, x12, x19\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x20, x13, x17\n\t"
+ "umulh x21, x13, x17\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x20, x14, x16\n\t"
+ "umulh x21, x14, x16\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x20, x13, x19\n\t"
+ "umulh x21, x13, x19\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x20, x14, x17\n\t"
+ "umulh x21, x14, x17\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x20, x14, x19\n\t"
+ "umulh x21, x14, x19\n\t"
+ "adds x9, x9, x20\n\t"
+ "adc x10, x10, x21\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x20, #19\n\t"
+ "mul x21, x20, x7\n\t"
+ "umulh x7, x20, x7\n\t"
+ "adds x3, x3, x21\n\t"
+ "mul x21, x20, x8\n\t"
+ "umulh x8, x20, x8\n\t"
+ "adcs x4, x4, x21\n\t"
+ "mul x21, x20, x9\n\t"
+ "umulh x9, x20, x9\n\t"
+ "adcs x5, x5, x21\n\t"
+ "mul x21, x20, x10\n\t"
+ "umulh x22, x20, x10\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Overflow */
+ "extr x22, x22, x6, #63\n\t"
+ "mul x22, x22, x20\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x22, x20, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #16]\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ "ldr x2, [x29, #48]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x1]\n\t"
+ "ldp x13, x14, [x1, #16]\n\t"
+ "ldp x15, x16, [x2]\n\t"
+ "ldp x17, x19, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x11, x15\n\t"
+ "umulh x4, x11, x15\n\t"
+ /* A[0] * B[1] */
+ "mul x20, x11, x16\n\t"
+ "umulh x5, x11, x16\n\t"
+ "adds x4, x4, x20\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x20, x12, x15\n\t"
+ "umulh x21, x12, x15\n\t"
+ "adds x4, x4, x20\n\t"
+ "adcs x5, x5, x21\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x20, x11, x17\n\t"
+ "umulh x21, x11, x17\n\t"
+ "adds x5, x5, x20\n\t"
+ "adc x6, x6, x21\n\t"
+ /* A[1] * B[1] */
+ "mul x20, x12, x16\n\t"
+ "umulh x21, x12, x16\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x20, x13, x15\n\t"
+ "umulh x21, x13, x15\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x20, x11, x19\n\t"
+ "umulh x21, x11, x19\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x20, x12, x17\n\t"
+ "umulh x21, x12, x17\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x20, x13, x16\n\t"
+ "umulh x21, x13, x16\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x20, x14, x15\n\t"
+ "umulh x21, x14, x15\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x20, x12, x19\n\t"
+ "umulh x21, x12, x19\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x20, x13, x17\n\t"
+ "umulh x21, x13, x17\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x20, x14, x16\n\t"
+ "umulh x21, x14, x16\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x20, x13, x19\n\t"
+ "umulh x21, x13, x19\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x20, x14, x17\n\t"
+ "umulh x21, x14, x17\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x20, x14, x19\n\t"
+ "umulh x21, x14, x19\n\t"
+ "adds x9, x9, x20\n\t"
+ "adc x10, x10, x21\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x20, #19\n\t"
+ "mul x21, x20, x7\n\t"
+ "umulh x7, x20, x7\n\t"
+ "adds x3, x3, x21\n\t"
+ "mul x21, x20, x8\n\t"
+ "umulh x8, x20, x8\n\t"
+ "adcs x4, x4, x21\n\t"
+ "mul x21, x20, x9\n\t"
+ "umulh x9, x20, x9\n\t"
+ "adcs x5, x5, x21\n\t"
+ "mul x21, x20, x10\n\t"
+ "umulh x22, x20, x10\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Overflow */
+ "extr x22, x22, x6, #63\n\t"
+ "mul x22, x22, x20\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x22, x20, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x2]\n\t"
+ "ldp x13, x14, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x15, x11\n\t"
+ "umulh x4, x15, x11\n\t"
+ /* A[0] * B[1] */
+ "mul x20, x15, x12\n\t"
+ "umulh x5, x15, x12\n\t"
+ "adds x4, x4, x20\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x20, x16, x11\n\t"
+ "umulh x21, x16, x11\n\t"
+ "adds x4, x4, x20\n\t"
+ "adcs x5, x5, x21\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x20, x15, x13\n\t"
+ "umulh x21, x15, x13\n\t"
+ "adds x5, x5, x20\n\t"
+ "adc x6, x6, x21\n\t"
+ /* A[1] * B[1] */
+ "mul x20, x16, x12\n\t"
+ "umulh x21, x16, x12\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x20, x17, x11\n\t"
+ "umulh x21, x17, x11\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x20, x15, x14\n\t"
+ "umulh x21, x15, x14\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x20, x16, x13\n\t"
+ "umulh x21, x16, x13\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x20, x17, x12\n\t"
+ "umulh x21, x17, x12\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x20, x19, x11\n\t"
+ "umulh x21, x19, x11\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x20, x16, x14\n\t"
+ "umulh x21, x16, x14\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x20, x17, x13\n\t"
+ "umulh x21, x17, x13\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x20, x19, x12\n\t"
+ "umulh x21, x19, x12\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x20, x17, x14\n\t"
+ "umulh x21, x17, x14\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x20, x19, x13\n\t"
+ "umulh x21, x19, x13\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x20, x19, x14\n\t"
+ "umulh x21, x19, x14\n\t"
+ "adds x9, x9, x20\n\t"
+ "adc x10, x10, x21\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x20, #19\n\t"
+ "mul x21, x20, x7\n\t"
+ "umulh x7, x20, x7\n\t"
+ "adds x3, x3, x21\n\t"
+ "mul x21, x20, x8\n\t"
+ "umulh x8, x20, x8\n\t"
+ "adcs x4, x4, x21\n\t"
+ "mul x21, x20, x9\n\t"
+ "umulh x9, x20, x9\n\t"
+ "adcs x5, x5, x21\n\t"
+ "mul x21, x20, x10\n\t"
+ "umulh x22, x20, x10\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Overflow */
+ "extr x22, x22, x6, #63\n\t"
+ "mul x22, x22, x20\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x22, x20, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldp x29, x30, [sp], #0x40\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22"
+ );
+}
+
+void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[ry], [x29, #16]\n\t"
+ "str %x[rz], [x29, #24]\n\t"
+ "str %x[rt], [x29, #32]\n\t"
+ "str %x[px], [x29, #40]\n\t"
+ "str %x[py], [x29, #48]\n\t"
+ "str %x[pz], [x29, #56]\n\t"
+ "str %x[pt], [x29, #64]\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ "ldr x2, [x29, #64]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x1]\n\t"
+ "ldp x13, x14, [x1, #16]\n\t"
+ "ldp x15, x16, [x2]\n\t"
+ "ldp x17, x19, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x11, x15\n\t"
+ "umulh x4, x11, x15\n\t"
+ /* A[0] * B[1] */
+ "mul x24, x11, x16\n\t"
+ "umulh x5, x11, x16\n\t"
+ "adds x4, x4, x24\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x24, x12, x15\n\t"
+ "umulh x25, x12, x15\n\t"
+ "adds x4, x4, x24\n\t"
+ "adcs x5, x5, x25\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x24, x11, x17\n\t"
+ "umulh x25, x11, x17\n\t"
+ "adds x5, x5, x24\n\t"
+ "adc x6, x6, x25\n\t"
+ /* A[1] * B[1] */
+ "mul x24, x12, x16\n\t"
+ "umulh x25, x12, x16\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x24, x13, x15\n\t"
+ "umulh x25, x13, x15\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x24, x11, x19\n\t"
+ "umulh x25, x11, x19\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x24, x12, x17\n\t"
+ "umulh x25, x12, x17\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x24, x13, x16\n\t"
+ "umulh x25, x13, x16\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x24, x14, x15\n\t"
+ "umulh x25, x14, x15\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x24, x12, x19\n\t"
+ "umulh x25, x12, x19\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x24, x13, x17\n\t"
+ "umulh x25, x13, x17\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x24, x14, x16\n\t"
+ "umulh x25, x14, x16\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x24, x13, x19\n\t"
+ "umulh x25, x13, x19\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x24, x14, x17\n\t"
+ "umulh x25, x14, x17\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x24, x14, x19\n\t"
+ "umulh x25, x14, x19\n\t"
+ "adds x9, x9, x24\n\t"
+ "adc x10, x10, x25\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x24, #19\n\t"
+ "mul x25, x24, x7\n\t"
+ "umulh x7, x24, x7\n\t"
+ "adds x3, x3, x25\n\t"
+ "mul x25, x24, x8\n\t"
+ "umulh x8, x24, x8\n\t"
+ "adcs x4, x4, x25\n\t"
+ "mul x25, x24, x9\n\t"
+ "umulh x9, x24, x9\n\t"
+ "adcs x5, x5, x25\n\t"
+ "mul x25, x24, x10\n\t"
+ "umulh x26, x24, x10\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Overflow */
+ "extr x26, x26, x6, #63\n\t"
+ "mul x26, x26, x24\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x26, x24, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #48]\n\t"
+ /* Multiply */
+ "ldp x20, x21, [x2]\n\t"
+ "ldp x22, x23, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x11, x20\n\t"
+ "umulh x4, x11, x20\n\t"
+ /* A[0] * B[1] */
+ "mul x24, x11, x21\n\t"
+ "umulh x5, x11, x21\n\t"
+ "adds x4, x4, x24\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x24, x12, x20\n\t"
+ "umulh x25, x12, x20\n\t"
+ "adds x4, x4, x24\n\t"
+ "adcs x5, x5, x25\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x24, x11, x22\n\t"
+ "umulh x25, x11, x22\n\t"
+ "adds x5, x5, x24\n\t"
+ "adc x6, x6, x25\n\t"
+ /* A[1] * B[1] */
+ "mul x24, x12, x21\n\t"
+ "umulh x25, x12, x21\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x24, x13, x20\n\t"
+ "umulh x25, x13, x20\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x24, x11, x23\n\t"
+ "umulh x25, x11, x23\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x24, x12, x22\n\t"
+ "umulh x25, x12, x22\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x24, x13, x21\n\t"
+ "umulh x25, x13, x21\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x24, x14, x20\n\t"
+ "umulh x25, x14, x20\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x24, x12, x23\n\t"
+ "umulh x25, x12, x23\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x24, x13, x22\n\t"
+ "umulh x25, x13, x22\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x24, x14, x21\n\t"
+ "umulh x25, x14, x21\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x24, x13, x23\n\t"
+ "umulh x25, x13, x23\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x24, x14, x22\n\t"
+ "umulh x25, x14, x22\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x24, x14, x23\n\t"
+ "umulh x25, x14, x23\n\t"
+ "adds x9, x9, x24\n\t"
+ "adc x10, x10, x25\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x24, #19\n\t"
+ "mul x25, x24, x7\n\t"
+ "umulh x7, x24, x7\n\t"
+ "adds x3, x3, x25\n\t"
+ "mul x25, x24, x8\n\t"
+ "umulh x8, x24, x8\n\t"
+ "adcs x4, x4, x25\n\t"
+ "mul x25, x24, x9\n\t"
+ "umulh x9, x24, x9\n\t"
+ "adcs x5, x5, x25\n\t"
+ "mul x25, x24, x10\n\t"
+ "umulh x26, x24, x10\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Overflow */
+ "extr x26, x26, x6, #63\n\t"
+ "mul x26, x26, x24\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x26, x24, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #16]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x2]\n\t"
+ "ldp x13, x14, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x20, x11\n\t"
+ "umulh x4, x20, x11\n\t"
+ /* A[0] * B[1] */
+ "mul x24, x20, x12\n\t"
+ "umulh x5, x20, x12\n\t"
+ "adds x4, x4, x24\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x24, x21, x11\n\t"
+ "umulh x25, x21, x11\n\t"
+ "adds x4, x4, x24\n\t"
+ "adcs x5, x5, x25\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x24, x20, x13\n\t"
+ "umulh x25, x20, x13\n\t"
+ "adds x5, x5, x24\n\t"
+ "adc x6, x6, x25\n\t"
+ /* A[1] * B[1] */
+ "mul x24, x21, x12\n\t"
+ "umulh x25, x21, x12\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x24, x22, x11\n\t"
+ "umulh x25, x22, x11\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x24, x20, x14\n\t"
+ "umulh x25, x20, x14\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x24, x21, x13\n\t"
+ "umulh x25, x21, x13\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x24, x22, x12\n\t"
+ "umulh x25, x22, x12\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x24, x23, x11\n\t"
+ "umulh x25, x23, x11\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x24, x21, x14\n\t"
+ "umulh x25, x21, x14\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x24, x22, x13\n\t"
+ "umulh x25, x22, x13\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x24, x23, x12\n\t"
+ "umulh x25, x23, x12\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x24, x22, x14\n\t"
+ "umulh x25, x22, x14\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x24, x23, x13\n\t"
+ "umulh x25, x23, x13\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x24, x23, x14\n\t"
+ "umulh x25, x23, x14\n\t"
+ "adds x9, x9, x24\n\t"
+ "adc x10, x10, x25\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x24, #19\n\t"
+ "mul x25, x24, x7\n\t"
+ "umulh x7, x24, x7\n\t"
+ "adds x3, x3, x25\n\t"
+ "mul x25, x24, x8\n\t"
+ "umulh x8, x24, x8\n\t"
+ "adcs x4, x4, x25\n\t"
+ "mul x25, x24, x9\n\t"
+ "umulh x9, x24, x9\n\t"
+ "adcs x5, x5, x25\n\t"
+ "mul x25, x24, x10\n\t"
+ "umulh x26, x24, x10\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Overflow */
+ "extr x26, x26, x6, #63\n\t"
+ "mul x26, x26, x24\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x26, x24, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #24]\n\t"
+ /* Multiply */
+ /* A[0] * B[0] */
+ "mul x3, x11, x15\n\t"
+ "umulh x4, x11, x15\n\t"
+ /* A[0] * B[1] */
+ "mul x24, x11, x16\n\t"
+ "umulh x5, x11, x16\n\t"
+ "adds x4, x4, x24\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x24, x12, x15\n\t"
+ "umulh x25, x12, x15\n\t"
+ "adds x4, x4, x24\n\t"
+ "adcs x5, x5, x25\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x24, x11, x17\n\t"
+ "umulh x25, x11, x17\n\t"
+ "adds x5, x5, x24\n\t"
+ "adc x6, x6, x25\n\t"
+ /* A[1] * B[1] */
+ "mul x24, x12, x16\n\t"
+ "umulh x25, x12, x16\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x24, x13, x15\n\t"
+ "umulh x25, x13, x15\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x24, x11, x19\n\t"
+ "umulh x25, x11, x19\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x24, x12, x17\n\t"
+ "umulh x25, x12, x17\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x24, x13, x16\n\t"
+ "umulh x25, x13, x16\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x24, x14, x15\n\t"
+ "umulh x25, x14, x15\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x24, x12, x19\n\t"
+ "umulh x25, x12, x19\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x24, x13, x17\n\t"
+ "umulh x25, x13, x17\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x24, x14, x16\n\t"
+ "umulh x25, x14, x16\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x24, x13, x19\n\t"
+ "umulh x25, x13, x19\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x24, x14, x17\n\t"
+ "umulh x25, x14, x17\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x24, x14, x19\n\t"
+ "umulh x25, x14, x19\n\t"
+ "adds x9, x9, x24\n\t"
+ "adc x10, x10, x25\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x24, #19\n\t"
+ "mul x25, x24, x7\n\t"
+ "umulh x7, x24, x7\n\t"
+ "adds x3, x3, x25\n\t"
+ "mul x25, x24, x8\n\t"
+ "umulh x8, x24, x8\n\t"
+ "adcs x4, x4, x25\n\t"
+ "mul x25, x24, x9\n\t"
+ "umulh x9, x24, x9\n\t"
+ "adcs x5, x5, x25\n\t"
+ "mul x25, x24, x10\n\t"
+ "umulh x26, x24, x10\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Overflow */
+ "extr x26, x26, x6, #63\n\t"
+ "mul x26, x26, x24\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x26, x24, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+ );
+}
+
+void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "ldr x1, [x29, #48]\n\t"
+ /* Square */
+ "ldp x12, x13, [x1]\n\t"
+ "ldp x14, x15, [x1, #16]\n\t"
+ /* A[0] * A[1] */
+ "mul x5, x12, x13\n\t"
+ "umulh x6, x12, x13\n\t"
+ /* A[0] * A[2] */
+ "mul x25, x12, x14\n\t"
+ "umulh x7, x12, x14\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x25, x12, x15\n\t"
+ "umulh x8, x12, x15\n\t"
+ "adds x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x25, x13, x14\n\t"
+ "umulh x26, x13, x14\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x25, x13, x15\n\t"
+ "umulh x26, x13, x15\n\t"
+ "adds x8, x8, x25\n\t"
+ "adc x9, x9, x26\n\t"
+ /* A[2] * A[3] */
+ "mul x25, x14, x15\n\t"
+ "umulh x10, x14, x15\n\t"
+ "adds x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* Double */
+ "adds x5, x5, x5\n\t"
+ "adcs x6, x6, x6\n\t"
+ "adcs x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x4, x12, x12\n\t"
+ "umulh x27, x12, x12\n\t"
+ /* A[1] * A[1] */
+ "mul x25, x13, x13\n\t"
+ "umulh x26, x13, x13\n\t"
+ "adds x5, x5, x27\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x25, x14, x14\n\t"
+ "umulh x26, x14, x14\n\t"
+ "adds x7, x7, x27\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x25, x15, x15\n\t"
+ "umulh x26, x15, x15\n\t"
+ "adds x9, x9, x27\n\t"
+ "adcs x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "stp x4, x5, [x0]\n\t"
+ "stp x6, x7, [x0, #16]\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x1, [x29, #56]\n\t"
+ /* Square */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * A[1] */
+ "mul x9, x21, x22\n\t"
+ "umulh x10, x21, x22\n\t"
+ /* A[0] * A[2] */
+ "mul x25, x21, x23\n\t"
+ "umulh x11, x21, x23\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x25, x21, x24\n\t"
+ "umulh x16, x21, x24\n\t"
+ "adds x11, x11, x25\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x25, x22, x23\n\t"
+ "umulh x26, x22, x23\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x25, x22, x24\n\t"
+ "umulh x26, x22, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adc x17, x17, x26\n\t"
+ /* A[2] * A[3] */
+ "mul x25, x23, x24\n\t"
+ "umulh x19, x23, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* Double */
+ "adds x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adcs x11, x11, x11\n\t"
+ "adcs x16, x16, x16\n\t"
+ "adcs x17, x17, x17\n\t"
+ "adcs x19, x19, x19\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x8, x21, x21\n\t"
+ "umulh x27, x21, x21\n\t"
+ /* A[1] * A[1] */
+ "mul x25, x22, x22\n\t"
+ "umulh x26, x22, x22\n\t"
+ "adds x9, x9, x27\n\t"
+ "adcs x10, x10, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x25, x23, x23\n\t"
+ "umulh x26, x23, x23\n\t"
+ "adds x11, x11, x27\n\t"
+ "adcs x16, x16, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x25, x24, x24\n\t"
+ "umulh x26, x24, x24\n\t"
+ "adds x17, x17, x27\n\t"
+ "adcs x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x11, #63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x8, x8, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x9, x9, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x10, x10, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x9, x9, x16\n\t"
+ "adcs x10, x10, x17\n\t"
+ "adcs x11, x11, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x11, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x11, asr 63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Store */
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "ldr x0, [x29, #24]\n\t"
+ /* Add */
+ "adds x12, x12, x21\n\t"
+ "adcs x13, x13, x22\n\t"
+ "adcs x14, x14, x23\n\t"
+ "adc x15, x15, x24\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x17, x12, x13\n\t"
+ "umulh x19, x12, x13\n\t"
+ /* A[0] * A[2] */
+ "mul x25, x12, x14\n\t"
+ "umulh x20, x12, x14\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x25, x12, x15\n\t"
+ "umulh x21, x12, x15\n\t"
+ "adds x20, x20, x25\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x25, x13, x14\n\t"
+ "umulh x26, x13, x14\n\t"
+ "adds x20, x20, x25\n\t"
+ "adcs x21, x21, x26\n\t"
+ "adc x22, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x25, x13, x15\n\t"
+ "umulh x26, x13, x15\n\t"
+ "adds x21, x21, x25\n\t"
+ "adc x22, x22, x26\n\t"
+ /* A[2] * A[3] */
+ "mul x25, x14, x15\n\t"
+ "umulh x23, x14, x15\n\t"
+ "adds x22, x22, x25\n\t"
+ "adc x23, x23, xzr\n\t"
+ /* Double */
+ "adds x17, x17, x17\n\t"
+ "adcs x19, x19, x19\n\t"
+ "adcs x20, x20, x20\n\t"
+ "adcs x21, x21, x21\n\t"
+ "adcs x22, x22, x22\n\t"
+ "adcs x23, x23, x23\n\t"
+ "adc x24, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x16, x12, x12\n\t"
+ "umulh x27, x12, x12\n\t"
+ /* A[1] * A[1] */
+ "mul x25, x13, x13\n\t"
+ "umulh x26, x13, x13\n\t"
+ "adds x17, x17, x27\n\t"
+ "adcs x19, x19, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x25, x14, x14\n\t"
+ "umulh x26, x14, x14\n\t"
+ "adds x20, x20, x27\n\t"
+ "adcs x21, x21, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x25, x15, x15\n\t"
+ "umulh x26, x15, x15\n\t"
+ "adds x22, x22, x27\n\t"
+ "adcs x23, x23, x25\n\t"
+ "adc x24, x24, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x24, x24, x23, #63\n\t"
+ "extr x23, x23, x22, #63\n\t"
+ "extr x22, x22, x21, #63\n\t"
+ "extr x21, x21, x20, #63\n\t"
+ "and x20, x20, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x21\n\t"
+ "umulh x21, x25, x21\n\t"
+ "adds x16, x16, x26\n\t"
+ "mul x26, x25, x22\n\t"
+ "umulh x22, x25, x22\n\t"
+ "adcs x17, x17, x26\n\t"
+ "mul x26, x25, x23\n\t"
+ "umulh x23, x25, x23\n\t"
+ "adcs x19, x19, x26\n\t"
+ "mul x26, x25, x24\n\t"
+ "umulh x27, x25, x24\n\t"
+ "adcs x20, x20, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x17, x17, x21\n\t"
+ "adcs x19, x19, x22\n\t"
+ "adcs x20, x20, x23\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x20, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x20, x20, #0x7fffffffffffffff\n\t"
+ "adds x16, x16, x27\n\t"
+ "adcs x17, x17, xzr\n\t"
+ "adcs x19, x19, xzr\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x20, asr 63\n\t"
+ "and x20, x20, #0x7fffffffffffffff\n\t"
+ "adds x16, x16, x27\n\t"
+ "adcs x17, x17, xzr\n\t"
+ "adcs x19, x19, xzr\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* Store */
+ "stp x16, x17, [x0]\n\t"
+ "stp x19, x20, [x0, #16]\n\t"
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #32]\n\t"
+ /* Add */
+ "adds x12, x8, x4\n\t"
+ "adcs x13, x9, x5\n\t"
+ "adcs x14, x10, x6\n\t"
+ "adc x15, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x21, x8, x4\n\t"
+ "sbcs x22, x9, x5\n\t"
+ "sbcs x23, x10, x6\n\t"
+ "sbcs x24, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x21, x21, x25\n\t"
+ "adcs x22, x22, x28\n\t"
+ "adcs x23, x23, x28\n\t"
+ "adc x24, x24, x26\n\t"
+ "stp x12, x13, [x0]\n\t"
+ "stp x14, x15, [x0, #16]\n\t"
+ "stp x21, x22, [x1]\n\t"
+ "stp x23, x24, [x1, #16]\n\t"
+ "ldr x0, [x29, #16]\n\t"
+ /* Sub */
+ "subs x16, x16, x12\n\t"
+ "sbcs x17, x17, x13\n\t"
+ "sbcs x19, x19, x14\n\t"
+ "sbcs x20, x20, x15\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x16, x17, [x0]\n\t"
+ "stp x19, x20, [x0, #16]\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ /* Square * 2 */
+ "ldp x12, x13, [x1]\n\t"
+ "ldp x14, x15, [x1, #16]\n\t"
+ /* A[0] * A[1] */
+ "mul x5, x12, x13\n\t"
+ "umulh x6, x12, x13\n\t"
+ /* A[0] * A[2] */
+ "mul x25, x12, x14\n\t"
+ "umulh x7, x12, x14\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x25, x12, x15\n\t"
+ "umulh x8, x12, x15\n\t"
+ "adds x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x25, x13, x14\n\t"
+ "umulh x26, x13, x14\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x25, x13, x15\n\t"
+ "umulh x26, x13, x15\n\t"
+ "adds x8, x8, x25\n\t"
+ "adc x9, x9, x26\n\t"
+ /* A[2] * A[3] */
+ "mul x25, x14, x15\n\t"
+ "umulh x10, x14, x15\n\t"
+ "adds x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* Double */
+ "adds x5, x5, x5\n\t"
+ "adcs x6, x6, x6\n\t"
+ "adcs x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x4, x12, x12\n\t"
+ "umulh x28, x12, x12\n\t"
+ /* A[1] * A[1] */
+ "mul x25, x13, x13\n\t"
+ "umulh x26, x13, x13\n\t"
+ "adds x5, x5, x28\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x28, x26, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x25, x14, x14\n\t"
+ "umulh x26, x14, x14\n\t"
+ "adds x7, x7, x28\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x28, x26, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x25, x15, x15\n\t"
+ "umulh x26, x15, x15\n\t"
+ "adds x9, x9, x28\n\t"
+ "adcs x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Double and Reduce */
+ "mov x25, #0x169\n\t"
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "lsr x28, x11, #61\n\t"
+ "extr x11, x11, x10, #62\n\t"
+ "extr x10, x10, x9, #62\n\t"
+ "extr x9, x9, x8, #62\n\t"
+ "extr x8, x8, x7, #62\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "extr x6, x6, x5, #63\n\t"
+ "extr x5, x5, x4, #63\n\t"
+ "lsl x4, x4, #1\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Two left, only one right */
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ /* Multiply top bits by 19*19 */
+ "mul x28, x28, x25\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x28\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #40]\n\t"
+ /* Sub */
+ "subs x4, x4, x21\n\t"
+ "sbcs x5, x5, x22\n\t"
+ "sbcs x6, x6, x23\n\t"
+ "sbcs x7, x7, x24\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x4, x4, x25\n\t"
+ "adcs x5, x5, x28\n\t"
+ "adcs x6, x6, x28\n\t"
+ "adc x7, x7, x26\n\t"
+ "stp x4, x5, [x0]\n\t"
+ "stp x6, x7, [x0, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz)
+ :
+ : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "str %x[pt], [x29, #72]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ "ldr x3, [x29, #48]\n\t"
+ /* Add */
+ "ldp x12, x13, [x2]\n\t"
+ "ldp x14, x15, [x2, #16]\n\t"
+ "ldp x16, x17, [x3]\n\t"
+ "ldp x19, x20, [x3, #16]\n\t"
+ "adds x4, x12, x16\n\t"
+ "adcs x5, x13, x17\n\t"
+ "adcs x6, x14, x19\n\t"
+ "adc x7, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ /* Sub */
+ "subs x8, x12, x16\n\t"
+ "sbcs x9, x13, x17\n\t"
+ "sbcs x10, x14, x19\n\t"
+ "sbcs x11, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x28\n\t"
+ "adcs x10, x10, x28\n\t"
+ "adc x11, x11, x26\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #168]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x12, x4, x21\n\t"
+ "umulh x13, x4, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x4, x22\n\t"
+ "umulh x14, x4, x22\n\t"
+ "adds x13, x13, x25\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x5, x21\n\t"
+ "umulh x26, x5, x21\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x4, x23\n\t"
+ "umulh x26, x4, x23\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x5, x22\n\t"
+ "umulh x26, x5, x22\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x6, x21\n\t"
+ "umulh x26, x6, x21\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x4, x24\n\t"
+ "umulh x26, x4, x24\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x5, x23\n\t"
+ "umulh x26, x5, x23\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x6, x22\n\t"
+ "umulh x26, x6, x22\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x7, x21\n\t"
+ "umulh x26, x7, x21\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x5, x24\n\t"
+ "umulh x26, x5, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x6, x23\n\t"
+ "umulh x26, x6, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x7, x22\n\t"
+ "umulh x26, x7, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x6, x24\n\t"
+ "umulh x26, x6, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x7, x23\n\t"
+ "umulh x26, x7, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x7, x24\n\t"
+ "umulh x26, x7, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x12, x12, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x13, x13, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x14, x14, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x13, x13, x16\n\t"
+ "adcs x14, x14, x17\n\t"
+ "adcs x15, x15, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x15, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x15, asr 63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x8, x21\n\t"
+ "umulh x5, x8, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x8, x22\n\t"
+ "umulh x6, x8, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x9, x21\n\t"
+ "umulh x26, x9, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x8, x23\n\t"
+ "umulh x26, x8, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x9, x22\n\t"
+ "umulh x26, x9, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x10, x21\n\t"
+ "umulh x26, x10, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x8, x24\n\t"
+ "umulh x26, x8, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x9, x23\n\t"
+ "umulh x26, x9, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x10, x22\n\t"
+ "umulh x26, x10, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x11, x21\n\t"
+ "umulh x26, x11, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x9, x24\n\t"
+ "umulh x26, x9, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x10, x23\n\t"
+ "umulh x26, x10, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x11, x22\n\t"
+ "umulh x26, x11, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x10, x24\n\t"
+ "umulh x26, x10, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x11, x23\n\t"
+ "umulh x26, x11, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x11, x24\n\t"
+ "umulh x26, x11, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x16\n\t"
+ "adcs x6, x6, x17\n\t"
+ "adcs x7, x7, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #16]\n\t"
+ /* Add */
+ "adds x8, x12, x4\n\t"
+ "adcs x9, x13, x5\n\t"
+ "adcs x10, x14, x6\n\t"
+ "adc x11, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ /* Sub */
+ "subs x16, x12, x4\n\t"
+ "sbcs x17, x13, x5\n\t"
+ "sbcs x19, x14, x6\n\t"
+ "sbcs x20, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #160]\n\t"
+ "ldr x3, [x29, #72]\n\t"
+ /* Multiply */
+ "ldp x16, x17, [x1]\n\t"
+ "ldp x19, x20, [x1, #16]\n\t"
+ "ldp x21, x22, [x3]\n\t"
+ "ldp x23, x24, [x3, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x16, x21\n\t"
+ "umulh x5, x16, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x16, x22\n\t"
+ "umulh x6, x16, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x17, x21\n\t"
+ "umulh x26, x17, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x16, x23\n\t"
+ "umulh x26, x16, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x17, x22\n\t"
+ "umulh x26, x17, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x19, x21\n\t"
+ "umulh x26, x19, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x16, x24\n\t"
+ "umulh x26, x16, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x17, x23\n\t"
+ "umulh x26, x17, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x19, x22\n\t"
+ "umulh x26, x19, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x20, x21\n\t"
+ "umulh x26, x20, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x17, x24\n\t"
+ "umulh x26, x17, x24\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x19, x23\n\t"
+ "umulh x26, x19, x23\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x20, x22\n\t"
+ "umulh x26, x20, x22\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x19, x24\n\t"
+ "umulh x26, x19, x24\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x20, x23\n\t"
+ "umulh x26, x20, x23\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x20, x24\n\t"
+ "umulh x26, x20, x24\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ /* Double */
+ "ldp x8, x9, [x1]\n\t"
+ "ldp x10, x11, [x1, #16]\n\t"
+ "adds x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adc x11, x11, x11\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ /* Add */
+ "adds x12, x8, x4\n\t"
+ "adcs x13, x9, x5\n\t"
+ "adcs x14, x10, x6\n\t"
+ "adc x15, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x16, x8, x4\n\t"
+ "sbcs x17, x9, x5\n\t"
+ "sbcs x19, x10, x6\n\t"
+ "sbcs x20, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x12, x13, [x0]\n\t"
+ "stp x14, x15, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ (void)qxy2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "str %x[pt], [x29, #72]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ "ldr x3, [x29, #48]\n\t"
+ /* Add */
+ "ldp x12, x13, [x2]\n\t"
+ "ldp x14, x15, [x2, #16]\n\t"
+ "ldp x16, x17, [x3]\n\t"
+ "ldp x19, x20, [x3, #16]\n\t"
+ "adds x4, x12, x16\n\t"
+ "adcs x5, x13, x17\n\t"
+ "adcs x6, x14, x19\n\t"
+ "adc x7, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ /* Sub */
+ "subs x8, x12, x16\n\t"
+ "sbcs x9, x13, x17\n\t"
+ "sbcs x10, x14, x19\n\t"
+ "sbcs x11, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x28\n\t"
+ "adcs x10, x10, x28\n\t"
+ "adc x11, x11, x26\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x12, x4, x21\n\t"
+ "umulh x13, x4, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x4, x22\n\t"
+ "umulh x14, x4, x22\n\t"
+ "adds x13, x13, x25\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x5, x21\n\t"
+ "umulh x26, x5, x21\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x4, x23\n\t"
+ "umulh x26, x4, x23\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x5, x22\n\t"
+ "umulh x26, x5, x22\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x6, x21\n\t"
+ "umulh x26, x6, x21\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x4, x24\n\t"
+ "umulh x26, x4, x24\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x5, x23\n\t"
+ "umulh x26, x5, x23\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x6, x22\n\t"
+ "umulh x26, x6, x22\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x7, x21\n\t"
+ "umulh x26, x7, x21\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x5, x24\n\t"
+ "umulh x26, x5, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x6, x23\n\t"
+ "umulh x26, x6, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x7, x22\n\t"
+ "umulh x26, x7, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x6, x24\n\t"
+ "umulh x26, x6, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x7, x23\n\t"
+ "umulh x26, x7, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x7, x24\n\t"
+ "umulh x26, x7, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x12, x12, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x13, x13, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x14, x14, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x13, x13, x16\n\t"
+ "adcs x14, x14, x17\n\t"
+ "adcs x15, x15, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x15, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x15, asr 63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #168]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x8, x21\n\t"
+ "umulh x5, x8, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x8, x22\n\t"
+ "umulh x6, x8, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x9, x21\n\t"
+ "umulh x26, x9, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x8, x23\n\t"
+ "umulh x26, x8, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x9, x22\n\t"
+ "umulh x26, x9, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x10, x21\n\t"
+ "umulh x26, x10, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x8, x24\n\t"
+ "umulh x26, x8, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x9, x23\n\t"
+ "umulh x26, x9, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x10, x22\n\t"
+ "umulh x26, x10, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x11, x21\n\t"
+ "umulh x26, x11, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x9, x24\n\t"
+ "umulh x26, x9, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x10, x23\n\t"
+ "umulh x26, x10, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x11, x22\n\t"
+ "umulh x26, x11, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x10, x24\n\t"
+ "umulh x26, x10, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x11, x23\n\t"
+ "umulh x26, x11, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x11, x24\n\t"
+ "umulh x26, x11, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x16\n\t"
+ "adcs x6, x6, x17\n\t"
+ "adcs x7, x7, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #16]\n\t"
+ /* Add */
+ "adds x8, x12, x4\n\t"
+ "adcs x9, x13, x5\n\t"
+ "adcs x10, x14, x6\n\t"
+ "adc x11, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ /* Sub */
+ "subs x16, x12, x4\n\t"
+ "sbcs x17, x13, x5\n\t"
+ "sbcs x19, x14, x6\n\t"
+ "sbcs x20, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #160]\n\t"
+ "ldr x3, [x29, #72]\n\t"
+ /* Multiply */
+ "ldp x16, x17, [x1]\n\t"
+ "ldp x19, x20, [x1, #16]\n\t"
+ "ldp x21, x22, [x3]\n\t"
+ "ldp x23, x24, [x3, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x16, x21\n\t"
+ "umulh x5, x16, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x16, x22\n\t"
+ "umulh x6, x16, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x17, x21\n\t"
+ "umulh x26, x17, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x16, x23\n\t"
+ "umulh x26, x16, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x17, x22\n\t"
+ "umulh x26, x17, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x19, x21\n\t"
+ "umulh x26, x19, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x16, x24\n\t"
+ "umulh x26, x16, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x17, x23\n\t"
+ "umulh x26, x17, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x19, x22\n\t"
+ "umulh x26, x19, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x20, x21\n\t"
+ "umulh x26, x20, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x17, x24\n\t"
+ "umulh x26, x17, x24\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x19, x23\n\t"
+ "umulh x26, x19, x23\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x20, x22\n\t"
+ "umulh x26, x20, x22\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x19, x24\n\t"
+ "umulh x26, x19, x24\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x20, x23\n\t"
+ "umulh x26, x20, x23\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x20, x24\n\t"
+ "umulh x26, x20, x24\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ /* Double */
+ "ldp x8, x9, [x1]\n\t"
+ "ldp x10, x11, [x1, #16]\n\t"
+ "adds x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adc x11, x11, x11\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ /* Add */
+ "adds x12, x8, x4\n\t"
+ "adcs x13, x9, x5\n\t"
+ "adcs x14, x10, x6\n\t"
+ "adc x15, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x16, x8, x4\n\t"
+ "sbcs x17, x9, x5\n\t"
+ "sbcs x19, x10, x6\n\t"
+ "sbcs x20, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x12, x13, [x1]\n\t"
+ "stp x14, x15, [x1, #16]\n\t"
+ "stp x16, x17, [x0]\n\t"
+ "stp x19, x20, [x0, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ (void)qxy2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "str %x[pt], [x29, #72]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ "ldr x3, [x29, #48]\n\t"
+ /* Add */
+ "ldp x12, x13, [x2]\n\t"
+ "ldp x14, x15, [x2, #16]\n\t"
+ "ldp x16, x17, [x3]\n\t"
+ "ldp x19, x20, [x3, #16]\n\t"
+ "adds x4, x12, x16\n\t"
+ "adcs x5, x13, x17\n\t"
+ "adcs x6, x14, x19\n\t"
+ "adc x7, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ /* Sub */
+ "subs x8, x12, x16\n\t"
+ "sbcs x9, x13, x17\n\t"
+ "sbcs x10, x14, x19\n\t"
+ "sbcs x11, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x28\n\t"
+ "adcs x10, x10, x28\n\t"
+ "adc x11, x11, x26\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x12, x4, x21\n\t"
+ "umulh x13, x4, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x4, x22\n\t"
+ "umulh x14, x4, x22\n\t"
+ "adds x13, x13, x25\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x5, x21\n\t"
+ "umulh x26, x5, x21\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x4, x23\n\t"
+ "umulh x26, x4, x23\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x5, x22\n\t"
+ "umulh x26, x5, x22\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x6, x21\n\t"
+ "umulh x26, x6, x21\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x4, x24\n\t"
+ "umulh x26, x4, x24\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x5, x23\n\t"
+ "umulh x26, x5, x23\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x6, x22\n\t"
+ "umulh x26, x6, x22\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x7, x21\n\t"
+ "umulh x26, x7, x21\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x5, x24\n\t"
+ "umulh x26, x5, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x6, x23\n\t"
+ "umulh x26, x6, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x7, x22\n\t"
+ "umulh x26, x7, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x6, x24\n\t"
+ "umulh x26, x6, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x7, x23\n\t"
+ "umulh x26, x7, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x7, x24\n\t"
+ "umulh x26, x7, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x12, x12, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x13, x13, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x14, x14, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x13, x13, x16\n\t"
+ "adcs x14, x14, x17\n\t"
+ "adcs x15, x15, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x15, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x15, asr 63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #184]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x8, x21\n\t"
+ "umulh x5, x8, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x8, x22\n\t"
+ "umulh x6, x8, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x9, x21\n\t"
+ "umulh x26, x9, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x8, x23\n\t"
+ "umulh x26, x8, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x9, x22\n\t"
+ "umulh x26, x9, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x10, x21\n\t"
+ "umulh x26, x10, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x8, x24\n\t"
+ "umulh x26, x8, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x9, x23\n\t"
+ "umulh x26, x9, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x10, x22\n\t"
+ "umulh x26, x10, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x11, x21\n\t"
+ "umulh x26, x11, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x9, x24\n\t"
+ "umulh x26, x9, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x10, x23\n\t"
+ "umulh x26, x10, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x11, x22\n\t"
+ "umulh x26, x11, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x10, x24\n\t"
+ "umulh x26, x10, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x11, x23\n\t"
+ "umulh x26, x11, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x11, x24\n\t"
+ "umulh x26, x11, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x16\n\t"
+ "adcs x6, x6, x17\n\t"
+ "adcs x7, x7, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #16]\n\t"
+ /* Add */
+ "adds x8, x12, x4\n\t"
+ "adcs x9, x13, x5\n\t"
+ "adcs x10, x14, x6\n\t"
+ "adc x11, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ /* Sub */
+ "subs x16, x12, x4\n\t"
+ "sbcs x17, x13, x5\n\t"
+ "sbcs x19, x14, x6\n\t"
+ "sbcs x20, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldr x0, [x29, #48]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ "ldr x2, [x29, #160]\n\t"
+ /* Multiply */
+ "ldp x12, x13, [x1]\n\t"
+ "ldp x14, x15, [x1, #16]\n\t"
+ "ldp x16, x17, [x2]\n\t"
+ "ldp x19, x20, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x12, x16\n\t"
+ "umulh x5, x12, x16\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x12, x17\n\t"
+ "umulh x6, x12, x17\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x13, x16\n\t"
+ "umulh x26, x13, x16\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x12, x19\n\t"
+ "umulh x26, x12, x19\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x13, x17\n\t"
+ "umulh x26, x13, x17\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x14, x16\n\t"
+ "umulh x26, x14, x16\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x12, x20\n\t"
+ "umulh x26, x12, x20\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x13, x19\n\t"
+ "umulh x26, x13, x19\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x14, x17\n\t"
+ "umulh x26, x14, x17\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x15, x16\n\t"
+ "umulh x26, x15, x16\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x13, x20\n\t"
+ "umulh x26, x13, x20\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x14, x19\n\t"
+ "umulh x26, x14, x19\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x15, x17\n\t"
+ "umulh x26, x15, x17\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x14, x20\n\t"
+ "umulh x26, x14, x20\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x15, x19\n\t"
+ "umulh x26, x15, x19\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x15, x20\n\t"
+ "umulh x26, x15, x20\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #48]\n\t"
+ /* Double */
+ "adds x4, x4, x4\n\t"
+ "adcs x5, x5, x5\n\t"
+ "adcs x6, x6, x6\n\t"
+ "adc x7, x7, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #168]\n\t"
+ "ldr x2, [x29, #72]\n\t"
+ /* Multiply */
+ "ldp x16, x17, [x1]\n\t"
+ "ldp x19, x20, [x1, #16]\n\t"
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x8, x16, x21\n\t"
+ "umulh x9, x16, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x16, x22\n\t"
+ "umulh x10, x16, x22\n\t"
+ "adds x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x17, x21\n\t"
+ "umulh x26, x17, x21\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x16, x23\n\t"
+ "umulh x26, x16, x23\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x17, x22\n\t"
+ "umulh x26, x17, x22\n\t"
+ "adds x10, x10, x25\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x12, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x19, x21\n\t"
+ "umulh x26, x19, x21\n\t"
+ "adds x10, x10, x25\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x16, x24\n\t"
+ "umulh x26, x16, x24\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x17, x23\n\t"
+ "umulh x26, x17, x23\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x19, x22\n\t"
+ "umulh x26, x19, x22\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x20, x21\n\t"
+ "umulh x26, x20, x21\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x17, x24\n\t"
+ "umulh x26, x17, x24\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x19, x23\n\t"
+ "umulh x26, x19, x23\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x20, x22\n\t"
+ "umulh x26, x20, x22\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x19, x24\n\t"
+ "umulh x26, x19, x24\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x20, x23\n\t"
+ "umulh x26, x20, x23\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x20, x24\n\t"
+ "umulh x26, x20, x24\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x15, x15, x14, #63\n\t"
+ "extr x14, x14, x13, #63\n\t"
+ "extr x13, x13, x12, #63\n\t"
+ "extr x12, x12, x11, #63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x12\n\t"
+ "umulh x12, x25, x12\n\t"
+ "adds x8, x8, x26\n\t"
+ "mul x26, x25, x13\n\t"
+ "umulh x13, x25, x13\n\t"
+ "adcs x9, x9, x26\n\t"
+ "mul x26, x25, x14\n\t"
+ "umulh x14, x25, x14\n\t"
+ "adcs x10, x10, x26\n\t"
+ "mul x26, x25, x15\n\t"
+ "umulh x27, x25, x15\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x9, x9, x12\n\t"
+ "adcs x10, x10, x13\n\t"
+ "adcs x11, x11, x14\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x11, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x11, asr 63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ /* Add */
+ "adds x12, x4, x8\n\t"
+ "adcs x13, x5, x9\n\t"
+ "adcs x14, x6, x10\n\t"
+ "adc x15, x7, x11\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x16, x4, x8\n\t"
+ "sbcs x17, x5, x9\n\t"
+ "sbcs x19, x6, x10\n\t"
+ "sbcs x20, x7, x11\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x12, x13, [x0]\n\t"
+ "stp x14, x15, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ (void)qz;
+ (void)qt2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "str %x[pt], [x29, #72]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ "ldr x3, [x29, #48]\n\t"
+ /* Add */
+ "ldp x12, x13, [x2]\n\t"
+ "ldp x14, x15, [x2, #16]\n\t"
+ "ldp x16, x17, [x3]\n\t"
+ "ldp x19, x20, [x3, #16]\n\t"
+ "adds x4, x12, x16\n\t"
+ "adcs x5, x13, x17\n\t"
+ "adcs x6, x14, x19\n\t"
+ "adc x7, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ /* Sub */
+ "subs x8, x12, x16\n\t"
+ "sbcs x9, x13, x17\n\t"
+ "sbcs x10, x14, x19\n\t"
+ "sbcs x11, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x28\n\t"
+ "adcs x10, x10, x28\n\t"
+ "adc x11, x11, x26\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #184]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x12, x4, x21\n\t"
+ "umulh x13, x4, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x4, x22\n\t"
+ "umulh x14, x4, x22\n\t"
+ "adds x13, x13, x25\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x5, x21\n\t"
+ "umulh x26, x5, x21\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x4, x23\n\t"
+ "umulh x26, x4, x23\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x5, x22\n\t"
+ "umulh x26, x5, x22\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x6, x21\n\t"
+ "umulh x26, x6, x21\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x4, x24\n\t"
+ "umulh x26, x4, x24\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x5, x23\n\t"
+ "umulh x26, x5, x23\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x6, x22\n\t"
+ "umulh x26, x6, x22\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x7, x21\n\t"
+ "umulh x26, x7, x21\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x5, x24\n\t"
+ "umulh x26, x5, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x6, x23\n\t"
+ "umulh x26, x6, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x7, x22\n\t"
+ "umulh x26, x7, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x6, x24\n\t"
+ "umulh x26, x6, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x7, x23\n\t"
+ "umulh x26, x7, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x7, x24\n\t"
+ "umulh x26, x7, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x12, x12, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x13, x13, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x14, x14, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x13, x13, x16\n\t"
+ "adcs x14, x14, x17\n\t"
+ "adcs x15, x15, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x15, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x15, asr 63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x8, x21\n\t"
+ "umulh x5, x8, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x8, x22\n\t"
+ "umulh x6, x8, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x9, x21\n\t"
+ "umulh x26, x9, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x8, x23\n\t"
+ "umulh x26, x8, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x9, x22\n\t"
+ "umulh x26, x9, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x10, x21\n\t"
+ "umulh x26, x10, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x8, x24\n\t"
+ "umulh x26, x8, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x9, x23\n\t"
+ "umulh x26, x9, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x10, x22\n\t"
+ "umulh x26, x10, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x11, x21\n\t"
+ "umulh x26, x11, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x9, x24\n\t"
+ "umulh x26, x9, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x10, x23\n\t"
+ "umulh x26, x10, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x11, x22\n\t"
+ "umulh x26, x11, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x10, x24\n\t"
+ "umulh x26, x10, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x11, x23\n\t"
+ "umulh x26, x11, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x11, x24\n\t"
+ "umulh x26, x11, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x16\n\t"
+ "adcs x6, x6, x17\n\t"
+ "adcs x7, x7, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #16]\n\t"
+ /* Add */
+ "adds x8, x12, x4\n\t"
+ "adcs x9, x13, x5\n\t"
+ "adcs x10, x14, x6\n\t"
+ "adc x11, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ /* Sub */
+ "subs x16, x12, x4\n\t"
+ "sbcs x17, x13, x5\n\t"
+ "sbcs x19, x14, x6\n\t"
+ "sbcs x20, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldr x0, [x29, #48]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ "ldr x2, [x29, #160]\n\t"
+ /* Multiply */
+ "ldp x12, x13, [x1]\n\t"
+ "ldp x14, x15, [x1, #16]\n\t"
+ "ldp x16, x17, [x2]\n\t"
+ "ldp x19, x20, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x12, x16\n\t"
+ "umulh x5, x12, x16\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x12, x17\n\t"
+ "umulh x6, x12, x17\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x13, x16\n\t"
+ "umulh x26, x13, x16\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x12, x19\n\t"
+ "umulh x26, x12, x19\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x13, x17\n\t"
+ "umulh x26, x13, x17\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x14, x16\n\t"
+ "umulh x26, x14, x16\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x12, x20\n\t"
+ "umulh x26, x12, x20\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x13, x19\n\t"
+ "umulh x26, x13, x19\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x14, x17\n\t"
+ "umulh x26, x14, x17\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x15, x16\n\t"
+ "umulh x26, x15, x16\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x13, x20\n\t"
+ "umulh x26, x13, x20\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x14, x19\n\t"
+ "umulh x26, x14, x19\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x15, x17\n\t"
+ "umulh x26, x15, x17\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x14, x20\n\t"
+ "umulh x26, x14, x20\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x15, x19\n\t"
+ "umulh x26, x15, x19\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x15, x20\n\t"
+ "umulh x26, x15, x20\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #48]\n\t"
+ /* Double */
+ "adds x4, x4, x4\n\t"
+ "adcs x5, x5, x5\n\t"
+ "adcs x6, x6, x6\n\t"
+ "adc x7, x7, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #168]\n\t"
+ "ldr x2, [x29, #72]\n\t"
+ /* Multiply */
+ "ldp x16, x17, [x1]\n\t"
+ "ldp x19, x20, [x1, #16]\n\t"
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x8, x16, x21\n\t"
+ "umulh x9, x16, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x16, x22\n\t"
+ "umulh x10, x16, x22\n\t"
+ "adds x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x17, x21\n\t"
+ "umulh x26, x17, x21\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x16, x23\n\t"
+ "umulh x26, x16, x23\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x17, x22\n\t"
+ "umulh x26, x17, x22\n\t"
+ "adds x10, x10, x25\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x12, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x19, x21\n\t"
+ "umulh x26, x19, x21\n\t"
+ "adds x10, x10, x25\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x16, x24\n\t"
+ "umulh x26, x16, x24\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x17, x23\n\t"
+ "umulh x26, x17, x23\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x19, x22\n\t"
+ "umulh x26, x19, x22\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x20, x21\n\t"
+ "umulh x26, x20, x21\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x17, x24\n\t"
+ "umulh x26, x17, x24\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x19, x23\n\t"
+ "umulh x26, x19, x23\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x20, x22\n\t"
+ "umulh x26, x20, x22\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x19, x24\n\t"
+ "umulh x26, x19, x24\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x20, x23\n\t"
+ "umulh x26, x20, x23\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x20, x24\n\t"
+ "umulh x26, x20, x24\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x15, x15, x14, #63\n\t"
+ "extr x14, x14, x13, #63\n\t"
+ "extr x13, x13, x12, #63\n\t"
+ "extr x12, x12, x11, #63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x12\n\t"
+ "umulh x12, x25, x12\n\t"
+ "adds x8, x8, x26\n\t"
+ "mul x26, x25, x13\n\t"
+ "umulh x13, x25, x13\n\t"
+ "adcs x9, x9, x26\n\t"
+ "mul x26, x25, x14\n\t"
+ "umulh x14, x25, x14\n\t"
+ "adcs x10, x10, x26\n\t"
+ "mul x26, x25, x15\n\t"
+ "umulh x27, x25, x15\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x9, x9, x12\n\t"
+ "adcs x10, x10, x13\n\t"
+ "adcs x11, x11, x14\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x11, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x11, asr 63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #32]\n\t"
+ /* Add */
+ "adds x12, x4, x8\n\t"
+ "adcs x13, x5, x9\n\t"
+ "adcs x14, x6, x10\n\t"
+ "adc x15, x7, x11\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x16, x4, x8\n\t"
+ "sbcs x17, x5, x9\n\t"
+ "sbcs x19, x6, x10\n\t"
+ "sbcs x20, x7, x11\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x12, x13, [x0]\n\t"
+ "stp x14, x15, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ (void)qz;
+ (void)qt2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+#endif /* WOLFSSL_ARMASM */
+#endif /* __aarch64__ */