aboutsummaryrefslogtreecommitdiff
path: root/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S
diff options
context:
space:
mode:
Diffstat (limited to 'client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S')
-rw-r--r--client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S6715
1 files changed, 6715 insertions, 0 deletions
diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S b/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S
new file mode 100644
index 0000000..891c6d8
--- /dev/null
+++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S
@@ -0,0 +1,6715 @@
+/* armv8-curve25519
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S
+ */
+#ifdef __aarch64__
+ .text
+ .align 2
+ .globl fe_init
+ .type fe_init, %function
+fe_init:
+ ret
+ .size fe_init,.-fe_init
+ .text
+ .align 2
+ .globl fe_frombytes
+ .type fe_frombytes, %function
+fe_frombytes:
+ ldp x2, x3, [x1]
+ ldp x4, x5, [x1, #16]
+ and x5, x5, #0x7fffffffffffffff
+ stp x2, x3, [x0]
+ stp x4, x5, [x0, #16]
+ ret
+ .size fe_frombytes,.-fe_frombytes
+ .text
+ .align 2
+ .globl fe_tobytes
+ .type fe_tobytes, %function
+fe_tobytes:
+ mov x7, #19
+ ldp x2, x3, [x1]
+ ldp x4, x5, [x1, #16]
+ adds x6, x2, x7
+ adcs x6, x3, xzr
+ adcs x6, x4, xzr
+ adc x6, x5, xzr
+ and x6, x7, x6, asr 63
+ adds x2, x2, x6
+ adcs x3, x3, xzr
+ adcs x4, x4, xzr
+ adc x5, x5, xzr
+ and x5, x5, #0x7fffffffffffffff
+ stp x2, x3, [x0]
+ stp x4, x5, [x0, #16]
+ ret
+ .size fe_tobytes,.-fe_tobytes
+ .text
+ .align 2
+ .globl fe_1
+ .type fe_1, %function
+fe_1:
+ # Set one
+ mov x1, #1
+ stp x1, xzr, [x0]
+ stp xzr, xzr, [x0, #16]
+ ret
+ .size fe_1,.-fe_1
+ .text
+ .align 2
+ .globl fe_0
+ .type fe_0, %function
+fe_0:
+ # Set zero
+ stp xzr, xzr, [x0]
+ stp xzr, xzr, [x0, #16]
+ ret
+ .size fe_0,.-fe_0
+ .text
+ .align 2
+ .globl fe_copy
+ .type fe_copy, %function
+fe_copy:
+ # Copy
+ ldp x2, x3, [x1]
+ ldp x4, x5, [x1, #16]
+ stp x2, x3, [x0]
+ stp x4, x5, [x0, #16]
+ ret
+ .size fe_copy,.-fe_copy
+ .text
+ .align 2
+ .globl fe_sub
+ .type fe_sub, %function
+fe_sub:
+ # Sub
+ ldp x3, x4, [x1]
+ ldp x5, x6, [x1, #16]
+ ldp x7, x8, [x2]
+ ldp x9, x10, [x2, #16]
+ subs x3, x3, x7
+ sbcs x4, x4, x8
+ sbcs x5, x5, x9
+ sbcs x6, x6, x10
+ mov x12, #-19
+ csetm x11, cc
+ # Mask the modulus
+ and x12, x11, x12
+ and x13, x11, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x3, x3, x12
+ adcs x4, x4, x11
+ adcs x5, x5, x11
+ adc x6, x6, x13
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ret
+ .size fe_sub,.-fe_sub
+ .text
+ .align 2
+ .globl fe_add
+ .type fe_add, %function
+fe_add:
+ # Add
+ ldp x3, x4, [x1]
+ ldp x5, x6, [x1, #16]
+ ldp x7, x8, [x2]
+ ldp x9, x10, [x2, #16]
+ adds x3, x3, x7
+ adcs x4, x4, x8
+ adcs x5, x5, x9
+ adc x6, x6, x10
+ mov x12, #-19
+ asr x11, x6, #63
+ # Mask the modulus
+ and x12, x11, x12
+ and x13, x11, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x3, x3, x12
+ sbcs x4, x4, x11
+ sbcs x5, x5, x11
+ sbc x6, x6, x13
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ret
+ .size fe_add,.-fe_add
+ .text
+ .align 2
+ .globl fe_neg
+ .type fe_neg, %function
+fe_neg:
+ ldp x2, x3, [x1]
+ ldp x4, x5, [x1, #16]
+ mov x6, #-19
+ mov x7, #-1
+ mov x8, #-1
+ mov x9, #0x7fffffffffffffff
+ subs x6, x6, x2
+ sbcs x7, x7, x3
+ sbcs x8, x8, x4
+ sbc x9, x9, x5
+ stp x6, x7, [x0]
+ stp x8, x9, [x0, #16]
+ ret
+ .size fe_neg,.-fe_neg
+ .text
+ .align 2
+ .globl fe_isnonzero
+ .type fe_isnonzero, %function
+fe_isnonzero:
+ mov x6, #19
+ ldp x1, x2, [x0]
+ ldp x3, x4, [x0, #16]
+ adds x5, x1, x6
+ adcs x5, x2, xzr
+ adcs x5, x3, xzr
+ adc x5, x4, xzr
+ and x5, x6, x5, asr 63
+ adds x1, x1, x5
+ adcs x2, x2, xzr
+ adcs x3, x3, xzr
+ adc x4, x4, xzr
+ and x4, x4, #0x7fffffffffffffff
+ orr x0, x1, x2
+ orr x3, x3, x4
+ orr x0, x0, x3
+ ret
+ .size fe_isnonzero,.-fe_isnonzero
+ .text
+ .align 2
+ .globl fe_isnegative
+ .type fe_isnegative, %function
+fe_isnegative:
+ mov x6, #19
+ ldp x1, x2, [x0]
+ ldp x3, x4, [x0, #16]
+ adds x5, x1, x6
+ adcs x5, x2, xzr
+ adcs x5, x3, xzr
+ adc x5, x4, xzr
+ and x0, x1, #1
+ eor x0, x0, x5, lsr 63
+ ret
+ .size fe_isnegative,.-fe_isnegative
+ .text
+ .align 2
+ .globl fe_cmov_table
+ .type fe_cmov_table, %function
+fe_cmov_table:
+ stp x29, x30, [sp, #-128]!
+ add x29, sp, #0
+ str x17, [x29, #40]
+ str x19, [x29, #48]
+ stp x20, x21, [x29, #56]
+ stp x22, x23, [x29, #72]
+ stp x24, x25, [x29, #88]
+ stp x26, x27, [x29, #104]
+ str x28, [x29, #120]
+ str x0, [x29, #16]
+ sxtb x2, w2
+ sbfx x3, x2, #7, #1
+ eor x0, x2, x3
+ sub x0, x0, x3
+ mov x4, #1
+ mov x5, xzr
+ mov x6, xzr
+ mov x7, xzr
+ mov x8, #1
+ mov x9, xzr
+ mov x10, xzr
+ mov x11, xzr
+ mov x12, xzr
+ mov x13, xzr
+ mov x14, xzr
+ mov x15, xzr
+ cmp x0, #1
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x1, #32]
+ ldp x23, x24, [x1, #48]
+ ldp x25, x26, [x1, #64]
+ ldp x27, x28, [x1, #80]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #2
+ ldp x16, x17, [x1, #96]
+ ldp x19, x20, [x1, #112]
+ ldp x21, x22, [x1, #128]
+ ldp x23, x24, [x1, #144]
+ ldp x25, x26, [x1, #160]
+ ldp x27, x28, [x1, #176]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #3
+ ldp x16, x17, [x1, #192]
+ ldp x19, x20, [x1, #208]
+ ldp x21, x22, [x1, #224]
+ ldp x23, x24, [x1, #240]
+ ldp x25, x26, [x1, #256]
+ ldp x27, x28, [x1, #272]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #4
+ ldp x16, x17, [x1, #288]
+ ldp x19, x20, [x1, #304]
+ ldp x21, x22, [x1, #320]
+ ldp x23, x24, [x1, #336]
+ ldp x25, x26, [x1, #352]
+ ldp x27, x28, [x1, #368]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ add x1, x1, #0x180
+ cmp x0, #5
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x1, #32]
+ ldp x23, x24, [x1, #48]
+ ldp x25, x26, [x1, #64]
+ ldp x27, x28, [x1, #80]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #6
+ ldp x16, x17, [x1, #96]
+ ldp x19, x20, [x1, #112]
+ ldp x21, x22, [x1, #128]
+ ldp x23, x24, [x1, #144]
+ ldp x25, x26, [x1, #160]
+ ldp x27, x28, [x1, #176]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #7
+ ldp x16, x17, [x1, #192]
+ ldp x19, x20, [x1, #208]
+ ldp x21, x22, [x1, #224]
+ ldp x23, x24, [x1, #240]
+ ldp x25, x26, [x1, #256]
+ ldp x27, x28, [x1, #272]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #8
+ ldp x16, x17, [x1, #288]
+ ldp x19, x20, [x1, #304]
+ ldp x21, x22, [x1, #320]
+ ldp x23, x24, [x1, #336]
+ ldp x25, x26, [x1, #352]
+ ldp x27, x28, [x1, #368]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ mov x16, #-19
+ mov x17, #-1
+ mov x19, #-1
+ mov x20, #0x7fffffffffffffff
+ subs x16, x16, x12
+ sbcs x17, x17, x13
+ sbcs x19, x19, x14
+ sbc x20, x20, x15
+ cmp x2, #0
+ mov x3, x4
+ csel x4, x8, x4, lt
+ csel x8, x3, x8, lt
+ mov x3, x5
+ csel x5, x9, x5, lt
+ csel x9, x3, x9, lt
+ mov x3, x6
+ csel x6, x10, x6, lt
+ csel x10, x3, x10, lt
+ mov x3, x7
+ csel x7, x11, x7, lt
+ csel x11, x3, x11, lt
+ csel x12, x16, x12, lt
+ csel x13, x17, x13, lt
+ csel x14, x19, x14, lt
+ csel x15, x20, x15, lt
+ ldr x0, [x29, #16]
+ stp x4, x5, [x0]
+ stp x6, x7, [x0, #16]
+ stp x8, x9, [x0, #32]
+ stp x10, x11, [x0, #48]
+ stp x12, x13, [x0, #64]
+ stp x14, x15, [x0, #80]
+ ldr x17, [x29, #40]
+ ldr x19, [x29, #48]
+ ldp x20, x21, [x29, #56]
+ ldp x22, x23, [x29, #72]
+ ldp x24, x25, [x29, #88]
+ ldp x26, x27, [x29, #104]
+ ldr x28, [x29, #120]
+ ldp x29, x30, [sp], #0x80
+ ret
+ .size fe_cmov_table,.-fe_cmov_table
+ .text
+ .align 2
+ .globl fe_mul
+ .type fe_mul, %function
+fe_mul:
+ stp x29, x30, [sp, #-64]!
+ add x29, sp, #0
+ str x17, [x29, #24]
+ str x19, [x29, #32]
+ stp x20, x21, [x29, #40]
+ str x22, [x29, #56]
+ # Multiply
+ ldp x14, x15, [x1]
+ ldp x16, x17, [x1, #16]
+ ldp x19, x20, [x2]
+ ldp x21, x22, [x2, #16]
+ # A[0] * B[0]
+ mul x6, x14, x19
+ umulh x7, x14, x19
+ # A[0] * B[1]
+ mul x3, x14, x20
+ umulh x8, x14, x20
+ adds x7, x7, x3
+ adc x8, x8, xzr
+ # A[1] * B[0]
+ mul x3, x15, x19
+ umulh x4, x15, x19
+ adds x7, x7, x3
+ adcs x8, x8, x4
+ adc x9, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x21
+ umulh x4, x14, x21
+ adds x8, x8, x3
+ adc x9, x9, x4
+ # A[1] * B[1]
+ mul x3, x15, x20
+ umulh x4, x15, x20
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x10, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x19
+ umulh x4, x16, x19
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x10, x10, xzr
+ # A[0] * B[3]
+ mul x3, x14, x22
+ umulh x4, x14, x22
+ adds x9, x9, x3
+ adcs x10, x10, x4
+ adc x11, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x21
+ umulh x4, x15, x21
+ adds x9, x9, x3
+ adcs x10, x10, x4
+ adc x11, x11, xzr
+ # A[2] * B[1]
+ mul x3, x16, x20
+ umulh x4, x16, x20
+ adds x9, x9, x3
+ adcs x10, x10, x4
+ adc x11, x11, xzr
+ # A[3] * B[0]
+ mul x3, x17, x19
+ umulh x4, x17, x19
+ adds x9, x9, x3
+ adcs x10, x10, x4
+ adc x11, x11, xzr
+ # A[1] * B[3]
+ mul x3, x15, x22
+ umulh x4, x15, x22
+ adds x10, x10, x3
+ adcs x11, x11, x4
+ adc x12, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x21
+ umulh x4, x16, x21
+ adds x10, x10, x3
+ adcs x11, x11, x4
+ adc x12, x12, xzr
+ # A[3] * B[1]
+ mul x3, x17, x20
+ umulh x4, x17, x20
+ adds x10, x10, x3
+ adcs x11, x11, x4
+ adc x12, x12, xzr
+ # A[2] * B[3]
+ mul x3, x16, x22
+ umulh x4, x16, x22
+ adds x11, x11, x3
+ adcs x12, x12, x4
+ adc x13, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x21
+ umulh x4, x17, x21
+ adds x11, x11, x3
+ adcs x12, x12, x4
+ adc x13, x13, xzr
+ # A[3] * B[3]
+ mul x3, x17, x22
+ umulh x4, x17, x22
+ adds x12, x12, x3
+ adc x13, x13, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x13, x13, x12, #63
+ extr x12, x12, x11, #63
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x10
+ umulh x10, x3, x10
+ adds x6, x6, x4
+ mul x4, x3, x11
+ umulh x11, x3, x11
+ adcs x7, x7, x4
+ mul x4, x3, x12
+ umulh x12, x3, x12
+ adcs x8, x8, x4
+ mul x4, x3, x13
+ umulh x5, x3, x13
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x10
+ adcs x8, x8, x11
+ adcs x9, x9, x12
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ stp x6, x7, [x0]
+ stp x8, x9, [x0, #16]
+ ldr x17, [x29, #24]
+ ldr x19, [x29, #32]
+ ldp x20, x21, [x29, #40]
+ ldr x22, [x29, #56]
+ ldp x29, x30, [sp], #0x40
+ ret
+ .size fe_mul,.-fe_mul
+ .text
+ .align 2
+ .globl fe_sq
+ .type fe_sq, %function
+fe_sq:
+ # Square
+ ldp x13, x14, [x1]
+ ldp x15, x16, [x1, #16]
+ # A[0] * A[1]
+ mul x6, x13, x14
+ umulh x7, x13, x14
+ # A[0] * A[2]
+ mul x2, x13, x15
+ umulh x8, x13, x15
+ adds x7, x7, x2
+ adc x8, x8, xzr
+ # A[0] * A[3]
+ mul x2, x13, x16
+ umulh x9, x13, x16
+ adds x8, x8, x2
+ adc x9, x9, xzr
+ # A[1] * A[2]
+ mul x2, x14, x15
+ umulh x3, x14, x15
+ adds x8, x8, x2
+ adcs x9, x9, x3
+ adc x10, xzr, xzr
+ # A[1] * A[3]
+ mul x2, x14, x16
+ umulh x3, x14, x16
+ adds x9, x9, x2
+ adc x10, x10, x3
+ # A[2] * A[3]
+ mul x2, x15, x16
+ umulh x11, x15, x16
+ adds x10, x10, x2
+ adc x11, x11, xzr
+ # Double
+ adds x6, x6, x6
+ adcs x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adcs x11, x11, x11
+ adc x12, xzr, xzr
+ # A[0] * A[0]
+ mul x5, x13, x13
+ umulh x4, x13, x13
+ # A[1] * A[1]
+ mul x2, x14, x14
+ umulh x3, x14, x14
+ adds x6, x6, x4
+ adcs x7, x7, x2
+ adc x4, x3, xzr
+ # A[2] * A[2]
+ mul x2, x15, x15
+ umulh x3, x15, x15
+ adds x8, x8, x4
+ adcs x9, x9, x2
+ adc x4, x3, xzr
+ # A[3] * A[3]
+ mul x2, x16, x16
+ umulh x3, x16, x16
+ adds x10, x10, x4
+ adcs x11, x11, x2
+ adc x12, x12, x3
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x12, x12, x11, #63
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ and x8, x8, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x2, #19
+ mul x3, x2, x9
+ umulh x9, x2, x9
+ adds x5, x5, x3
+ mul x3, x2, x10
+ umulh x10, x2, x10
+ adcs x6, x6, x3
+ mul x3, x2, x11
+ umulh x11, x2, x11
+ adcs x7, x7, x3
+ mul x3, x2, x12
+ umulh x4, x2, x12
+ adcs x8, x8, x3
+ adc x4, x4, xzr
+ # Add remaining product results in
+ adds x6, x6, x9
+ adcs x7, x7, x10
+ adcs x8, x8, x11
+ adc x4, x4, xzr
+ # Overflow
+ extr x4, x4, x8, #63
+ mul x4, x4, x2
+ and x8, x8, #0x7fffffffffffffff
+ adds x5, x5, x4
+ adcs x6, x6, xzr
+ adcs x7, x7, xzr
+ adc x8, x8, xzr
+ # Reduce if top bit set
+ and x4, x2, x8, asr 63
+ and x8, x8, #0x7fffffffffffffff
+ adds x5, x5, x4
+ adcs x6, x6, xzr
+ adcs x7, x7, xzr
+ adc x8, x8, xzr
+ # Store
+ stp x5, x6, [x0]
+ stp x7, x8, [x0, #16]
+ ret
+ .size fe_sq,.-fe_sq
+ .text
+ .align 2
+ .globl fe_invert
+ .type fe_invert, %function
+fe_invert:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x20, [x29, #168]
+ # Invert
+ str x0, [x29, #144]
+ str x1, [x29, #152]
+ add x0, x29, #16
+ bl fe_sq
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ add x1, x29, #48
+ bl fe_sq
+ ldr x1, [x29, #152]
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #16
+ add x1, x29, #16
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x50
+ bl fe_sq
+ add x0, x29, #48
+ add x1, x29, #48
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x50
+ bl fe_sq
+ mov x20, #4
+ add x1, x29, #0x50
+L_fe_invert1:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert1
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x50
+ add x1, x29, #48
+ bl fe_sq
+ mov x20, #9
+ add x1, x29, #0x50
+L_fe_invert2:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert2
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x70
+ bl fe_sq
+ mov x20, #19
+ add x1, x29, #0x70
+L_fe_invert3:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert3
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ mov x20, #10
+ add x1, x29, #0x50
+L_fe_invert4:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert4
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x50
+ add x1, x29, #48
+ bl fe_sq
+ mov x20, #49
+ add x1, x29, #0x50
+L_fe_invert5:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert5
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x70
+ bl fe_sq
+ mov x20, #0x63
+ add x1, x29, #0x70
+L_fe_invert6:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert6
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ mov x20, #50
+ add x1, x29, #0x50
+L_fe_invert7:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert7
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ mov x20, #5
+ add x1, x29, #48
+L_fe_invert8:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert8
+ ldr x0, [x29, #144]
+ add x2, x29, #16
+ bl fe_mul
+ ldr x20, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_invert,.-fe_invert
+ .text
+ .align 2
+ .globl curve25519
+ .type curve25519, %function
+curve25519:
+ stp x29, x30, [sp, #-288]!
+ add x29, sp, #0
+ str x17, [x29, #200]
+ str x19, [x29, #208]
+ stp x20, x21, [x29, #216]
+ stp x22, x23, [x29, #232]
+ stp x24, x25, [x29, #248]
+ stp x26, x27, [x29, #264]
+ str x28, [x29, #280]
+ mov x23, xzr
+ str x0, [x29, #176]
+ str x2, [x29, #184]
+ # Copy
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x29, #80]
+ stp x8, x9, [x29, #96]
+ # Set one
+ mov x2, #1
+ stp x2, xzr, [x0]
+ stp xzr, xzr, [x0, #16]
+ # Set zero
+ stp xzr, xzr, [x29, #16]
+ stp xzr, xzr, [x29, #32]
+ # Set one
+ mov x2, #1
+ stp x2, xzr, [x29, #48]
+ stp xzr, xzr, [x29, #64]
+ mov x25, #62
+ mov x24, #24
+L_curve25519_words:
+L_curve25519_bits:
+ ldr x2, [x1, x24]
+ lsr x2, x2, x25
+ and x2, x2, #1
+ eor x23, x23, x2
+ # Conditional Swap
+ cmp x23, #1
+ ldp x10, x11, [x0]
+ ldp x12, x13, [x0, #16]
+ ldp x6, x7, [x29, #80]
+ ldp x8, x9, [x29, #96]
+ csel x14, x10, x6, eq
+ csel x10, x6, x10, eq
+ csel x15, x11, x7, eq
+ csel x11, x7, x11, eq
+ csel x16, x12, x8, eq
+ csel x12, x8, x12, eq
+ csel x17, x13, x9, eq
+ csel x13, x9, x13, eq
+ # Conditional Swap
+ cmp x23, #1
+ ldp x19, x20, [x29, #16]
+ ldp x21, x22, [x29, #32]
+ ldp x6, x7, [x29, #48]
+ ldp x8, x9, [x29, #64]
+ csel x5, x19, x6, eq
+ csel x19, x6, x19, eq
+ csel x26, x20, x7, eq
+ csel x20, x7, x20, eq
+ csel x27, x21, x8, eq
+ csel x21, x8, x21, eq
+ csel x28, x22, x9, eq
+ csel x22, x9, x22, eq
+ mov x23, x2
+ # Add
+ adds x6, x10, x19
+ adcs x7, x11, x20
+ adcs x8, x12, x21
+ adc x9, x13, x22
+ mov x3, #-19
+ asr x2, x9, #63
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x6, x6, x3
+ sbcs x7, x7, x2
+ sbcs x8, x8, x2
+ sbc x9, x9, x4
+ # Sub
+ subs x19, x10, x19
+ sbcs x20, x11, x20
+ sbcs x21, x12, x21
+ sbcs x22, x13, x22
+ mov x3, #-19
+ csetm x2, cc
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x19, x19, x3
+ adcs x20, x20, x2
+ adcs x21, x21, x2
+ adc x22, x22, x4
+ stp x19, x20, [x29, #144]
+ stp x21, x22, [x29, #160]
+ # Add
+ adds x10, x14, x5
+ adcs x11, x15, x26
+ adcs x12, x16, x27
+ adc x13, x17, x28
+ mov x3, #-19
+ asr x2, x13, #63
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x10, x10, x3
+ sbcs x11, x11, x2
+ sbcs x12, x12, x2
+ sbc x13, x13, x4
+ # Sub
+ subs x14, x14, x5
+ sbcs x15, x15, x26
+ sbcs x16, x16, x27
+ sbcs x17, x17, x28
+ mov x3, #-19
+ csetm x2, cc
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x14, x14, x3
+ adcs x15, x15, x2
+ adcs x16, x16, x2
+ adc x17, x17, x4
+ # Multiply
+ # A[0] * B[0]
+ mul x19, x14, x6
+ umulh x20, x14, x6
+ # A[0] * B[1]
+ mul x3, x14, x7
+ umulh x21, x14, x7
+ adds x20, x20, x3
+ adc x21, x21, xzr
+ # A[1] * B[0]
+ mul x3, x15, x6
+ umulh x4, x15, x6
+ adds x20, x20, x3
+ adcs x21, x21, x4
+ adc x22, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x8
+ umulh x4, x14, x8
+ adds x21, x21, x3
+ adc x22, x22, x4
+ # A[1] * B[1]
+ mul x3, x15, x7
+ umulh x4, x15, x7
+ adds x21, x21, x3
+ adcs x22, x22, x4
+ adc x2, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x6
+ umulh x4, x16, x6
+ adds x21, x21, x3
+ adcs x22, x22, x4
+ adc x2, x2, xzr
+ # A[0] * B[3]
+ mul x3, x14, x9
+ umulh x4, x14, x9
+ adds x22, x22, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x8
+ umulh x4, x15, x8
+ adds x22, x22, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[2] * B[1]
+ mul x3, x16, x7
+ umulh x4, x16, x7
+ adds x22, x22, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[3] * B[0]
+ mul x3, x17, x6
+ umulh x4, x17, x6
+ adds x22, x22, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[1] * B[3]
+ mul x3, x15, x9
+ umulh x4, x15, x9
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x8
+ umulh x4, x16, x8
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[3] * B[1]
+ mul x3, x17, x7
+ umulh x4, x17, x7
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[2] * B[3]
+ mul x3, x16, x9
+ umulh x4, x16, x9
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x8
+ umulh x4, x17, x8
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, x28, xzr
+ # A[3] * B[3]
+ mul x3, x17, x9
+ umulh x4, x17, x9
+ adds x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x22, #63
+ and x22, x22, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x19, x19, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x20, x20, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x21, x21, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x22, x22, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x20, x20, x2
+ adcs x21, x21, x26
+ adcs x22, x22, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x22, #63
+ mul x5, x5, x3
+ and x22, x22, #0x7fffffffffffffff
+ adds x19, x19, x5
+ adcs x20, x20, xzr
+ adcs x21, x21, xzr
+ adc x22, x22, xzr
+ # Reduce if top bit set
+ and x5, x3, x22, asr 63
+ and x22, x22, #0x7fffffffffffffff
+ adds x19, x19, x5
+ adcs x20, x20, xzr
+ adcs x21, x21, xzr
+ adc x22, x22, xzr
+ # Store
+ stp x19, x20, [x29, #112]
+ stp x21, x22, [x29, #128]
+ # Multiply
+ ldp x2, x26, [x29, #144]
+ ldp x27, x28, [x29, #160]
+ # A[0] * B[0]
+ mul x19, x10, x2
+ umulh x20, x10, x2
+ # A[0] * B[1]
+ mul x3, x10, x26
+ umulh x21, x10, x26
+ adds x20, x20, x3
+ adc x21, x21, xzr
+ # A[1] * B[0]
+ mul x3, x11, x2
+ umulh x4, x11, x2
+ adds x20, x20, x3
+ adcs x21, x21, x4
+ adc x22, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x10, x27
+ umulh x4, x10, x27
+ adds x21, x21, x3
+ adc x22, x22, x4
+ # A[1] * B[1]
+ mul x3, x11, x26
+ umulh x4, x11, x26
+ adds x21, x21, x3
+ adcs x22, x22, x4
+ adc x14, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x12, x2
+ umulh x4, x12, x2
+ adds x21, x21, x3
+ adcs x22, x22, x4
+ adc x14, x14, xzr
+ # A[0] * B[3]
+ mul x3, x10, x28
+ umulh x4, x10, x28
+ adds x22, x22, x3
+ adcs x14, x14, x4
+ adc x15, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x11, x27
+ umulh x4, x11, x27
+ adds x22, x22, x3
+ adcs x14, x14, x4
+ adc x15, x15, xzr
+ # A[2] * B[1]
+ mul x3, x12, x26
+ umulh x4, x12, x26
+ adds x22, x22, x3
+ adcs x14, x14, x4
+ adc x15, x15, xzr
+ # A[3] * B[0]
+ mul x3, x13, x2
+ umulh x4, x13, x2
+ adds x22, x22, x3
+ adcs x14, x14, x4
+ adc x15, x15, xzr
+ # A[1] * B[3]
+ mul x3, x11, x28
+ umulh x4, x11, x28
+ adds x14, x14, x3
+ adcs x15, x15, x4
+ adc x16, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x12, x27
+ umulh x4, x12, x27
+ adds x14, x14, x3
+ adcs x15, x15, x4
+ adc x16, x16, xzr
+ # A[3] * B[1]
+ mul x3, x13, x26
+ umulh x4, x13, x26
+ adds x14, x14, x3
+ adcs x15, x15, x4
+ adc x16, x16, xzr
+ # A[2] * B[3]
+ mul x3, x12, x28
+ umulh x4, x12, x28
+ adds x15, x15, x3
+ adcs x16, x16, x4
+ adc x17, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x13, x27
+ umulh x4, x13, x27
+ adds x15, x15, x3
+ adcs x16, x16, x4
+ adc x17, x17, xzr
+ # A[3] * B[3]
+ mul x3, x13, x28
+ umulh x4, x13, x28
+ adds x16, x16, x3
+ adc x17, x17, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ extr x15, x15, x14, #63
+ extr x14, x14, x22, #63
+ and x22, x22, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x14
+ umulh x14, x3, x14
+ adds x19, x19, x4
+ mul x4, x3, x15
+ umulh x15, x3, x15
+ adcs x20, x20, x4
+ mul x4, x3, x16
+ umulh x16, x3, x16
+ adcs x21, x21, x4
+ mul x4, x3, x17
+ umulh x5, x3, x17
+ adcs x22, x22, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x20, x20, x14
+ adcs x21, x21, x15
+ adcs x22, x22, x16
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x22, #63
+ mul x5, x5, x3
+ and x22, x22, #0x7fffffffffffffff
+ adds x19, x19, x5
+ adcs x20, x20, xzr
+ adcs x21, x21, xzr
+ adc x22, x22, xzr
+ # Reduce if top bit set
+ and x5, x3, x22, asr 63
+ and x22, x22, #0x7fffffffffffffff
+ adds x19, x19, x5
+ adcs x20, x20, xzr
+ adcs x21, x21, xzr
+ adc x22, x22, xzr
+ # Store
+ # Square
+ # A[0] * A[1]
+ mul x11, x2, x26
+ umulh x12, x2, x26
+ # A[0] * A[2]
+ mul x3, x2, x27
+ umulh x13, x2, x27
+ adds x12, x12, x3
+ adc x13, x13, xzr
+ # A[0] * A[3]
+ mul x3, x2, x28
+ umulh x14, x2, x28
+ adds x13, x13, x3
+ adc x14, x14, xzr
+ # A[1] * A[2]
+ mul x3, x26, x27
+ umulh x4, x26, x27
+ adds x13, x13, x3
+ adcs x14, x14, x4
+ adc x15, xzr, xzr
+ # A[1] * A[3]
+ mul x3, x26, x28
+ umulh x4, x26, x28
+ adds x14, x14, x3
+ adc x15, x15, x4
+ # A[2] * A[3]
+ mul x3, x27, x28
+ umulh x16, x27, x28
+ adds x15, x15, x3
+ adc x16, x16, xzr
+ # Double
+ adds x11, x11, x11
+ adcs x12, x12, x12
+ adcs x13, x13, x13
+ adcs x14, x14, x14
+ adcs x15, x15, x15
+ adcs x16, x16, x16
+ adc x17, xzr, xzr
+ # A[0] * A[0]
+ mul x10, x2, x2
+ umulh x5, x2, x2
+ # A[1] * A[1]
+ mul x3, x26, x26
+ umulh x4, x26, x26
+ adds x11, x11, x5
+ adcs x12, x12, x3
+ adc x5, x4, xzr
+ # A[2] * A[2]
+ mul x3, x27, x27
+ umulh x4, x27, x27
+ adds x13, x13, x5
+ adcs x14, x14, x3
+ adc x5, x4, xzr
+ # A[3] * A[3]
+ mul x3, x28, x28
+ umulh x4, x28, x28
+ adds x15, x15, x5
+ adcs x16, x16, x3
+ adc x17, x17, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ extr x15, x15, x14, #63
+ extr x14, x14, x13, #63
+ and x13, x13, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x14
+ umulh x14, x3, x14
+ adds x10, x10, x4
+ mul x4, x3, x15
+ umulh x15, x3, x15
+ adcs x11, x11, x4
+ mul x4, x3, x16
+ umulh x16, x3, x16
+ adcs x12, x12, x4
+ mul x4, x3, x17
+ umulh x5, x3, x17
+ adcs x13, x13, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x11, x11, x14
+ adcs x12, x12, x15
+ adcs x13, x13, x16
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x13, #63
+ mul x5, x5, x3
+ and x13, x13, #0x7fffffffffffffff
+ adds x10, x10, x5
+ adcs x11, x11, xzr
+ adcs x12, x12, xzr
+ adc x13, x13, xzr
+ # Reduce if top bit set
+ and x5, x3, x13, asr 63
+ and x13, x13, #0x7fffffffffffffff
+ adds x10, x10, x5
+ adcs x11, x11, xzr
+ adcs x12, x12, xzr
+ adc x13, x13, xzr
+ # Store
+ # Square
+ # A[0] * A[1]
+ mul x15, x6, x7
+ umulh x16, x6, x7
+ # A[0] * A[2]
+ mul x3, x6, x8
+ umulh x17, x6, x8
+ adds x16, x16, x3
+ adc x17, x17, xzr
+ # A[0] * A[3]
+ mul x3, x6, x9
+ umulh x2, x6, x9
+ adds x17, x17, x3
+ adc x2, x2, xzr
+ # A[1] * A[2]
+ mul x3, x7, x8
+ umulh x4, x7, x8
+ adds x17, x17, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * A[3]
+ mul x3, x7, x9
+ umulh x4, x7, x9
+ adds x2, x2, x3
+ adc x26, x26, x4
+ # A[2] * A[3]
+ mul x3, x8, x9
+ umulh x27, x8, x9
+ adds x26, x26, x3
+ adc x27, x27, xzr
+ # Double
+ adds x15, x15, x15
+ adcs x16, x16, x16
+ adcs x17, x17, x17
+ adcs x2, x2, x2
+ adcs x26, x26, x26
+ adcs x27, x27, x27
+ adc x28, xzr, xzr
+ # A[0] * A[0]
+ mul x14, x6, x6
+ umulh x5, x6, x6
+ # A[1] * A[1]
+ mul x3, x7, x7
+ umulh x4, x7, x7
+ adds x15, x15, x5
+ adcs x16, x16, x3
+ adc x5, x4, xzr
+ # A[2] * A[2]
+ mul x3, x8, x8
+ umulh x4, x8, x8
+ adds x17, x17, x5
+ adcs x2, x2, x3
+ adc x5, x4, xzr
+ # A[3] * A[3]
+ mul x3, x9, x9
+ umulh x4, x9, x9
+ adds x26, x26, x5
+ adcs x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x17, #63
+ and x17, x17, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x14, x14, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x15, x15, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x16, x16, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x17, x17, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x15, x15, x2
+ adcs x16, x16, x26
+ adcs x17, x17, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x17, #63
+ mul x5, x5, x3
+ and x17, x17, #0x7fffffffffffffff
+ adds x14, x14, x5
+ adcs x15, x15, xzr
+ adcs x16, x16, xzr
+ adc x17, x17, xzr
+ # Reduce if top bit set
+ and x5, x3, x17, asr 63
+ and x17, x17, #0x7fffffffffffffff
+ adds x14, x14, x5
+ adcs x15, x15, xzr
+ adcs x16, x16, xzr
+ adc x17, x17, xzr
+ # Store
+ # Multiply
+ # A[0] * B[0]
+ mul x6, x14, x10
+ umulh x7, x14, x10
+ # A[0] * B[1]
+ mul x3, x14, x11
+ umulh x8, x14, x11
+ adds x7, x7, x3
+ adc x8, x8, xzr
+ # A[1] * B[0]
+ mul x3, x15, x10
+ umulh x4, x15, x10
+ adds x7, x7, x3
+ adcs x8, x8, x4
+ adc x9, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x12
+ umulh x4, x14, x12
+ adds x8, x8, x3
+ adc x9, x9, x4
+ # A[1] * B[1]
+ mul x3, x15, x11
+ umulh x4, x15, x11
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x2, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x10
+ umulh x4, x16, x10
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x2, x2, xzr
+ # A[0] * B[3]
+ mul x3, x14, x13
+ umulh x4, x14, x13
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x12
+ umulh x4, x15, x12
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[2] * B[1]
+ mul x3, x16, x11
+ umulh x4, x16, x11
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[3] * B[0]
+ mul x3, x17, x10
+ umulh x4, x17, x10
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[1] * B[3]
+ mul x3, x15, x13
+ umulh x4, x15, x13
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x12
+ umulh x4, x16, x12
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[3] * B[1]
+ mul x3, x17, x11
+ umulh x4, x17, x11
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[2] * B[3]
+ mul x3, x16, x13
+ umulh x4, x16, x13
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x12
+ umulh x4, x17, x12
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, x28, xzr
+ # A[3] * B[3]
+ mul x3, x17, x13
+ umulh x4, x17, x13
+ adds x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x6, x6, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x7, x7, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x8, x8, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x2
+ adcs x8, x8, x26
+ adcs x9, x9, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ stp x6, x7, [x0]
+ stp x8, x9, [x0, #16]
+ # Sub
+ subs x14, x14, x10
+ sbcs x15, x15, x11
+ sbcs x16, x16, x12
+ sbcs x17, x17, x13
+ mov x3, #-19
+ csetm x2, cc
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x14, x14, x3
+ adcs x15, x15, x2
+ adcs x16, x16, x2
+ adc x17, x17, x4
+ # Multiply by 121666
+ mov x5, #0xdb42
+ movk x5, #1, lsl 16
+ mul x6, x14, x5
+ umulh x7, x14, x5
+ mul x3, x15, x5
+ umulh x4, x15, x5
+ adds x7, x7, x3
+ adc x8, xzr, x4
+ mul x3, x16, x5
+ umulh x4, x16, x5
+ adds x8, x8, x3
+ adc x9, xzr, x4
+ mul x3, x17, x5
+ umulh x4, x17, x5
+ adds x9, x9, x3
+ adc x4, xzr, x4
+ mov x5, #19
+ extr x4, x4, x9, #63
+ mul x4, x4, x5
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x4
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Add
+ adds x10, x10, x6
+ adcs x11, x11, x7
+ adcs x12, x12, x8
+ adc x13, x13, x9
+ mov x3, #-19
+ asr x2, x13, #63
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x10, x10, x3
+ sbcs x11, x11, x2
+ sbcs x12, x12, x2
+ sbc x13, x13, x4
+ # Multiply
+ # A[0] * B[0]
+ mul x6, x14, x10
+ umulh x7, x14, x10
+ # A[0] * B[1]
+ mul x3, x14, x11
+ umulh x8, x14, x11
+ adds x7, x7, x3
+ adc x8, x8, xzr
+ # A[1] * B[0]
+ mul x3, x15, x10
+ umulh x4, x15, x10
+ adds x7, x7, x3
+ adcs x8, x8, x4
+ adc x9, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x12
+ umulh x4, x14, x12
+ adds x8, x8, x3
+ adc x9, x9, x4
+ # A[1] * B[1]
+ mul x3, x15, x11
+ umulh x4, x15, x11
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x2, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x10
+ umulh x4, x16, x10
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x2, x2, xzr
+ # A[0] * B[3]
+ mul x3, x14, x13
+ umulh x4, x14, x13
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x12
+ umulh x4, x15, x12
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[2] * B[1]
+ mul x3, x16, x11
+ umulh x4, x16, x11
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[3] * B[0]
+ mul x3, x17, x10
+ umulh x4, x17, x10
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[1] * B[3]
+ mul x3, x15, x13
+ umulh x4, x15, x13
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x12
+ umulh x4, x16, x12
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[3] * B[1]
+ mul x3, x17, x11
+ umulh x4, x17, x11
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[2] * B[3]
+ mul x3, x16, x13
+ umulh x4, x16, x13
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x12
+ umulh x4, x17, x12
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, x28, xzr
+ # A[3] * B[3]
+ mul x3, x17, x13
+ umulh x4, x17, x13
+ adds x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x6, x6, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x7, x7, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x8, x8, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x2
+ adcs x8, x8, x26
+ adcs x9, x9, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ stp x6, x7, [x29, #16]
+ stp x8, x9, [x29, #32]
+ # Add
+ ldp x6, x7, [x29, #112]
+ ldp x8, x9, [x29, #128]
+ adds x10, x6, x19
+ adcs x11, x7, x20
+ adcs x12, x8, x21
+ adc x13, x9, x22
+ mov x3, #-19
+ asr x2, x13, #63
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x10, x10, x3
+ sbcs x11, x11, x2
+ sbcs x12, x12, x2
+ sbc x13, x13, x4
+ # Sub
+ subs x19, x6, x19
+ sbcs x20, x7, x20
+ sbcs x21, x8, x21
+ sbcs x22, x9, x22
+ mov x3, #-19
+ csetm x2, cc
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x19, x19, x3
+ adcs x20, x20, x2
+ adcs x21, x21, x2
+ adc x22, x22, x4
+ # Square
+ # A[0] * A[1]
+ mul x7, x10, x11
+ umulh x8, x10, x11
+ # A[0] * A[2]
+ mul x3, x10, x12
+ umulh x9, x10, x12
+ adds x8, x8, x3
+ adc x9, x9, xzr
+ # A[0] * A[3]
+ mul x3, x10, x13
+ umulh x2, x10, x13
+ adds x9, x9, x3
+ adc x2, x2, xzr
+ # A[1] * A[2]
+ mul x3, x11, x12
+ umulh x4, x11, x12
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * A[3]
+ mul x3, x11, x13
+ umulh x4, x11, x13
+ adds x2, x2, x3
+ adc x26, x26, x4
+ # A[2] * A[3]
+ mul x3, x12, x13
+ umulh x27, x12, x13
+ adds x26, x26, x3
+ adc x27, x27, xzr
+ # Double
+ adds x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x2, x2, x2
+ adcs x26, x26, x26
+ adcs x27, x27, x27
+ adc x28, xzr, xzr
+ # A[0] * A[0]
+ mul x6, x10, x10
+ umulh x5, x10, x10
+ # A[1] * A[1]
+ mul x3, x11, x11
+ umulh x4, x11, x11
+ adds x7, x7, x5
+ adcs x8, x8, x3
+ adc x5, x4, xzr
+ # A[2] * A[2]
+ mul x3, x12, x12
+ umulh x4, x12, x12
+ adds x9, x9, x5
+ adcs x2, x2, x3
+ adc x5, x4, xzr
+ # A[3] * A[3]
+ mul x3, x13, x13
+ umulh x4, x13, x13
+ adds x26, x26, x5
+ adcs x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x6, x6, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x7, x7, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x8, x8, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x2
+ adcs x8, x8, x26
+ adcs x9, x9, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ stp x6, x7, [x29, #80]
+ stp x8, x9, [x29, #96]
+ # Square
+ # A[0] * A[1]
+ mul x7, x19, x20
+ umulh x8, x19, x20
+ # A[0] * A[2]
+ mul x3, x19, x21
+ umulh x9, x19, x21
+ adds x8, x8, x3
+ adc x9, x9, xzr
+ # A[0] * A[3]
+ mul x3, x19, x22
+ umulh x2, x19, x22
+ adds x9, x9, x3
+ adc x2, x2, xzr
+ # A[1] * A[2]
+ mul x3, x20, x21
+ umulh x4, x20, x21
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * A[3]
+ mul x3, x20, x22
+ umulh x4, x20, x22
+ adds x2, x2, x3
+ adc x26, x26, x4
+ # A[2] * A[3]
+ mul x3, x21, x22
+ umulh x27, x21, x22
+ adds x26, x26, x3
+ adc x27, x27, xzr
+ # Double
+ adds x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x2, x2, x2
+ adcs x26, x26, x26
+ adcs x27, x27, x27
+ adc x28, xzr, xzr
+ # A[0] * A[0]
+ mul x6, x19, x19
+ umulh x5, x19, x19
+ # A[1] * A[1]
+ mul x3, x20, x20
+ umulh x4, x20, x20
+ adds x7, x7, x5
+ adcs x8, x8, x3
+ adc x5, x4, xzr
+ # A[2] * A[2]
+ mul x3, x21, x21
+ umulh x4, x21, x21
+ adds x9, x9, x5
+ adcs x2, x2, x3
+ adc x5, x4, xzr
+ # A[3] * A[3]
+ mul x3, x22, x22
+ umulh x4, x22, x22
+ adds x26, x26, x5
+ adcs x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x6, x6, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x7, x7, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x8, x8, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x2
+ adcs x8, x8, x26
+ adcs x9, x9, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ ldr x2, [x29, #184]
+ # Multiply
+ ldp x14, x15, [x2]
+ ldp x16, x17, [x2, #16]
+ # A[0] * B[0]
+ mul x10, x14, x6
+ umulh x11, x14, x6
+ # A[0] * B[1]
+ mul x3, x14, x7
+ umulh x12, x14, x7
+ adds x11, x11, x3
+ adc x12, x12, xzr
+ # A[1] * B[0]
+ mul x3, x15, x6
+ umulh x4, x15, x6
+ adds x11, x11, x3
+ adcs x12, x12, x4
+ adc x13, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x8
+ umulh x4, x14, x8
+ adds x12, x12, x3
+ adc x13, x13, x4
+ # A[1] * B[1]
+ mul x3, x15, x7
+ umulh x4, x15, x7
+ adds x12, x12, x3
+ adcs x13, x13, x4
+ adc x2, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x6
+ umulh x4, x16, x6
+ adds x12, x12, x3
+ adcs x13, x13, x4
+ adc x2, x2, xzr
+ # A[0] * B[3]
+ mul x3, x14, x9
+ umulh x4, x14, x9
+ adds x13, x13, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x8
+ umulh x4, x15, x8
+ adds x13, x13, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[2] * B[1]
+ mul x3, x16, x7
+ umulh x4, x16, x7
+ adds x13, x13, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[3] * B[0]
+ mul x3, x17, x6
+ umulh x4, x17, x6
+ adds x13, x13, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[1] * B[3]
+ mul x3, x15, x9
+ umulh x4, x15, x9
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x8
+ umulh x4, x16, x8
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[3] * B[1]
+ mul x3, x17, x7
+ umulh x4, x17, x7
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[2] * B[3]
+ mul x3, x16, x9
+ umulh x4, x16, x9
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x8
+ umulh x4, x17, x8
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, x28, xzr
+ # A[3] * B[3]
+ mul x3, x17, x9
+ umulh x4, x17, x9
+ adds x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x13, #63
+ and x13, x13, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x10, x10, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x11, x11, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x12, x12, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x13, x13, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x11, x11, x2
+ adcs x12, x12, x26
+ adcs x13, x13, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x13, #63
+ mul x5, x5, x3
+ and x13, x13, #0x7fffffffffffffff
+ adds x10, x10, x5
+ adcs x11, x11, xzr
+ adcs x12, x12, xzr
+ adc x13, x13, xzr
+ # Reduce if top bit set
+ and x5, x3, x13, asr 63
+ and x13, x13, #0x7fffffffffffffff
+ adds x10, x10, x5
+ adcs x11, x11, xzr
+ adcs x12, x12, xzr
+ adc x13, x13, xzr
+ # Store
+ stp x10, x11, [x29, #48]
+ stp x12, x13, [x29, #64]
+ sub x25, x25, #1
+ cmp x25, #0
+ bge L_curve25519_bits
+ mov x25, #63
+ sub x24, x24, #8
+ cmp x24, #0
+ bge L_curve25519_words
+ # Invert
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ add x0, x29, #0x50
+ add x1, x29, #48
+ bl fe_sq
+ add x1, x29, #0x50
+ bl fe_sq
+ add x1, x29, #16
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #48
+ add x1, x29, #48
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x70
+ bl fe_sq
+ add x0, x29, #0x50
+ add x1, x29, #0x50
+ add x2, x29, #0x70
+ bl fe_mul
+ add x0, x29, #0x70
+ bl fe_sq
+ mov x24, #4
+ add x1, x29, #0x70
+L_curve25519_inv_1:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_1
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x70
+ add x1, x29, #0x50
+ bl fe_sq
+ mov x24, #9
+ add x1, x29, #0x70
+L_curve25519_inv_2:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_2
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x90
+ bl fe_sq
+ mov x24, #19
+ add x1, x29, #0x90
+L_curve25519_inv_3:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_3
+ add x0, x29, #0x70
+ add x2, x29, #0x70
+ bl fe_mul
+ mov x24, #10
+ add x1, x29, #0x70
+L_curve25519_inv_4:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_4
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x70
+ add x1, x29, #0x50
+ bl fe_sq
+ mov x24, #49
+ add x1, x29, #0x70
+L_curve25519_inv_5:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_5
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x90
+ bl fe_sq
+ mov x24, #0x63
+ add x1, x29, #0x90
+L_curve25519_inv_6:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_6
+ add x0, x29, #0x70
+ add x2, x29, #0x70
+ bl fe_mul
+ mov x24, #50
+ add x1, x29, #0x70
+L_curve25519_inv_7:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_7
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ mov x24, #5
+ add x1, x29, #0x50
+L_curve25519_inv_8:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_8
+ add x0, x29, #16
+ add x2, x29, #48
+ bl fe_mul
+ ldr x0, [x29, #176]
+ # Multiply
+ ldp x6, x7, [x0]
+ ldp x8, x9, [x0, #16]
+ ldp x10, x11, [x29, #16]
+ ldp x12, x13, [x29, #32]
+ # A[0] * B[0]
+ mul x14, x6, x10
+ umulh x15, x6, x10
+ # A[0] * B[1]
+ mul x3, x6, x11
+ umulh x16, x6, x11
+ adds x15, x15, x3
+ adc x16, x16, xzr
+ # A[1] * B[0]
+ mul x3, x7, x10
+ umulh x4, x7, x10
+ adds x15, x15, x3
+ adcs x16, x16, x4
+ adc x17, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x6, x12
+ umulh x4, x6, x12
+ adds x16, x16, x3
+ adc x17, x17, x4
+ # A[1] * B[1]
+ mul x3, x7, x11
+ umulh x4, x7, x11
+ adds x16, x16, x3
+ adcs x17, x17, x4
+ adc x19, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x8, x10
+ umulh x4, x8, x10
+ adds x16, x16, x3
+ adcs x17, x17, x4
+ adc x19, x19, xzr
+ # A[0] * B[3]
+ mul x3, x6, x13
+ umulh x4, x6, x13
+ adds x17, x17, x3
+ adcs x19, x19, x4
+ adc x20, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x7, x12
+ umulh x4, x7, x12
+ adds x17, x17, x3
+ adcs x19, x19, x4
+ adc x20, x20, xzr
+ # A[2] * B[1]
+ mul x3, x8, x11
+ umulh x4, x8, x11
+ adds x17, x17, x3
+ adcs x19, x19, x4
+ adc x20, x20, xzr
+ # A[3] * B[0]
+ mul x3, x9, x10
+ umulh x4, x9, x10
+ adds x17, x17, x3
+ adcs x19, x19, x4
+ adc x20, x20, xzr
+ # A[1] * B[3]
+ mul x3, x7, x13
+ umulh x4, x7, x13
+ adds x19, x19, x3
+ adcs x20, x20, x4
+ adc x21, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x8, x12
+ umulh x4, x8, x12
+ adds x19, x19, x3
+ adcs x20, x20, x4
+ adc x21, x21, xzr
+ # A[3] * B[1]
+ mul x3, x9, x11
+ umulh x4, x9, x11
+ adds x19, x19, x3
+ adcs x20, x20, x4
+ adc x21, x21, xzr
+ # A[2] * B[3]
+ mul x3, x8, x13
+ umulh x4, x8, x13
+ adds x20, x20, x3
+ adcs x21, x21, x4
+ adc x22, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x9, x12
+ umulh x4, x9, x12
+ adds x20, x20, x3
+ adcs x21, x21, x4
+ adc x22, x22, xzr
+ # A[3] * B[3]
+ mul x3, x9, x13
+ umulh x4, x9, x13
+ adds x21, x21, x3
+ adc x22, x22, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x22, x22, x21, #63
+ extr x21, x21, x20, #63
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ and x17, x17, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x19
+ umulh x19, x3, x19
+ adds x14, x14, x4
+ mul x4, x3, x20
+ umulh x20, x3, x20
+ adcs x15, x15, x4
+ mul x4, x3, x21
+ umulh x21, x3, x21
+ adcs x16, x16, x4
+ mul x4, x3, x22
+ umulh x5, x3, x22
+ adcs x17, x17, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x15, x15, x19
+ adcs x16, x16, x20
+ adcs x17, x17, x21
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x17, #63
+ mul x5, x5, x3
+ and x17, x17, #0x7fffffffffffffff
+ adds x14, x14, x5
+ adcs x15, x15, xzr
+ adcs x16, x16, xzr
+ adc x17, x17, xzr
+ # Reduce if top bit set
+ and x5, x3, x17, asr 63
+ and x17, x17, #0x7fffffffffffffff
+ adds x14, x14, x5
+ adcs x15, x15, xzr
+ adcs x16, x16, xzr
+ adc x17, x17, xzr
+ # Store
+ stp x14, x15, [x0]
+ stp x16, x17, [x0, #16]
+ mov x0, xzr
+ ldr x17, [x29, #200]
+ ldr x19, [x29, #208]
+ ldp x20, x21, [x29, #216]
+ ldp x22, x23, [x29, #232]
+ ldp x24, x25, [x29, #248]
+ ldp x26, x27, [x29, #264]
+ ldr x28, [x29, #280]
+ ldp x29, x30, [sp], #0x120
+ ret
+ .size curve25519,.-curve25519
+ .text
+ .align 2
+ .globl fe_pow22523
+ .type fe_pow22523, %function
+fe_pow22523:
+ stp x29, x30, [sp, #-144]!
+ add x29, sp, #0
+ str x21, [x29, #136]
+ # pow22523
+ str x0, [x29, #112]
+ str x1, [x29, #120]
+ add x0, x29, #16
+ bl fe_sq
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ add x1, x29, #48
+ bl fe_sq
+ ldr x1, [x29, #120]
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #16
+ add x1, x29, #16
+ add x2, x29, #48
+ bl fe_mul
+ bl fe_sq
+ add x1, x29, #48
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ mov x21, #4
+ add x1, x29, #48
+L_fe_pow22523_1:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_1
+ add x0, x29, #16
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ mov x21, #9
+ add x1, x29, #48
+L_fe_pow22523_2:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_2
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #0x50
+ bl fe_sq
+ mov x21, #19
+ add x1, x29, #0x50
+L_fe_pow22523_3:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_3
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ mov x21, #10
+ add x1, x29, #48
+L_fe_pow22523_4:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_4
+ add x0, x29, #16
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ mov x21, #49
+ add x1, x29, #48
+L_fe_pow22523_5:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_5
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #0x50
+ bl fe_sq
+ mov x21, #0x63
+ add x1, x29, #0x50
+L_fe_pow22523_6:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_6
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ mov x21, #50
+ add x1, x29, #48
+L_fe_pow22523_7:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_7
+ add x0, x29, #16
+ add x2, x29, #16
+ bl fe_mul
+ mov x21, #2
+ add x1, x29, #16
+L_fe_pow22523_8:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_8
+ ldr x0, [x29, #112]
+ ldr x2, [x29, #120]
+ bl fe_mul
+ ldr x21, [x29, #136]
+ ldp x29, x30, [sp], #0x90
+ ret
+ .size fe_pow22523,.-fe_pow22523
+ .text
+ .align 2
+ .globl fe_ge_to_p2
+ .type fe_ge_to_p2, %function
+fe_ge_to_p2:
+ stp x29, x30, [sp, #-112]!
+ add x29, sp, #0
+ str x17, [x29, #72]
+ str x19, [x29, #80]
+ stp x20, x21, [x29, #88]
+ str x22, [x29, #104]
+ str x1, [x29, #16]
+ str x2, [x29, #24]
+ str x3, [x29, #32]
+ str x4, [x29, #40]
+ str x5, [x29, #48]
+ str x6, [x29, #56]
+ ldr x1, [x29, #32]
+ ldr x2, [x29, #56]
+ # Multiply
+ ldp x11, x12, [x1]
+ ldp x13, x14, [x1, #16]
+ ldp x15, x16, [x2]
+ ldp x17, x19, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x11, x15
+ umulh x4, x11, x15
+ # A[0] * B[1]
+ mul x20, x11, x16
+ umulh x5, x11, x16
+ adds x4, x4, x20
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x20, x12, x15
+ umulh x21, x12, x15
+ adds x4, x4, x20
+ adcs x5, x5, x21
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x20, x11, x17
+ umulh x21, x11, x17
+ adds x5, x5, x20
+ adc x6, x6, x21
+ # A[1] * B[1]
+ mul x20, x12, x16
+ umulh x21, x12, x16
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x20, x13, x15
+ umulh x21, x13, x15
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x20, x11, x19
+ umulh x21, x11, x19
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x20, x12, x17
+ umulh x21, x12, x17
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x20, x13, x16
+ umulh x21, x13, x16
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x20, x14, x15
+ umulh x21, x14, x15
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x20, x12, x19
+ umulh x21, x12, x19
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x20, x13, x17
+ umulh x21, x13, x17
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x20, x14, x16
+ umulh x21, x14, x16
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x20, x13, x19
+ umulh x21, x13, x19
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x20, x14, x17
+ umulh x21, x14, x17
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x20, x14, x19
+ umulh x21, x14, x19
+ adds x9, x9, x20
+ adc x10, x10, x21
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x20, #19
+ mul x21, x20, x7
+ umulh x7, x20, x7
+ adds x3, x3, x21
+ mul x21, x20, x8
+ umulh x8, x20, x8
+ adcs x4, x4, x21
+ mul x21, x20, x9
+ umulh x9, x20, x9
+ adcs x5, x5, x21
+ mul x21, x20, x10
+ umulh x22, x20, x10
+ adcs x6, x6, x21
+ adc x22, x22, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x22, x22, xzr
+ # Overflow
+ extr x22, x22, x6, #63
+ mul x22, x22, x20
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x22, x20, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #16]
+ ldr x1, [x29, #40]
+ ldr x2, [x29, #48]
+ # Multiply
+ ldp x11, x12, [x1]
+ ldp x13, x14, [x1, #16]
+ ldp x15, x16, [x2]
+ ldp x17, x19, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x11, x15
+ umulh x4, x11, x15
+ # A[0] * B[1]
+ mul x20, x11, x16
+ umulh x5, x11, x16
+ adds x4, x4, x20
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x20, x12, x15
+ umulh x21, x12, x15
+ adds x4, x4, x20
+ adcs x5, x5, x21
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x20, x11, x17
+ umulh x21, x11, x17
+ adds x5, x5, x20
+ adc x6, x6, x21
+ # A[1] * B[1]
+ mul x20, x12, x16
+ umulh x21, x12, x16
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x20, x13, x15
+ umulh x21, x13, x15
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x20, x11, x19
+ umulh x21, x11, x19
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x20, x12, x17
+ umulh x21, x12, x17
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x20, x13, x16
+ umulh x21, x13, x16
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x20, x14, x15
+ umulh x21, x14, x15
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x20, x12, x19
+ umulh x21, x12, x19
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x20, x13, x17
+ umulh x21, x13, x17
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x20, x14, x16
+ umulh x21, x14, x16
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x20, x13, x19
+ umulh x21, x13, x19
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x20, x14, x17
+ umulh x21, x14, x17
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x20, x14, x19
+ umulh x21, x14, x19
+ adds x9, x9, x20
+ adc x10, x10, x21
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x20, #19
+ mul x21, x20, x7
+ umulh x7, x20, x7
+ adds x3, x3, x21
+ mul x21, x20, x8
+ umulh x8, x20, x8
+ adcs x4, x4, x21
+ mul x21, x20, x9
+ umulh x9, x20, x9
+ adcs x5, x5, x21
+ mul x21, x20, x10
+ umulh x22, x20, x10
+ adcs x6, x6, x21
+ adc x22, x22, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x22, x22, xzr
+ # Overflow
+ extr x22, x22, x6, #63
+ mul x22, x22, x20
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x22, x20, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #24]
+ ldr x2, [x29, #56]
+ # Multiply
+ ldp x11, x12, [x2]
+ ldp x13, x14, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x15, x11
+ umulh x4, x15, x11
+ # A[0] * B[1]
+ mul x20, x15, x12
+ umulh x5, x15, x12
+ adds x4, x4, x20
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x20, x16, x11
+ umulh x21, x16, x11
+ adds x4, x4, x20
+ adcs x5, x5, x21
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x20, x15, x13
+ umulh x21, x15, x13
+ adds x5, x5, x20
+ adc x6, x6, x21
+ # A[1] * B[1]
+ mul x20, x16, x12
+ umulh x21, x16, x12
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x20, x17, x11
+ umulh x21, x17, x11
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x20, x15, x14
+ umulh x21, x15, x14
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x20, x16, x13
+ umulh x21, x16, x13
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x20, x17, x12
+ umulh x21, x17, x12
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x20, x19, x11
+ umulh x21, x19, x11
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x20, x16, x14
+ umulh x21, x16, x14
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x20, x17, x13
+ umulh x21, x17, x13
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x20, x19, x12
+ umulh x21, x19, x12
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x20, x17, x14
+ umulh x21, x17, x14
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x20, x19, x13
+ umulh x21, x19, x13
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x20, x19, x14
+ umulh x21, x19, x14
+ adds x9, x9, x20
+ adc x10, x10, x21
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x20, #19
+ mul x21, x20, x7
+ umulh x7, x20, x7
+ adds x3, x3, x21
+ mul x21, x20, x8
+ umulh x8, x20, x8
+ adcs x4, x4, x21
+ mul x21, x20, x9
+ umulh x9, x20, x9
+ adcs x5, x5, x21
+ mul x21, x20, x10
+ umulh x22, x20, x10
+ adcs x6, x6, x21
+ adc x22, x22, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x22, x22, xzr
+ # Overflow
+ extr x22, x22, x6, #63
+ mul x22, x22, x20
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x22, x20, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x17, [x29, #72]
+ ldr x19, [x29, #80]
+ ldp x20, x21, [x29, #88]
+ ldr x22, [x29, #104]
+ ldp x29, x30, [sp], #0x70
+ ret
+ .size fe_ge_to_p2,.-fe_ge_to_p2
+ .text
+ .align 2
+ .globl fe_ge_to_p3
+ .type fe_ge_to_p3, %function
+fe_ge_to_p3:
+ stp x29, x30, [sp, #-160]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ str x26, [x29, #152]
+ str x1, [x29, #16]
+ str x2, [x29, #24]
+ str x3, [x29, #32]
+ str x4, [x29, #40]
+ str x5, [x29, #48]
+ str x6, [x29, #56]
+ str x7, [x29, #64]
+ ldr x1, [x29, #40]
+ ldr x2, [x29, #64]
+ # Multiply
+ ldp x11, x12, [x1]
+ ldp x13, x14, [x1, #16]
+ ldp x15, x16, [x2]
+ ldp x17, x19, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x11, x15
+ umulh x4, x11, x15
+ # A[0] * B[1]
+ mul x24, x11, x16
+ umulh x5, x11, x16
+ adds x4, x4, x24
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x24, x12, x15
+ umulh x25, x12, x15
+ adds x4, x4, x24
+ adcs x5, x5, x25
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x24, x11, x17
+ umulh x25, x11, x17
+ adds x5, x5, x24
+ adc x6, x6, x25
+ # A[1] * B[1]
+ mul x24, x12, x16
+ umulh x25, x12, x16
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x24, x13, x15
+ umulh x25, x13, x15
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x24, x11, x19
+ umulh x25, x11, x19
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x24, x12, x17
+ umulh x25, x12, x17
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x24, x13, x16
+ umulh x25, x13, x16
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x24, x14, x15
+ umulh x25, x14, x15
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x24, x12, x19
+ umulh x25, x12, x19
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x24, x13, x17
+ umulh x25, x13, x17
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x24, x14, x16
+ umulh x25, x14, x16
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x24, x13, x19
+ umulh x25, x13, x19
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x24, x14, x17
+ umulh x25, x14, x17
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x24, x14, x19
+ umulh x25, x14, x19
+ adds x9, x9, x24
+ adc x10, x10, x25
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x24, #19
+ mul x25, x24, x7
+ umulh x7, x24, x7
+ adds x3, x3, x25
+ mul x25, x24, x8
+ umulh x8, x24, x8
+ adcs x4, x4, x25
+ mul x25, x24, x9
+ umulh x9, x24, x9
+ adcs x5, x5, x25
+ mul x25, x24, x10
+ umulh x26, x24, x10
+ adcs x6, x6, x25
+ adc x26, x26, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x26, x26, xzr
+ # Overflow
+ extr x26, x26, x6, #63
+ mul x26, x26, x24
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x26, x24, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #48]
+ # Multiply
+ ldp x20, x21, [x2]
+ ldp x22, x23, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x11, x20
+ umulh x4, x11, x20
+ # A[0] * B[1]
+ mul x24, x11, x21
+ umulh x5, x11, x21
+ adds x4, x4, x24
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x24, x12, x20
+ umulh x25, x12, x20
+ adds x4, x4, x24
+ adcs x5, x5, x25
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x24, x11, x22
+ umulh x25, x11, x22
+ adds x5, x5, x24
+ adc x6, x6, x25
+ # A[1] * B[1]
+ mul x24, x12, x21
+ umulh x25, x12, x21
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x24, x13, x20
+ umulh x25, x13, x20
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x24, x11, x23
+ umulh x25, x11, x23
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x24, x12, x22
+ umulh x25, x12, x22
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x24, x13, x21
+ umulh x25, x13, x21
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x24, x14, x20
+ umulh x25, x14, x20
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x24, x12, x23
+ umulh x25, x12, x23
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x24, x13, x22
+ umulh x25, x13, x22
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x24, x14, x21
+ umulh x25, x14, x21
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x24, x13, x23
+ umulh x25, x13, x23
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x24, x14, x22
+ umulh x25, x14, x22
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x24, x14, x23
+ umulh x25, x14, x23
+ adds x9, x9, x24
+ adc x10, x10, x25
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x24, #19
+ mul x25, x24, x7
+ umulh x7, x24, x7
+ adds x3, x3, x25
+ mul x25, x24, x8
+ umulh x8, x24, x8
+ adcs x4, x4, x25
+ mul x25, x24, x9
+ umulh x9, x24, x9
+ adcs x5, x5, x25
+ mul x25, x24, x10
+ umulh x26, x24, x10
+ adcs x6, x6, x25
+ adc x26, x26, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x26, x26, xzr
+ # Overflow
+ extr x26, x26, x6, #63
+ mul x26, x26, x24
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x26, x24, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #16]
+ ldr x2, [x29, #56]
+ # Multiply
+ ldp x11, x12, [x2]
+ ldp x13, x14, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x20, x11
+ umulh x4, x20, x11
+ # A[0] * B[1]
+ mul x24, x20, x12
+ umulh x5, x20, x12
+ adds x4, x4, x24
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x24, x21, x11
+ umulh x25, x21, x11
+ adds x4, x4, x24
+ adcs x5, x5, x25
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x24, x20, x13
+ umulh x25, x20, x13
+ adds x5, x5, x24
+ adc x6, x6, x25
+ # A[1] * B[1]
+ mul x24, x21, x12
+ umulh x25, x21, x12
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x24, x22, x11
+ umulh x25, x22, x11
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x24, x20, x14
+ umulh x25, x20, x14
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x24, x21, x13
+ umulh x25, x21, x13
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x24, x22, x12
+ umulh x25, x22, x12
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x24, x23, x11
+ umulh x25, x23, x11
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x24, x21, x14
+ umulh x25, x21, x14
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x24, x22, x13
+ umulh x25, x22, x13
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x24, x23, x12
+ umulh x25, x23, x12
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x24, x22, x14
+ umulh x25, x22, x14
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x24, x23, x13
+ umulh x25, x23, x13
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x24, x23, x14
+ umulh x25, x23, x14
+ adds x9, x9, x24
+ adc x10, x10, x25
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x24, #19
+ mul x25, x24, x7
+ umulh x7, x24, x7
+ adds x3, x3, x25
+ mul x25, x24, x8
+ umulh x8, x24, x8
+ adcs x4, x4, x25
+ mul x25, x24, x9
+ umulh x9, x24, x9
+ adcs x5, x5, x25
+ mul x25, x24, x10
+ umulh x26, x24, x10
+ adcs x6, x6, x25
+ adc x26, x26, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x26, x26, xzr
+ # Overflow
+ extr x26, x26, x6, #63
+ mul x26, x26, x24
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x26, x24, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #24]
+ # Multiply
+ # A[0] * B[0]
+ mul x3, x11, x15
+ umulh x4, x11, x15
+ # A[0] * B[1]
+ mul x24, x11, x16
+ umulh x5, x11, x16
+ adds x4, x4, x24
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x24, x12, x15
+ umulh x25, x12, x15
+ adds x4, x4, x24
+ adcs x5, x5, x25
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x24, x11, x17
+ umulh x25, x11, x17
+ adds x5, x5, x24
+ adc x6, x6, x25
+ # A[1] * B[1]
+ mul x24, x12, x16
+ umulh x25, x12, x16
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x24, x13, x15
+ umulh x25, x13, x15
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x24, x11, x19
+ umulh x25, x11, x19
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x24, x12, x17
+ umulh x25, x12, x17
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x24, x13, x16
+ umulh x25, x13, x16
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x24, x14, x15
+ umulh x25, x14, x15
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x24, x12, x19
+ umulh x25, x12, x19
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x24, x13, x17
+ umulh x25, x13, x17
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x24, x14, x16
+ umulh x25, x14, x16
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x24, x13, x19
+ umulh x25, x13, x19
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x24, x14, x17
+ umulh x25, x14, x17
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x24, x14, x19
+ umulh x25, x14, x19
+ adds x9, x9, x24
+ adc x10, x10, x25
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x24, #19
+ mul x25, x24, x7
+ umulh x7, x24, x7
+ adds x3, x3, x25
+ mul x25, x24, x8
+ umulh x8, x24, x8
+ adcs x4, x4, x25
+ mul x25, x24, x9
+ umulh x9, x24, x9
+ adcs x5, x5, x25
+ mul x25, x24, x10
+ umulh x26, x24, x10
+ adcs x6, x6, x25
+ adc x26, x26, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x26, x26, xzr
+ # Overflow
+ extr x26, x26, x6, #63
+ mul x26, x26, x24
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x26, x24, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldr x26, [x29, #152]
+ ldp x29, x30, [sp], #0xa0
+ ret
+ .size fe_ge_to_p3,.-fe_ge_to_p3
+ .text
+ .align 2
+ .globl fe_ge_dbl
+ .type fe_ge_dbl, %function
+fe_ge_dbl:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ ldr x1, [x29, #48]
+ # Square
+ ldp x12, x13, [x1]
+ ldp x14, x15, [x1, #16]
+ # A[0] * A[1]
+ mul x5, x12, x13
+ umulh x6, x12, x13
+ # A[0] * A[2]
+ mul x25, x12, x14
+ umulh x7, x12, x14
+ adds x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * A[3]
+ mul x25, x12, x15
+ umulh x8, x12, x15
+ adds x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * A[2]
+ mul x25, x13, x14
+ umulh x26, x13, x14
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * A[3]
+ mul x25, x13, x15
+ umulh x26, x13, x15
+ adds x8, x8, x25
+ adc x9, x9, x26
+ # A[2] * A[3]
+ mul x25, x14, x15
+ umulh x10, x14, x15
+ adds x9, x9, x25
+ adc x10, x10, xzr
+ # Double
+ adds x5, x5, x5
+ adcs x6, x6, x6
+ adcs x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adc x11, xzr, xzr
+ # A[0] * A[0]
+ mul x4, x12, x12
+ umulh x27, x12, x12
+ # A[1] * A[1]
+ mul x25, x13, x13
+ umulh x26, x13, x13
+ adds x5, x5, x27
+ adcs x6, x6, x25
+ adc x27, x26, xzr
+ # A[2] * A[2]
+ mul x25, x14, x14
+ umulh x26, x14, x14
+ adds x7, x7, x27
+ adcs x8, x8, x25
+ adc x27, x26, xzr
+ # A[3] * A[3]
+ mul x25, x15, x15
+ umulh x26, x15, x15
+ adds x9, x9, x27
+ adcs x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ stp x4, x5, [x0]
+ stp x6, x7, [x0, #16]
+ ldr x0, [x29, #32]
+ ldr x1, [x29, #56]
+ # Square
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * A[1]
+ mul x9, x21, x22
+ umulh x10, x21, x22
+ # A[0] * A[2]
+ mul x25, x21, x23
+ umulh x11, x21, x23
+ adds x10, x10, x25
+ adc x11, x11, xzr
+ # A[0] * A[3]
+ mul x25, x21, x24
+ umulh x16, x21, x24
+ adds x11, x11, x25
+ adc x16, x16, xzr
+ # A[1] * A[2]
+ mul x25, x22, x23
+ umulh x26, x22, x23
+ adds x11, x11, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * A[3]
+ mul x25, x22, x24
+ umulh x26, x22, x24
+ adds x16, x16, x25
+ adc x17, x17, x26
+ # A[2] * A[3]
+ mul x25, x23, x24
+ umulh x19, x23, x24
+ adds x17, x17, x25
+ adc x19, x19, xzr
+ # Double
+ adds x9, x9, x9
+ adcs x10, x10, x10
+ adcs x11, x11, x11
+ adcs x16, x16, x16
+ adcs x17, x17, x17
+ adcs x19, x19, x19
+ adc x20, xzr, xzr
+ # A[0] * A[0]
+ mul x8, x21, x21
+ umulh x27, x21, x21
+ # A[1] * A[1]
+ mul x25, x22, x22
+ umulh x26, x22, x22
+ adds x9, x9, x27
+ adcs x10, x10, x25
+ adc x27, x26, xzr
+ # A[2] * A[2]
+ mul x25, x23, x23
+ umulh x26, x23, x23
+ adds x11, x11, x27
+ adcs x16, x16, x25
+ adc x27, x26, xzr
+ # A[3] * A[3]
+ mul x25, x24, x24
+ umulh x26, x24, x24
+ adds x17, x17, x27
+ adcs x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x11, #63
+ and x11, x11, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x8, x8, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x9, x9, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x10, x10, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x11, x11, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x9, x9, x16
+ adcs x10, x10, x17
+ adcs x11, x11, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x11, #63
+ mul x27, x27, x25
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Reduce if top bit set
+ and x27, x25, x11, asr 63
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Store
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ ldr x0, [x29, #24]
+ # Add
+ adds x12, x12, x21
+ adcs x13, x13, x22
+ adcs x14, x14, x23
+ adc x15, x15, x24
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ ldr x0, [x29, #40]
+ # Square
+ # A[0] * A[1]
+ mul x17, x12, x13
+ umulh x19, x12, x13
+ # A[0] * A[2]
+ mul x25, x12, x14
+ umulh x20, x12, x14
+ adds x19, x19, x25
+ adc x20, x20, xzr
+ # A[0] * A[3]
+ mul x25, x12, x15
+ umulh x21, x12, x15
+ adds x20, x20, x25
+ adc x21, x21, xzr
+ # A[1] * A[2]
+ mul x25, x13, x14
+ umulh x26, x13, x14
+ adds x20, x20, x25
+ adcs x21, x21, x26
+ adc x22, xzr, xzr
+ # A[1] * A[3]
+ mul x25, x13, x15
+ umulh x26, x13, x15
+ adds x21, x21, x25
+ adc x22, x22, x26
+ # A[2] * A[3]
+ mul x25, x14, x15
+ umulh x23, x14, x15
+ adds x22, x22, x25
+ adc x23, x23, xzr
+ # Double
+ adds x17, x17, x17
+ adcs x19, x19, x19
+ adcs x20, x20, x20
+ adcs x21, x21, x21
+ adcs x22, x22, x22
+ adcs x23, x23, x23
+ adc x24, xzr, xzr
+ # A[0] * A[0]
+ mul x16, x12, x12
+ umulh x27, x12, x12
+ # A[1] * A[1]
+ mul x25, x13, x13
+ umulh x26, x13, x13
+ adds x17, x17, x27
+ adcs x19, x19, x25
+ adc x27, x26, xzr
+ # A[2] * A[2]
+ mul x25, x14, x14
+ umulh x26, x14, x14
+ adds x20, x20, x27
+ adcs x21, x21, x25
+ adc x27, x26, xzr
+ # A[3] * A[3]
+ mul x25, x15, x15
+ umulh x26, x15, x15
+ adds x22, x22, x27
+ adcs x23, x23, x25
+ adc x24, x24, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x24, x24, x23, #63
+ extr x23, x23, x22, #63
+ extr x22, x22, x21, #63
+ extr x21, x21, x20, #63
+ and x20, x20, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x21
+ umulh x21, x25, x21
+ adds x16, x16, x26
+ mul x26, x25, x22
+ umulh x22, x25, x22
+ adcs x17, x17, x26
+ mul x26, x25, x23
+ umulh x23, x25, x23
+ adcs x19, x19, x26
+ mul x26, x25, x24
+ umulh x27, x25, x24
+ adcs x20, x20, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x17, x17, x21
+ adcs x19, x19, x22
+ adcs x20, x20, x23
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x20, #63
+ mul x27, x27, x25
+ and x20, x20, #0x7fffffffffffffff
+ adds x16, x16, x27
+ adcs x17, x17, xzr
+ adcs x19, x19, xzr
+ adc x20, x20, xzr
+ # Reduce if top bit set
+ and x27, x25, x20, asr 63
+ and x20, x20, #0x7fffffffffffffff
+ adds x16, x16, x27
+ adcs x17, x17, xzr
+ adcs x19, x19, xzr
+ adc x20, x20, xzr
+ # Store
+ stp x16, x17, [x0]
+ stp x19, x20, [x0, #16]
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #32]
+ # Add
+ adds x12, x8, x4
+ adcs x13, x9, x5
+ adcs x14, x10, x6
+ adc x15, x11, x7
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x21, x8, x4
+ sbcs x22, x9, x5
+ sbcs x23, x10, x6
+ sbcs x24, x11, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x21, x21, x25
+ adcs x22, x22, x28
+ adcs x23, x23, x28
+ adc x24, x24, x26
+ stp x12, x13, [x0]
+ stp x14, x15, [x0, #16]
+ stp x21, x22, [x1]
+ stp x23, x24, [x1, #16]
+ ldr x0, [x29, #16]
+ # Sub
+ subs x16, x16, x12
+ sbcs x17, x17, x13
+ sbcs x19, x19, x14
+ sbcs x20, x20, x15
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x16, x17, [x0]
+ stp x19, x20, [x0, #16]
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #64]
+ # Square * 2
+ ldp x12, x13, [x1]
+ ldp x14, x15, [x1, #16]
+ # A[0] * A[1]
+ mul x5, x12, x13
+ umulh x6, x12, x13
+ # A[0] * A[2]
+ mul x25, x12, x14
+ umulh x7, x12, x14
+ adds x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * A[3]
+ mul x25, x12, x15
+ umulh x8, x12, x15
+ adds x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * A[2]
+ mul x25, x13, x14
+ umulh x26, x13, x14
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * A[3]
+ mul x25, x13, x15
+ umulh x26, x13, x15
+ adds x8, x8, x25
+ adc x9, x9, x26
+ # A[2] * A[3]
+ mul x25, x14, x15
+ umulh x10, x14, x15
+ adds x9, x9, x25
+ adc x10, x10, xzr
+ # Double
+ adds x5, x5, x5
+ adcs x6, x6, x6
+ adcs x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adc x11, xzr, xzr
+ # A[0] * A[0]
+ mul x4, x12, x12
+ umulh x28, x12, x12
+ # A[1] * A[1]
+ mul x25, x13, x13
+ umulh x26, x13, x13
+ adds x5, x5, x28
+ adcs x6, x6, x25
+ adc x28, x26, xzr
+ # A[2] * A[2]
+ mul x25, x14, x14
+ umulh x26, x14, x14
+ adds x7, x7, x28
+ adcs x8, x8, x25
+ adc x28, x26, xzr
+ # A[3] * A[3]
+ mul x25, x15, x15
+ umulh x26, x15, x15
+ adds x9, x9, x28
+ adcs x10, x10, x25
+ adc x11, x11, x26
+ # Double and Reduce
+ mov x25, #0x169
+ # Move top half into t4-t7 and remove top bit from t3
+ lsr x28, x11, #61
+ extr x11, x11, x10, #62
+ extr x10, x10, x9, #62
+ extr x9, x9, x8, #62
+ extr x8, x8, x7, #62
+ extr x7, x7, x6, #63
+ extr x6, x6, x5, #63
+ extr x5, x5, x4, #63
+ lsl x4, x4, #1
+ and x7, x7, #0x7fffffffffffffff
+ # Two left, only one right
+ and x11, x11, #0x7fffffffffffffff
+ # Multiply top bits by 19*19
+ mul x28, x28, x25
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x4, x4, x28
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #40]
+ # Sub
+ subs x4, x4, x21
+ sbcs x5, x5, x22
+ sbcs x6, x6, x23
+ sbcs x7, x7, x24
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x4, x4, x25
+ adcs x5, x5, x28
+ adcs x6, x6, x28
+ adc x7, x7, x26
+ stp x4, x5, [x0]
+ stp x6, x7, [x0, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_dbl,.-fe_ge_dbl
+ .text
+ .align 2
+ .globl fe_ge_madd
+ .type fe_ge_madd, %function
+fe_ge_madd:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ str x7, [x29, #72]
+ ldr x2, [x29, #56]
+ ldr x3, [x29, #48]
+ # Add
+ ldp x12, x13, [x2]
+ ldp x14, x15, [x2, #16]
+ ldp x16, x17, [x3]
+ ldp x19, x20, [x3, #16]
+ adds x4, x12, x16
+ adcs x5, x13, x17
+ adcs x6, x14, x19
+ adc x7, x15, x20
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ # Sub
+ subs x8, x12, x16
+ sbcs x9, x13, x17
+ sbcs x10, x14, x19
+ sbcs x11, x15, x20
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x8, x8, x25
+ adcs x9, x9, x28
+ adcs x10, x10, x28
+ adc x11, x11, x26
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #184]
+ # Multiply
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x12, x4, x21
+ umulh x13, x4, x21
+ # A[0] * B[1]
+ mul x25, x4, x22
+ umulh x14, x4, x22
+ adds x13, x13, x25
+ adc x14, x14, xzr
+ # A[1] * B[0]
+ mul x25, x5, x21
+ umulh x26, x5, x21
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x4, x23
+ umulh x26, x4, x23
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # A[1] * B[1]
+ mul x25, x5, x22
+ umulh x26, x5, x22
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x6, x21
+ umulh x26, x6, x21
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x4, x24
+ umulh x26, x4, x24
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x5, x23
+ umulh x26, x5, x23
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x6, x22
+ umulh x26, x6, x22
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x7, x21
+ umulh x26, x7, x21
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x5, x24
+ umulh x26, x5, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x6, x23
+ umulh x26, x6, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x7, x22
+ umulh x26, x7, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x6, x24
+ umulh x26, x6, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x7, x23
+ umulh x26, x7, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x7, x24
+ umulh x26, x7, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ and x15, x15, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x12, x12, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x13, x13, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x14, x14, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x15, x15, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x13, x13, x16
+ adcs x14, x14, x17
+ adcs x15, x15, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x15, #63
+ mul x27, x27, x25
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Reduce if top bit set
+ and x27, x25, x15, asr 63
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #192]
+ # Multiply
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * B[0]
+ mul x4, x8, x21
+ umulh x5, x8, x21
+ # A[0] * B[1]
+ mul x25, x8, x22
+ umulh x6, x8, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x9, x21
+ umulh x26, x9, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x8, x23
+ umulh x26, x8, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x9, x22
+ umulh x26, x9, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x10, x21
+ umulh x26, x10, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x8, x24
+ umulh x26, x8, x24
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x9, x23
+ umulh x26, x9, x23
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x10, x22
+ umulh x26, x10, x22
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x11, x21
+ umulh x26, x11, x21
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x9, x24
+ umulh x26, x9, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x10, x23
+ umulh x26, x10, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x11, x22
+ umulh x26, x11, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x10, x24
+ umulh x26, x10, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x11, x23
+ umulh x26, x11, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x11, x24
+ umulh x26, x11, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x4, x4, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x5, x5, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x6, x6, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x16
+ adcs x6, x6, x17
+ adcs x7, x7, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #16]
+ # Add
+ adds x8, x12, x4
+ adcs x9, x13, x5
+ adcs x10, x14, x6
+ adc x11, x15, x7
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ # Sub
+ subs x16, x12, x4
+ sbcs x17, x13, x5
+ sbcs x19, x14, x6
+ sbcs x20, x15, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #176]
+ ldr x3, [x29, #72]
+ # Multiply
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x3]
+ ldp x23, x24, [x3, #16]
+ # A[0] * B[0]
+ mul x4, x16, x21
+ umulh x5, x16, x21
+ # A[0] * B[1]
+ mul x25, x16, x22
+ umulh x6, x16, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x17, x21
+ umulh x26, x17, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x16, x23
+ umulh x26, x16, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x17, x22
+ umulh x26, x17, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x19, x21
+ umulh x26, x19, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, x8, xzr
+ # A[0] * B[3]
+ mul x25, x16, x24
+ umulh x26, x16, x24
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x17, x23
+ umulh x26, x17, x23
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[2] * B[1]
+ mul x25, x19, x22
+ umulh x26, x19, x22
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[3] * B[0]
+ mul x25, x20, x21
+ umulh x26, x20, x21
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[1] * B[3]
+ mul x25, x17, x24
+ umulh x26, x17, x24
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x19, x23
+ umulh x26, x19, x23
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[3] * B[1]
+ mul x25, x20, x22
+ umulh x26, x20, x22
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[2] * B[3]
+ mul x25, x19, x24
+ umulh x26, x19, x24
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x20, x23
+ umulh x26, x20, x23
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, x11, xzr
+ # A[3] * B[3]
+ mul x25, x20, x24
+ umulh x26, x20, x24
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #32]
+ ldr x1, [x29, #64]
+ # Double
+ ldp x8, x9, [x1]
+ ldp x10, x11, [x1, #16]
+ adds x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adc x11, x11, x11
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ ldr x1, [x29, #40]
+ # Add
+ adds x12, x8, x4
+ adcs x13, x9, x5
+ adcs x14, x10, x6
+ adc x15, x11, x7
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x16, x8, x4
+ sbcs x17, x9, x5
+ sbcs x19, x10, x6
+ sbcs x20, x11, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x12, x13, [x0]
+ stp x14, x15, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_madd,.-fe_ge_madd
+ .text
+ .align 2
+ .globl fe_ge_msub
+ .type fe_ge_msub, %function
+fe_ge_msub:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ str x7, [x29, #72]
+ ldr x2, [x29, #56]
+ ldr x3, [x29, #48]
+ # Add
+ ldp x12, x13, [x2]
+ ldp x14, x15, [x2, #16]
+ ldp x16, x17, [x3]
+ ldp x19, x20, [x3, #16]
+ adds x4, x12, x16
+ adcs x5, x13, x17
+ adcs x6, x14, x19
+ adc x7, x15, x20
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ # Sub
+ subs x8, x12, x16
+ sbcs x9, x13, x17
+ sbcs x10, x14, x19
+ sbcs x11, x15, x20
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x8, x8, x25
+ adcs x9, x9, x28
+ adcs x10, x10, x28
+ adc x11, x11, x26
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #192]
+ # Multiply
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x12, x4, x21
+ umulh x13, x4, x21
+ # A[0] * B[1]
+ mul x25, x4, x22
+ umulh x14, x4, x22
+ adds x13, x13, x25
+ adc x14, x14, xzr
+ # A[1] * B[0]
+ mul x25, x5, x21
+ umulh x26, x5, x21
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x4, x23
+ umulh x26, x4, x23
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # A[1] * B[1]
+ mul x25, x5, x22
+ umulh x26, x5, x22
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x6, x21
+ umulh x26, x6, x21
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x4, x24
+ umulh x26, x4, x24
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x5, x23
+ umulh x26, x5, x23
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x6, x22
+ umulh x26, x6, x22
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x7, x21
+ umulh x26, x7, x21
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x5, x24
+ umulh x26, x5, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x6, x23
+ umulh x26, x6, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x7, x22
+ umulh x26, x7, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x6, x24
+ umulh x26, x6, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x7, x23
+ umulh x26, x7, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x7, x24
+ umulh x26, x7, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ and x15, x15, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x12, x12, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x13, x13, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x14, x14, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x15, x15, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x13, x13, x16
+ adcs x14, x14, x17
+ adcs x15, x15, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x15, #63
+ mul x27, x27, x25
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Reduce if top bit set
+ and x27, x25, x15, asr 63
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #184]
+ # Multiply
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * B[0]
+ mul x4, x8, x21
+ umulh x5, x8, x21
+ # A[0] * B[1]
+ mul x25, x8, x22
+ umulh x6, x8, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x9, x21
+ umulh x26, x9, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x8, x23
+ umulh x26, x8, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x9, x22
+ umulh x26, x9, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x10, x21
+ umulh x26, x10, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x8, x24
+ umulh x26, x8, x24
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x9, x23
+ umulh x26, x9, x23
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x10, x22
+ umulh x26, x10, x22
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x11, x21
+ umulh x26, x11, x21
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x9, x24
+ umulh x26, x9, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x10, x23
+ umulh x26, x10, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x11, x22
+ umulh x26, x11, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x10, x24
+ umulh x26, x10, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x11, x23
+ umulh x26, x11, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x11, x24
+ umulh x26, x11, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x4, x4, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x5, x5, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x6, x6, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x16
+ adcs x6, x6, x17
+ adcs x7, x7, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #16]
+ # Add
+ adds x8, x12, x4
+ adcs x9, x13, x5
+ adcs x10, x14, x6
+ adc x11, x15, x7
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ # Sub
+ subs x16, x12, x4
+ sbcs x17, x13, x5
+ sbcs x19, x14, x6
+ sbcs x20, x15, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #176]
+ ldr x3, [x29, #72]
+ # Multiply
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x3]
+ ldp x23, x24, [x3, #16]
+ # A[0] * B[0]
+ mul x4, x16, x21
+ umulh x5, x16, x21
+ # A[0] * B[1]
+ mul x25, x16, x22
+ umulh x6, x16, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x17, x21
+ umulh x26, x17, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x16, x23
+ umulh x26, x16, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x17, x22
+ umulh x26, x17, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x19, x21
+ umulh x26, x19, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, x8, xzr
+ # A[0] * B[3]
+ mul x25, x16, x24
+ umulh x26, x16, x24
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x17, x23
+ umulh x26, x17, x23
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[2] * B[1]
+ mul x25, x19, x22
+ umulh x26, x19, x22
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[3] * B[0]
+ mul x25, x20, x21
+ umulh x26, x20, x21
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[1] * B[3]
+ mul x25, x17, x24
+ umulh x26, x17, x24
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x19, x23
+ umulh x26, x19, x23
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[3] * B[1]
+ mul x25, x20, x22
+ umulh x26, x20, x22
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[2] * B[3]
+ mul x25, x19, x24
+ umulh x26, x19, x24
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x20, x23
+ umulh x26, x20, x23
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, x11, xzr
+ # A[3] * B[3]
+ mul x25, x20, x24
+ umulh x26, x20, x24
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #32]
+ ldr x1, [x29, #64]
+ # Double
+ ldp x8, x9, [x1]
+ ldp x10, x11, [x1, #16]
+ adds x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adc x11, x11, x11
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ ldr x1, [x29, #40]
+ # Add
+ adds x12, x8, x4
+ adcs x13, x9, x5
+ adcs x14, x10, x6
+ adc x15, x11, x7
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x16, x8, x4
+ sbcs x17, x9, x5
+ sbcs x19, x10, x6
+ sbcs x20, x11, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x12, x13, [x1]
+ stp x14, x15, [x1, #16]
+ stp x16, x17, [x0]
+ stp x19, x20, [x0, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_msub,.-fe_ge_msub
+ .text
+ .align 2
+ .globl fe_ge_add
+ .type fe_ge_add, %function
+fe_ge_add:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ str x7, [x29, #72]
+ ldr x2, [x29, #56]
+ ldr x3, [x29, #48]
+ # Add
+ ldp x12, x13, [x2]
+ ldp x14, x15, [x2, #16]
+ ldp x16, x17, [x3]
+ ldp x19, x20, [x3, #16]
+ adds x4, x12, x16
+ adcs x5, x13, x17
+ adcs x6, x14, x19
+ adc x7, x15, x20
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ # Sub
+ subs x8, x12, x16
+ sbcs x9, x13, x17
+ sbcs x10, x14, x19
+ sbcs x11, x15, x20
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x8, x8, x25
+ adcs x9, x9, x28
+ adcs x10, x10, x28
+ adc x11, x11, x26
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #192]
+ # Multiply
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x12, x4, x21
+ umulh x13, x4, x21
+ # A[0] * B[1]
+ mul x25, x4, x22
+ umulh x14, x4, x22
+ adds x13, x13, x25
+ adc x14, x14, xzr
+ # A[1] * B[0]
+ mul x25, x5, x21
+ umulh x26, x5, x21
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x4, x23
+ umulh x26, x4, x23
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # A[1] * B[1]
+ mul x25, x5, x22
+ umulh x26, x5, x22
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x6, x21
+ umulh x26, x6, x21
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x4, x24
+ umulh x26, x4, x24
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x5, x23
+ umulh x26, x5, x23
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x6, x22
+ umulh x26, x6, x22
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x7, x21
+ umulh x26, x7, x21
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x5, x24
+ umulh x26, x5, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x6, x23
+ umulh x26, x6, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x7, x22
+ umulh x26, x7, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x6, x24
+ umulh x26, x6, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x7, x23
+ umulh x26, x7, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x7, x24
+ umulh x26, x7, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ and x15, x15, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x12, x12, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x13, x13, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x14, x14, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x15, x15, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x13, x13, x16
+ adcs x14, x14, x17
+ adcs x15, x15, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x15, #63
+ mul x27, x27, x25
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Reduce if top bit set
+ and x27, x25, x15, asr 63
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #200]
+ # Multiply
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * B[0]
+ mul x4, x8, x21
+ umulh x5, x8, x21
+ # A[0] * B[1]
+ mul x25, x8, x22
+ umulh x6, x8, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x9, x21
+ umulh x26, x9, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x8, x23
+ umulh x26, x8, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x9, x22
+ umulh x26, x9, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x10, x21
+ umulh x26, x10, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x8, x24
+ umulh x26, x8, x24
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x9, x23
+ umulh x26, x9, x23
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x10, x22
+ umulh x26, x10, x22
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x11, x21
+ umulh x26, x11, x21
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x9, x24
+ umulh x26, x9, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x10, x23
+ umulh x26, x10, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x11, x22
+ umulh x26, x11, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x10, x24
+ umulh x26, x10, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x11, x23
+ umulh x26, x11, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x11, x24
+ umulh x26, x11, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x4, x4, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x5, x5, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x6, x6, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x16
+ adcs x6, x6, x17
+ adcs x7, x7, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #16]
+ # Add
+ adds x8, x12, x4
+ adcs x9, x13, x5
+ adcs x10, x14, x6
+ adc x11, x15, x7
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ # Sub
+ subs x16, x12, x4
+ sbcs x17, x13, x5
+ sbcs x19, x14, x6
+ sbcs x20, x15, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x0, [x29, #48]
+ ldr x1, [x29, #64]
+ ldr x2, [x29, #176]
+ # Multiply
+ ldp x12, x13, [x1]
+ ldp x14, x15, [x1, #16]
+ ldp x16, x17, [x2]
+ ldp x19, x20, [x2, #16]
+ # A[0] * B[0]
+ mul x4, x12, x16
+ umulh x5, x12, x16
+ # A[0] * B[1]
+ mul x25, x12, x17
+ umulh x6, x12, x17
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x13, x16
+ umulh x26, x13, x16
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x12, x19
+ umulh x26, x12, x19
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x13, x17
+ umulh x26, x13, x17
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x14, x16
+ umulh x26, x14, x16
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, x8, xzr
+ # A[0] * B[3]
+ mul x25, x12, x20
+ umulh x26, x12, x20
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x13, x19
+ umulh x26, x13, x19
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[2] * B[1]
+ mul x25, x14, x17
+ umulh x26, x14, x17
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[3] * B[0]
+ mul x25, x15, x16
+ umulh x26, x15, x16
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[1] * B[3]
+ mul x25, x13, x20
+ umulh x26, x13, x20
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x14, x19
+ umulh x26, x14, x19
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[3] * B[1]
+ mul x25, x15, x17
+ umulh x26, x15, x17
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[2] * B[3]
+ mul x25, x14, x20
+ umulh x26, x14, x20
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x15, x19
+ umulh x26, x15, x19
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, x11, xzr
+ # A[3] * B[3]
+ mul x25, x15, x20
+ umulh x26, x15, x20
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #48]
+ # Double
+ adds x4, x4, x4
+ adcs x5, x5, x5
+ adcs x6, x6, x6
+ adc x7, x7, x7
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #184]
+ ldr x2, [x29, #72]
+ # Multiply
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x8, x16, x21
+ umulh x9, x16, x21
+ # A[0] * B[1]
+ mul x25, x16, x22
+ umulh x10, x16, x22
+ adds x9, x9, x25
+ adc x10, x10, xzr
+ # A[1] * B[0]
+ mul x25, x17, x21
+ umulh x26, x17, x21
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x16, x23
+ umulh x26, x16, x23
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # A[1] * B[1]
+ mul x25, x17, x22
+ umulh x26, x17, x22
+ adds x10, x10, x25
+ adcs x11, x11, x26
+ adc x12, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x19, x21
+ umulh x26, x19, x21
+ adds x10, x10, x25
+ adcs x11, x11, x26
+ adc x12, x12, xzr
+ # A[0] * B[3]
+ mul x25, x16, x24
+ umulh x26, x16, x24
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x17, x23
+ umulh x26, x17, x23
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[2] * B[1]
+ mul x25, x19, x22
+ umulh x26, x19, x22
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[3] * B[0]
+ mul x25, x20, x21
+ umulh x26, x20, x21
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[1] * B[3]
+ mul x25, x17, x24
+ umulh x26, x17, x24
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x19, x23
+ umulh x26, x19, x23
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, x14, xzr
+ # A[3] * B[1]
+ mul x25, x20, x22
+ umulh x26, x20, x22
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, x14, xzr
+ # A[2] * B[3]
+ mul x25, x19, x24
+ umulh x26, x19, x24
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x20, x23
+ umulh x26, x20, x23
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, x15, xzr
+ # A[3] * B[3]
+ mul x25, x20, x24
+ umulh x26, x20, x24
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x15, x15, x14, #63
+ extr x14, x14, x13, #63
+ extr x13, x13, x12, #63
+ extr x12, x12, x11, #63
+ and x11, x11, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x12
+ umulh x12, x25, x12
+ adds x8, x8, x26
+ mul x26, x25, x13
+ umulh x13, x25, x13
+ adcs x9, x9, x26
+ mul x26, x25, x14
+ umulh x14, x25, x14
+ adcs x10, x10, x26
+ mul x26, x25, x15
+ umulh x27, x25, x15
+ adcs x11, x11, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x9, x9, x12
+ adcs x10, x10, x13
+ adcs x11, x11, x14
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x11, #63
+ mul x27, x27, x25
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Reduce if top bit set
+ and x27, x25, x11, asr 63
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Store
+ ldr x0, [x29, #32]
+ ldr x1, [x29, #40]
+ # Add
+ adds x12, x4, x8
+ adcs x13, x5, x9
+ adcs x14, x6, x10
+ adc x15, x7, x11
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x16, x4, x8
+ sbcs x17, x5, x9
+ sbcs x19, x6, x10
+ sbcs x20, x7, x11
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x12, x13, [x0]
+ stp x14, x15, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_add,.-fe_ge_add
+ .text
+ .align 2
+ .globl fe_ge_sub
+ .type fe_ge_sub, %function
+fe_ge_sub:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ str x7, [x29, #72]
+ ldr x2, [x29, #56]
+ ldr x3, [x29, #48]
+ # Add
+ ldp x12, x13, [x2]
+ ldp x14, x15, [x2, #16]
+ ldp x16, x17, [x3]
+ ldp x19, x20, [x3, #16]
+ adds x4, x12, x16
+ adcs x5, x13, x17
+ adcs x6, x14, x19
+ adc x7, x15, x20
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ # Sub
+ subs x8, x12, x16
+ sbcs x9, x13, x17
+ sbcs x10, x14, x19
+ sbcs x11, x15, x20
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x8, x8, x25
+ adcs x9, x9, x28
+ adcs x10, x10, x28
+ adc x11, x11, x26
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #200]
+ # Multiply
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x12, x4, x21
+ umulh x13, x4, x21
+ # A[0] * B[1]
+ mul x25, x4, x22
+ umulh x14, x4, x22
+ adds x13, x13, x25
+ adc x14, x14, xzr
+ # A[1] * B[0]
+ mul x25, x5, x21
+ umulh x26, x5, x21
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x4, x23
+ umulh x26, x4, x23
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # A[1] * B[1]
+ mul x25, x5, x22
+ umulh x26, x5, x22
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x6, x21
+ umulh x26, x6, x21
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x4, x24
+ umulh x26, x4, x24
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x5, x23
+ umulh x26, x5, x23
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x6, x22
+ umulh x26, x6, x22
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x7, x21
+ umulh x26, x7, x21
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x5, x24
+ umulh x26, x5, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x6, x23
+ umulh x26, x6, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x7, x22
+ umulh x26, x7, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x6, x24
+ umulh x26, x6, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x7, x23
+ umulh x26, x7, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x7, x24
+ umulh x26, x7, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ and x15, x15, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x12, x12, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x13, x13, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x14, x14, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x15, x15, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x13, x13, x16
+ adcs x14, x14, x17
+ adcs x15, x15, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x15, #63
+ mul x27, x27, x25
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Reduce if top bit set
+ and x27, x25, x15, asr 63
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #192]
+ # Multiply
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * B[0]
+ mul x4, x8, x21
+ umulh x5, x8, x21
+ # A[0] * B[1]
+ mul x25, x8, x22
+ umulh x6, x8, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x9, x21
+ umulh x26, x9, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x8, x23
+ umulh x26, x8, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x9, x22
+ umulh x26, x9, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x10, x21
+ umulh x26, x10, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x8, x24
+ umulh x26, x8, x24
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x9, x23
+ umulh x26, x9, x23
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x10, x22
+ umulh x26, x10, x22
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x11, x21
+ umulh x26, x11, x21
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x9, x24
+ umulh x26, x9, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x10, x23
+ umulh x26, x10, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x11, x22
+ umulh x26, x11, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x10, x24
+ umulh x26, x10, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x11, x23
+ umulh x26, x11, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x11, x24
+ umulh x26, x11, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x4, x4, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x5, x5, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x6, x6, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x16
+ adcs x6, x6, x17
+ adcs x7, x7, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #16]
+ # Add
+ adds x8, x12, x4
+ adcs x9, x13, x5
+ adcs x10, x14, x6
+ adc x11, x15, x7
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ # Sub
+ subs x16, x12, x4
+ sbcs x17, x13, x5
+ sbcs x19, x14, x6
+ sbcs x20, x15, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x0, [x29, #48]
+ ldr x1, [x29, #64]
+ ldr x2, [x29, #176]
+ # Multiply
+ ldp x12, x13, [x1]
+ ldp x14, x15, [x1, #16]
+ ldp x16, x17, [x2]
+ ldp x19, x20, [x2, #16]
+ # A[0] * B[0]
+ mul x4, x12, x16
+ umulh x5, x12, x16
+ # A[0] * B[1]
+ mul x25, x12, x17
+ umulh x6, x12, x17
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x13, x16
+ umulh x26, x13, x16
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x12, x19
+ umulh x26, x12, x19
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x13, x17
+ umulh x26, x13, x17
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x14, x16
+ umulh x26, x14, x16
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, x8, xzr
+ # A[0] * B[3]
+ mul x25, x12, x20
+ umulh x26, x12, x20
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x13, x19
+ umulh x26, x13, x19
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[2] * B[1]
+ mul x25, x14, x17
+ umulh x26, x14, x17
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[3] * B[0]
+ mul x25, x15, x16
+ umulh x26, x15, x16
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[1] * B[3]
+ mul x25, x13, x20
+ umulh x26, x13, x20
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x14, x19
+ umulh x26, x14, x19
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[3] * B[1]
+ mul x25, x15, x17
+ umulh x26, x15, x17
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[2] * B[3]
+ mul x25, x14, x20
+ umulh x26, x14, x20
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x15, x19
+ umulh x26, x15, x19
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, x11, xzr
+ # A[3] * B[3]
+ mul x25, x15, x20
+ umulh x26, x15, x20
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #48]
+ # Double
+ adds x4, x4, x4
+ adcs x5, x5, x5
+ adcs x6, x6, x6
+ adc x7, x7, x7
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #184]
+ ldr x2, [x29, #72]
+ # Multiply
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x8, x16, x21
+ umulh x9, x16, x21
+ # A[0] * B[1]
+ mul x25, x16, x22
+ umulh x10, x16, x22
+ adds x9, x9, x25
+ adc x10, x10, xzr
+ # A[1] * B[0]
+ mul x25, x17, x21
+ umulh x26, x17, x21
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x16, x23
+ umulh x26, x16, x23
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # A[1] * B[1]
+ mul x25, x17, x22
+ umulh x26, x17, x22
+ adds x10, x10, x25
+ adcs x11, x11, x26
+ adc x12, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x19, x21
+ umulh x26, x19, x21
+ adds x10, x10, x25
+ adcs x11, x11, x26
+ adc x12, x12, xzr
+ # A[0] * B[3]
+ mul x25, x16, x24
+ umulh x26, x16, x24
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x17, x23
+ umulh x26, x17, x23
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[2] * B[1]
+ mul x25, x19, x22
+ umulh x26, x19, x22
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[3] * B[0]
+ mul x25, x20, x21
+ umulh x26, x20, x21
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[1] * B[3]
+ mul x25, x17, x24
+ umulh x26, x17, x24
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x19, x23
+ umulh x26, x19, x23
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, x14, xzr
+ # A[3] * B[1]
+ mul x25, x20, x22
+ umulh x26, x20, x22
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, x14, xzr
+ # A[2] * B[3]
+ mul x25, x19, x24
+ umulh x26, x19, x24
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x20, x23
+ umulh x26, x20, x23
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, x15, xzr
+ # A[3] * B[3]
+ mul x25, x20, x24
+ umulh x26, x20, x24
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x15, x15, x14, #63
+ extr x14, x14, x13, #63
+ extr x13, x13, x12, #63
+ extr x12, x12, x11, #63
+ and x11, x11, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x12
+ umulh x12, x25, x12
+ adds x8, x8, x26
+ mul x26, x25, x13
+ umulh x13, x25, x13
+ adcs x9, x9, x26
+ mul x26, x25, x14
+ umulh x14, x25, x14
+ adcs x10, x10, x26
+ mul x26, x25, x15
+ umulh x27, x25, x15
+ adcs x11, x11, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x9, x9, x12
+ adcs x10, x10, x13
+ adcs x11, x11, x14
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x11, #63
+ mul x27, x27, x25
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Reduce if top bit set
+ and x27, x25, x11, asr 63
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Store
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #32]
+ # Add
+ adds x12, x4, x8
+ adcs x13, x5, x9
+ adcs x14, x6, x10
+ adc x15, x7, x11
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x16, x4, x8
+ sbcs x17, x5, x9
+ sbcs x19, x6, x10
+ sbcs x20, x7, x11
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x12, x13, [x0]
+ stp x14, x15, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_sub,.-fe_ge_sub
+#endif /* __aarch64__ */