diff options
Diffstat (limited to 'client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S')
| -rw-r--r-- | client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S | 6715 |
1 files changed, 6715 insertions, 0 deletions
diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S b/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S new file mode 100644 index 0000000..891c6d8 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -0,0 +1,6715 @@ +/* armv8-curve25519 + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S + */ +#ifdef __aarch64__ + .text + .align 2 + .globl fe_init + .type fe_init, %function +fe_init: + ret + .size fe_init,.-fe_init + .text + .align 2 + .globl fe_frombytes + .type fe_frombytes, %function +fe_frombytes: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + .size fe_frombytes,.-fe_frombytes + .text + .align 2 + .globl fe_tobytes + .type fe_tobytes, %function +fe_tobytes: + mov x7, #19 + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + adds x6, x2, x7 + adcs x6, x3, xzr + adcs x6, x4, xzr + adc x6, x5, xzr + and x6, x7, x6, asr 63 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + adc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + .size fe_tobytes,.-fe_tobytes + .text + .align 2 + .globl fe_1 + .type fe_1, %function +fe_1: + # Set one + mov x1, #1 + stp x1, xzr, [x0] + stp xzr, xzr, [x0, #16] + ret + .size fe_1,.-fe_1 + .text + .align 2 + .globl fe_0 + .type fe_0, %function +fe_0: + # Set zero + stp xzr, xzr, [x0] + stp xzr, xzr, [x0, #16] + ret + .size fe_0,.-fe_0 + .text + .align 2 + .globl fe_copy + .type fe_copy, %function +fe_copy: + # Copy + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + .size fe_copy,.-fe_copy + .text + .align 2 + .globl fe_sub + .type fe_sub, %function +fe_sub: + # Sub + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + subs x3, x3, x7 + sbcs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + mov x12, #-19 + csetm x11, cc + # Mask the modulus + and x12, x11, x12 + and x13, x11, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x3, x3, x12 + adcs x4, x4, x11 + adcs x5, x5, x11 + adc x6, x6, x13 + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ret + .size fe_sub,.-fe_sub + .text + .align 2 + .globl fe_add + .type fe_add, %function +fe_add: + # Add + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + adds x3, x3, x7 + adcs x4, x4, x8 + adcs x5, x5, x9 + adc x6, x6, x10 + mov x12, #-19 + asr x11, x6, #63 + # Mask the modulus + and x12, x11, x12 + and x13, x11, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x3, x3, x12 + sbcs x4, x4, x11 + sbcs x5, x5, x11 + sbc x6, x6, x13 + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ret + .size fe_add,.-fe_add + .text + .align 2 + .globl fe_neg + .type fe_neg, %function +fe_neg: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x6, #-19 + mov x7, #-1 + mov x8, #-1 + mov x9, #0x7fffffffffffffff + subs x6, x6, x2 + sbcs x7, x7, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + ret + .size fe_neg,.-fe_neg + .text + .align 2 + .globl fe_isnonzero + .type fe_isnonzero, %function +fe_isnonzero: + mov x6, #19 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + adds x5, x1, x6 + adcs x5, x2, xzr + adcs x5, x3, xzr + adc x5, x4, xzr + and x5, x6, x5, asr 63 + adds x1, x1, x5 + adcs x2, x2, xzr + adcs x3, x3, xzr + adc x4, x4, xzr + and x4, x4, #0x7fffffffffffffff + orr x0, x1, x2 + orr x3, x3, x4 + orr x0, x0, x3 + ret + .size fe_isnonzero,.-fe_isnonzero + .text + .align 2 + .globl fe_isnegative + .type fe_isnegative, %function +fe_isnegative: + mov x6, #19 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + adds x5, x1, x6 + adcs x5, x2, xzr + adcs x5, x3, xzr + adc x5, x4, xzr + and x0, x1, #1 + eor x0, x0, x5, lsr 63 + ret + .size fe_isnegative,.-fe_isnegative + .text + .align 2 + .globl fe_cmov_table + .type fe_cmov_table, %function +fe_cmov_table: + stp x29, x30, [sp, #-128]! + add x29, sp, #0 + str x17, [x29, #40] + str x19, [x29, #48] + stp x20, x21, [x29, #56] + stp x22, x23, [x29, #72] + stp x24, x25, [x29, #88] + stp x26, x27, [x29, #104] + str x28, [x29, #120] + str x0, [x29, #16] + sxtb x2, w2 + sbfx x3, x2, #7, #1 + eor x0, x2, x3 + sub x0, x0, x3 + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, #1 + mov x9, xzr + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + mov x15, xzr + cmp x0, #1 + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x1, #32] + ldp x23, x24, [x1, #48] + ldp x25, x26, [x1, #64] + ldp x27, x28, [x1, #80] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #2 + ldp x16, x17, [x1, #96] + ldp x19, x20, [x1, #112] + ldp x21, x22, [x1, #128] + ldp x23, x24, [x1, #144] + ldp x25, x26, [x1, #160] + ldp x27, x28, [x1, #176] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #3 + ldp x16, x17, [x1, #192] + ldp x19, x20, [x1, #208] + ldp x21, x22, [x1, #224] + ldp x23, x24, [x1, #240] + ldp x25, x26, [x1, #256] + ldp x27, x28, [x1, #272] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #4 + ldp x16, x17, [x1, #288] + ldp x19, x20, [x1, #304] + ldp x21, x22, [x1, #320] + ldp x23, x24, [x1, #336] + ldp x25, x26, [x1, #352] + ldp x27, x28, [x1, #368] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + add x1, x1, #0x180 + cmp x0, #5 + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x1, #32] + ldp x23, x24, [x1, #48] + ldp x25, x26, [x1, #64] + ldp x27, x28, [x1, #80] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #6 + ldp x16, x17, [x1, #96] + ldp x19, x20, [x1, #112] + ldp x21, x22, [x1, #128] + ldp x23, x24, [x1, #144] + ldp x25, x26, [x1, #160] + ldp x27, x28, [x1, #176] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #7 + ldp x16, x17, [x1, #192] + ldp x19, x20, [x1, #208] + ldp x21, x22, [x1, #224] + ldp x23, x24, [x1, #240] + ldp x25, x26, [x1, #256] + ldp x27, x28, [x1, #272] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #8 + ldp x16, x17, [x1, #288] + ldp x19, x20, [x1, #304] + ldp x21, x22, [x1, #320] + ldp x23, x24, [x1, #336] + ldp x25, x26, [x1, #352] + ldp x27, x28, [x1, #368] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + mov x16, #-19 + mov x17, #-1 + mov x19, #-1 + mov x20, #0x7fffffffffffffff + subs x16, x16, x12 + sbcs x17, x17, x13 + sbcs x19, x19, x14 + sbc x20, x20, x15 + cmp x2, #0 + mov x3, x4 + csel x4, x8, x4, lt + csel x8, x3, x8, lt + mov x3, x5 + csel x5, x9, x5, lt + csel x9, x3, x9, lt + mov x3, x6 + csel x6, x10, x6, lt + csel x10, x3, x10, lt + mov x3, x7 + csel x7, x11, x7, lt + csel x11, x3, x11, lt + csel x12, x16, x12, lt + csel x13, x17, x13, lt + csel x14, x19, x14, lt + csel x15, x20, x15, lt + ldr x0, [x29, #16] + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + stp x8, x9, [x0, #32] + stp x10, x11, [x0, #48] + stp x12, x13, [x0, #64] + stp x14, x15, [x0, #80] + ldr x17, [x29, #40] + ldr x19, [x29, #48] + ldp x20, x21, [x29, #56] + ldp x22, x23, [x29, #72] + ldp x24, x25, [x29, #88] + ldp x26, x27, [x29, #104] + ldr x28, [x29, #120] + ldp x29, x30, [sp], #0x80 + ret + .size fe_cmov_table,.-fe_cmov_table + .text + .align 2 + .globl fe_mul + .type fe_mul, %function +fe_mul: + stp x29, x30, [sp, #-64]! + add x29, sp, #0 + str x17, [x29, #24] + str x19, [x29, #32] + stp x20, x21, [x29, #40] + str x22, [x29, #56] + # Multiply + ldp x14, x15, [x1] + ldp x16, x17, [x1, #16] + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] + # A[0] * B[0] + mul x6, x14, x19 + umulh x7, x14, x19 + # A[0] * B[1] + mul x3, x14, x20 + umulh x8, x14, x20 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x15, x19 + umulh x4, x15, x19 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x14, x21 + umulh x4, x14, x21 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x15, x20 + umulh x4, x15, x20 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, xzr, xzr + # A[2] * B[0] + mul x3, x16, x19 + umulh x4, x16, x19 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, x10, xzr + # A[0] * B[3] + mul x3, x14, x22 + umulh x4, x14, x22 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * B[2] + mul x3, x15, x21 + umulh x4, x15, x21 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[2] * B[1] + mul x3, x16, x20 + umulh x4, x16, x20 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[3] * B[0] + mul x3, x17, x19 + umulh x4, x17, x19 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[1] * B[3] + mul x3, x15, x22 + umulh x4, x15, x22 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, xzr, xzr + # A[2] * B[2] + mul x3, x16, x21 + umulh x4, x16, x21 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[3] * B[1] + mul x3, x17, x20 + umulh x4, x17, x20 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[2] * B[3] + mul x3, x16, x22 + umulh x4, x16, x22 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[3] * B[2] + mul x3, x17, x21 + umulh x4, x17, x21 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[3] * B[3] + mul x3, x17, x22 + umulh x4, x17, x22 + adds x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + ldr x17, [x29, #24] + ldr x19, [x29, #32] + ldp x20, x21, [x29, #40] + ldr x22, [x29, #56] + ldp x29, x30, [sp], #0x40 + ret + .size fe_mul,.-fe_mul + .text + .align 2 + .globl fe_sq + .type fe_sq, %function +fe_sq: + # Square + ldp x13, x14, [x1] + ldp x15, x16, [x1, #16] + # A[0] * A[1] + mul x6, x13, x14 + umulh x7, x13, x14 + # A[0] * A[2] + mul x2, x13, x15 + umulh x8, x13, x15 + adds x7, x7, x2 + adc x8, x8, xzr + # A[0] * A[3] + mul x2, x13, x16 + umulh x9, x13, x16 + adds x8, x8, x2 + adc x9, x9, xzr + # A[1] * A[2] + mul x2, x14, x15 + umulh x3, x14, x15 + adds x8, x8, x2 + adcs x9, x9, x3 + adc x10, xzr, xzr + # A[1] * A[3] + mul x2, x14, x16 + umulh x3, x14, x16 + adds x9, x9, x2 + adc x10, x10, x3 + # A[2] * A[3] + mul x2, x15, x16 + umulh x11, x15, x16 + adds x10, x10, x2 + adc x11, x11, xzr + # Double + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adc x12, xzr, xzr + # A[0] * A[0] + mul x5, x13, x13 + umulh x4, x13, x13 + # A[1] * A[1] + mul x2, x14, x14 + umulh x3, x14, x14 + adds x6, x6, x4 + adcs x7, x7, x2 + adc x4, x3, xzr + # A[2] * A[2] + mul x2, x15, x15 + umulh x3, x15, x15 + adds x8, x8, x4 + adcs x9, x9, x2 + adc x4, x3, xzr + # A[3] * A[3] + mul x2, x16, x16 + umulh x3, x16, x16 + adds x10, x10, x4 + adcs x11, x11, x2 + adc x12, x12, x3 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + and x8, x8, #0x7fffffffffffffff + # Multiply top half by 19 + mov x2, #19 + mul x3, x2, x9 + umulh x9, x2, x9 + adds x5, x5, x3 + mul x3, x2, x10 + umulh x10, x2, x10 + adcs x6, x6, x3 + mul x3, x2, x11 + umulh x11, x2, x11 + adcs x7, x7, x3 + mul x3, x2, x12 + umulh x4, x2, x12 + adcs x8, x8, x3 + adc x4, x4, xzr + # Add remaining product results in + adds x6, x6, x9 + adcs x7, x7, x10 + adcs x8, x8, x11 + adc x4, x4, xzr + # Overflow + extr x4, x4, x8, #63 + mul x4, x4, x2 + and x8, x8, #0x7fffffffffffffff + adds x5, x5, x4 + adcs x6, x6, xzr + adcs x7, x7, xzr + adc x8, x8, xzr + # Reduce if top bit set + and x4, x2, x8, asr 63 + and x8, x8, #0x7fffffffffffffff + adds x5, x5, x4 + adcs x6, x6, xzr + adcs x7, x7, xzr + adc x8, x8, xzr + # Store + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + ret + .size fe_sq,.-fe_sq + .text + .align 2 + .globl fe_invert + .type fe_invert, %function +fe_invert: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x20, [x29, #168] + # Invert + str x0, [x29, #144] + str x1, [x29, #152] + add x0, x29, #16 + bl fe_sq + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + add x1, x29, #48 + bl fe_sq + ldr x1, [x29, #152] + add x2, x29, #48 + bl fe_mul + add x0, x29, #16 + add x1, x29, #16 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x50 + bl fe_sq + add x0, x29, #48 + add x1, x29, #48 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x50 + bl fe_sq + mov x20, #4 + add x1, x29, #0x50 +L_fe_invert1: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert1 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x50 + add x1, x29, #48 + bl fe_sq + mov x20, #9 + add x1, x29, #0x50 +L_fe_invert2: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert2 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x70 + bl fe_sq + mov x20, #19 + add x1, x29, #0x70 +L_fe_invert3: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert3 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + mov x20, #10 + add x1, x29, #0x50 +L_fe_invert4: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert4 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x50 + add x1, x29, #48 + bl fe_sq + mov x20, #49 + add x1, x29, #0x50 +L_fe_invert5: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert5 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x70 + bl fe_sq + mov x20, #0x63 + add x1, x29, #0x70 +L_fe_invert6: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert6 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + mov x20, #50 + add x1, x29, #0x50 +L_fe_invert7: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert7 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + mov x20, #5 + add x1, x29, #48 +L_fe_invert8: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert8 + ldr x0, [x29, #144] + add x2, x29, #16 + bl fe_mul + ldr x20, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_invert,.-fe_invert + .text + .align 2 + .globl curve25519 + .type curve25519, %function +curve25519: + stp x29, x30, [sp, #-288]! + add x29, sp, #0 + str x17, [x29, #200] + str x19, [x29, #208] + stp x20, x21, [x29, #216] + stp x22, x23, [x29, #232] + stp x24, x25, [x29, #248] + stp x26, x27, [x29, #264] + str x28, [x29, #280] + mov x23, xzr + str x0, [x29, #176] + str x2, [x29, #184] + # Copy + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + # Set one + mov x2, #1 + stp x2, xzr, [x0] + stp xzr, xzr, [x0, #16] + # Set zero + stp xzr, xzr, [x29, #16] + stp xzr, xzr, [x29, #32] + # Set one + mov x2, #1 + stp x2, xzr, [x29, #48] + stp xzr, xzr, [x29, #64] + mov x25, #62 + mov x24, #24 +L_curve25519_words: +L_curve25519_bits: + ldr x2, [x1, x24] + lsr x2, x2, x25 + and x2, x2, #1 + eor x23, x23, x2 + # Conditional Swap + cmp x23, #1 + ldp x10, x11, [x0] + ldp x12, x13, [x0, #16] + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] + csel x14, x10, x6, eq + csel x10, x6, x10, eq + csel x15, x11, x7, eq + csel x11, x7, x11, eq + csel x16, x12, x8, eq + csel x12, x8, x12, eq + csel x17, x13, x9, eq + csel x13, x9, x13, eq + # Conditional Swap + cmp x23, #1 + ldp x19, x20, [x29, #16] + ldp x21, x22, [x29, #32] + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] + csel x5, x19, x6, eq + csel x19, x6, x19, eq + csel x26, x20, x7, eq + csel x20, x7, x20, eq + csel x27, x21, x8, eq + csel x21, x8, x21, eq + csel x28, x22, x9, eq + csel x22, x9, x22, eq + mov x23, x2 + # Add + adds x6, x10, x19 + adcs x7, x11, x20 + adcs x8, x12, x21 + adc x9, x13, x22 + mov x3, #-19 + asr x2, x9, #63 + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x6, x6, x3 + sbcs x7, x7, x2 + sbcs x8, x8, x2 + sbc x9, x9, x4 + # Sub + subs x19, x10, x19 + sbcs x20, x11, x20 + sbcs x21, x12, x21 + sbcs x22, x13, x22 + mov x3, #-19 + csetm x2, cc + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x19, x19, x3 + adcs x20, x20, x2 + adcs x21, x21, x2 + adc x22, x22, x4 + stp x19, x20, [x29, #144] + stp x21, x22, [x29, #160] + # Add + adds x10, x14, x5 + adcs x11, x15, x26 + adcs x12, x16, x27 + adc x13, x17, x28 + mov x3, #-19 + asr x2, x13, #63 + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x10, x10, x3 + sbcs x11, x11, x2 + sbcs x12, x12, x2 + sbc x13, x13, x4 + # Sub + subs x14, x14, x5 + sbcs x15, x15, x26 + sbcs x16, x16, x27 + sbcs x17, x17, x28 + mov x3, #-19 + csetm x2, cc + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x14, x14, x3 + adcs x15, x15, x2 + adcs x16, x16, x2 + adc x17, x17, x4 + # Multiply + # A[0] * B[0] + mul x19, x14, x6 + umulh x20, x14, x6 + # A[0] * B[1] + mul x3, x14, x7 + umulh x21, x14, x7 + adds x20, x20, x3 + adc x21, x21, xzr + # A[1] * B[0] + mul x3, x15, x6 + umulh x4, x15, x6 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, xzr, xzr + # A[0] * B[2] + mul x3, x14, x8 + umulh x4, x14, x8 + adds x21, x21, x3 + adc x22, x22, x4 + # A[1] * B[1] + mul x3, x15, x7 + umulh x4, x15, x7 + adds x21, x21, x3 + adcs x22, x22, x4 + adc x2, xzr, xzr + # A[2] * B[0] + mul x3, x16, x6 + umulh x4, x16, x6 + adds x21, x21, x3 + adcs x22, x22, x4 + adc x2, x2, xzr + # A[0] * B[3] + mul x3, x14, x9 + umulh x4, x14, x9 + adds x22, x22, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x8 + umulh x4, x15, x8 + adds x22, x22, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x7 + umulh x4, x16, x7 + adds x22, x22, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x6 + umulh x4, x17, x6 + adds x22, x22, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x9 + umulh x4, x15, x9 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x8 + umulh x4, x16, x8 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x7 + umulh x4, x17, x7 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x9 + umulh x4, x16, x9 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x8 + umulh x4, x17, x8 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x9 + umulh x4, x17, x9 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x22, #63 + and x22, x22, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x19, x19, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x20, x20, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x21, x21, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x22, x22, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x20, x20, x2 + adcs x21, x21, x26 + adcs x22, x22, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x22, #63 + mul x5, x5, x3 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x22, x22, xzr + # Reduce if top bit set + and x5, x3, x22, asr 63 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x22, x22, xzr + # Store + stp x19, x20, [x29, #112] + stp x21, x22, [x29, #128] + # Multiply + ldp x2, x26, [x29, #144] + ldp x27, x28, [x29, #160] + # A[0] * B[0] + mul x19, x10, x2 + umulh x20, x10, x2 + # A[0] * B[1] + mul x3, x10, x26 + umulh x21, x10, x26 + adds x20, x20, x3 + adc x21, x21, xzr + # A[1] * B[0] + mul x3, x11, x2 + umulh x4, x11, x2 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, xzr, xzr + # A[0] * B[2] + mul x3, x10, x27 + umulh x4, x10, x27 + adds x21, x21, x3 + adc x22, x22, x4 + # A[1] * B[1] + mul x3, x11, x26 + umulh x4, x11, x26 + adds x21, x21, x3 + adcs x22, x22, x4 + adc x14, xzr, xzr + # A[2] * B[0] + mul x3, x12, x2 + umulh x4, x12, x2 + adds x21, x21, x3 + adcs x22, x22, x4 + adc x14, x14, xzr + # A[0] * B[3] + mul x3, x10, x28 + umulh x4, x10, x28 + adds x22, x22, x3 + adcs x14, x14, x4 + adc x15, xzr, xzr + # A[1] * B[2] + mul x3, x11, x27 + umulh x4, x11, x27 + adds x22, x22, x3 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[2] * B[1] + mul x3, x12, x26 + umulh x4, x12, x26 + adds x22, x22, x3 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[3] * B[0] + mul x3, x13, x2 + umulh x4, x13, x2 + adds x22, x22, x3 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[1] * B[3] + mul x3, x11, x28 + umulh x4, x11, x28 + adds x14, x14, x3 + adcs x15, x15, x4 + adc x16, xzr, xzr + # A[2] * B[2] + mul x3, x12, x27 + umulh x4, x12, x27 + adds x14, x14, x3 + adcs x15, x15, x4 + adc x16, x16, xzr + # A[3] * B[1] + mul x3, x13, x26 + umulh x4, x13, x26 + adds x14, x14, x3 + adcs x15, x15, x4 + adc x16, x16, xzr + # A[2] * B[3] + mul x3, x12, x28 + umulh x4, x12, x28 + adds x15, x15, x3 + adcs x16, x16, x4 + adc x17, xzr, xzr + # A[3] * B[2] + mul x3, x13, x27 + umulh x4, x13, x27 + adds x15, x15, x3 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[3] * B[3] + mul x3, x13, x28 + umulh x4, x13, x28 + adds x16, x16, x3 + adc x17, x17, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + extr x15, x15, x14, #63 + extr x14, x14, x22, #63 + and x22, x22, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x14 + umulh x14, x3, x14 + adds x19, x19, x4 + mul x4, x3, x15 + umulh x15, x3, x15 + adcs x20, x20, x4 + mul x4, x3, x16 + umulh x16, x3, x16 + adcs x21, x21, x4 + mul x4, x3, x17 + umulh x5, x3, x17 + adcs x22, x22, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x20, x20, x14 + adcs x21, x21, x15 + adcs x22, x22, x16 + adc x5, x5, xzr + # Overflow + extr x5, x5, x22, #63 + mul x5, x5, x3 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x22, x22, xzr + # Reduce if top bit set + and x5, x3, x22, asr 63 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x22, x22, xzr + # Store + # Square + # A[0] * A[1] + mul x11, x2, x26 + umulh x12, x2, x26 + # A[0] * A[2] + mul x3, x2, x27 + umulh x13, x2, x27 + adds x12, x12, x3 + adc x13, x13, xzr + # A[0] * A[3] + mul x3, x2, x28 + umulh x14, x2, x28 + adds x13, x13, x3 + adc x14, x14, xzr + # A[1] * A[2] + mul x3, x26, x27 + umulh x4, x26, x27 + adds x13, x13, x3 + adcs x14, x14, x4 + adc x15, xzr, xzr + # A[1] * A[3] + mul x3, x26, x28 + umulh x4, x26, x28 + adds x14, x14, x3 + adc x15, x15, x4 + # A[2] * A[3] + mul x3, x27, x28 + umulh x16, x27, x28 + adds x15, x15, x3 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + mul x10, x2, x2 + umulh x5, x2, x2 + # A[1] * A[1] + mul x3, x26, x26 + umulh x4, x26, x26 + adds x11, x11, x5 + adcs x12, x12, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x27, x27 + umulh x4, x27, x27 + adds x13, x13, x5 + adcs x14, x14, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x28, x28 + umulh x4, x28, x28 + adds x15, x15, x5 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + extr x15, x15, x14, #63 + extr x14, x14, x13, #63 + and x13, x13, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x14 + umulh x14, x3, x14 + adds x10, x10, x4 + mul x4, x3, x15 + umulh x15, x3, x15 + adcs x11, x11, x4 + mul x4, x3, x16 + umulh x16, x3, x16 + adcs x12, x12, x4 + mul x4, x3, x17 + umulh x5, x3, x17 + adcs x13, x13, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x11, x11, x14 + adcs x12, x12, x15 + adcs x13, x13, x16 + adc x5, x5, xzr + # Overflow + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Reduce if top bit set + and x5, x3, x13, asr 63 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Store + # Square + # A[0] * A[1] + mul x15, x6, x7 + umulh x16, x6, x7 + # A[0] * A[2] + mul x3, x6, x8 + umulh x17, x6, x8 + adds x16, x16, x3 + adc x17, x17, xzr + # A[0] * A[3] + mul x3, x6, x9 + umulh x2, x6, x9 + adds x17, x17, x3 + adc x2, x2, xzr + # A[1] * A[2] + mul x3, x7, x8 + umulh x4, x7, x8 + adds x17, x17, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * A[3] + mul x3, x7, x9 + umulh x4, x7, x9 + adds x2, x2, x3 + adc x26, x26, x4 + # A[2] * A[3] + mul x3, x8, x9 + umulh x27, x8, x9 + adds x26, x26, x3 + adc x27, x27, xzr + # Double + adds x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x2, x2, x2 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + mul x14, x6, x6 + umulh x5, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + umulh x4, x7, x7 + adds x15, x15, x5 + adcs x16, x16, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x8, x8 + umulh x4, x8, x8 + adds x17, x17, x5 + adcs x2, x2, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x9, x9 + umulh x4, x9, x9 + adds x26, x26, x5 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x17, #63 + and x17, x17, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x14, x14, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x15, x15, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x16, x16, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x17, x17, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x15, x15, x2 + adcs x16, x16, x26 + adcs x17, x17, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Reduce if top bit set + and x5, x3, x17, asr 63 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Store + # Multiply + # A[0] * B[0] + mul x6, x14, x10 + umulh x7, x14, x10 + # A[0] * B[1] + mul x3, x14, x11 + umulh x8, x14, x11 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x15, x10 + umulh x4, x15, x10 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x14, x12 + umulh x4, x14, x12 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x15, x11 + umulh x4, x15, x11 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x2, xzr, xzr + # A[2] * B[0] + mul x3, x16, x10 + umulh x4, x16, x10 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x2, x2, xzr + # A[0] * B[3] + mul x3, x14, x13 + umulh x4, x14, x13 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x12 + umulh x4, x15, x12 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x11 + umulh x4, x16, x11 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x10 + umulh x4, x17, x10 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x13 + umulh x4, x15, x13 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x12 + umulh x4, x16, x12 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x11 + umulh x4, x17, x11 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x13 + umulh x4, x16, x13 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x12 + umulh x4, x17, x12 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x13 + umulh x4, x17, x13 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x2 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + # Sub + subs x14, x14, x10 + sbcs x15, x15, x11 + sbcs x16, x16, x12 + sbcs x17, x17, x13 + mov x3, #-19 + csetm x2, cc + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x14, x14, x3 + adcs x15, x15, x2 + adcs x16, x16, x2 + adc x17, x17, x4 + # Multiply by 121666 + mov x5, #0xdb42 + movk x5, #1, lsl 16 + mul x6, x14, x5 + umulh x7, x14, x5 + mul x3, x15, x5 + umulh x4, x15, x5 + adds x7, x7, x3 + adc x8, xzr, x4 + mul x3, x16, x5 + umulh x4, x16, x5 + adds x8, x8, x3 + adc x9, xzr, x4 + mul x3, x17, x5 + umulh x4, x17, x5 + adds x9, x9, x3 + adc x4, xzr, x4 + mov x5, #19 + extr x4, x4, x9, #63 + mul x4, x4, x5 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x4 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Add + adds x10, x10, x6 + adcs x11, x11, x7 + adcs x12, x12, x8 + adc x13, x13, x9 + mov x3, #-19 + asr x2, x13, #63 + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x10, x10, x3 + sbcs x11, x11, x2 + sbcs x12, x12, x2 + sbc x13, x13, x4 + # Multiply + # A[0] * B[0] + mul x6, x14, x10 + umulh x7, x14, x10 + # A[0] * B[1] + mul x3, x14, x11 + umulh x8, x14, x11 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x15, x10 + umulh x4, x15, x10 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x14, x12 + umulh x4, x14, x12 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x15, x11 + umulh x4, x15, x11 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x2, xzr, xzr + # A[2] * B[0] + mul x3, x16, x10 + umulh x4, x16, x10 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x2, x2, xzr + # A[0] * B[3] + mul x3, x14, x13 + umulh x4, x14, x13 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x12 + umulh x4, x15, x12 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x11 + umulh x4, x16, x11 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x10 + umulh x4, x17, x10 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x13 + umulh x4, x15, x13 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x12 + umulh x4, x16, x12 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x11 + umulh x4, x17, x11 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x13 + umulh x4, x16, x13 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x12 + umulh x4, x17, x12 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x13 + umulh x4, x17, x13 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x2 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #16] + stp x8, x9, [x29, #32] + # Add + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] + adds x10, x6, x19 + adcs x11, x7, x20 + adcs x12, x8, x21 + adc x13, x9, x22 + mov x3, #-19 + asr x2, x13, #63 + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x10, x10, x3 + sbcs x11, x11, x2 + sbcs x12, x12, x2 + sbc x13, x13, x4 + # Sub + subs x19, x6, x19 + sbcs x20, x7, x20 + sbcs x21, x8, x21 + sbcs x22, x9, x22 + mov x3, #-19 + csetm x2, cc + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x19, x19, x3 + adcs x20, x20, x2 + adcs x21, x21, x2 + adc x22, x22, x4 + # Square + # A[0] * A[1] + mul x7, x10, x11 + umulh x8, x10, x11 + # A[0] * A[2] + mul x3, x10, x12 + umulh x9, x10, x12 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x10, x13 + umulh x2, x10, x13 + adds x9, x9, x3 + adc x2, x2, xzr + # A[1] * A[2] + mul x3, x11, x12 + umulh x4, x11, x12 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * A[3] + mul x3, x11, x13 + umulh x4, x11, x13 + adds x2, x2, x3 + adc x26, x26, x4 + # A[2] * A[3] + mul x3, x12, x13 + umulh x27, x12, x13 + adds x26, x26, x3 + adc x27, x27, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x2, x2, x2 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + mul x6, x10, x10 + umulh x5, x10, x10 + # A[1] * A[1] + mul x3, x11, x11 + umulh x4, x11, x11 + adds x7, x7, x5 + adcs x8, x8, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x12, x12 + umulh x4, x12, x12 + adds x9, x9, x5 + adcs x2, x2, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x13, x13 + umulh x4, x13, x13 + adds x26, x26, x5 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x2 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + # Square + # A[0] * A[1] + mul x7, x19, x20 + umulh x8, x19, x20 + # A[0] * A[2] + mul x3, x19, x21 + umulh x9, x19, x21 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x19, x22 + umulh x2, x19, x22 + adds x9, x9, x3 + adc x2, x2, xzr + # A[1] * A[2] + mul x3, x20, x21 + umulh x4, x20, x21 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * A[3] + mul x3, x20, x22 + umulh x4, x20, x22 + adds x2, x2, x3 + adc x26, x26, x4 + # A[2] * A[3] + mul x3, x21, x22 + umulh x27, x21, x22 + adds x26, x26, x3 + adc x27, x27, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x2, x2, x2 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + mul x6, x19, x19 + umulh x5, x19, x19 + # A[1] * A[1] + mul x3, x20, x20 + umulh x4, x20, x20 + adds x7, x7, x5 + adcs x8, x8, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x21, x21 + umulh x4, x21, x21 + adds x9, x9, x5 + adcs x2, x2, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x22, x22 + umulh x4, x22, x22 + adds x26, x26, x5 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x2 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + ldr x2, [x29, #184] + # Multiply + ldp x14, x15, [x2] + ldp x16, x17, [x2, #16] + # A[0] * B[0] + mul x10, x14, x6 + umulh x11, x14, x6 + # A[0] * B[1] + mul x3, x14, x7 + umulh x12, x14, x7 + adds x11, x11, x3 + adc x12, x12, xzr + # A[1] * B[0] + mul x3, x15, x6 + umulh x4, x15, x6 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[0] * B[2] + mul x3, x14, x8 + umulh x4, x14, x8 + adds x12, x12, x3 + adc x13, x13, x4 + # A[1] * B[1] + mul x3, x15, x7 + umulh x4, x15, x7 + adds x12, x12, x3 + adcs x13, x13, x4 + adc x2, xzr, xzr + # A[2] * B[0] + mul x3, x16, x6 + umulh x4, x16, x6 + adds x12, x12, x3 + adcs x13, x13, x4 + adc x2, x2, xzr + # A[0] * B[3] + mul x3, x14, x9 + umulh x4, x14, x9 + adds x13, x13, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x8 + umulh x4, x15, x8 + adds x13, x13, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x7 + umulh x4, x16, x7 + adds x13, x13, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x6 + umulh x4, x17, x6 + adds x13, x13, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x9 + umulh x4, x15, x9 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x8 + umulh x4, x16, x8 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x7 + umulh x4, x17, x7 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x9 + umulh x4, x16, x9 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x8 + umulh x4, x17, x8 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x9 + umulh x4, x17, x9 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x13, #63 + and x13, x13, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x10, x10, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x11, x11, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x12, x12, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x13, x13, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x11, x11, x2 + adcs x12, x12, x26 + adcs x13, x13, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Reduce if top bit set + and x5, x3, x13, asr 63 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Store + stp x10, x11, [x29, #48] + stp x12, x13, [x29, #64] + sub x25, x25, #1 + cmp x25, #0 + bge L_curve25519_bits + mov x25, #63 + sub x24, x24, #8 + cmp x24, #0 + bge L_curve25519_words + # Invert + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + add x0, x29, #0x50 + add x1, x29, #48 + bl fe_sq + add x1, x29, #0x50 + bl fe_sq + add x1, x29, #16 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #48 + add x1, x29, #48 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x70 + bl fe_sq + add x0, x29, #0x50 + add x1, x29, #0x50 + add x2, x29, #0x70 + bl fe_mul + add x0, x29, #0x70 + bl fe_sq + mov x24, #4 + add x1, x29, #0x70 +L_curve25519_inv_1: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_1 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x70 + add x1, x29, #0x50 + bl fe_sq + mov x24, #9 + add x1, x29, #0x70 +L_curve25519_inv_2: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_2 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x90 + bl fe_sq + mov x24, #19 + add x1, x29, #0x90 +L_curve25519_inv_3: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_3 + add x0, x29, #0x70 + add x2, x29, #0x70 + bl fe_mul + mov x24, #10 + add x1, x29, #0x70 +L_curve25519_inv_4: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_4 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x70 + add x1, x29, #0x50 + bl fe_sq + mov x24, #49 + add x1, x29, #0x70 +L_curve25519_inv_5: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_5 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x90 + bl fe_sq + mov x24, #0x63 + add x1, x29, #0x90 +L_curve25519_inv_6: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_6 + add x0, x29, #0x70 + add x2, x29, #0x70 + bl fe_mul + mov x24, #50 + add x1, x29, #0x70 +L_curve25519_inv_7: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_7 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + mov x24, #5 + add x1, x29, #0x50 +L_curve25519_inv_8: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_8 + add x0, x29, #16 + add x2, x29, #48 + bl fe_mul + ldr x0, [x29, #176] + # Multiply + ldp x6, x7, [x0] + ldp x8, x9, [x0, #16] + ldp x10, x11, [x29, #16] + ldp x12, x13, [x29, #32] + # A[0] * B[0] + mul x14, x6, x10 + umulh x15, x6, x10 + # A[0] * B[1] + mul x3, x6, x11 + umulh x16, x6, x11 + adds x15, x15, x3 + adc x16, x16, xzr + # A[1] * B[0] + mul x3, x7, x10 + umulh x4, x7, x10 + adds x15, x15, x3 + adcs x16, x16, x4 + adc x17, xzr, xzr + # A[0] * B[2] + mul x3, x6, x12 + umulh x4, x6, x12 + adds x16, x16, x3 + adc x17, x17, x4 + # A[1] * B[1] + mul x3, x7, x11 + umulh x4, x7, x11 + adds x16, x16, x3 + adcs x17, x17, x4 + adc x19, xzr, xzr + # A[2] * B[0] + mul x3, x8, x10 + umulh x4, x8, x10 + adds x16, x16, x3 + adcs x17, x17, x4 + adc x19, x19, xzr + # A[0] * B[3] + mul x3, x6, x13 + umulh x4, x6, x13 + adds x17, x17, x3 + adcs x19, x19, x4 + adc x20, xzr, xzr + # A[1] * B[2] + mul x3, x7, x12 + umulh x4, x7, x12 + adds x17, x17, x3 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[2] * B[1] + mul x3, x8, x11 + umulh x4, x8, x11 + adds x17, x17, x3 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[3] * B[0] + mul x3, x9, x10 + umulh x4, x9, x10 + adds x17, x17, x3 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[1] * B[3] + mul x3, x7, x13 + umulh x4, x7, x13 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, xzr, xzr + # A[2] * B[2] + mul x3, x8, x12 + umulh x4, x8, x12 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[3] * B[1] + mul x3, x9, x11 + umulh x4, x9, x11 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[3] + mul x3, x8, x13 + umulh x4, x8, x13 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, xzr, xzr + # A[3] * B[2] + mul x3, x9, x12 + umulh x4, x9, x12 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[3] + mul x3, x9, x13 + umulh x4, x9, x13 + adds x21, x21, x3 + adc x22, x22, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x22, x22, x21, #63 + extr x21, x21, x20, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + and x17, x17, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x19 + umulh x19, x3, x19 + adds x14, x14, x4 + mul x4, x3, x20 + umulh x20, x3, x20 + adcs x15, x15, x4 + mul x4, x3, x21 + umulh x21, x3, x21 + adcs x16, x16, x4 + mul x4, x3, x22 + umulh x5, x3, x22 + adcs x17, x17, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x15, x15, x19 + adcs x16, x16, x20 + adcs x17, x17, x21 + adc x5, x5, xzr + # Overflow + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Reduce if top bit set + and x5, x3, x17, asr 63 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Store + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + mov x0, xzr + ldr x17, [x29, #200] + ldr x19, [x29, #208] + ldp x20, x21, [x29, #216] + ldp x22, x23, [x29, #232] + ldp x24, x25, [x29, #248] + ldp x26, x27, [x29, #264] + ldr x28, [x29, #280] + ldp x29, x30, [sp], #0x120 + ret + .size curve25519,.-curve25519 + .text + .align 2 + .globl fe_pow22523 + .type fe_pow22523, %function +fe_pow22523: + stp x29, x30, [sp, #-144]! + add x29, sp, #0 + str x21, [x29, #136] + # pow22523 + str x0, [x29, #112] + str x1, [x29, #120] + add x0, x29, #16 + bl fe_sq + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + add x1, x29, #48 + bl fe_sq + ldr x1, [x29, #120] + add x2, x29, #48 + bl fe_mul + add x0, x29, #16 + add x1, x29, #16 + add x2, x29, #48 + bl fe_mul + bl fe_sq + add x1, x29, #48 + add x2, x29, #16 + bl fe_mul + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + mov x21, #4 + add x1, x29, #48 +L_fe_pow22523_1: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_1 + add x0, x29, #16 + add x2, x29, #16 + bl fe_mul + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + mov x21, #9 + add x1, x29, #48 +L_fe_pow22523_2: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_2 + add x2, x29, #16 + bl fe_mul + add x0, x29, #0x50 + bl fe_sq + mov x21, #19 + add x1, x29, #0x50 +L_fe_pow22523_3: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_3 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + mov x21, #10 + add x1, x29, #48 +L_fe_pow22523_4: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_4 + add x0, x29, #16 + add x2, x29, #16 + bl fe_mul + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + mov x21, #49 + add x1, x29, #48 +L_fe_pow22523_5: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_5 + add x2, x29, #16 + bl fe_mul + add x0, x29, #0x50 + bl fe_sq + mov x21, #0x63 + add x1, x29, #0x50 +L_fe_pow22523_6: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_6 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + mov x21, #50 + add x1, x29, #48 +L_fe_pow22523_7: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_7 + add x0, x29, #16 + add x2, x29, #16 + bl fe_mul + mov x21, #2 + add x1, x29, #16 +L_fe_pow22523_8: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_8 + ldr x0, [x29, #112] + ldr x2, [x29, #120] + bl fe_mul + ldr x21, [x29, #136] + ldp x29, x30, [sp], #0x90 + ret + .size fe_pow22523,.-fe_pow22523 + .text + .align 2 + .globl fe_ge_to_p2 + .type fe_ge_to_p2, %function +fe_ge_to_p2: + stp x29, x30, [sp, #-112]! + add x29, sp, #0 + str x17, [x29, #72] + str x19, [x29, #80] + stp x20, x21, [x29, #88] + str x22, [x29, #104] + str x1, [x29, #16] + str x2, [x29, #24] + str x3, [x29, #32] + str x4, [x29, #40] + str x5, [x29, #48] + str x6, [x29, #56] + ldr x1, [x29, #32] + ldr x2, [x29, #56] + # Multiply + ldp x11, x12, [x1] + ldp x13, x14, [x1, #16] + ldp x15, x16, [x2] + ldp x17, x19, [x2, #16] + # A[0] * B[0] + mul x3, x11, x15 + umulh x4, x11, x15 + # A[0] * B[1] + mul x20, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x20 + adc x5, x5, xzr + # A[1] * B[0] + mul x20, x12, x15 + umulh x21, x12, x15 + adds x4, x4, x20 + adcs x5, x5, x21 + adc x6, xzr, xzr + # A[0] * B[2] + mul x20, x11, x17 + umulh x21, x11, x17 + adds x5, x5, x20 + adc x6, x6, x21 + # A[1] * B[1] + mul x20, x12, x16 + umulh x21, x12, x16 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, xzr, xzr + # A[2] * B[0] + mul x20, x13, x15 + umulh x21, x13, x15 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, x7, xzr + # A[0] * B[3] + mul x20, x11, x19 + umulh x21, x11, x19 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, xzr, xzr + # A[1] * B[2] + mul x20, x12, x17 + umulh x21, x12, x17 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[2] * B[1] + mul x20, x13, x16 + umulh x21, x13, x16 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[3] * B[0] + mul x20, x14, x15 + umulh x21, x14, x15 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[1] * B[3] + mul x20, x12, x19 + umulh x21, x12, x19 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, xzr, xzr + # A[2] * B[2] + mul x20, x13, x17 + umulh x21, x13, x17 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[3] * B[1] + mul x20, x14, x16 + umulh x21, x14, x16 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[2] * B[3] + mul x20, x13, x19 + umulh x21, x13, x19 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, xzr, xzr + # A[3] * B[2] + mul x20, x14, x17 + umulh x21, x14, x17 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, x10, xzr + # A[3] * B[3] + mul x20, x14, x19 + umulh x21, x14, x19 + adds x9, x9, x20 + adc x10, x10, x21 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x20, #19 + mul x21, x20, x7 + umulh x7, x20, x7 + adds x3, x3, x21 + mul x21, x20, x8 + umulh x8, x20, x8 + adcs x4, x4, x21 + mul x21, x20, x9 + umulh x9, x20, x9 + adcs x5, x5, x21 + mul x21, x20, x10 + umulh x22, x20, x10 + adcs x6, x6, x21 + adc x22, x22, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x22, x22, xzr + # Overflow + extr x22, x22, x6, #63 + mul x22, x22, x20 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x22, x20, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #16] + ldr x1, [x29, #40] + ldr x2, [x29, #48] + # Multiply + ldp x11, x12, [x1] + ldp x13, x14, [x1, #16] + ldp x15, x16, [x2] + ldp x17, x19, [x2, #16] + # A[0] * B[0] + mul x3, x11, x15 + umulh x4, x11, x15 + # A[0] * B[1] + mul x20, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x20 + adc x5, x5, xzr + # A[1] * B[0] + mul x20, x12, x15 + umulh x21, x12, x15 + adds x4, x4, x20 + adcs x5, x5, x21 + adc x6, xzr, xzr + # A[0] * B[2] + mul x20, x11, x17 + umulh x21, x11, x17 + adds x5, x5, x20 + adc x6, x6, x21 + # A[1] * B[1] + mul x20, x12, x16 + umulh x21, x12, x16 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, xzr, xzr + # A[2] * B[0] + mul x20, x13, x15 + umulh x21, x13, x15 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, x7, xzr + # A[0] * B[3] + mul x20, x11, x19 + umulh x21, x11, x19 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, xzr, xzr + # A[1] * B[2] + mul x20, x12, x17 + umulh x21, x12, x17 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[2] * B[1] + mul x20, x13, x16 + umulh x21, x13, x16 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[3] * B[0] + mul x20, x14, x15 + umulh x21, x14, x15 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[1] * B[3] + mul x20, x12, x19 + umulh x21, x12, x19 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, xzr, xzr + # A[2] * B[2] + mul x20, x13, x17 + umulh x21, x13, x17 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[3] * B[1] + mul x20, x14, x16 + umulh x21, x14, x16 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[2] * B[3] + mul x20, x13, x19 + umulh x21, x13, x19 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, xzr, xzr + # A[3] * B[2] + mul x20, x14, x17 + umulh x21, x14, x17 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, x10, xzr + # A[3] * B[3] + mul x20, x14, x19 + umulh x21, x14, x19 + adds x9, x9, x20 + adc x10, x10, x21 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x20, #19 + mul x21, x20, x7 + umulh x7, x20, x7 + adds x3, x3, x21 + mul x21, x20, x8 + umulh x8, x20, x8 + adcs x4, x4, x21 + mul x21, x20, x9 + umulh x9, x20, x9 + adcs x5, x5, x21 + mul x21, x20, x10 + umulh x22, x20, x10 + adcs x6, x6, x21 + adc x22, x22, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x22, x22, xzr + # Overflow + extr x22, x22, x6, #63 + mul x22, x22, x20 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x22, x20, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #24] + ldr x2, [x29, #56] + # Multiply + ldp x11, x12, [x2] + ldp x13, x14, [x2, #16] + # A[0] * B[0] + mul x3, x15, x11 + umulh x4, x15, x11 + # A[0] * B[1] + mul x20, x15, x12 + umulh x5, x15, x12 + adds x4, x4, x20 + adc x5, x5, xzr + # A[1] * B[0] + mul x20, x16, x11 + umulh x21, x16, x11 + adds x4, x4, x20 + adcs x5, x5, x21 + adc x6, xzr, xzr + # A[0] * B[2] + mul x20, x15, x13 + umulh x21, x15, x13 + adds x5, x5, x20 + adc x6, x6, x21 + # A[1] * B[1] + mul x20, x16, x12 + umulh x21, x16, x12 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, xzr, xzr + # A[2] * B[0] + mul x20, x17, x11 + umulh x21, x17, x11 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, x7, xzr + # A[0] * B[3] + mul x20, x15, x14 + umulh x21, x15, x14 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, xzr, xzr + # A[1] * B[2] + mul x20, x16, x13 + umulh x21, x16, x13 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[2] * B[1] + mul x20, x17, x12 + umulh x21, x17, x12 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[3] * B[0] + mul x20, x19, x11 + umulh x21, x19, x11 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[1] * B[3] + mul x20, x16, x14 + umulh x21, x16, x14 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, xzr, xzr + # A[2] * B[2] + mul x20, x17, x13 + umulh x21, x17, x13 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[3] * B[1] + mul x20, x19, x12 + umulh x21, x19, x12 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[2] * B[3] + mul x20, x17, x14 + umulh x21, x17, x14 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, xzr, xzr + # A[3] * B[2] + mul x20, x19, x13 + umulh x21, x19, x13 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, x10, xzr + # A[3] * B[3] + mul x20, x19, x14 + umulh x21, x19, x14 + adds x9, x9, x20 + adc x10, x10, x21 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x20, #19 + mul x21, x20, x7 + umulh x7, x20, x7 + adds x3, x3, x21 + mul x21, x20, x8 + umulh x8, x20, x8 + adcs x4, x4, x21 + mul x21, x20, x9 + umulh x9, x20, x9 + adcs x5, x5, x21 + mul x21, x20, x10 + umulh x22, x20, x10 + adcs x6, x6, x21 + adc x22, x22, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x22, x22, xzr + # Overflow + extr x22, x22, x6, #63 + mul x22, x22, x20 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x22, x20, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x17, [x29, #72] + ldr x19, [x29, #80] + ldp x20, x21, [x29, #88] + ldr x22, [x29, #104] + ldp x29, x30, [sp], #0x70 + ret + .size fe_ge_to_p2,.-fe_ge_to_p2 + .text + .align 2 + .globl fe_ge_to_p3 + .type fe_ge_to_p3, %function +fe_ge_to_p3: + stp x29, x30, [sp, #-160]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + str x26, [x29, #152] + str x1, [x29, #16] + str x2, [x29, #24] + str x3, [x29, #32] + str x4, [x29, #40] + str x5, [x29, #48] + str x6, [x29, #56] + str x7, [x29, #64] + ldr x1, [x29, #40] + ldr x2, [x29, #64] + # Multiply + ldp x11, x12, [x1] + ldp x13, x14, [x1, #16] + ldp x15, x16, [x2] + ldp x17, x19, [x2, #16] + # A[0] * B[0] + mul x3, x11, x15 + umulh x4, x11, x15 + # A[0] * B[1] + mul x24, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x24 + adc x5, x5, xzr + # A[1] * B[0] + mul x24, x12, x15 + umulh x25, x12, x15 + adds x4, x4, x24 + adcs x5, x5, x25 + adc x6, xzr, xzr + # A[0] * B[2] + mul x24, x11, x17 + umulh x25, x11, x17 + adds x5, x5, x24 + adc x6, x6, x25 + # A[1] * B[1] + mul x24, x12, x16 + umulh x25, x12, x16 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[2] * B[0] + mul x24, x13, x15 + umulh x25, x13, x15 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, x7, xzr + # A[0] * B[3] + mul x24, x11, x19 + umulh x25, x11, x19 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, xzr, xzr + # A[1] * B[2] + mul x24, x12, x17 + umulh x25, x12, x17 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[2] * B[1] + mul x24, x13, x16 + umulh x25, x13, x16 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[3] * B[0] + mul x24, x14, x15 + umulh x25, x14, x15 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[1] * B[3] + mul x24, x12, x19 + umulh x25, x12, x19 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, xzr, xzr + # A[2] * B[2] + mul x24, x13, x17 + umulh x25, x13, x17 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[3] * B[1] + mul x24, x14, x16 + umulh x25, x14, x16 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[2] * B[3] + mul x24, x13, x19 + umulh x25, x13, x19 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, xzr, xzr + # A[3] * B[2] + mul x24, x14, x17 + umulh x25, x14, x17 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, x10, xzr + # A[3] * B[3] + mul x24, x14, x19 + umulh x25, x14, x19 + adds x9, x9, x24 + adc x10, x10, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x26, x26, xzr + # Overflow + extr x26, x26, x6, #63 + mul x26, x26, x24 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x26, x24, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #32] + ldr x2, [x29, #48] + # Multiply + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] + # A[0] * B[0] + mul x3, x11, x20 + umulh x4, x11, x20 + # A[0] * B[1] + mul x24, x11, x21 + umulh x5, x11, x21 + adds x4, x4, x24 + adc x5, x5, xzr + # A[1] * B[0] + mul x24, x12, x20 + umulh x25, x12, x20 + adds x4, x4, x24 + adcs x5, x5, x25 + adc x6, xzr, xzr + # A[0] * B[2] + mul x24, x11, x22 + umulh x25, x11, x22 + adds x5, x5, x24 + adc x6, x6, x25 + # A[1] * B[1] + mul x24, x12, x21 + umulh x25, x12, x21 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[2] * B[0] + mul x24, x13, x20 + umulh x25, x13, x20 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, x7, xzr + # A[0] * B[3] + mul x24, x11, x23 + umulh x25, x11, x23 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, xzr, xzr + # A[1] * B[2] + mul x24, x12, x22 + umulh x25, x12, x22 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[2] * B[1] + mul x24, x13, x21 + umulh x25, x13, x21 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[3] * B[0] + mul x24, x14, x20 + umulh x25, x14, x20 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[1] * B[3] + mul x24, x12, x23 + umulh x25, x12, x23 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, xzr, xzr + # A[2] * B[2] + mul x24, x13, x22 + umulh x25, x13, x22 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[3] * B[1] + mul x24, x14, x21 + umulh x25, x14, x21 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[2] * B[3] + mul x24, x13, x23 + umulh x25, x13, x23 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, xzr, xzr + # A[3] * B[2] + mul x24, x14, x22 + umulh x25, x14, x22 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, x10, xzr + # A[3] * B[3] + mul x24, x14, x23 + umulh x25, x14, x23 + adds x9, x9, x24 + adc x10, x10, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x26, x26, xzr + # Overflow + extr x26, x26, x6, #63 + mul x26, x26, x24 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x26, x24, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #16] + ldr x2, [x29, #56] + # Multiply + ldp x11, x12, [x2] + ldp x13, x14, [x2, #16] + # A[0] * B[0] + mul x3, x20, x11 + umulh x4, x20, x11 + # A[0] * B[1] + mul x24, x20, x12 + umulh x5, x20, x12 + adds x4, x4, x24 + adc x5, x5, xzr + # A[1] * B[0] + mul x24, x21, x11 + umulh x25, x21, x11 + adds x4, x4, x24 + adcs x5, x5, x25 + adc x6, xzr, xzr + # A[0] * B[2] + mul x24, x20, x13 + umulh x25, x20, x13 + adds x5, x5, x24 + adc x6, x6, x25 + # A[1] * B[1] + mul x24, x21, x12 + umulh x25, x21, x12 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[2] * B[0] + mul x24, x22, x11 + umulh x25, x22, x11 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, x7, xzr + # A[0] * B[3] + mul x24, x20, x14 + umulh x25, x20, x14 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, xzr, xzr + # A[1] * B[2] + mul x24, x21, x13 + umulh x25, x21, x13 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[2] * B[1] + mul x24, x22, x12 + umulh x25, x22, x12 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[3] * B[0] + mul x24, x23, x11 + umulh x25, x23, x11 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[1] * B[3] + mul x24, x21, x14 + umulh x25, x21, x14 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, xzr, xzr + # A[2] * B[2] + mul x24, x22, x13 + umulh x25, x22, x13 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[3] * B[1] + mul x24, x23, x12 + umulh x25, x23, x12 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[2] * B[3] + mul x24, x22, x14 + umulh x25, x22, x14 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, xzr, xzr + # A[3] * B[2] + mul x24, x23, x13 + umulh x25, x23, x13 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, x10, xzr + # A[3] * B[3] + mul x24, x23, x14 + umulh x25, x23, x14 + adds x9, x9, x24 + adc x10, x10, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x26, x26, xzr + # Overflow + extr x26, x26, x6, #63 + mul x26, x26, x24 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x26, x24, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #24] + # Multiply + # A[0] * B[0] + mul x3, x11, x15 + umulh x4, x11, x15 + # A[0] * B[1] + mul x24, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x24 + adc x5, x5, xzr + # A[1] * B[0] + mul x24, x12, x15 + umulh x25, x12, x15 + adds x4, x4, x24 + adcs x5, x5, x25 + adc x6, xzr, xzr + # A[0] * B[2] + mul x24, x11, x17 + umulh x25, x11, x17 + adds x5, x5, x24 + adc x6, x6, x25 + # A[1] * B[1] + mul x24, x12, x16 + umulh x25, x12, x16 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[2] * B[0] + mul x24, x13, x15 + umulh x25, x13, x15 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, x7, xzr + # A[0] * B[3] + mul x24, x11, x19 + umulh x25, x11, x19 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, xzr, xzr + # A[1] * B[2] + mul x24, x12, x17 + umulh x25, x12, x17 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[2] * B[1] + mul x24, x13, x16 + umulh x25, x13, x16 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[3] * B[0] + mul x24, x14, x15 + umulh x25, x14, x15 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[1] * B[3] + mul x24, x12, x19 + umulh x25, x12, x19 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, xzr, xzr + # A[2] * B[2] + mul x24, x13, x17 + umulh x25, x13, x17 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[3] * B[1] + mul x24, x14, x16 + umulh x25, x14, x16 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[2] * B[3] + mul x24, x13, x19 + umulh x25, x13, x19 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, xzr, xzr + # A[3] * B[2] + mul x24, x14, x17 + umulh x25, x14, x17 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, x10, xzr + # A[3] * B[3] + mul x24, x14, x19 + umulh x25, x14, x19 + adds x9, x9, x24 + adc x10, x10, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x26, x26, xzr + # Overflow + extr x26, x26, x6, #63 + mul x26, x26, x24 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x26, x24, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldr x26, [x29, #152] + ldp x29, x30, [sp], #0xa0 + ret + .size fe_ge_to_p3,.-fe_ge_to_p3 + .text + .align 2 + .globl fe_ge_dbl + .type fe_ge_dbl, %function +fe_ge_dbl: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + ldr x1, [x29, #48] + # Square + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + # A[0] * A[1] + mul x5, x12, x13 + umulh x6, x12, x13 + # A[0] * A[2] + mul x25, x12, x14 + umulh x7, x12, x14 + adds x6, x6, x25 + adc x7, x7, xzr + # A[0] * A[3] + mul x25, x12, x15 + umulh x8, x12, x15 + adds x7, x7, x25 + adc x8, x8, xzr + # A[1] * A[2] + mul x25, x13, x14 + umulh x26, x13, x14 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * A[3] + mul x25, x13, x15 + umulh x26, x13, x15 + adds x8, x8, x25 + adc x9, x9, x26 + # A[2] * A[3] + mul x25, x14, x15 + umulh x10, x14, x15 + adds x9, x9, x25 + adc x10, x10, xzr + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, xzr, xzr + # A[0] * A[0] + mul x4, x12, x12 + umulh x27, x12, x12 + # A[1] * A[1] + mul x25, x13, x13 + umulh x26, x13, x13 + adds x5, x5, x27 + adcs x6, x6, x25 + adc x27, x26, xzr + # A[2] * A[2] + mul x25, x14, x14 + umulh x26, x14, x14 + adds x7, x7, x27 + adcs x8, x8, x25 + adc x27, x26, xzr + # A[3] * A[3] + mul x25, x15, x15 + umulh x26, x15, x15 + adds x9, x9, x27 + adcs x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x0, [x29, #32] + ldr x1, [x29, #56] + # Square + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * A[1] + mul x9, x21, x22 + umulh x10, x21, x22 + # A[0] * A[2] + mul x25, x21, x23 + umulh x11, x21, x23 + adds x10, x10, x25 + adc x11, x11, xzr + # A[0] * A[3] + mul x25, x21, x24 + umulh x16, x21, x24 + adds x11, x11, x25 + adc x16, x16, xzr + # A[1] * A[2] + mul x25, x22, x23 + umulh x26, x22, x23 + adds x11, x11, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * A[3] + mul x25, x22, x24 + umulh x26, x22, x24 + adds x16, x16, x25 + adc x17, x17, x26 + # A[2] * A[3] + mul x25, x23, x24 + umulh x19, x23, x24 + adds x17, x17, x25 + adc x19, x19, xzr + # Double + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x19, x19, x19 + adc x20, xzr, xzr + # A[0] * A[0] + mul x8, x21, x21 + umulh x27, x21, x21 + # A[1] * A[1] + mul x25, x22, x22 + umulh x26, x22, x22 + adds x9, x9, x27 + adcs x10, x10, x25 + adc x27, x26, xzr + # A[2] * A[2] + mul x25, x23, x23 + umulh x26, x23, x23 + adds x11, x11, x27 + adcs x16, x16, x25 + adc x27, x26, xzr + # A[3] * A[3] + mul x25, x24, x24 + umulh x26, x24, x24 + adds x17, x17, x27 + adcs x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x11, #63 + and x11, x11, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x8, x8, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x9, x9, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x10, x10, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x11, x11, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x9, x9, x16 + adcs x10, x10, x17 + adcs x11, x11, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x11, #63 + mul x27, x27, x25 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Reduce if top bit set + and x27, x25, x11, asr 63 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Store + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + ldr x0, [x29, #24] + # Add + adds x12, x12, x21 + adcs x13, x13, x22 + adcs x14, x14, x23 + adc x15, x15, x24 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + ldr x0, [x29, #40] + # Square + # A[0] * A[1] + mul x17, x12, x13 + umulh x19, x12, x13 + # A[0] * A[2] + mul x25, x12, x14 + umulh x20, x12, x14 + adds x19, x19, x25 + adc x20, x20, xzr + # A[0] * A[3] + mul x25, x12, x15 + umulh x21, x12, x15 + adds x20, x20, x25 + adc x21, x21, xzr + # A[1] * A[2] + mul x25, x13, x14 + umulh x26, x13, x14 + adds x20, x20, x25 + adcs x21, x21, x26 + adc x22, xzr, xzr + # A[1] * A[3] + mul x25, x13, x15 + umulh x26, x13, x15 + adds x21, x21, x25 + adc x22, x22, x26 + # A[2] * A[3] + mul x25, x14, x15 + umulh x23, x14, x15 + adds x22, x22, x25 + adc x23, x23, xzr + # Double + adds x17, x17, x17 + adcs x19, x19, x19 + adcs x20, x20, x20 + adcs x21, x21, x21 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x24, xzr, xzr + # A[0] * A[0] + mul x16, x12, x12 + umulh x27, x12, x12 + # A[1] * A[1] + mul x25, x13, x13 + umulh x26, x13, x13 + adds x17, x17, x27 + adcs x19, x19, x25 + adc x27, x26, xzr + # A[2] * A[2] + mul x25, x14, x14 + umulh x26, x14, x14 + adds x20, x20, x27 + adcs x21, x21, x25 + adc x27, x26, xzr + # A[3] * A[3] + mul x25, x15, x15 + umulh x26, x15, x15 + adds x22, x22, x27 + adcs x23, x23, x25 + adc x24, x24, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x24, x24, x23, #63 + extr x23, x23, x22, #63 + extr x22, x22, x21, #63 + extr x21, x21, x20, #63 + and x20, x20, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x21 + umulh x21, x25, x21 + adds x16, x16, x26 + mul x26, x25, x22 + umulh x22, x25, x22 + adcs x17, x17, x26 + mul x26, x25, x23 + umulh x23, x25, x23 + adcs x19, x19, x26 + mul x26, x25, x24 + umulh x27, x25, x24 + adcs x20, x20, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x17, x17, x21 + adcs x19, x19, x22 + adcs x20, x20, x23 + adc x27, x27, xzr + # Overflow + extr x27, x27, x20, #63 + mul x27, x27, x25 + and x20, x20, #0x7fffffffffffffff + adds x16, x16, x27 + adcs x17, x17, xzr + adcs x19, x19, xzr + adc x20, x20, xzr + # Reduce if top bit set + and x27, x25, x20, asr 63 + and x20, x20, #0x7fffffffffffffff + adds x16, x16, x27 + adcs x17, x17, xzr + adcs x19, x19, xzr + adc x20, x20, xzr + # Store + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + ldr x0, [x29, #24] + ldr x1, [x29, #32] + # Add + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x21, x8, x4 + sbcs x22, x9, x5 + sbcs x23, x10, x6 + sbcs x24, x11, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x21, x21, x25 + adcs x22, x22, x28 + adcs x23, x23, x28 + adc x24, x24, x26 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x21, x22, [x1] + stp x23, x24, [x1, #16] + ldr x0, [x29, #16] + # Sub + subs x16, x16, x12 + sbcs x17, x17, x13 + sbcs x19, x19, x14 + sbcs x20, x20, x15 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #64] + # Square * 2 + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + # A[0] * A[1] + mul x5, x12, x13 + umulh x6, x12, x13 + # A[0] * A[2] + mul x25, x12, x14 + umulh x7, x12, x14 + adds x6, x6, x25 + adc x7, x7, xzr + # A[0] * A[3] + mul x25, x12, x15 + umulh x8, x12, x15 + adds x7, x7, x25 + adc x8, x8, xzr + # A[1] * A[2] + mul x25, x13, x14 + umulh x26, x13, x14 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * A[3] + mul x25, x13, x15 + umulh x26, x13, x15 + adds x8, x8, x25 + adc x9, x9, x26 + # A[2] * A[3] + mul x25, x14, x15 + umulh x10, x14, x15 + adds x9, x9, x25 + adc x10, x10, xzr + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, xzr, xzr + # A[0] * A[0] + mul x4, x12, x12 + umulh x28, x12, x12 + # A[1] * A[1] + mul x25, x13, x13 + umulh x26, x13, x13 + adds x5, x5, x28 + adcs x6, x6, x25 + adc x28, x26, xzr + # A[2] * A[2] + mul x25, x14, x14 + umulh x26, x14, x14 + adds x7, x7, x28 + adcs x8, x8, x25 + adc x28, x26, xzr + # A[3] * A[3] + mul x25, x15, x15 + umulh x26, x15, x15 + adds x9, x9, x28 + adcs x10, x10, x25 + adc x11, x11, x26 + # Double and Reduce + mov x25, #0x169 + # Move top half into t4-t7 and remove top bit from t3 + lsr x28, x11, #61 + extr x11, x11, x10, #62 + extr x10, x10, x9, #62 + extr x9, x9, x8, #62 + extr x8, x8, x7, #62 + extr x7, x7, x6, #63 + extr x6, x6, x5, #63 + extr x5, x5, x4, #63 + lsl x4, x4, #1 + and x7, x7, #0x7fffffffffffffff + # Two left, only one right + and x11, x11, #0x7fffffffffffffff + # Multiply top bits by 19*19 + mul x28, x28, x25 + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x4, x4, x28 + adcs x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #40] + # Sub + subs x4, x4, x21 + sbcs x5, x5, x22 + sbcs x6, x6, x23 + sbcs x7, x7, x24 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x25 + adcs x5, x5, x28 + adcs x6, x6, x28 + adc x7, x7, x26 + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_dbl,.-fe_ge_dbl + .text + .align 2 + .globl fe_ge_madd + .type fe_ge_madd, %function +fe_ge_madd: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x19, x20, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + # Sub + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 + ldr x0, [x29, #32] + ldr x2, [x29, #184] + # Multiply + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x12, x4, x21 + umulh x13, x4, x21 + # A[0] * B[1] + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 + adc x14, x14, xzr + # A[1] * B[0] + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[0] * B[2] + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 + # A[1] * B[1] + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x15, #63 + mul x27, x27, x25 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Reduce if top bit set + and x27, x25, x15, asr 63 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #192] + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * B[0] + mul x4, x8, x21 + umulh x5, x8, x21 + # A[0] * B[1] + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #176] + ldr x3, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x3] + ldp x23, x24, [x3, #16] + # A[0] * B[0] + mul x4, x16, x21 + umulh x5, x16, x21 + # A[0] * B[1] + mul x25, x16, x22 + umulh x6, x16, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x17, x21 + umulh x26, x17, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x16, x23 + umulh x26, x16, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x17, x22 + umulh x26, x17, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, xzr, xzr + # A[2] * B[0] + mul x25, x19, x21 + umulh x26, x19, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, x8, xzr + # A[0] * B[3] + mul x25, x16, x24 + umulh x26, x16, x24 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * B[2] + mul x25, x17, x23 + umulh x26, x17, x23 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[2] * B[1] + mul x25, x19, x22 + umulh x26, x19, x22 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[3] * B[0] + mul x25, x20, x21 + umulh x26, x20, x21 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[3] + mul x25, x17, x24 + umulh x26, x17, x24 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, xzr, xzr + # A[2] * B[2] + mul x25, x19, x23 + umulh x26, x19, x23 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[3] * B[1] + mul x25, x20, x22 + umulh x26, x20, x22 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[3] + mul x25, x19, x24 + umulh x26, x19, x24 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[3] * B[2] + mul x25, x20, x23 + umulh x26, x20, x23 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[3] + mul x25, x20, x24 + umulh x26, x20, x24 + adds x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #32] + ldr x1, [x29, #64] + # Double + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + adds x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, x11, x11 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + ldr x1, [x29, #40] + # Add + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x16, x8, x4 + sbcs x17, x9, x5 + sbcs x19, x10, x6 + sbcs x20, x11, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_madd,.-fe_ge_madd + .text + .align 2 + .globl fe_ge_msub + .type fe_ge_msub, %function +fe_ge_msub: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x19, x20, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + # Sub + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 + ldr x0, [x29, #32] + ldr x2, [x29, #192] + # Multiply + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x12, x4, x21 + umulh x13, x4, x21 + # A[0] * B[1] + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 + adc x14, x14, xzr + # A[1] * B[0] + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[0] * B[2] + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 + # A[1] * B[1] + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x15, #63 + mul x27, x27, x25 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Reduce if top bit set + and x27, x25, x15, asr 63 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #184] + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * B[0] + mul x4, x8, x21 + umulh x5, x8, x21 + # A[0] * B[1] + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #176] + ldr x3, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x3] + ldp x23, x24, [x3, #16] + # A[0] * B[0] + mul x4, x16, x21 + umulh x5, x16, x21 + # A[0] * B[1] + mul x25, x16, x22 + umulh x6, x16, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x17, x21 + umulh x26, x17, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x16, x23 + umulh x26, x16, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x17, x22 + umulh x26, x17, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, xzr, xzr + # A[2] * B[0] + mul x25, x19, x21 + umulh x26, x19, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, x8, xzr + # A[0] * B[3] + mul x25, x16, x24 + umulh x26, x16, x24 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * B[2] + mul x25, x17, x23 + umulh x26, x17, x23 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[2] * B[1] + mul x25, x19, x22 + umulh x26, x19, x22 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[3] * B[0] + mul x25, x20, x21 + umulh x26, x20, x21 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[3] + mul x25, x17, x24 + umulh x26, x17, x24 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, xzr, xzr + # A[2] * B[2] + mul x25, x19, x23 + umulh x26, x19, x23 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[3] * B[1] + mul x25, x20, x22 + umulh x26, x20, x22 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[3] + mul x25, x19, x24 + umulh x26, x19, x24 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[3] * B[2] + mul x25, x20, x23 + umulh x26, x20, x23 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[3] + mul x25, x20, x24 + umulh x26, x20, x24 + adds x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #32] + ldr x1, [x29, #64] + # Double + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + adds x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, x11, x11 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + ldr x1, [x29, #40] + # Add + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x16, x8, x4 + sbcs x17, x9, x5 + sbcs x19, x10, x6 + sbcs x20, x11, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x12, x13, [x1] + stp x14, x15, [x1, #16] + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_msub,.-fe_ge_msub + .text + .align 2 + .globl fe_ge_add + .type fe_ge_add, %function +fe_ge_add: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x19, x20, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + # Sub + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 + ldr x0, [x29, #32] + ldr x2, [x29, #192] + # Multiply + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x12, x4, x21 + umulh x13, x4, x21 + # A[0] * B[1] + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 + adc x14, x14, xzr + # A[1] * B[0] + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[0] * B[2] + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 + # A[1] * B[1] + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x15, #63 + mul x27, x27, x25 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Reduce if top bit set + and x27, x25, x15, asr 63 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #200] + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * B[0] + mul x4, x8, x21 + umulh x5, x8, x21 + # A[0] * B[1] + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x0, [x29, #48] + ldr x1, [x29, #64] + ldr x2, [x29, #176] + # Multiply + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + ldp x16, x17, [x2] + ldp x19, x20, [x2, #16] + # A[0] * B[0] + mul x4, x12, x16 + umulh x5, x12, x16 + # A[0] * B[1] + mul x25, x12, x17 + umulh x6, x12, x17 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x13, x16 + umulh x26, x13, x16 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x12, x19 + umulh x26, x12, x19 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x13, x17 + umulh x26, x13, x17 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, xzr, xzr + # A[2] * B[0] + mul x25, x14, x16 + umulh x26, x14, x16 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, x8, xzr + # A[0] * B[3] + mul x25, x12, x20 + umulh x26, x12, x20 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * B[2] + mul x25, x13, x19 + umulh x26, x13, x19 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[2] * B[1] + mul x25, x14, x17 + umulh x26, x14, x17 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[3] * B[0] + mul x25, x15, x16 + umulh x26, x15, x16 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[3] + mul x25, x13, x20 + umulh x26, x13, x20 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, xzr, xzr + # A[2] * B[2] + mul x25, x14, x19 + umulh x26, x14, x19 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[3] * B[1] + mul x25, x15, x17 + umulh x26, x15, x17 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[3] + mul x25, x14, x20 + umulh x26, x14, x20 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[3] * B[2] + mul x25, x15, x19 + umulh x26, x15, x19 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[3] + mul x25, x15, x20 + umulh x26, x15, x20 + adds x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #48] + # Double + adds x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adc x7, x7, x7 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + ldr x0, [x29, #40] + ldr x1, [x29, #184] + ldr x2, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x8, x16, x21 + umulh x9, x16, x21 + # A[0] * B[1] + mul x25, x16, x22 + umulh x10, x16, x22 + adds x9, x9, x25 + adc x10, x10, xzr + # A[1] * B[0] + mul x25, x17, x21 + umulh x26, x17, x21 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[0] * B[2] + mul x25, x16, x23 + umulh x26, x16, x23 + adds x10, x10, x25 + adc x11, x11, x26 + # A[1] * B[1] + mul x25, x17, x22 + umulh x26, x17, x22 + adds x10, x10, x25 + adcs x11, x11, x26 + adc x12, xzr, xzr + # A[2] * B[0] + mul x25, x19, x21 + umulh x26, x19, x21 + adds x10, x10, x25 + adcs x11, x11, x26 + adc x12, x12, xzr + # A[0] * B[3] + mul x25, x16, x24 + umulh x26, x16, x24 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, xzr, xzr + # A[1] * B[2] + mul x25, x17, x23 + umulh x26, x17, x23 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[2] * B[1] + mul x25, x19, x22 + umulh x26, x19, x22 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[3] * B[0] + mul x25, x20, x21 + umulh x26, x20, x21 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[1] * B[3] + mul x25, x17, x24 + umulh x26, x17, x24 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, xzr, xzr + # A[2] * B[2] + mul x25, x19, x23 + umulh x26, x19, x23 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, x14, xzr + # A[3] * B[1] + mul x25, x20, x22 + umulh x26, x20, x22 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, x14, xzr + # A[2] * B[3] + mul x25, x19, x24 + umulh x26, x19, x24 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[3] * B[2] + mul x25, x20, x23 + umulh x26, x20, x23 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, x15, xzr + # A[3] * B[3] + mul x25, x20, x24 + umulh x26, x20, x24 + adds x14, x14, x25 + adc x15, x15, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x15, x15, x14, #63 + extr x14, x14, x13, #63 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + and x11, x11, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x12 + umulh x12, x25, x12 + adds x8, x8, x26 + mul x26, x25, x13 + umulh x13, x25, x13 + adcs x9, x9, x26 + mul x26, x25, x14 + umulh x14, x25, x14 + adcs x10, x10, x26 + mul x26, x25, x15 + umulh x27, x25, x15 + adcs x11, x11, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x9, x9, x12 + adcs x10, x10, x13 + adcs x11, x11, x14 + adc x27, x27, xzr + # Overflow + extr x27, x27, x11, #63 + mul x27, x27, x25 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Reduce if top bit set + and x27, x25, x11, asr 63 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Store + ldr x0, [x29, #32] + ldr x1, [x29, #40] + # Add + adds x12, x4, x8 + adcs x13, x5, x9 + adcs x14, x6, x10 + adc x15, x7, x11 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x16, x4, x8 + sbcs x17, x5, x9 + sbcs x19, x6, x10 + sbcs x20, x7, x11 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_add,.-fe_ge_add + .text + .align 2 + .globl fe_ge_sub + .type fe_ge_sub, %function +fe_ge_sub: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x19, x20, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + # Sub + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 + ldr x0, [x29, #32] + ldr x2, [x29, #200] + # Multiply + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x12, x4, x21 + umulh x13, x4, x21 + # A[0] * B[1] + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 + adc x14, x14, xzr + # A[1] * B[0] + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[0] * B[2] + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 + # A[1] * B[1] + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x15, #63 + mul x27, x27, x25 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Reduce if top bit set + and x27, x25, x15, asr 63 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #192] + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * B[0] + mul x4, x8, x21 + umulh x5, x8, x21 + # A[0] * B[1] + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x0, [x29, #48] + ldr x1, [x29, #64] + ldr x2, [x29, #176] + # Multiply + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + ldp x16, x17, [x2] + ldp x19, x20, [x2, #16] + # A[0] * B[0] + mul x4, x12, x16 + umulh x5, x12, x16 + # A[0] * B[1] + mul x25, x12, x17 + umulh x6, x12, x17 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x13, x16 + umulh x26, x13, x16 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x12, x19 + umulh x26, x12, x19 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x13, x17 + umulh x26, x13, x17 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, xzr, xzr + # A[2] * B[0] + mul x25, x14, x16 + umulh x26, x14, x16 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, x8, xzr + # A[0] * B[3] + mul x25, x12, x20 + umulh x26, x12, x20 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * B[2] + mul x25, x13, x19 + umulh x26, x13, x19 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[2] * B[1] + mul x25, x14, x17 + umulh x26, x14, x17 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[3] * B[0] + mul x25, x15, x16 + umulh x26, x15, x16 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[3] + mul x25, x13, x20 + umulh x26, x13, x20 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, xzr, xzr + # A[2] * B[2] + mul x25, x14, x19 + umulh x26, x14, x19 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[3] * B[1] + mul x25, x15, x17 + umulh x26, x15, x17 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[3] + mul x25, x14, x20 + umulh x26, x14, x20 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[3] * B[2] + mul x25, x15, x19 + umulh x26, x15, x19 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[3] + mul x25, x15, x20 + umulh x26, x15, x20 + adds x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #48] + # Double + adds x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adc x7, x7, x7 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + ldr x0, [x29, #40] + ldr x1, [x29, #184] + ldr x2, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x8, x16, x21 + umulh x9, x16, x21 + # A[0] * B[1] + mul x25, x16, x22 + umulh x10, x16, x22 + adds x9, x9, x25 + adc x10, x10, xzr + # A[1] * B[0] + mul x25, x17, x21 + umulh x26, x17, x21 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[0] * B[2] + mul x25, x16, x23 + umulh x26, x16, x23 + adds x10, x10, x25 + adc x11, x11, x26 + # A[1] * B[1] + mul x25, x17, x22 + umulh x26, x17, x22 + adds x10, x10, x25 + adcs x11, x11, x26 + adc x12, xzr, xzr + # A[2] * B[0] + mul x25, x19, x21 + umulh x26, x19, x21 + adds x10, x10, x25 + adcs x11, x11, x26 + adc x12, x12, xzr + # A[0] * B[3] + mul x25, x16, x24 + umulh x26, x16, x24 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, xzr, xzr + # A[1] * B[2] + mul x25, x17, x23 + umulh x26, x17, x23 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[2] * B[1] + mul x25, x19, x22 + umulh x26, x19, x22 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[3] * B[0] + mul x25, x20, x21 + umulh x26, x20, x21 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[1] * B[3] + mul x25, x17, x24 + umulh x26, x17, x24 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, xzr, xzr + # A[2] * B[2] + mul x25, x19, x23 + umulh x26, x19, x23 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, x14, xzr + # A[3] * B[1] + mul x25, x20, x22 + umulh x26, x20, x22 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, x14, xzr + # A[2] * B[3] + mul x25, x19, x24 + umulh x26, x19, x24 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[3] * B[2] + mul x25, x20, x23 + umulh x26, x20, x23 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, x15, xzr + # A[3] * B[3] + mul x25, x20, x24 + umulh x26, x20, x24 + adds x14, x14, x25 + adc x15, x15, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x15, x15, x14, #63 + extr x14, x14, x13, #63 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + and x11, x11, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x12 + umulh x12, x25, x12 + adds x8, x8, x26 + mul x26, x25, x13 + umulh x13, x25, x13 + adcs x9, x9, x26 + mul x26, x25, x14 + umulh x14, x25, x14 + adcs x10, x10, x26 + mul x26, x25, x15 + umulh x27, x25, x15 + adcs x11, x11, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x9, x9, x12 + adcs x10, x10, x13 + adcs x11, x11, x14 + adc x27, x27, xzr + # Overflow + extr x27, x27, x11, #63 + mul x27, x27, x25 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Reduce if top bit set + and x27, x25, x11, asr 63 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Store + ldr x0, [x29, #40] + ldr x1, [x29, #32] + # Add + adds x12, x4, x8 + adcs x13, x5, x9 + adcs x14, x6, x10 + adc x15, x7, x11 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x16, x4, x8 + sbcs x17, x5, x9 + sbcs x19, x6, x10 + sbcs x20, x7, x11 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_sub,.-fe_ge_sub +#endif /* __aarch64__ */ |