aboutsummaryrefslogtreecommitdiff
path: root/client/wolfssl/wolfcrypt/src/fe_x25519_asm.S
diff options
context:
space:
mode:
Diffstat (limited to 'client/wolfssl/wolfcrypt/src/fe_x25519_asm.S')
-rw-r--r--client/wolfssl/wolfcrypt/src/fe_x25519_asm.S16542
1 files changed, 0 insertions, 16542 deletions
diff --git a/client/wolfssl/wolfcrypt/src/fe_x25519_asm.S b/client/wolfssl/wolfcrypt/src/fe_x25519_asm.S
deleted file mode 100644
index 6d0f638..0000000
--- a/client/wolfssl/wolfcrypt/src/fe_x25519_asm.S
+++ /dev/null
@@ -1,16542 +0,0 @@
-/* fe_x25519_asm
- *
- * Copyright (C) 2006-2020 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-#ifndef HAVE_INTEL_AVX1
-#define HAVE_INTEL_AVX1
-#endif /* HAVE_INTEL_AVX1 */
-#ifndef NO_AVX2_SUPPORT
-#define HAVE_INTEL_AVX2
-#endif /* NO_AVX2_SUPPORT */
-
-#ifndef __APPLE__
-.text
-.globl fe_init
-.type fe_init,@function
-.align 4
-fe_init:
-#else
-.section __TEXT,__text
-.globl _fe_init
-.p2align 2
-_fe_init:
-#endif /* __APPLE__ */
-#ifdef HAVE_INTEL_AVX2
-#ifndef __APPLE__
- movq cpuFlagsSet@GOTPCREL(%rip), %rax
- movl (%rax), %eax
-#else
- movl _cpuFlagsSet(%rip), %eax
-#endif /* __APPLE__ */
- testl %eax, %eax
- je L_fe_init_get_flags
- repz retq
-L_fe_init_get_flags:
-#ifndef __APPLE__
- callq cpuid_get_flags@plt
-#else
- callq _cpuid_get_flags
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq intelFlags@GOTPCREL(%rip), %rdx
- movl %eax, (%rdx)
-#else
- movl %eax, _intelFlags(%rip)
-#endif /* __APPLE__ */
- andl $0x50, %eax
- cmpl $0x50, %eax
- jne L_fe_init_flags_done
-#ifndef __APPLE__
- movq fe_mul_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_mul_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_mul_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_mul_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_sq_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_sq_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_sq_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_sq_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_mul121666_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_mul121666_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_mul121666_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_mul121666_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_sq2_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_sq2_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_sq2_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_sq2_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_invert_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_invert_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_invert_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_invert_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq curve25519_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _curve25519_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq curve25519_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _curve25519_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_pow22523_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_pow22523_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_pow22523_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_pow22523_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_ge_to_p2_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_ge_to_p2_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_ge_to_p3_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_ge_to_p3_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_ge_dbl_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_ge_dbl_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_ge_madd_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_madd_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_ge_madd_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_ge_msub_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_msub_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_ge_msub_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_add_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_ge_add_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_add_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_ge_add_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax
-#else
- leaq _fe_ge_sub_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- movq fe_ge_sub_p@GOTPCREL(%rip), %rdx
- movq %rax, (%rdx)
-#else
- movq %rax, _fe_ge_sub_p(%rip)
-#endif /* __APPLE__ */
-L_fe_init_flags_done:
-#ifndef __APPLE__
- movq cpuFlagsSet@GOTPCREL(%rip), %rdx
- movl $0x1, (%rdx)
-#else
- movl $0x1, _cpuFlagsSet(%rip)
-#endif /* __APPLE__ */
-#endif /* HAVE_INTEL_AVX2 */
- repz retq
-#ifndef __APPLE__
-.size fe_init,.-fe_init
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_frombytes
-.type fe_frombytes,@function
-.align 4
-fe_frombytes:
-#else
-.section __TEXT,__text
-.globl _fe_frombytes
-.p2align 2
-_fe_frombytes:
-#endif /* __APPLE__ */
- movq $0x7fffffffffffffff, %r9
- movq (%rsi), %rdx
- movq 8(%rsi), %rax
- movq 16(%rsi), %rcx
- movq 24(%rsi), %r8
- andq %r9, %r8
- movq %rdx, (%rdi)
- movq %rax, 8(%rdi)
- movq %rcx, 16(%rdi)
- movq %r8, 24(%rdi)
- repz retq
-#ifndef __APPLE__
-.size fe_frombytes,.-fe_frombytes
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_tobytes
-.type fe_tobytes,@function
-.align 4
-fe_tobytes:
-#else
-.section __TEXT,__text
-.globl _fe_tobytes
-.p2align 2
-_fe_tobytes:
-#endif /* __APPLE__ */
- movq $0x7fffffffffffffff, %r10
- movq (%rsi), %rdx
- movq 8(%rsi), %rax
- movq 16(%rsi), %rcx
- movq 24(%rsi), %r8
- addq $19, %rdx
- adcq $0x00, %rax
- adcq $0x00, %rcx
- adcq $0x00, %r8
- shrq $63, %r8
- imulq $19, %r8, %r9
- movq (%rsi), %rdx
- movq 8(%rsi), %rax
- movq 16(%rsi), %rcx
- movq 24(%rsi), %r8
- addq %r9, %rdx
- adcq $0x00, %rax
- adcq $0x00, %rcx
- adcq $0x00, %r8
- andq %r10, %r8
- movq %rdx, (%rdi)
- movq %rax, 8(%rdi)
- movq %rcx, 16(%rdi)
- movq %r8, 24(%rdi)
- repz retq
-#ifndef __APPLE__
-.size fe_tobytes,.-fe_tobytes
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_1
-.type fe_1,@function
-.align 4
-fe_1:
-#else
-.section __TEXT,__text
-.globl _fe_1
-.p2align 2
-_fe_1:
-#endif /* __APPLE__ */
- # Set one
- movq $0x01, (%rdi)
- movq $0x00, 8(%rdi)
- movq $0x00, 16(%rdi)
- movq $0x00, 24(%rdi)
- repz retq
-#ifndef __APPLE__
-.size fe_1,.-fe_1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_0
-.type fe_0,@function
-.align 4
-fe_0:
-#else
-.section __TEXT,__text
-.globl _fe_0
-.p2align 2
-_fe_0:
-#endif /* __APPLE__ */
- # Set zero
- movq $0x00, (%rdi)
- movq $0x00, 8(%rdi)
- movq $0x00, 16(%rdi)
- movq $0x00, 24(%rdi)
- repz retq
-#ifndef __APPLE__
-.size fe_0,.-fe_0
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_copy
-.type fe_copy,@function
-.align 4
-fe_copy:
-#else
-.section __TEXT,__text
-.globl _fe_copy
-.p2align 2
-_fe_copy:
-#endif /* __APPLE__ */
- # Copy
- movq (%rsi), %rdx
- movq 8(%rsi), %rax
- movq 16(%rsi), %rcx
- movq 24(%rsi), %r8
- movq %rdx, (%rdi)
- movq %rax, 8(%rdi)
- movq %rcx, 16(%rdi)
- movq %r8, 24(%rdi)
- repz retq
-#ifndef __APPLE__
-.size fe_copy,.-fe_copy
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_sub
-.type fe_sub,@function
-.align 4
-fe_sub:
-#else
-.section __TEXT,__text
-.globl _fe_sub
-.p2align 2
-_fe_sub:
-#endif /* __APPLE__ */
- pushq %r12
- # Sub
- movq (%rsi), %rax
- movq 8(%rsi), %rcx
- movq 16(%rsi), %r8
- movq 24(%rsi), %r9
- subq (%rdx), %rax
- movq $0x00, %r10
- sbbq 8(%rdx), %rcx
- movq $-19, %r11
- sbbq 16(%rdx), %r8
- movq $0x7fffffffffffffff, %r12
- sbbq 24(%rdx), %r9
- sbbq $0x00, %r10
- # Mask the modulus
- andq %r10, %r11
- andq %r10, %r12
- # Add modulus (if underflow)
- addq %r11, %rax
- adcq %r10, %rcx
- adcq %r10, %r8
- adcq %r12, %r9
- movq %rax, (%rdi)
- movq %rcx, 8(%rdi)
- movq %r8, 16(%rdi)
- movq %r9, 24(%rdi)
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_sub,.-fe_sub
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_add
-.type fe_add,@function
-.align 4
-fe_add:
-#else
-.section __TEXT,__text
-.globl _fe_add
-.p2align 2
-_fe_add:
-#endif /* __APPLE__ */
- pushq %r12
- # Add
- movq (%rsi), %rax
- movq 8(%rsi), %rcx
- addq (%rdx), %rax
- movq 16(%rsi), %r8
- adcq 8(%rdx), %rcx
- movq 24(%rsi), %r10
- adcq 16(%rdx), %r8
- movq $-19, %r11
- adcq 24(%rdx), %r10
- movq $0x7fffffffffffffff, %r12
- movq %r10, %r9
- sarq $63, %r10
- # Mask the modulus
- andq %r10, %r11
- andq %r10, %r12
- # Sub modulus (if overflow)
- subq %r11, %rax
- sbbq %r10, %rcx
- sbbq %r10, %r8
- sbbq %r12, %r9
- movq %rax, (%rdi)
- movq %rcx, 8(%rdi)
- movq %r8, 16(%rdi)
- movq %r9, 24(%rdi)
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_add,.-fe_add
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_neg
-.type fe_neg,@function
-.align 4
-fe_neg:
-#else
-.section __TEXT,__text
-.globl _fe_neg
-.p2align 2
-_fe_neg:
-#endif /* __APPLE__ */
- movq $-19, %rdx
- movq $-1, %rax
- movq $-1, %rcx
- movq $0x7fffffffffffffff, %r8
- subq (%rsi), %rdx
- sbbq 8(%rsi), %rax
- sbbq 16(%rsi), %rcx
- sbbq 24(%rsi), %r8
- movq %rdx, (%rdi)
- movq %rax, 8(%rdi)
- movq %rcx, 16(%rdi)
- movq %r8, 24(%rdi)
- repz retq
-#ifndef __APPLE__
-.size fe_neg,.-fe_neg
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_cmov
-.type fe_cmov,@function
-.align 4
-fe_cmov:
-#else
-.section __TEXT,__text
-.globl _fe_cmov
-.p2align 2
-_fe_cmov:
-#endif /* __APPLE__ */
- cmpl $0x01, %edx
- movq (%rdi), %rcx
- movq 8(%rdi), %r8
- movq 16(%rdi), %r9
- movq 24(%rdi), %r10
- cmoveq (%rsi), %rcx
- cmoveq 8(%rsi), %r8
- cmoveq 16(%rsi), %r9
- cmoveq 24(%rsi), %r10
- movq %rcx, (%rdi)
- movq %r8, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- repz retq
-#ifndef __APPLE__
-.size fe_cmov,.-fe_cmov
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_isnonzero
-.type fe_isnonzero,@function
-.align 4
-fe_isnonzero:
-#else
-.section __TEXT,__text
-.globl _fe_isnonzero
-.p2align 2
-_fe_isnonzero:
-#endif /* __APPLE__ */
- movq $0x7fffffffffffffff, %r10
- movq (%rdi), %rax
- movq 8(%rdi), %rdx
- movq 16(%rdi), %rcx
- movq 24(%rdi), %r8
- addq $19, %rax
- adcq $0x00, %rdx
- adcq $0x00, %rcx
- adcq $0x00, %r8
- shrq $63, %r8
- imulq $19, %r8, %r9
- movq (%rdi), %rax
- movq 8(%rdi), %rdx
- movq 16(%rdi), %rcx
- movq 24(%rdi), %r8
- addq %r9, %rax
- adcq $0x00, %rdx
- adcq $0x00, %rcx
- adcq $0x00, %r8
- andq %r10, %r8
- orq %rdx, %rax
- orq %rcx, %rax
- orq %r8, %rax
- repz retq
-#ifndef __APPLE__
-.size fe_isnonzero,.-fe_isnonzero
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_isnegative
-.type fe_isnegative,@function
-.align 4
-fe_isnegative:
-#else
-.section __TEXT,__text
-.globl _fe_isnegative
-.p2align 2
-_fe_isnegative:
-#endif /* __APPLE__ */
- movq $0x7fffffffffffffff, %r11
- movq (%rdi), %rdx
- movq 8(%rdi), %rcx
- movq 16(%rdi), %r8
- movq 24(%rdi), %r9
- movq %rdx, %rax
- addq $19, %rdx
- adcq $0x00, %rcx
- adcq $0x00, %r8
- adcq $0x00, %r9
- shrq $63, %r9
- imulq $19, %r9, %r10
- addq %r10, %rax
- andq $0x01, %rax
- repz retq
-#ifndef __APPLE__
-.size fe_isnegative,.-fe_isnegative
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_cmov_table
-.type fe_cmov_table,@function
-.align 4
-fe_cmov_table:
-#else
-.section __TEXT,__text
-.globl _fe_cmov_table
-.p2align 2
-_fe_cmov_table:
-#endif /* __APPLE__ */
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- movq %rdx, %rcx
- movsbq %cl, %rax
- cdq
- xorb %dl, %al
- subb %dl, %al
- movb %al, %r15b
- movq $0x01, %rax
- xorq %rdx, %rdx
- xorq %r8, %r8
- xorq %r9, %r9
- movq $0x01, %r10
- xorq %r11, %r11
- xorq %r12, %r12
- xorq %r13, %r13
- cmpb $0x01, %r15b
- movq (%rsi), %r14
- cmoveq %r14, %rax
- movq 8(%rsi), %r14
- cmoveq %r14, %rdx
- movq 16(%rsi), %r14
- cmoveq %r14, %r8
- movq 24(%rsi), %r14
- cmoveq %r14, %r9
- movq 32(%rsi), %r14
- cmoveq %r14, %r10
- movq 40(%rsi), %r14
- cmoveq %r14, %r11
- movq 48(%rsi), %r14
- cmoveq %r14, %r12
- movq 56(%rsi), %r14
- cmoveq %r14, %r13
- cmpb $2, %r15b
- movq 96(%rsi), %r14
- cmoveq %r14, %rax
- movq 104(%rsi), %r14
- cmoveq %r14, %rdx
- movq 112(%rsi), %r14
- cmoveq %r14, %r8
- movq 120(%rsi), %r14
- cmoveq %r14, %r9
- movq 128(%rsi), %r14
- cmoveq %r14, %r10
- movq 136(%rsi), %r14
- cmoveq %r14, %r11
- movq 144(%rsi), %r14
- cmoveq %r14, %r12
- movq 152(%rsi), %r14
- cmoveq %r14, %r13
- cmpb $3, %r15b
- movq 192(%rsi), %r14
- cmoveq %r14, %rax
- movq 200(%rsi), %r14
- cmoveq %r14, %rdx
- movq 208(%rsi), %r14
- cmoveq %r14, %r8
- movq 216(%rsi), %r14
- cmoveq %r14, %r9
- movq 224(%rsi), %r14
- cmoveq %r14, %r10
- movq 232(%rsi), %r14
- cmoveq %r14, %r11
- movq 240(%rsi), %r14
- cmoveq %r14, %r12
- movq 248(%rsi), %r14
- cmoveq %r14, %r13
- cmpb $4, %r15b
- movq 288(%rsi), %r14
- cmoveq %r14, %rax
- movq 296(%rsi), %r14
- cmoveq %r14, %rdx
- movq 304(%rsi), %r14
- cmoveq %r14, %r8
- movq 312(%rsi), %r14
- cmoveq %r14, %r9
- movq 320(%rsi), %r14
- cmoveq %r14, %r10
- movq 328(%rsi), %r14
- cmoveq %r14, %r11
- movq 336(%rsi), %r14
- cmoveq %r14, %r12
- movq 344(%rsi), %r14
- cmoveq %r14, %r13
- cmpb $5, %r15b
- movq 384(%rsi), %r14
- cmoveq %r14, %rax
- movq 392(%rsi), %r14
- cmoveq %r14, %rdx
- movq 400(%rsi), %r14
- cmoveq %r14, %r8
- movq 408(%rsi), %r14
- cmoveq %r14, %r9
- movq 416(%rsi), %r14
- cmoveq %r14, %r10
- movq 424(%rsi), %r14
- cmoveq %r14, %r11
- movq 432(%rsi), %r14
- cmoveq %r14, %r12
- movq 440(%rsi), %r14
- cmoveq %r14, %r13
- cmpb $6, %r15b
- movq 480(%rsi), %r14
- cmoveq %r14, %rax
- movq 488(%rsi), %r14
- cmoveq %r14, %rdx
- movq 496(%rsi), %r14
- cmoveq %r14, %r8
- movq 504(%rsi), %r14
- cmoveq %r14, %r9
- movq 512(%rsi), %r14
- cmoveq %r14, %r10
- movq 520(%rsi), %r14
- cmoveq %r14, %r11
- movq 528(%rsi), %r14
- cmoveq %r14, %r12
- movq 536(%rsi), %r14
- cmoveq %r14, %r13
- cmpb $7, %r15b
- movq 576(%rsi), %r14
- cmoveq %r14, %rax
- movq 584(%rsi), %r14
- cmoveq %r14, %rdx
- movq 592(%rsi), %r14
- cmoveq %r14, %r8
- movq 600(%rsi), %r14
- cmoveq %r14, %r9
- movq 608(%rsi), %r14
- cmoveq %r14, %r10
- movq 616(%rsi), %r14
- cmoveq %r14, %r11
- movq 624(%rsi), %r14
- cmoveq %r14, %r12
- movq 632(%rsi), %r14
- cmoveq %r14, %r13
- cmpb $8, %r15b
- movq 672(%rsi), %r14
- cmoveq %r14, %rax
- movq 680(%rsi), %r14
- cmoveq %r14, %rdx
- movq 688(%rsi), %r14
- cmoveq %r14, %r8
- movq 696(%rsi), %r14
- cmoveq %r14, %r9
- movq 704(%rsi), %r14
- cmoveq %r14, %r10
- movq 712(%rsi), %r14
- cmoveq %r14, %r11
- movq 720(%rsi), %r14
- cmoveq %r14, %r12
- movq 728(%rsi), %r14
- cmoveq %r14, %r13
- cmpb $0x00, %cl
- movq %rax, %r14
- cmovlq %r10, %rax
- cmovlq %r14, %r10
- movq %rdx, %r14
- cmovlq %r11, %rdx
- cmovlq %r14, %r11
- movq %r8, %r14
- cmovlq %r12, %r8
- cmovlq %r14, %r12
- movq %r9, %r14
- cmovlq %r13, %r9
- cmovlq %r14, %r13
- movq %rax, (%rdi)
- movq %rdx, 8(%rdi)
- movq %r8, 16(%rdi)
- movq %r9, 24(%rdi)
- movq %r10, 32(%rdi)
- movq %r11, 40(%rdi)
- movq %r12, 48(%rdi)
- movq %r13, 56(%rdi)
- xorq %rax, %rax
- xorq %rdx, %rdx
- xorq %r8, %r8
- xorq %r9, %r9
- cmpb $0x01, %r15b
- movq 64(%rsi), %r14
- cmoveq %r14, %rax
- movq 72(%rsi), %r14
- cmoveq %r14, %rdx
- movq 80(%rsi), %r14
- cmoveq %r14, %r8
- movq 88(%rsi), %r14
- cmoveq %r14, %r9
- cmpb $2, %r15b
- movq 160(%rsi), %r14
- cmoveq %r14, %rax
- movq 168(%rsi), %r14
- cmoveq %r14, %rdx
- movq 176(%rsi), %r14
- cmoveq %r14, %r8
- movq 184(%rsi), %r14
- cmoveq %r14, %r9
- cmpb $3, %r15b
- movq 256(%rsi), %r14
- cmoveq %r14, %rax
- movq 264(%rsi), %r14
- cmoveq %r14, %rdx
- movq 272(%rsi), %r14
- cmoveq %r14, %r8
- movq 280(%rsi), %r14
- cmoveq %r14, %r9
- cmpb $4, %r15b
- movq 352(%rsi), %r14
- cmoveq %r14, %rax
- movq 360(%rsi), %r14
- cmoveq %r14, %rdx
- movq 368(%rsi), %r14
- cmoveq %r14, %r8
- movq 376(%rsi), %r14
- cmoveq %r14, %r9
- cmpb $5, %r15b
- movq 448(%rsi), %r14
- cmoveq %r14, %rax
- movq 456(%rsi), %r14
- cmoveq %r14, %rdx
- movq 464(%rsi), %r14
- cmoveq %r14, %r8
- movq 472(%rsi), %r14
- cmoveq %r14, %r9
- cmpb $6, %r15b
- movq 544(%rsi), %r14
- cmoveq %r14, %rax
- movq 552(%rsi), %r14
- cmoveq %r14, %rdx
- movq 560(%rsi), %r14
- cmoveq %r14, %r8
- movq 568(%rsi), %r14
- cmoveq %r14, %r9
- cmpb $7, %r15b
- movq 640(%rsi), %r14
- cmoveq %r14, %rax
- movq 648(%rsi), %r14
- cmoveq %r14, %rdx
- movq 656(%rsi), %r14
- cmoveq %r14, %r8
- movq 664(%rsi), %r14
- cmoveq %r14, %r9
- cmpb $8, %r15b
- movq 736(%rsi), %r14
- cmoveq %r14, %rax
- movq 744(%rsi), %r14
- cmoveq %r14, %rdx
- movq 752(%rsi), %r14
- cmoveq %r14, %r8
- movq 760(%rsi), %r14
- cmoveq %r14, %r9
- movq $-19, %r10
- movq $-1, %r11
- movq $-1, %r12
- movq $0x7fffffffffffffff, %r13
- subq %rax, %r10
- sbbq %rdx, %r11
- sbbq %r8, %r12
- sbbq %r9, %r13
- cmpb $0x00, %cl
- cmovlq %r10, %rax
- cmovlq %r11, %rdx
- cmovlq %r12, %r8
- cmovlq %r13, %r9
- movq %rax, 64(%rdi)
- movq %rdx, 72(%rdi)
- movq %r8, 80(%rdi)
- movq %r9, 88(%rdi)
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_cmov_table,.-fe_cmov_table
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_mul
-.type fe_mul,@function
-.align 4
-fe_mul:
-#else
-.section __TEXT,__text
-.globl _fe_mul
-.p2align 2
-_fe_mul:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_mul_p(%rip)
-#else
- jmpq *_fe_mul_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_mul,.-fe_mul
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_sq
-.type fe_sq,@function
-.align 4
-fe_sq:
-#else
-.section __TEXT,__text
-.globl _fe_sq
-.p2align 2
-_fe_sq:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_sq_p(%rip)
-#else
- jmpq *_fe_sq_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_sq,.-fe_sq
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_mul121666
-.type fe_mul121666,@function
-.align 4
-fe_mul121666:
-#else
-.section __TEXT,__text
-.globl _fe_mul121666
-.p2align 2
-_fe_mul121666:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_mul121666_p(%rip)
-#else
- jmpq *_fe_mul121666_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_mul121666,.-fe_mul121666
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_sq2
-.type fe_sq2,@function
-.align 4
-fe_sq2:
-#else
-.section __TEXT,__text
-.globl _fe_sq2
-.p2align 2
-_fe_sq2:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_sq2_p(%rip)
-#else
- jmpq *_fe_sq2_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_sq2,.-fe_sq2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_invert
-.type fe_invert,@function
-.align 4
-fe_invert:
-#else
-.section __TEXT,__text
-.globl _fe_invert
-.p2align 2
-_fe_invert:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_invert_p(%rip)
-#else
- jmpq *_fe_invert_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_invert,.-fe_invert
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl curve25519
-.type curve25519,@function
-.align 4
-curve25519:
-#else
-.section __TEXT,__text
-.globl _curve25519
-.p2align 2
-_curve25519:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *curve25519_p(%rip)
-#else
- jmpq *_curve25519_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size curve25519,.-curve25519
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_pow22523
-.type fe_pow22523,@function
-.align 4
-fe_pow22523:
-#else
-.section __TEXT,__text
-.globl _fe_pow22523
-.p2align 2
-_fe_pow22523:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_pow22523_p(%rip)
-#else
- jmpq *_fe_pow22523_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_pow22523,.-fe_pow22523
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_to_p2
-.type fe_ge_to_p2,@function
-.align 4
-fe_ge_to_p2:
-#else
-.section __TEXT,__text
-.globl _fe_ge_to_p2
-.p2align 2
-_fe_ge_to_p2:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_ge_to_p2_p(%rip)
-#else
- jmpq *_fe_ge_to_p2_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_ge_to_p2,.-fe_ge_to_p2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_to_p3
-.type fe_ge_to_p3,@function
-.align 4
-fe_ge_to_p3:
-#else
-.section __TEXT,__text
-.globl _fe_ge_to_p3
-.p2align 2
-_fe_ge_to_p3:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_ge_to_p3_p(%rip)
-#else
- jmpq *_fe_ge_to_p3_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_ge_to_p3,.-fe_ge_to_p3
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_dbl
-.type fe_ge_dbl,@function
-.align 4
-fe_ge_dbl:
-#else
-.section __TEXT,__text
-.globl _fe_ge_dbl
-.p2align 2
-_fe_ge_dbl:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_ge_dbl_p(%rip)
-#else
- jmpq *_fe_ge_dbl_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_ge_dbl,.-fe_ge_dbl
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_madd
-.type fe_ge_madd,@function
-.align 4
-fe_ge_madd:
-#else
-.section __TEXT,__text
-.globl _fe_ge_madd
-.p2align 2
-_fe_ge_madd:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_ge_madd_p(%rip)
-#else
- jmpq *_fe_ge_madd_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_ge_madd,.-fe_ge_madd
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_msub
-.type fe_ge_msub,@function
-.align 4
-fe_ge_msub:
-#else
-.section __TEXT,__text
-.globl _fe_ge_msub
-.p2align 2
-_fe_ge_msub:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_ge_msub_p(%rip)
-#else
- jmpq *_fe_ge_msub_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_ge_msub,.-fe_ge_msub
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_add
-.type fe_ge_add,@function
-.align 4
-fe_ge_add:
-#else
-.section __TEXT,__text
-.globl _fe_ge_add
-.p2align 2
-_fe_ge_add:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_ge_add_p(%rip)
-#else
- jmpq *_fe_ge_add_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_ge_add,.-fe_ge_add
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_sub
-.type fe_ge_sub,@function
-.align 4
-fe_ge_sub:
-#else
-.section __TEXT,__text
-.globl _fe_ge_sub
-.p2align 2
-_fe_ge_sub:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
- jmpq *fe_ge_sub_p(%rip)
-#else
- jmpq *_fe_ge_sub_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size fe_ge_sub,.-fe_ge_sub
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type cpuFlagsSet, @object
-.size cpuFlagsSet,4
-cpuFlagsSet:
- .long 0
-#else
-.section __DATA,__data
-.p2align 2
-_cpuFlagsSet:
- .long 0
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type intelFlags, @object
-.size intelFlags,4
-intelFlags:
- .long 0
-#else
-.section __DATA,__data
-.p2align 2
-_intelFlags:
- .long 0
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_mul_p, @object
-.size fe_mul_p,8
-fe_mul_p:
- .quad fe_mul_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_mul_p:
- .quad _fe_mul_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_sq_p, @object
-.size fe_sq_p,8
-fe_sq_p:
- .quad fe_sq_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_sq_p:
- .quad _fe_sq_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_mul121666_p, @object
-.size fe_mul121666_p,8
-fe_mul121666_p:
- .quad fe_mul121666_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_mul121666_p:
- .quad _fe_mul121666_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_sq2_p, @object
-.size fe_sq2_p,8
-fe_sq2_p:
- .quad fe_sq2_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_sq2_p:
- .quad _fe_sq2_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_invert_p, @object
-.size fe_invert_p,8
-fe_invert_p:
- .quad fe_invert_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_invert_p:
- .quad _fe_invert_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type curve25519_p, @object
-.size curve25519_p,8
-curve25519_p:
- .quad curve25519_x64
-#else
-.section __DATA,__data
-.p2align 2
-_curve25519_p:
- .quad _curve25519_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_pow22523_p, @object
-.size fe_pow22523_p,8
-fe_pow22523_p:
- .quad fe_pow22523_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_pow22523_p:
- .quad _fe_pow22523_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_ge_to_p2_p, @object
-.size fe_ge_to_p2_p,8
-fe_ge_to_p2_p:
- .quad fe_ge_to_p2_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_ge_to_p2_p:
- .quad _fe_ge_to_p2_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_ge_to_p3_p, @object
-.size fe_ge_to_p3_p,8
-fe_ge_to_p3_p:
- .quad fe_ge_to_p3_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_ge_to_p3_p:
- .quad _fe_ge_to_p3_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_ge_dbl_p, @object
-.size fe_ge_dbl_p,8
-fe_ge_dbl_p:
- .quad fe_ge_dbl_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_ge_dbl_p:
- .quad _fe_ge_dbl_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_ge_madd_p, @object
-.size fe_ge_madd_p,8
-fe_ge_madd_p:
- .quad fe_ge_madd_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_ge_madd_p:
- .quad _fe_ge_madd_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_ge_msub_p, @object
-.size fe_ge_msub_p,8
-fe_ge_msub_p:
- .quad fe_ge_msub_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_ge_msub_p:
- .quad _fe_ge_msub_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_ge_add_p, @object
-.size fe_ge_add_p,8
-fe_ge_add_p:
- .quad fe_ge_add_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_ge_add_p:
- .quad _fe_ge_add_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type fe_ge_sub_p, @object
-.size fe_ge_sub_p,8
-fe_ge_sub_p:
- .quad fe_ge_sub_x64
-#else
-.section __DATA,__data
-.p2align 2
-_fe_ge_sub_p:
- .quad _fe_ge_sub_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_mul_x64
-.type fe_mul_x64,@function
-.align 4
-fe_mul_x64:
-#else
-.section __TEXT,__text
-.globl _fe_mul_x64
-.p2align 2
-_fe_mul_x64:
-#endif /* __APPLE__ */
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbx
- movq %rdx, %rcx
- # Multiply
- # A[0] * B[0]
- movq (%rcx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rcx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rcx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rcx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rcx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rcx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rcx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rcx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rcx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rcx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rcx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rcx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rcx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rcx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rcx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rcx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- popq %rbx
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_mul_x64,.-fe_mul_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_sq_x64
-.type fe_sq_x64,@function
-.align 4
-fe_sq_x64:
-#else
-.section __TEXT,__text
-.globl _fe_sq_x64
-.p2align 2
-_fe_sq_x64:
-#endif /* __APPLE__ */
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- # Square
- # A[0] * A[1]
- movq (%rsi), %rax
- mulq 8(%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * A[2]
- movq (%rsi), %rax
- mulq 16(%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[0] * A[3]
- movq (%rsi), %rax
- mulq 24(%rsi)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * A[2]
- movq 8(%rsi), %rax
- mulq 16(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[1] * A[3]
- movq 8(%rsi), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- # A[2] * A[3]
- movq 16(%rsi), %rax
- mulq 24(%rsi)
- xorq %r13, %r13
- addq %rax, %r12
- adcq %rdx, %r13
- # Double
- xorq %r14, %r14
- addq %r8, %r8
- adcq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq $0x00, %r14
- # A[0] * A[0]
- movq (%rsi), %rax
- mulq %rax
- movq %rax, %rcx
- movq %rdx, %r15
- # A[1] * A[1]
- movq 8(%rsi), %rax
- mulq %rax
- addq %r15, %r8
- adcq %rax, %r9
- adcq $0x00, %rdx
- movq %rdx, %r15
- # A[2] * A[2]
- movq 16(%rsi), %rax
- mulq %rax
- addq %r15, %r10
- adcq %rax, %r11
- adcq $0x00, %rdx
- movq %rdx, %r15
- # A[3] * A[3]
- movq 24(%rsi), %rax
- mulq %rax
- addq %rax, %r13
- adcq %rdx, %r14
- addq %r15, %r12
- adcq $0x00, %r13
- adcq $0x00, %r14
- # Reduce
- movq $0x7fffffffffffffff, %r15
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- shldq $0x01, %r10, %r11
- andq %r15, %r10
- # Multiply top half by 19
- movq $19, %rax
- mulq %r11
- xorq %r11, %r11
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r11
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- # Add remaining product results in
- addq %r11, %r8
- adcq %r12, %r9
- adcq %r13, %r10
- adcq %rax, %r10
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r10, %rdx
- imulq $19, %rdx, %rax
- andq %r15, %r10
- addq %rax, %rcx
- adcq $0x00, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- # Reduce if top bit set
- movq %r10, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %r15, %r10
- addq %rax, %rcx
- adcq $0x00, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- # Store
- movq %rcx, (%rdi)
- movq %r8, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_sq_x64,.-fe_sq_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_sq_n_x64
-.type fe_sq_n_x64,@function
-.align 4
-fe_sq_n_x64:
-#else
-.section __TEXT,__text
-.globl _fe_sq_n_x64
-.p2align 2
-_fe_sq_n_x64:
-#endif /* __APPLE__ */
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbx
- movq %rdx, %rcx
-L_fe_sq_n_x64:
- # Square
- # A[0] * A[1]
- movq (%rsi), %rax
- mulq 8(%rsi)
- movq %rax, %r9
- movq %rdx, %r10
- # A[0] * A[2]
- movq (%rsi), %rax
- mulq 16(%rsi)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[0] * A[3]
- movq (%rsi), %rax
- mulq 24(%rsi)
- xorq %r12, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- # A[1] * A[2]
- movq 8(%rsi), %rax
- mulq 16(%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * A[3]
- movq 8(%rsi), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- # A[2] * A[3]
- movq 16(%rsi), %rax
- mulq 24(%rsi)
- xorq %r14, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Double
- xorq %r15, %r15
- addq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq %r14, %r14
- adcq $0x00, %r15
- # A[0] * A[0]
- movq (%rsi), %rax
- mulq %rax
- movq %rax, %r8
- movq %rdx, %rbx
- # A[1] * A[1]
- movq 8(%rsi), %rax
- mulq %rax
- addq %rbx, %r9
- adcq %rax, %r10
- adcq $0x00, %rdx
- movq %rdx, %rbx
- # A[2] * A[2]
- movq 16(%rsi), %rax
- mulq %rax
- addq %rbx, %r11
- adcq %rax, %r12
- adcq $0x00, %rdx
- movq %rdx, %rbx
- # A[3] * A[3]
- movq 24(%rsi), %rax
- mulq %rax
- addq %rax, %r14
- adcq %rdx, %r15
- addq %rbx, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- decb %cl
- jnz L_fe_sq_n_x64
- popq %rbx
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_sq_n_x64,.-fe_sq_n_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_mul121666_x64
-.type fe_mul121666_x64,@function
-.align 4
-fe_mul121666_x64:
-#else
-.section __TEXT,__text
-.globl _fe_mul121666_x64
-.p2align 2
-_fe_mul121666_x64:
-#endif /* __APPLE__ */
- pushq %r12
- # Multiply by 121666
- movq $0x1db42, %rax
- mulq (%rsi)
- xorq %r10, %r10
- movq %rax, %r8
- movq %rdx, %r9
- movq $0x1db42, %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- movq $0x1db42, %rax
- mulq 16(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- movq $0x1db42, %rax
- mulq 24(%rsi)
- movq $0x7fffffffffffffff, %rcx
- addq %rax, %r11
- adcq %rdx, %r12
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- movq $19, %rax
- mulq %r12
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_mul121666_x64,.-fe_mul121666_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_sq2_x64
-.type fe_sq2_x64,@function
-.align 4
-fe_sq2_x64:
-#else
-.section __TEXT,__text
-.globl _fe_sq2_x64
-.p2align 2
-_fe_sq2_x64:
-#endif /* __APPLE__ */
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbx
- # Square * 2
- # A[0] * A[1]
- movq (%rsi), %rax
- mulq 8(%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * A[2]
- movq (%rsi), %rax
- mulq 16(%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[0] * A[3]
- movq (%rsi), %rax
- mulq 24(%rsi)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * A[2]
- movq 8(%rsi), %rax
- mulq 16(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[1] * A[3]
- movq 8(%rsi), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- # A[2] * A[3]
- movq 16(%rsi), %rax
- mulq 24(%rsi)
- xorq %r13, %r13
- addq %rax, %r12
- adcq %rdx, %r13
- # Double
- xorq %r14, %r14
- addq %r8, %r8
- adcq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq $0x00, %r14
- # A[0] * A[0]
- movq (%rsi), %rax
- mulq %rax
- movq %rax, %rcx
- movq %rdx, %r15
- # A[1] * A[1]
- movq 8(%rsi), %rax
- mulq %rax
- addq %r15, %r8
- adcq %rax, %r9
- adcq $0x00, %rdx
- movq %rdx, %r15
- # A[2] * A[2]
- movq 16(%rsi), %rax
- mulq %rax
- addq %r15, %r10
- adcq %rax, %r11
- adcq $0x00, %rdx
- movq %rdx, %r15
- # A[3] * A[3]
- movq 24(%rsi), %rax
- mulq %rax
- addq %rax, %r13
- adcq %rdx, %r14
- addq %r15, %r12
- adcq $0x00, %r13
- adcq $0x00, %r14
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- xorq %rax, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $3, %r14, %rax
- shldq $2, %r13, %r14
- shldq $2, %r12, %r13
- shldq $2, %r11, %r12
- shldq $2, %r10, %r11
- shldq $0x01, %r9, %r10
- shldq $0x01, %r8, %r9
- shldq $0x01, %rcx, %r8
- shlq $0x01, %rcx
- andq %rbx, %r10
- # Two out left, one in right
- andq %rbx, %r14
- # Multiply top bits by 19*19
- imulq $0x169, %rax, %r15
- # Multiply top half by 19
- movq $19, %rax
- mulq %r11
- xorq %r11, %r11
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r11
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- # Add remaining produce results in
- addq %r15, %rcx
- adcq %r11, %r8
- adcq %r12, %r9
- adcq %r13, %r10
- adcq %rax, %r10
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r10, %rdx
- imulq $19, %rdx, %rax
- andq %rbx, %r10
- addq %rax, %rcx
- adcq $0x00, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- # Reduce if top bit set
- movq %r10, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbx, %r10
- addq %rax, %rcx
- adcq $0x00, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- # Store
- movq %rcx, (%rdi)
- movq %r8, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- popq %rbx
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_sq2_x64,.-fe_sq2_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_invert_x64
-.type fe_invert_x64,@function
-.align 4
-fe_invert_x64:
-#else
-.section __TEXT,__text
-.globl _fe_invert_x64
-.p2align 2
-_fe_invert_x64:
-#endif /* __APPLE__ */
- subq $0x90, %rsp
- # Invert
- movq %rdi, 128(%rsp)
- movq %rsi, 136(%rsp)
- movq %rsp, %rdi
- movq 136(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq 136(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $4, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $19, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $0x63, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq $4, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- movq 128(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- movq 136(%rsp), %rsi
- movq 128(%rsp), %rdi
- addq $0x90, %rsp
- repz retq
-#ifndef __APPLE__
-.text
-.globl curve25519_x64
-.type curve25519_x64,@function
-.align 4
-curve25519_x64:
-#else
-.section __TEXT,__text
-.globl _curve25519_x64
-.p2align 2
-_curve25519_x64:
-#endif /* __APPLE__ */
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbx
- pushq %rbp
- movq %rdx, %r8
- subq $0xb8, %rsp
- xorq %rbx, %rbx
- movq %rdi, 176(%rsp)
- # Set one
- movq $0x01, (%rdi)
- movq $0x00, 8(%rdi)
- movq $0x00, 16(%rdi)
- movq $0x00, 24(%rdi)
- # Set zero
- movq $0x00, (%rsp)
- movq $0x00, 8(%rsp)
- movq $0x00, 16(%rsp)
- movq $0x00, 24(%rsp)
- # Set one
- movq $0x01, 32(%rsp)
- movq $0x00, 40(%rsp)
- movq $0x00, 48(%rsp)
- movq $0x00, 56(%rsp)
- # Copy
- movq (%r8), %rcx
- movq 8(%r8), %r9
- movq 16(%r8), %r10
- movq 24(%r8), %r11
- movq %rcx, 64(%rsp)
- movq %r9, 72(%rsp)
- movq %r10, 80(%rsp)
- movq %r11, 88(%rsp)
- movb $62, 168(%rsp)
- movq $3, 160(%rsp)
-L_curve25519_x64_words:
-L_curve25519_x64_bits:
- movq 160(%rsp), %r9
- movb 168(%rsp), %cl
- movq (%rsi,%r9,8), %rbp
- shrq %cl, %rbp
- andq $0x01, %rbp
- xorq %rbp, %rbx
- negq %rbx
- # Conditional Swap
- movq (%rdi), %rcx
- movq 8(%rdi), %r9
- movq 16(%rdi), %r10
- movq 24(%rdi), %r11
- xorq 64(%rsp), %rcx
- xorq 72(%rsp), %r9
- xorq 80(%rsp), %r10
- xorq 88(%rsp), %r11
- andq %rbx, %rcx
- andq %rbx, %r9
- andq %rbx, %r10
- andq %rbx, %r11
- xorq %rcx, (%rdi)
- xorq %r9, 8(%rdi)
- xorq %r10, 16(%rdi)
- xorq %r11, 24(%rdi)
- xorq %rcx, 64(%rsp)
- xorq %r9, 72(%rsp)
- xorq %r10, 80(%rsp)
- xorq %r11, 88(%rsp)
- # Conditional Swap
- movq (%rsp), %rcx
- movq 8(%rsp), %r9
- movq 16(%rsp), %r10
- movq 24(%rsp), %r11
- xorq 32(%rsp), %rcx
- xorq 40(%rsp), %r9
- xorq 48(%rsp), %r10
- xorq 56(%rsp), %r11
- andq %rbx, %rcx
- andq %rbx, %r9
- andq %rbx, %r10
- andq %rbx, %r11
- xorq %rcx, (%rsp)
- xorq %r9, 8(%rsp)
- xorq %r10, 16(%rsp)
- xorq %r11, 24(%rsp)
- xorq %rcx, 32(%rsp)
- xorq %r9, 40(%rsp)
- xorq %r10, 48(%rsp)
- xorq %r11, 56(%rsp)
- movq %rbp, %rbx
- # Add
- movq (%rdi), %rcx
- movq 8(%rdi), %r9
- movq 16(%rdi), %r10
- movq 24(%rdi), %rbp
- movq %rcx, %r12
- addq (%rsp), %rcx
- movq %r9, %r13
- adcq 8(%rsp), %r9
- movq %r10, %r14
- adcq 16(%rsp), %r10
- movq %rbp, %r15
- adcq 24(%rsp), %rbp
- movq $-19, %rax
- movq %rbp, %r11
- movq $0x7fffffffffffffff, %rdx
- sarq $63, %rbp
- # Mask the modulus
- andq %rbp, %rax
- andq %rbp, %rdx
- # Sub modulus (if overflow)
- subq %rax, %rcx
- sbbq %rbp, %r9
- sbbq %rbp, %r10
- sbbq %rdx, %r11
- # Sub
- subq (%rsp), %r12
- movq $0x00, %rbp
- sbbq 8(%rsp), %r13
- movq $-19, %rax
- sbbq 16(%rsp), %r14
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rsp), %r15
- sbbq $0x00, %rbp
- # Mask the modulus
- andq %rbp, %rax
- andq %rbp, %rdx
- # Add modulus (if underflow)
- addq %rax, %r12
- adcq %rbp, %r13
- adcq %rbp, %r14
- adcq %rdx, %r15
- movq %rcx, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, 128(%rsp)
- movq %r13, 136(%rsp)
- movq %r14, 144(%rsp)
- movq %r15, 152(%rsp)
- # Add
- movq 64(%rsp), %rcx
- movq 72(%rsp), %r9
- movq 80(%rsp), %r10
- movq 88(%rsp), %rbp
- movq %rcx, %r12
- addq 32(%rsp), %rcx
- movq %r9, %r13
- adcq 40(%rsp), %r9
- movq %r10, %r14
- adcq 48(%rsp), %r10
- movq %rbp, %r15
- adcq 56(%rsp), %rbp
- movq $-19, %rax
- movq %rbp, %r11
- movq $0x7fffffffffffffff, %rdx
- sarq $63, %rbp
- # Mask the modulus
- andq %rbp, %rax
- andq %rbp, %rdx
- # Sub modulus (if overflow)
- subq %rax, %rcx
- sbbq %rbp, %r9
- sbbq %rbp, %r10
- sbbq %rdx, %r11
- # Sub
- subq 32(%rsp), %r12
- movq $0x00, %rbp
- sbbq 40(%rsp), %r13
- movq $-19, %rax
- sbbq 48(%rsp), %r14
- movq $0x7fffffffffffffff, %rdx
- sbbq 56(%rsp), %r15
- sbbq $0x00, %rbp
- # Mask the modulus
- andq %rbp, %rax
- andq %rbp, %rdx
- # Add modulus (if underflow)
- addq %rax, %r12
- adcq %rbp, %r13
- adcq %rbp, %r14
- adcq %rdx, %r15
- movq %rcx, (%rsp)
- movq %r9, 8(%rsp)
- movq %r10, 16(%rsp)
- movq %r11, 24(%rsp)
- movq %r12, 96(%rsp)
- movq %r13, 104(%rsp)
- movq %r14, 112(%rsp)
- movq %r15, 120(%rsp)
- # Multiply
- # A[0] * B[0]
- movq (%rdi), %rax
- mulq 96(%rsp)
- movq %rax, %rcx
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rdi), %rax
- mulq 96(%rsp)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rdi), %rax
- mulq 104(%rsp)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rdi), %rax
- mulq 96(%rsp)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rdi), %rax
- mulq 104(%rsp)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rdi), %rax
- mulq 112(%rsp)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rdi), %rax
- mulq 96(%rsp)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rdi), %rax
- mulq 104(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rdi), %rax
- mulq 112(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rdi), %rax
- mulq 120(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rdi), %rax
- mulq 104(%rsp)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rdi), %rax
- mulq 112(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rdi), %rax
- mulq 120(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rdi), %rax
- mulq 112(%rsp)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rdi), %rax
- mulq 120(%rsp)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rdi), %rax
- mulq 120(%rsp)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, 32(%rsp)
- movq %r9, 40(%rsp)
- movq %r10, 48(%rsp)
- movq %r11, 56(%rsp)
- # Multiply
- # A[0] * B[0]
- movq 128(%rsp), %rax
- mulq (%rsp)
- movq %rax, %rcx
- movq %rdx, %r9
- # A[0] * B[1]
- movq 136(%rsp), %rax
- mulq (%rsp)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq 128(%rsp), %rax
- mulq 8(%rsp)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 144(%rsp), %rax
- mulq (%rsp)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 136(%rsp), %rax
- mulq 8(%rsp)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq 128(%rsp), %rax
- mulq 16(%rsp)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 152(%rsp), %rax
- mulq (%rsp)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 144(%rsp), %rax
- mulq 8(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 136(%rsp), %rax
- mulq 16(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq 128(%rsp), %rax
- mulq 24(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 152(%rsp), %rax
- mulq 8(%rsp)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 144(%rsp), %rax
- mulq 16(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 136(%rsp), %rax
- mulq 24(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 152(%rsp), %rax
- mulq 16(%rsp)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 144(%rsp), %rax
- mulq 24(%rsp)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 152(%rsp), %rax
- mulq 24(%rsp)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, (%rsp)
- movq %r9, 8(%rsp)
- movq %r10, 16(%rsp)
- movq %r11, 24(%rsp)
- # Square
- # A[0] * A[1]
- movq 128(%rsp), %rax
- mulq 136(%rsp)
- movq %rax, %r9
- movq %rdx, %r10
- # A[0] * A[2]
- movq 128(%rsp), %rax
- mulq 144(%rsp)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[0] * A[3]
- movq 128(%rsp), %rax
- mulq 152(%rsp)
- xorq %r12, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- # A[1] * A[2]
- movq 136(%rsp), %rax
- mulq 144(%rsp)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * A[3]
- movq 136(%rsp), %rax
- mulq 152(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- # A[2] * A[3]
- movq 144(%rsp), %rax
- mulq 152(%rsp)
- xorq %r14, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Double
- xorq %r15, %r15
- addq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq %r14, %r14
- adcq $0x00, %r15
- # A[0] * A[0]
- movq 128(%rsp), %rax
- mulq %rax
- movq %rax, %rcx
- movq %rdx, %rbp
- # A[1] * A[1]
- movq 136(%rsp), %rax
- mulq %rax
- addq %rbp, %r9
- adcq %rax, %r10
- adcq $0x00, %rdx
- movq %rdx, %rbp
- # A[2] * A[2]
- movq 144(%rsp), %rax
- mulq %rax
- addq %rbp, %r11
- adcq %rax, %r12
- adcq $0x00, %rdx
- movq %rdx, %rbp
- # A[3] * A[3]
- movq 152(%rsp), %rax
- mulq %rax
- addq %rax, %r14
- adcq %rdx, %r15
- addq %rbp, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, 96(%rsp)
- movq %r9, 104(%rsp)
- movq %r10, 112(%rsp)
- movq %r11, 120(%rsp)
- # Square
- # A[0] * A[1]
- movq (%rdi), %rax
- mulq 8(%rdi)
- movq %rax, %r9
- movq %rdx, %r10
- # A[0] * A[2]
- movq (%rdi), %rax
- mulq 16(%rdi)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[0] * A[3]
- movq (%rdi), %rax
- mulq 24(%rdi)
- xorq %r12, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- # A[1] * A[2]
- movq 8(%rdi), %rax
- mulq 16(%rdi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * A[3]
- movq 8(%rdi), %rax
- mulq 24(%rdi)
- addq %rax, %r12
- adcq %rdx, %r13
- # A[2] * A[3]
- movq 16(%rdi), %rax
- mulq 24(%rdi)
- xorq %r14, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Double
- xorq %r15, %r15
- addq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq %r14, %r14
- adcq $0x00, %r15
- # A[0] * A[0]
- movq (%rdi), %rax
- mulq %rax
- movq %rax, %rcx
- movq %rdx, %rbp
- # A[1] * A[1]
- movq 8(%rdi), %rax
- mulq %rax
- addq %rbp, %r9
- adcq %rax, %r10
- adcq $0x00, %rdx
- movq %rdx, %rbp
- # A[2] * A[2]
- movq 16(%rdi), %rax
- mulq %rax
- addq %rbp, %r11
- adcq %rax, %r12
- adcq $0x00, %rdx
- movq %rdx, %rbp
- # A[3] * A[3]
- movq 24(%rdi), %rax
- mulq %rax
- addq %rax, %r14
- adcq %rdx, %r15
- addq %rbp, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, 128(%rsp)
- movq %r9, 136(%rsp)
- movq %r10, 144(%rsp)
- movq %r11, 152(%rsp)
- # Add
- movq 32(%rsp), %rcx
- movq 40(%rsp), %r9
- movq 48(%rsp), %r10
- movq 56(%rsp), %rbp
- movq %rcx, %r12
- addq (%rsp), %rcx
- movq %r9, %r13
- adcq 8(%rsp), %r9
- movq %r10, %r14
- adcq 16(%rsp), %r10
- movq %rbp, %r15
- adcq 24(%rsp), %rbp
- movq $-19, %rax
- movq %rbp, %r11
- movq $0x7fffffffffffffff, %rdx
- sarq $63, %rbp
- # Mask the modulus
- andq %rbp, %rax
- andq %rbp, %rdx
- # Sub modulus (if overflow)
- subq %rax, %rcx
- sbbq %rbp, %r9
- sbbq %rbp, %r10
- sbbq %rdx, %r11
- # Sub
- subq (%rsp), %r12
- movq $0x00, %rbp
- sbbq 8(%rsp), %r13
- movq $-19, %rax
- sbbq 16(%rsp), %r14
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rsp), %r15
- sbbq $0x00, %rbp
- # Mask the modulus
- andq %rbp, %rax
- andq %rbp, %rdx
- # Add modulus (if underflow)
- addq %rax, %r12
- adcq %rbp, %r13
- adcq %rbp, %r14
- adcq %rdx, %r15
- movq %rcx, 64(%rsp)
- movq %r9, 72(%rsp)
- movq %r10, 80(%rsp)
- movq %r11, 88(%rsp)
- movq %r12, (%rsp)
- movq %r13, 8(%rsp)
- movq %r14, 16(%rsp)
- movq %r15, 24(%rsp)
- # Multiply
- # A[0] * B[0]
- movq 96(%rsp), %rax
- mulq 128(%rsp)
- movq %rax, %rcx
- movq %rdx, %r9
- # A[0] * B[1]
- movq 104(%rsp), %rax
- mulq 128(%rsp)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq 96(%rsp), %rax
- mulq 136(%rsp)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 112(%rsp), %rax
- mulq 128(%rsp)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 104(%rsp), %rax
- mulq 136(%rsp)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq 96(%rsp), %rax
- mulq 144(%rsp)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 120(%rsp), %rax
- mulq 128(%rsp)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 112(%rsp), %rax
- mulq 136(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 104(%rsp), %rax
- mulq 144(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq 96(%rsp), %rax
- mulq 152(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 120(%rsp), %rax
- mulq 136(%rsp)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 112(%rsp), %rax
- mulq 144(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 104(%rsp), %rax
- mulq 152(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 120(%rsp), %rax
- mulq 144(%rsp)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 112(%rsp), %rax
- mulq 152(%rsp)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 120(%rsp), %rax
- mulq 152(%rsp)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- # Sub
- movq 128(%rsp), %rcx
- movq 136(%rsp), %r9
- movq 144(%rsp), %r10
- movq 152(%rsp), %r11
- subq 96(%rsp), %rcx
- movq $0x00, %rbp
- sbbq 104(%rsp), %r9
- movq $-19, %rax
- sbbq 112(%rsp), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 120(%rsp), %r11
- sbbq $0x00, %rbp
- # Mask the modulus
- andq %rbp, %rax
- andq %rbp, %rdx
- # Add modulus (if underflow)
- addq %rax, %rcx
- adcq %rbp, %r9
- adcq %rbp, %r10
- adcq %rdx, %r11
- movq %rcx, 128(%rsp)
- movq %r9, 136(%rsp)
- movq %r10, 144(%rsp)
- movq %r11, 152(%rsp)
- # Square
- # A[0] * A[1]
- movq (%rsp), %rax
- mulq 8(%rsp)
- movq %rax, %r9
- movq %rdx, %r10
- # A[0] * A[2]
- movq (%rsp), %rax
- mulq 16(%rsp)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[0] * A[3]
- movq (%rsp), %rax
- mulq 24(%rsp)
- xorq %r12, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- # A[1] * A[2]
- movq 8(%rsp), %rax
- mulq 16(%rsp)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * A[3]
- movq 8(%rsp), %rax
- mulq 24(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- # A[2] * A[3]
- movq 16(%rsp), %rax
- mulq 24(%rsp)
- xorq %r14, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Double
- xorq %r15, %r15
- addq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq %r14, %r14
- adcq $0x00, %r15
- # A[0] * A[0]
- movq (%rsp), %rax
- mulq %rax
- movq %rax, %rcx
- movq %rdx, %rbp
- # A[1] * A[1]
- movq 8(%rsp), %rax
- mulq %rax
- addq %rbp, %r9
- adcq %rax, %r10
- adcq $0x00, %rdx
- movq %rdx, %rbp
- # A[2] * A[2]
- movq 16(%rsp), %rax
- mulq %rax
- addq %rbp, %r11
- adcq %rax, %r12
- adcq $0x00, %rdx
- movq %rdx, %rbp
- # A[3] * A[3]
- movq 24(%rsp), %rax
- mulq %rax
- addq %rax, %r14
- adcq %rdx, %r15
- addq %rbp, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, (%rsp)
- movq %r9, 8(%rsp)
- movq %r10, 16(%rsp)
- movq %r11, 24(%rsp)
- # Multiply by 121666
- movq $0x1db42, %rax
- mulq 128(%rsp)
- xorq %r10, %r10
- movq %rax, %rcx
- movq %rdx, %r9
- movq $0x1db42, %rax
- mulq 136(%rsp)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- movq $0x1db42, %rax
- mulq 144(%rsp)
- xorq %r13, %r13
- addq %rax, %r10
- adcq %rdx, %r11
- movq $0x1db42, %rax
- mulq 152(%rsp)
- movq $0x7fffffffffffffff, %r12
- addq %rax, %r11
- adcq %rdx, %r13
- shldq $0x01, %r11, %r13
- andq %r12, %r11
- movq $19, %rax
- mulq %r13
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- movq %rcx, 32(%rsp)
- movq %r9, 40(%rsp)
- movq %r10, 48(%rsp)
- movq %r11, 56(%rsp)
- # Square
- # A[0] * A[1]
- movq 64(%rsp), %rax
- mulq 72(%rsp)
- movq %rax, %r9
- movq %rdx, %r10
- # A[0] * A[2]
- movq 64(%rsp), %rax
- mulq 80(%rsp)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[0] * A[3]
- movq 64(%rsp), %rax
- mulq 88(%rsp)
- xorq %r12, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- # A[1] * A[2]
- movq 72(%rsp), %rax
- mulq 80(%rsp)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * A[3]
- movq 72(%rsp), %rax
- mulq 88(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- # A[2] * A[3]
- movq 80(%rsp), %rax
- mulq 88(%rsp)
- xorq %r14, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Double
- xorq %r15, %r15
- addq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq %r14, %r14
- adcq $0x00, %r15
- # A[0] * A[0]
- movq 64(%rsp), %rax
- mulq %rax
- movq %rax, %rcx
- movq %rdx, %rbp
- # A[1] * A[1]
- movq 72(%rsp), %rax
- mulq %rax
- addq %rbp, %r9
- adcq %rax, %r10
- adcq $0x00, %rdx
- movq %rdx, %rbp
- # A[2] * A[2]
- movq 80(%rsp), %rax
- mulq %rax
- addq %rbp, %r11
- adcq %rax, %r12
- adcq $0x00, %rdx
- movq %rdx, %rbp
- # A[3] * A[3]
- movq 88(%rsp), %rax
- mulq %rax
- addq %rax, %r14
- adcq %rdx, %r15
- addq %rbp, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, 64(%rsp)
- movq %r9, 72(%rsp)
- movq %r10, 80(%rsp)
- movq %r11, 88(%rsp)
- # Add
- movq 96(%rsp), %rcx
- movq 104(%rsp), %r9
- addq 32(%rsp), %rcx
- movq 112(%rsp), %r10
- adcq 40(%rsp), %r9
- movq 120(%rsp), %rbp
- adcq 48(%rsp), %r10
- movq $-19, %rax
- adcq 56(%rsp), %rbp
- movq $0x7fffffffffffffff, %rdx
- movq %rbp, %r11
- sarq $63, %rbp
- # Mask the modulus
- andq %rbp, %rax
- andq %rbp, %rdx
- # Sub modulus (if overflow)
- subq %rax, %rcx
- sbbq %rbp, %r9
- sbbq %rbp, %r10
- sbbq %rdx, %r11
- movq %rcx, 96(%rsp)
- movq %r9, 104(%rsp)
- movq %r10, 112(%rsp)
- movq %r11, 120(%rsp)
- # Multiply
- # A[0] * B[0]
- movq (%rsp), %rax
- mulq (%r8)
- movq %rax, %rcx
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rsp), %rax
- mulq (%r8)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rsp), %rax
- mulq 8(%r8)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rsp), %rax
- mulq (%r8)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rsp), %rax
- mulq 8(%r8)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rsp), %rax
- mulq 16(%r8)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rsp), %rax
- mulq (%r8)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rsp), %rax
- mulq 8(%r8)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rsp), %rax
- mulq 16(%r8)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rsp), %rax
- mulq 24(%r8)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rsp), %rax
- mulq 8(%r8)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rsp), %rax
- mulq 16(%r8)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rsp), %rax
- mulq 24(%r8)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rsp), %rax
- mulq 16(%r8)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rsp), %rax
- mulq 24(%r8)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rsp), %rax
- mulq 24(%r8)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, 32(%rsp)
- movq %r9, 40(%rsp)
- movq %r10, 48(%rsp)
- movq %r11, 56(%rsp)
- # Multiply
- # A[0] * B[0]
- movq 96(%rsp), %rax
- mulq 128(%rsp)
- movq %rax, %rcx
- movq %rdx, %r9
- # A[0] * B[1]
- movq 104(%rsp), %rax
- mulq 128(%rsp)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq 96(%rsp), %rax
- mulq 136(%rsp)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 112(%rsp), %rax
- mulq 128(%rsp)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 104(%rsp), %rax
- mulq 136(%rsp)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq 96(%rsp), %rax
- mulq 144(%rsp)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 120(%rsp), %rax
- mulq 128(%rsp)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 112(%rsp), %rax
- mulq 136(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 104(%rsp), %rax
- mulq 144(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq 96(%rsp), %rax
- mulq 152(%rsp)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 120(%rsp), %rax
- mulq 136(%rsp)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 112(%rsp), %rax
- mulq 144(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 104(%rsp), %rax
- mulq 152(%rsp)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 120(%rsp), %rax
- mulq 144(%rsp)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 112(%rsp), %rax
- mulq 152(%rsp)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 120(%rsp), %rax
- mulq 152(%rsp)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, (%rsp)
- movq %r9, 8(%rsp)
- movq %r10, 16(%rsp)
- movq %r11, 24(%rsp)
- decb 168(%rsp)
- jge L_curve25519_x64_bits
- movq $63, 168(%rsp)
- decb 160(%rsp)
- jge L_curve25519_x64_words
- # Invert
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- movq %rsp, %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 96(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $4, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 128(%rsp), %rdi
- leaq 96(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 128(%rsp), %rdi
- leaq 128(%rsp), %rsi
- movq $19, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 128(%rsp), %rsi
- leaq 96(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 128(%rsp), %rdi
- leaq 96(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 128(%rsp), %rdi
- leaq 128(%rsp), %rsi
- movq $0x63, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 128(%rsp), %rsi
- leaq 96(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $4, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- movq 176(%rsp), %rdi
- # Multiply
- # A[0] * B[0]
- movq (%rsp), %rax
- mulq (%rdi)
- movq %rax, %rcx
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rsp), %rax
- mulq (%rdi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rsp), %rax
- mulq 8(%rdi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rsp), %rax
- mulq (%rdi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rsp), %rax
- mulq 8(%rdi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rsp), %rax
- mulq 16(%rdi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rsp), %rax
- mulq (%rdi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rsp), %rax
- mulq 8(%rdi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rsp), %rax
- mulq 16(%rdi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rsp), %rax
- mulq 24(%rdi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rsp), %rax
- mulq 8(%rdi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rsp), %rax
- mulq 16(%rdi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rsp), %rax
- mulq 24(%rdi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rsp), %rax
- mulq 16(%rdi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rsp), %rax
- mulq 24(%rdi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rsp), %rax
- mulq 24(%rdi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbp
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rbp, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %rcx
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbp, %r11
- addq %rax, %rcx
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %rcx, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- xorq %rax, %rax
- addq $0xb8, %rsp
- popq %rbp
- popq %rbx
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size curve25519_x64,.-curve25519_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_pow22523_x64
-.type fe_pow22523_x64,@function
-.align 4
-fe_pow22523_x64:
-#else
-.section __TEXT,__text
-.globl _fe_pow22523_x64
-.p2align 2
-_fe_pow22523_x64:
-#endif /* __APPLE__ */
- subq $0x70, %rsp
- # pow22523
- movq %rdi, 96(%rsp)
- movq %rsi, 104(%rsp)
- movq %rsp, %rdi
- movq 104(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq 104(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq $4, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $19, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $0x63, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_x64@plt
-#else
- callq _fe_sq_n_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_x64@plt
-#else
- callq _fe_sq_x64
-#endif /* __APPLE__ */
- movq 96(%rsp), %rdi
- movq %rsp, %rsi
- movq 104(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_x64@plt
-#else
- callq _fe_mul_x64
-#endif /* __APPLE__ */
- movq 104(%rsp), %rsi
- movq 96(%rsp), %rdi
- addq $0x70, %rsp
- repz retq
-#ifndef __APPLE__
-.text
-.globl fe_ge_to_p2_x64
-.type fe_ge_to_p2_x64,@function
-.align 4
-fe_ge_to_p2_x64:
-#else
-.section __TEXT,__text
-.globl _fe_ge_to_p2_x64
-.p2align 2
-_fe_ge_to_p2_x64:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $40, %rsp
- movq %rsi, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rcx, 16(%rsp)
- movq %r8, 24(%rsp)
- movq %r9, 32(%rsp)
- movq 16(%rsp), %rsi
- movq 88(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 32(%rsp), %rsi
- movq 88(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- addq $40, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_to_p3_x64
-.type fe_ge_to_p3_x64,@function
-.align 4
-fe_ge_to_p3_x64:
-#else
-.section __TEXT,__text
-.globl _fe_ge_to_p3_x64
-.p2align 2
-_fe_ge_to_p3_x64:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $40, %rsp
- movq %rsi, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rcx, 16(%rsp)
- movq %r8, 24(%rsp)
- movq %r9, 32(%rsp)
- movq 24(%rsp), %rsi
- movq 96(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 32(%rsp), %rsi
- movq 88(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 88(%rsp), %rsi
- movq 96(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- addq $40, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_dbl_x64
-.type fe_ge_dbl_x64,@function
-.align 4
-fe_ge_dbl_x64:
-#else
-.section __TEXT,__text
-.globl _fe_ge_dbl_x64
-.p2align 2
-_fe_ge_dbl_x64:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $0x50, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq (%rsp), %rdi
- movq 32(%rsp), %rsi
- # Square
- # A[0] * A[1]
- movq (%rsi), %rax
- mulq 8(%rsi)
- movq %rax, %r9
- movq %rdx, %r10
- # A[0] * A[2]
- movq (%rsi), %rax
- mulq 16(%rsi)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[0] * A[3]
- movq (%rsi), %rax
- mulq 24(%rsi)
- xorq %r12, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- # A[1] * A[2]
- movq 8(%rsi), %rax
- mulq 16(%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * A[3]
- movq 8(%rsi), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- # A[2] * A[3]
- movq 16(%rsi), %rax
- mulq 24(%rsi)
- xorq %r14, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Double
- xorq %r15, %r15
- addq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq %r14, %r14
- adcq $0x00, %r15
- # A[0] * A[0]
- movq (%rsi), %rax
- mulq %rax
- movq %rax, %r8
- movq %rdx, %rcx
- # A[1] * A[1]
- movq 8(%rsi), %rax
- mulq %rax
- addq %rcx, %r9
- adcq %rax, %r10
- adcq $0x00, %rdx
- movq %rdx, %rcx
- # A[2] * A[2]
- movq 16(%rsi), %rax
- mulq %rax
- addq %rcx, %r11
- adcq %rax, %r12
- adcq $0x00, %rdx
- movq %rdx, %rcx
- # A[3] * A[3]
- movq 24(%rsi), %rax
- mulq %rax
- addq %rax, %r14
- adcq %rdx, %r15
- addq %rcx, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- movq 40(%rsp), %rsi
- # Square
- # A[0] * A[1]
- movq (%rsi), %rax
- mulq 8(%rsi)
- movq %rax, %r9
- movq %rdx, %r10
- # A[0] * A[2]
- movq (%rsi), %rax
- mulq 16(%rsi)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[0] * A[3]
- movq (%rsi), %rax
- mulq 24(%rsi)
- xorq %r12, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- # A[1] * A[2]
- movq 8(%rsi), %rax
- mulq 16(%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * A[3]
- movq 8(%rsi), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- # A[2] * A[3]
- movq 16(%rsi), %rax
- mulq 24(%rsi)
- xorq %r14, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Double
- xorq %r15, %r15
- addq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq %r14, %r14
- adcq $0x00, %r15
- # A[0] * A[0]
- movq (%rsi), %rax
- mulq %rax
- movq %rax, %r8
- movq %rdx, %rcx
- # A[1] * A[1]
- movq 8(%rsi), %rax
- mulq %rax
- addq %rcx, %r9
- adcq %rax, %r10
- adcq $0x00, %rdx
- movq %rdx, %rcx
- # A[2] * A[2]
- movq 16(%rsi), %rax
- mulq %rax
- addq %rcx, %r11
- adcq %rax, %r12
- adcq $0x00, %rdx
- movq %rdx, %rcx
- # A[3] * A[3]
- movq 24(%rsi), %rax
- mulq %rax
- addq %rax, %r14
- adcq %rdx, %r15
- addq %rcx, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- movq 128(%rsp), %rsi
- # Square * 2
- # A[0] * A[1]
- movq (%rsi), %rax
- mulq 8(%rsi)
- movq %rax, %r9
- movq %rdx, %r10
- # A[0] * A[2]
- movq (%rsi), %rax
- mulq 16(%rsi)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[0] * A[3]
- movq (%rsi), %rax
- mulq 24(%rsi)
- xorq %r12, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- # A[1] * A[2]
- movq 8(%rsi), %rax
- mulq 16(%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * A[3]
- movq 8(%rsi), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- # A[2] * A[3]
- movq 16(%rsi), %rax
- mulq 24(%rsi)
- xorq %r14, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Double
- xorq %r15, %r15
- addq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq %r14, %r14
- adcq $0x00, %r15
- # A[0] * A[0]
- movq (%rsi), %rax
- mulq %rax
- movq %rax, %r8
- movq %rdx, %rcx
- # A[1] * A[1]
- movq 8(%rsi), %rax
- mulq %rax
- addq %rcx, %r9
- adcq %rax, %r10
- adcq $0x00, %rdx
- movq %rdx, %rcx
- # A[2] * A[2]
- movq 16(%rsi), %rax
- mulq %rax
- addq %rcx, %r11
- adcq %rax, %r12
- adcq $0x00, %rdx
- movq %rdx, %rcx
- # A[3] * A[3]
- movq 24(%rsi), %rax
- mulq %rax
- addq %rax, %r14
- adcq %rdx, %r15
- addq %rcx, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- xorq %rax, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $3, %r15, %rax
- shldq $2, %r14, %r15
- shldq $2, %r13, %r14
- shldq $2, %r12, %r13
- shldq $2, %r11, %r12
- shldq $0x01, %r10, %r11
- shldq $0x01, %r9, %r10
- shldq $0x01, %r8, %r9
- shlq $0x01, %r8
- andq %rbx, %r11
- # Two out left, one in right
- andq %rbx, %r15
- # Multiply top bits by 19*19
- imulq $0x169, %rax, %rcx
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining produce results in
- addq %rcx, %r8
- adcq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rbx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 32(%rsp), %rsi
- movq 40(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- leaq 48(%rsp), %rdi
- movq 8(%rsp), %rsi
- # Square
- # A[0] * A[1]
- movq (%rsi), %rax
- mulq 8(%rsi)
- movq %rax, %r9
- movq %rdx, %r10
- # A[0] * A[2]
- movq (%rsi), %rax
- mulq 16(%rsi)
- xorq %r11, %r11
- addq %rax, %r10
- adcq %rdx, %r11
- # A[0] * A[3]
- movq (%rsi), %rax
- mulq 24(%rsi)
- xorq %r12, %r12
- addq %rax, %r11
- adcq %rdx, %r12
- # A[1] * A[2]
- movq 8(%rsi), %rax
- mulq 16(%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * A[3]
- movq 8(%rsi), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- # A[2] * A[3]
- movq 16(%rsi), %rax
- mulq 24(%rsi)
- xorq %r14, %r14
- addq %rax, %r13
- adcq %rdx, %r14
- # Double
- xorq %r15, %r15
- addq %r9, %r9
- adcq %r10, %r10
- adcq %r11, %r11
- adcq %r12, %r12
- adcq %r13, %r13
- adcq %r14, %r14
- adcq $0x00, %r15
- # A[0] * A[0]
- movq (%rsi), %rax
- mulq %rax
- movq %rax, %r8
- movq %rdx, %rcx
- # A[1] * A[1]
- movq 8(%rsi), %rax
- mulq %rax
- addq %rcx, %r9
- adcq %rax, %r10
- adcq $0x00, %rdx
- movq %rdx, %rcx
- # A[2] * A[2]
- movq 16(%rsi), %rax
- mulq %rax
- addq %rcx, %r11
- adcq %rax, %r12
- adcq $0x00, %rdx
- movq %rdx, %rcx
- # A[3] * A[3]
- movq 24(%rsi), %rax
- mulq %rax
- addq %rax, %r14
- adcq %rdx, %r15
- addq %rcx, %r13
- adcq $0x00, %r14
- adcq $0x00, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 16(%rsp), %rsi
- movq (%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- movq 16(%rsp), %rsi
- movq (%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- leaq 48(%rsp), %rsi
- movq 8(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- addq $0x50, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_dbl_x64,.-fe_ge_dbl_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_madd_x64
-.type fe_ge_madd_x64,@function
-.align 4
-fe_ge_madd_x64:
-#else
-.section __TEXT,__text
-.globl _fe_ge_madd_x64
-.p2align 2
-_fe_ge_madd_x64:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $0x50, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq (%rsp), %rdi
- movq 40(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 40(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- movq (%rsp), %rsi
- movq 152(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 8(%rsp), %rsi
- movq 160(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- movq 144(%rsp), %rsi
- movq 136(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- leaq 48(%rsp), %rdi
- movq 128(%rsp), %rsi
- movq 128(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 16(%rsp), %rsi
- movq 8(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 16(%rsp), %rsi
- movq 8(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- leaq 48(%rsp), %rsi
- movq 24(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- leaq 48(%rsp), %rsi
- movq 24(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- addq $0x50, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_madd_x64,.-fe_ge_madd_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_msub_x64
-.type fe_ge_msub_x64,@function
-.align 4
-fe_ge_msub_x64:
-#else
-.section __TEXT,__text
-.globl _fe_ge_msub_x64
-.p2align 2
-_fe_ge_msub_x64:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $0x50, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq (%rsp), %rdi
- movq 40(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 40(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- movq (%rsp), %rsi
- movq 160(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 8(%rsp), %rsi
- movq 152(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- movq 144(%rsp), %rsi
- movq 136(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- leaq 48(%rsp), %rdi
- movq 128(%rsp), %rsi
- movq 128(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 16(%rsp), %rsi
- movq 8(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 16(%rsp), %rsi
- movq 8(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- leaq 48(%rsp), %rsi
- movq 24(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- leaq 48(%rsp), %rsi
- movq 24(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- addq $0x50, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_msub_x64,.-fe_ge_msub_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_add_x64
-.type fe_ge_add_x64,@function
-.align 4
-fe_ge_add_x64:
-#else
-.section __TEXT,__text
-.globl _fe_ge_add_x64
-.p2align 2
-_fe_ge_add_x64:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $0x50, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq (%rsp), %rdi
- movq 40(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 40(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- movq (%rsp), %rsi
- movq 160(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 8(%rsp), %rsi
- movq 168(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- movq 152(%rsp), %rsi
- movq 136(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 128(%rsp), %rsi
- movq 144(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- leaq 48(%rsp), %rdi
- movq (%rsp), %rsi
- movq (%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 16(%rsp), %rsi
- movq 8(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 16(%rsp), %rsi
- movq 8(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- leaq 48(%rsp), %rsi
- movq 24(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- leaq 48(%rsp), %rsi
- movq 24(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- addq $0x50, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_add_x64,.-fe_ge_add_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_sub_x64
-.type fe_ge_sub_x64,@function
-.align 4
-fe_ge_sub_x64:
-#else
-.section __TEXT,__text
-.globl _fe_ge_sub_x64
-.p2align 2
-_fe_ge_sub_x64:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $0x50, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq (%rsp), %rdi
- movq 40(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 40(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- movq (%rsp), %rsi
- movq 168(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 8(%rsp), %rsi
- movq 160(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- movq 152(%rsp), %rsi
- movq 136(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 128(%rsp), %rsi
- movq 144(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rax
- mulq (%rsi)
- movq %rax, %r8
- movq %rdx, %r9
- # A[0] * B[1]
- movq 8(%rbx), %rax
- mulq (%rsi)
- xorq %r10, %r10
- addq %rax, %r9
- adcq %rdx, %r10
- # A[1] * B[0]
- movq (%rbx), %rax
- mulq 8(%rsi)
- xorq %r11, %r11
- addq %rax, %r9
- adcq %rdx, %r10
- adcq $0x00, %r11
- # A[0] * B[2]
- movq 16(%rbx), %rax
- mulq (%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- # A[1] * B[1]
- movq 8(%rbx), %rax
- mulq 8(%rsi)
- xorq %r12, %r12
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[2] * B[0]
- movq (%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r10
- adcq %rdx, %r11
- adcq $0x00, %r12
- # A[0] * B[3]
- movq 24(%rbx), %rax
- mulq (%rsi)
- xorq %r13, %r13
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[2]
- movq 16(%rbx), %rax
- mulq 8(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[2] * B[1]
- movq 8(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[3] * B[0]
- movq (%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r11
- adcq %rdx, %r12
- adcq $0x00, %r13
- # A[1] * B[3]
- movq 24(%rbx), %rax
- mulq 8(%rsi)
- xorq %r14, %r14
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[2]
- movq 16(%rbx), %rax
- mulq 16(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[3] * B[1]
- movq 8(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r12
- adcq %rdx, %r13
- adcq $0x00, %r14
- # A[2] * B[3]
- movq 24(%rbx), %rax
- mulq 16(%rsi)
- xorq %r15, %r15
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[2]
- movq 16(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r13
- adcq %rdx, %r14
- adcq $0x00, %r15
- # A[3] * B[3]
- movq 24(%rbx), %rax
- mulq 24(%rsi)
- addq %rax, %r14
- adcq %rdx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rax
- mulq %r12
- xorq %r12, %r12
- addq %rax, %r8
- movq $19, %rax
- adcq %rdx, %r12
- mulq %r13
- xorq %r13, %r13
- addq %rax, %r9
- movq $19, %rax
- adcq %rdx, %r13
- mulq %r14
- xorq %r14, %r14
- addq %rax, %r10
- movq $19, %rax
- adcq %rdx, %r14
- mulq %r15
- # Add remaining product results in
- addq %r12, %r9
- adcq %r13, %r10
- adcq %r14, %r11
- adcq %rax, %r11
- adcq $0x00, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- leaq 48(%rsp), %rdi
- movq (%rsp), %rsi
- movq (%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 16(%rsp), %rsi
- movq 8(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 16(%rsp), %rsi
- movq 8(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- leaq 48(%rsp), %rsi
- movq 24(%rsp), %rbx
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rbx), %r8
- movq $0x00, %rcx
- sbbq 8(%rbx), %r9
- movq $-19, %rax
- sbbq 16(%rbx), %r10
- movq $0x7fffffffffffffff, %rdx
- sbbq 24(%rbx), %r11
- sbbq $0x00, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Add modulus (if underflow)
- addq %rax, %r8
- adcq %rcx, %r9
- adcq %rcx, %r10
- adcq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rdi
- leaq 48(%rsp), %rsi
- movq 24(%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rcx
- adcq 16(%rbx), %r10
- movq $-19, %rax
- adcq 24(%rbx), %rcx
- movq $0x7fffffffffffffff, %rdx
- movq %rcx, %r11
- sarq $63, %rcx
- # Mask the modulus
- andq %rcx, %rax
- andq %rcx, %rdx
- # Sub modulus (if overflow)
- subq %rax, %r8
- sbbq %rcx, %r9
- sbbq %rcx, %r10
- sbbq %rdx, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- addq $0x50, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_sub_x64,.-fe_ge_sub_x64
-#endif /* __APPLE__ */
-#ifdef HAVE_INTEL_AVX2
-#ifndef __APPLE__
-.text
-.globl fe_mul_avx2
-.type fe_mul_avx2,@function
-.align 4
-fe_mul_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_mul_avx2
-.p2align 2
-_fe_mul_avx2:
-#endif /* __APPLE__ */
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbx
- movq %rdx, %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rax, %rcx
- xorq %r15, %r15
- adcxq %rax, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rcx, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rax, %rcx
- adoxq %rax, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rax, %r14
- adoxq %rcx, %r10
- adcxq %rax, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rax, %rcx
- adcxq %r14, %r12
- adoxq %rax, %r11
- adcxq %r15, %r13
- adoxq %rcx, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rax, %rcx
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rax, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rax
- adcxq %rcx, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rax, %r11
- mulxq 24(%rsi), %rax, %rcx
- adcxq %rax, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rax
- adcxq %rcx, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rax, %r13
- mulxq 24(%rsi), %rax, %rcx
- adoxq %r15, %r14
- adcxq %rax, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rax
- adcxq %rcx, %r15
- xorq %rcx, %rcx
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rax, %r12
- mulxq 24(%rsi), %rdx, %rax
- adoxq %rdx, %r11
- adoxq %rax, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rax
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rax, %r14
- mulxq 24(%rsi), %rax, %rdx
- adcxq %rcx, %r15
- adoxq %rax, %r13
- adoxq %rdx, %r14
- adoxq %rcx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r12, %rax, %r12
- adcxq %rax, %r8
- adoxq %r12, %r9
- mulxq %r13, %rax, %r13
- adcxq %rax, %r9
- adoxq %r13, %r10
- mulxq %r14, %rax, %r14
- adcxq %rax, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- popq %rbx
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_mul_avx2,.-fe_mul_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_sq_avx2
-.type fe_sq_avx2,@function
-.align 4
-fe_sq_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_sq_avx2
-.p2align 2
-_fe_sq_avx2:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- # Square
- # A[0] * A[1]
- movq (%rsi), %rdx
- mulxq 8(%rsi), %r9, %r10
- # A[0] * A[3]
- mulxq 24(%rsi), %r11, %r12
- # A[2] * A[1]
- movq 16(%rsi), %rdx
- mulxq 8(%rsi), %rcx, %rbx
- xorq %r15, %r15
- adoxq %rcx, %r11
- # A[2] * A[3]
- mulxq 24(%rsi), %r13, %r14
- adoxq %rbx, %r12
- # A[2] * A[0]
- mulxq (%rsi), %rcx, %rbx
- adoxq %r15, %r13
- adcxq %rcx, %r10
- adoxq %r15, %r14
- # A[1] * A[3]
- movq 8(%rsi), %rdx
- mulxq 24(%rsi), %rax, %r8
- adcxq %rbx, %r11
- adcxq %rax, %r12
- adcxq %r8, %r13
- adcxq %r15, %r14
- # Double with Carry Flag
- xorq %r15, %r15
- # A[0] * A[0]
- movq (%rsi), %rdx
- mulxq %rdx, %r8, %rax
- adcxq %r9, %r9
- # A[1] * A[1]
- movq 8(%rsi), %rdx
- mulxq %rdx, %rcx, %rbx
- adcxq %r10, %r10
- adoxq %rax, %r9
- adcxq %r11, %r11
- adoxq %rcx, %r10
- # A[2] * A[2]
- movq 16(%rsi), %rdx
- mulxq %rdx, %rax, %rcx
- adcxq %r12, %r12
- adoxq %rbx, %r11
- adcxq %r13, %r13
- adoxq %rax, %r12
- # A[3] * A[3]
- movq 24(%rsi), %rdx
- mulxq %rdx, %rax, %rbx
- adcxq %r14, %r14
- adoxq %rcx, %r13
- adcxq %r15, %r15
- adoxq %rax, %r14
- adoxq %rbx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r12, %rax, %r12
- adcxq %rax, %r8
- adoxq %r12, %r9
- mulxq %r13, %rax, %r13
- adcxq %rax, %r9
- adoxq %r13, %r10
- mulxq %r14, %rax, %r14
- adcxq %rax, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_sq_avx2,.-fe_sq_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_sq_n_avx2
-.type fe_sq_n_avx2,@function
-.align 4
-fe_sq_n_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_sq_n_avx2
-.p2align 2
-_fe_sq_n_avx2:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- movq %rdx, %rbp
-L_fe_sq_n_avx2:
- # Square
- # A[0] * A[1]
- movq (%rsi), %rdx
- mulxq 8(%rsi), %r9, %r10
- # A[0] * A[3]
- mulxq 24(%rsi), %r11, %r12
- # A[2] * A[1]
- movq 16(%rsi), %rdx
- mulxq 8(%rsi), %rcx, %rbx
- xorq %r15, %r15
- adoxq %rcx, %r11
- # A[2] * A[3]
- mulxq 24(%rsi), %r13, %r14
- adoxq %rbx, %r12
- # A[2] * A[0]
- mulxq (%rsi), %rcx, %rbx
- adoxq %r15, %r13
- adcxq %rcx, %r10
- adoxq %r15, %r14
- # A[1] * A[3]
- movq 8(%rsi), %rdx
- mulxq 24(%rsi), %rax, %r8
- adcxq %rbx, %r11
- adcxq %rax, %r12
- adcxq %r8, %r13
- adcxq %r15, %r14
- # Double with Carry Flag
- xorq %r15, %r15
- # A[0] * A[0]
- movq (%rsi), %rdx
- mulxq %rdx, %r8, %rax
- adcxq %r9, %r9
- # A[1] * A[1]
- movq 8(%rsi), %rdx
- mulxq %rdx, %rcx, %rbx
- adcxq %r10, %r10
- adoxq %rax, %r9
- adcxq %r11, %r11
- adoxq %rcx, %r10
- # A[2] * A[2]
- movq 16(%rsi), %rdx
- mulxq %rdx, %rax, %rcx
- adcxq %r12, %r12
- adoxq %rbx, %r11
- adcxq %r13, %r13
- adoxq %rax, %r12
- # A[3] * A[3]
- movq 24(%rsi), %rdx
- mulxq %rdx, %rax, %rbx
- adcxq %r14, %r14
- adoxq %rcx, %r13
- adcxq %r15, %r15
- adoxq %rax, %r14
- adoxq %rbx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r12, %rax, %r12
- adcxq %rax, %r8
- adoxq %r12, %r9
- mulxq %r13, %rax, %r13
- adcxq %rax, %r9
- adoxq %r13, %r10
- mulxq %r14, %rax, %r14
- adcxq %rax, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- decb %bpl
- jnz L_fe_sq_n_avx2
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_sq_n_avx2,.-fe_sq_n_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_mul121666_avx2
-.type fe_mul121666_avx2,@function
-.align 4
-fe_mul121666_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_mul121666_avx2
-.p2align 2
-_fe_mul121666_avx2:
-#endif /* __APPLE__ */
- pushq %r12
- pushq %r13
- movq $0x1db42, %rdx
- mulxq (%rsi), %rax, %r13
- mulxq 8(%rsi), %rcx, %r12
- mulxq 16(%rsi), %r8, %r11
- mulxq 24(%rsi), %r9, %r10
- addq %r13, %rcx
- adcq %r12, %r8
- adcq %r11, %r9
- adcq $0x00, %r10
- movq $0x7fffffffffffffff, %r13
- shldq $0x01, %r9, %r10
- andq %r13, %r9
- imulq $19, %r10, %r10
- addq %r10, %rax
- adcq $0x00, %rcx
- adcq $0x00, %r8
- adcq $0x00, %r9
- movq %rax, (%rdi)
- movq %rcx, 8(%rdi)
- movq %r8, 16(%rdi)
- movq %r9, 24(%rdi)
- popq %r13
- popq %r12
- repz retq
-#ifndef __APPLE__
-.size fe_mul121666_avx2,.-fe_mul121666_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_sq2_avx2
-.type fe_sq2_avx2,@function
-.align 4
-fe_sq2_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_sq2_avx2
-.p2align 2
-_fe_sq2_avx2:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- # Square * 2
- # A[0] * A[1]
- movq (%rsi), %rdx
- mulxq 8(%rsi), %r9, %r10
- # A[0] * A[3]
- mulxq 24(%rsi), %r11, %r12
- # A[2] * A[1]
- movq 16(%rsi), %rdx
- mulxq 8(%rsi), %rcx, %rbx
- xorq %r15, %r15
- adoxq %rcx, %r11
- # A[2] * A[3]
- mulxq 24(%rsi), %r13, %r14
- adoxq %rbx, %r12
- # A[2] * A[0]
- mulxq (%rsi), %rcx, %rbx
- adoxq %r15, %r13
- adcxq %rcx, %r10
- adoxq %r15, %r14
- # A[1] * A[3]
- movq 8(%rsi), %rdx
- mulxq 24(%rsi), %rax, %r8
- adcxq %rbx, %r11
- adcxq %rax, %r12
- adcxq %r8, %r13
- adcxq %r15, %r14
- # Double with Carry Flag
- xorq %r15, %r15
- # A[0] * A[0]
- movq (%rsi), %rdx
- mulxq %rdx, %r8, %rax
- adcxq %r9, %r9
- # A[1] * A[1]
- movq 8(%rsi), %rdx
- mulxq %rdx, %rcx, %rbx
- adcxq %r10, %r10
- adoxq %rax, %r9
- adcxq %r11, %r11
- adoxq %rcx, %r10
- # A[2] * A[2]
- movq 16(%rsi), %rdx
- mulxq %rdx, %rax, %rcx
- adcxq %r12, %r12
- adoxq %rbx, %r11
- adcxq %r13, %r13
- adoxq %rax, %r12
- # A[3] * A[3]
- movq 24(%rsi), %rdx
- mulxq %rdx, %rax, %rbx
- adcxq %r14, %r14
- adoxq %rcx, %r13
- adcxq %r15, %r15
- adoxq %rax, %r14
- adoxq %rbx, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- xorq %rax, %rax
- # Move top half into t4-t7 and remove top bit from t3 and double
- shldq $3, %r15, %rax
- shldq $2, %r14, %r15
- shldq $2, %r13, %r14
- shldq $2, %r12, %r13
- shldq $2, %r11, %r12
- shldq $0x01, %r10, %r11
- shldq $0x01, %r9, %r10
- shldq $0x01, %r8, %r9
- shlq $0x01, %r8
- andq %rbx, %r11
- # Two out left, one in right
- andq %rbx, %r15
- # Multiply top bits by 19*19
- imulq $0x169, %rax, %rcx
- xorq %rbx, %rbx
- # Multiply top half by 19
- movq $19, %rdx
- adoxq %rcx, %r8
- mulxq %r12, %rax, %r12
- adcxq %rax, %r8
- adoxq %r12, %r9
- mulxq %r13, %rax, %r13
- adcxq %rax, %r9
- adoxq %r13, %r10
- mulxq %r14, %rax, %r14
- adcxq %rax, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rbx, %rdx
- adcxq %rbx, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rbx
- imulq $19, %rdx, %rax
- andq %rbx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rbx, %r11
- addq %rax, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_sq2_avx2,.-fe_sq2_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_invert_avx2
-.type fe_invert_avx2,@function
-.align 4
-fe_invert_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_invert_avx2
-.p2align 2
-_fe_invert_avx2:
-#endif /* __APPLE__ */
- subq $0x90, %rsp
- # Invert
- movq %rdi, 128(%rsp)
- movq %rsi, 136(%rsp)
- movq %rsp, %rdi
- movq 136(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq 136(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $4, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $19, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $0x63, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq $4, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- movq 128(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- movq 136(%rsp), %rsi
- movq 128(%rsp), %rdi
- addq $0x90, %rsp
- repz retq
-#ifndef __APPLE__
-.text
-.globl curve25519_avx2
-.type curve25519_avx2,@function
-.align 4
-curve25519_avx2:
-#else
-.section __TEXT,__text
-.globl _curve25519_avx2
-.p2align 2
-_curve25519_avx2:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- movq %rdx, %r8
- subq $0xc0, %rsp
- movq $0x00, 184(%rsp)
- movq %rdi, 176(%rsp)
- # Set one
- movq $0x01, (%rdi)
- movq $0x00, 8(%rdi)
- movq $0x00, 16(%rdi)
- movq $0x00, 24(%rdi)
- # Set zero
- movq $0x00, (%rsp)
- movq $0x00, 8(%rsp)
- movq $0x00, 16(%rsp)
- movq $0x00, 24(%rsp)
- # Set one
- movq $0x01, 32(%rsp)
- movq $0x00, 40(%rsp)
- movq $0x00, 48(%rsp)
- movq $0x00, 56(%rsp)
- # Copy
- movq (%r8), %r9
- movq 8(%r8), %r10
- movq 16(%r8), %r11
- movq 24(%r8), %r12
- movq %r9, 64(%rsp)
- movq %r10, 72(%rsp)
- movq %r11, 80(%rsp)
- movq %r12, 88(%rsp)
- movb $62, 168(%rsp)
- movq $3, 160(%rsp)
-L_curve25519_avx2_words:
-L_curve25519_avx2_bits:
- movq 184(%rsp), %rbx
- movq 160(%rsp), %r9
- movb 168(%rsp), %cl
- movq (%rsi,%r9,8), %rax
- shrq %cl, %rax
- andq $0x01, %rax
- xorq %rax, %rbx
- negq %rbx
- # Conditional Swap
- movq (%rdi), %r9
- movq 8(%rdi), %r10
- movq 16(%rdi), %r11
- movq 24(%rdi), %r12
- xorq 64(%rsp), %r9
- xorq 72(%rsp), %r10
- xorq 80(%rsp), %r11
- xorq 88(%rsp), %r12
- andq %rbx, %r9
- andq %rbx, %r10
- andq %rbx, %r11
- andq %rbx, %r12
- xorq %r9, (%rdi)
- xorq %r10, 8(%rdi)
- xorq %r11, 16(%rdi)
- xorq %r12, 24(%rdi)
- xorq %r9, 64(%rsp)
- xorq %r10, 72(%rsp)
- xorq %r11, 80(%rsp)
- xorq %r12, 88(%rsp)
- # Conditional Swap
- movq (%rsp), %r9
- movq 8(%rsp), %r10
- movq 16(%rsp), %r11
- movq 24(%rsp), %r12
- xorq 32(%rsp), %r9
- xorq 40(%rsp), %r10
- xorq 48(%rsp), %r11
- xorq 56(%rsp), %r12
- andq %rbx, %r9
- andq %rbx, %r10
- andq %rbx, %r11
- andq %rbx, %r12
- xorq %r9, (%rsp)
- xorq %r10, 8(%rsp)
- xorq %r11, 16(%rsp)
- xorq %r12, 24(%rsp)
- xorq %r9, 32(%rsp)
- xorq %r10, 40(%rsp)
- xorq %r11, 48(%rsp)
- xorq %r12, 56(%rsp)
- movq %rax, 184(%rsp)
- # Add
- movq (%rdi), %r9
- movq 8(%rdi), %r10
- movq 16(%rdi), %r11
- movq 24(%rdi), %rax
- movq %r9, %r13
- addq (%rsp), %r9
- movq %r10, %r14
- adcq 8(%rsp), %r10
- movq %r11, %r15
- adcq 16(%rsp), %r11
- movq %rax, %rbp
- adcq 24(%rsp), %rax
- movq $-19, %rcx
- movq %rax, %r12
- movq $0x7fffffffffffffff, %rbx
- sarq $63, %rax
- # Mask the modulus
- andq %rax, %rcx
- andq %rax, %rbx
- # Sub modulus (if overflow)
- subq %rcx, %r9
- sbbq %rax, %r10
- sbbq %rax, %r11
- sbbq %rbx, %r12
- # Sub
- subq (%rsp), %r13
- movq $0x00, %rax
- sbbq 8(%rsp), %r14
- movq $-19, %rcx
- sbbq 16(%rsp), %r15
- movq $0x7fffffffffffffff, %rbx
- sbbq 24(%rsp), %rbp
- sbbq $0x00, %rax
- # Mask the modulus
- andq %rax, %rcx
- andq %rax, %rbx
- # Add modulus (if underflow)
- addq %rcx, %r13
- adcq %rax, %r14
- adcq %rax, %r15
- adcq %rbx, %rbp
- movq %r9, (%rdi)
- movq %r10, 8(%rdi)
- movq %r11, 16(%rdi)
- movq %r12, 24(%rdi)
- movq %r13, 128(%rsp)
- movq %r14, 136(%rsp)
- movq %r15, 144(%rsp)
- movq %rbp, 152(%rsp)
- # Add
- movq 64(%rsp), %r9
- movq 72(%rsp), %r10
- movq 80(%rsp), %r11
- movq 88(%rsp), %rax
- movq %r9, %r13
- addq 32(%rsp), %r9
- movq %r10, %r14
- adcq 40(%rsp), %r10
- movq %r11, %r15
- adcq 48(%rsp), %r11
- movq %rax, %rbp
- adcq 56(%rsp), %rax
- movq $-19, %rcx
- movq %rax, %r12
- movq $0x7fffffffffffffff, %rbx
- sarq $63, %rax
- # Mask the modulus
- andq %rax, %rcx
- andq %rax, %rbx
- # Sub modulus (if overflow)
- subq %rcx, %r9
- sbbq %rax, %r10
- sbbq %rax, %r11
- sbbq %rbx, %r12
- # Sub
- subq 32(%rsp), %r13
- movq $0x00, %rax
- sbbq 40(%rsp), %r14
- movq $-19, %rcx
- sbbq 48(%rsp), %r15
- movq $0x7fffffffffffffff, %rbx
- sbbq 56(%rsp), %rbp
- sbbq $0x00, %rax
- # Mask the modulus
- andq %rax, %rcx
- andq %rax, %rbx
- # Add modulus (if underflow)
- addq %rcx, %r13
- adcq %rax, %r14
- adcq %rax, %r15
- adcq %rbx, %rbp
- movq %r9, (%rsp)
- movq %r10, 8(%rsp)
- movq %r11, 16(%rsp)
- movq %r12, 24(%rsp)
- movq %r13, 96(%rsp)
- movq %r14, 104(%rsp)
- movq %r15, 112(%rsp)
- movq %rbp, 120(%rsp)
- # Multiply
- # A[0] * B[0]
- movq (%rdi), %rdx
- mulxq 96(%rsp), %r9, %r10
- # A[2] * B[0]
- mulxq 112(%rsp), %r11, %r12
- # A[1] * B[0]
- mulxq 104(%rsp), %rcx, %rbx
- xorq %rbp, %rbp
- adcxq %rcx, %r10
- # A[1] * B[3]
- movq 24(%rdi), %rdx
- mulxq 104(%rsp), %r13, %r14
- adcxq %rbx, %r11
- # A[0] * B[1]
- movq 8(%rdi), %rdx
- mulxq 96(%rsp), %rcx, %rbx
- adoxq %rcx, %r10
- # A[2] * B[1]
- mulxq 112(%rsp), %rcx, %r15
- adoxq %rbx, %r11
- adcxq %rcx, %r12
- # A[1] * B[2]
- movq 16(%rdi), %rdx
- mulxq 104(%rsp), %rcx, %rbx
- adcxq %r15, %r13
- adoxq %rcx, %r12
- adcxq %rbp, %r14
- adoxq %rbx, %r13
- # A[0] * B[2]
- mulxq 96(%rsp), %rcx, %rbx
- adoxq %rbp, %r14
- xorq %r15, %r15
- adcxq %rcx, %r11
- # A[1] * B[1]
- movq 8(%rdi), %rdx
- mulxq 104(%rsp), %rdx, %rcx
- adcxq %rbx, %r12
- adoxq %rdx, %r11
- # A[3] * B[1]
- movq 8(%rdi), %rdx
- adoxq %rcx, %r12
- mulxq 120(%rsp), %rcx, %rbx
- adcxq %rcx, %r13
- # A[2] * B[2]
- movq 16(%rdi), %rdx
- mulxq 112(%rsp), %rdx, %rcx
- adcxq %rbx, %r14
- adoxq %rdx, %r13
- # A[3] * B[3]
- movq 24(%rdi), %rdx
- adoxq %rcx, %r14
- mulxq 120(%rsp), %rcx, %rbx
- adoxq %rbp, %r15
- adcxq %rcx, %r15
- # A[0] * B[3]
- mulxq 96(%rsp), %rdx, %rcx
- adcxq %rbx, %rbp
- xorq %rbx, %rbx
- adcxq %rdx, %r12
- # A[3] * B[0]
- movq (%rdi), %rdx
- adcxq %rcx, %r13
- mulxq 120(%rsp), %rdx, %rcx
- adoxq %rdx, %r12
- adoxq %rcx, %r13
- # A[2] * B[3]
- movq 24(%rdi), %rdx
- mulxq 112(%rsp), %rdx, %rcx
- adcxq %rdx, %r14
- # A[3] * B[2]
- movq 16(%rdi), %rdx
- adcxq %rcx, %r15
- mulxq 120(%rsp), %rcx, %rdx
- adcxq %rbx, %rbp
- adoxq %rcx, %r14
- adoxq %rdx, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rbx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rbx, %rbx
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %rcx, %r15
- adcxq %rcx, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rbx, %rdx
- adcxq %rbx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rbx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, 32(%rsp)
- movq %r10, 40(%rsp)
- movq %r11, 48(%rsp)
- movq %r12, 56(%rsp)
- # Multiply
- # A[0] * B[0]
- movq 128(%rsp), %rdx
- mulxq (%rsp), %r9, %r10
- # A[2] * B[0]
- mulxq 16(%rsp), %r11, %r12
- # A[1] * B[0]
- mulxq 8(%rsp), %rcx, %rbx
- xorq %rbp, %rbp
- adcxq %rcx, %r10
- # A[1] * B[3]
- movq 152(%rsp), %rdx
- mulxq 8(%rsp), %r13, %r14
- adcxq %rbx, %r11
- # A[0] * B[1]
- movq 136(%rsp), %rdx
- mulxq (%rsp), %rcx, %rbx
- adoxq %rcx, %r10
- # A[2] * B[1]
- mulxq 16(%rsp), %rcx, %r15
- adoxq %rbx, %r11
- adcxq %rcx, %r12
- # A[1] * B[2]
- movq 144(%rsp), %rdx
- mulxq 8(%rsp), %rcx, %rbx
- adcxq %r15, %r13
- adoxq %rcx, %r12
- adcxq %rbp, %r14
- adoxq %rbx, %r13
- # A[0] * B[2]
- mulxq (%rsp), %rcx, %rbx
- adoxq %rbp, %r14
- xorq %r15, %r15
- adcxq %rcx, %r11
- # A[1] * B[1]
- movq 136(%rsp), %rdx
- mulxq 8(%rsp), %rdx, %rcx
- adcxq %rbx, %r12
- adoxq %rdx, %r11
- # A[3] * B[1]
- movq 136(%rsp), %rdx
- adoxq %rcx, %r12
- mulxq 24(%rsp), %rcx, %rbx
- adcxq %rcx, %r13
- # A[2] * B[2]
- movq 144(%rsp), %rdx
- mulxq 16(%rsp), %rdx, %rcx
- adcxq %rbx, %r14
- adoxq %rdx, %r13
- # A[3] * B[3]
- movq 152(%rsp), %rdx
- adoxq %rcx, %r14
- mulxq 24(%rsp), %rcx, %rbx
- adoxq %rbp, %r15
- adcxq %rcx, %r15
- # A[0] * B[3]
- mulxq (%rsp), %rdx, %rcx
- adcxq %rbx, %rbp
- xorq %rbx, %rbx
- adcxq %rdx, %r12
- # A[3] * B[0]
- movq 128(%rsp), %rdx
- adcxq %rcx, %r13
- mulxq 24(%rsp), %rdx, %rcx
- adoxq %rdx, %r12
- adoxq %rcx, %r13
- # A[2] * B[3]
- movq 152(%rsp), %rdx
- mulxq 16(%rsp), %rdx, %rcx
- adcxq %rdx, %r14
- # A[3] * B[2]
- movq 144(%rsp), %rdx
- adcxq %rcx, %r15
- mulxq 24(%rsp), %rcx, %rdx
- adcxq %rbx, %rbp
- adoxq %rcx, %r14
- adoxq %rdx, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rbx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rbx, %rbx
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %rcx, %r15
- adcxq %rcx, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rbx, %rdx
- adcxq %rbx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rbx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, (%rsp)
- movq %r10, 8(%rsp)
- movq %r11, 16(%rsp)
- movq %r12, 24(%rsp)
- # Square
- # A[0] * A[1]
- movq 128(%rsp), %rdx
- mulxq 136(%rsp), %r10, %r11
- # A[0] * A[3]
- mulxq 152(%rsp), %r12, %r13
- # A[2] * A[1]
- movq 144(%rsp), %rdx
- mulxq 136(%rsp), %rcx, %rbx
- xorq %rbp, %rbp
- adoxq %rcx, %r12
- # A[2] * A[3]
- mulxq 152(%rsp), %r14, %r15
- adoxq %rbx, %r13
- # A[2] * A[0]
- mulxq 128(%rsp), %rcx, %rbx
- adoxq %rbp, %r14
- adcxq %rcx, %r11
- adoxq %rbp, %r15
- # A[1] * A[3]
- movq 136(%rsp), %rdx
- mulxq 152(%rsp), %rax, %r9
- adcxq %rbx, %r12
- adcxq %rax, %r13
- adcxq %r9, %r14
- adcxq %rbp, %r15
- # Double with Carry Flag
- xorq %rbp, %rbp
- # A[0] * A[0]
- movq 128(%rsp), %rdx
- mulxq %rdx, %r9, %rax
- adcxq %r10, %r10
- # A[1] * A[1]
- movq 136(%rsp), %rdx
- mulxq %rdx, %rcx, %rbx
- adcxq %r11, %r11
- adoxq %rax, %r10
- adcxq %r12, %r12
- adoxq %rcx, %r11
- # A[2] * A[2]
- movq 144(%rsp), %rdx
- mulxq %rdx, %rax, %rcx
- adcxq %r13, %r13
- adoxq %rbx, %r12
- adcxq %r14, %r14
- adoxq %rax, %r13
- # A[3] * A[3]
- movq 152(%rsp), %rdx
- mulxq %rdx, %rax, %rbx
- adcxq %r15, %r15
- adoxq %rcx, %r14
- adcxq %rbp, %rbp
- adoxq %rax, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rcx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r13, %rax, %r13
- adcxq %rax, %r9
- adoxq %r13, %r10
- mulxq %r14, %rax, %r14
- adcxq %rax, %r10
- adoxq %r14, %r11
- mulxq %r15, %rax, %r15
- adcxq %rax, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rax
- andq %rcx, %r12
- addq %rax, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r12
- addq %rax, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, 96(%rsp)
- movq %r10, 104(%rsp)
- movq %r11, 112(%rsp)
- movq %r12, 120(%rsp)
- # Square
- # A[0] * A[1]
- movq (%rdi), %rdx
- mulxq 8(%rdi), %r10, %r11
- # A[0] * A[3]
- mulxq 24(%rdi), %r12, %r13
- # A[2] * A[1]
- movq 16(%rdi), %rdx
- mulxq 8(%rdi), %rcx, %rbx
- xorq %rbp, %rbp
- adoxq %rcx, %r12
- # A[2] * A[3]
- mulxq 24(%rdi), %r14, %r15
- adoxq %rbx, %r13
- # A[2] * A[0]
- mulxq (%rdi), %rcx, %rbx
- adoxq %rbp, %r14
- adcxq %rcx, %r11
- adoxq %rbp, %r15
- # A[1] * A[3]
- movq 8(%rdi), %rdx
- mulxq 24(%rdi), %rax, %r9
- adcxq %rbx, %r12
- adcxq %rax, %r13
- adcxq %r9, %r14
- adcxq %rbp, %r15
- # Double with Carry Flag
- xorq %rbp, %rbp
- # A[0] * A[0]
- movq (%rdi), %rdx
- mulxq %rdx, %r9, %rax
- adcxq %r10, %r10
- # A[1] * A[1]
- movq 8(%rdi), %rdx
- mulxq %rdx, %rcx, %rbx
- adcxq %r11, %r11
- adoxq %rax, %r10
- adcxq %r12, %r12
- adoxq %rcx, %r11
- # A[2] * A[2]
- movq 16(%rdi), %rdx
- mulxq %rdx, %rax, %rcx
- adcxq %r13, %r13
- adoxq %rbx, %r12
- adcxq %r14, %r14
- adoxq %rax, %r13
- # A[3] * A[3]
- movq 24(%rdi), %rdx
- mulxq %rdx, %rax, %rbx
- adcxq %r15, %r15
- adoxq %rcx, %r14
- adcxq %rbp, %rbp
- adoxq %rax, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rcx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r13, %rax, %r13
- adcxq %rax, %r9
- adoxq %r13, %r10
- mulxq %r14, %rax, %r14
- adcxq %rax, %r10
- adoxq %r14, %r11
- mulxq %r15, %rax, %r15
- adcxq %rax, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rax
- andq %rcx, %r12
- addq %rax, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r12
- addq %rax, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, 128(%rsp)
- movq %r10, 136(%rsp)
- movq %r11, 144(%rsp)
- movq %r12, 152(%rsp)
- # Add
- movq 32(%rsp), %r9
- movq 40(%rsp), %r10
- movq 48(%rsp), %r11
- movq 56(%rsp), %rax
- movq %r9, %r13
- addq (%rsp), %r9
- movq %r10, %r14
- adcq 8(%rsp), %r10
- movq %r11, %r15
- adcq 16(%rsp), %r11
- movq %rax, %rbp
- adcq 24(%rsp), %rax
- movq $-19, %rcx
- movq %rax, %r12
- movq $0x7fffffffffffffff, %rbx
- sarq $63, %rax
- # Mask the modulus
- andq %rax, %rcx
- andq %rax, %rbx
- # Sub modulus (if overflow)
- subq %rcx, %r9
- sbbq %rax, %r10
- sbbq %rax, %r11
- sbbq %rbx, %r12
- # Sub
- subq (%rsp), %r13
- movq $0x00, %rax
- sbbq 8(%rsp), %r14
- movq $-19, %rcx
- sbbq 16(%rsp), %r15
- movq $0x7fffffffffffffff, %rbx
- sbbq 24(%rsp), %rbp
- sbbq $0x00, %rax
- # Mask the modulus
- andq %rax, %rcx
- andq %rax, %rbx
- # Add modulus (if underflow)
- addq %rcx, %r13
- adcq %rax, %r14
- adcq %rax, %r15
- adcq %rbx, %rbp
- movq %r9, 64(%rsp)
- movq %r10, 72(%rsp)
- movq %r11, 80(%rsp)
- movq %r12, 88(%rsp)
- movq %r13, (%rsp)
- movq %r14, 8(%rsp)
- movq %r15, 16(%rsp)
- movq %rbp, 24(%rsp)
- # Multiply
- # A[0] * B[0]
- movq 96(%rsp), %rdx
- mulxq 128(%rsp), %r9, %r10
- # A[2] * B[0]
- mulxq 144(%rsp), %r11, %r12
- # A[1] * B[0]
- mulxq 136(%rsp), %rcx, %rbx
- xorq %rbp, %rbp
- adcxq %rcx, %r10
- # A[1] * B[3]
- movq 120(%rsp), %rdx
- mulxq 136(%rsp), %r13, %r14
- adcxq %rbx, %r11
- # A[0] * B[1]
- movq 104(%rsp), %rdx
- mulxq 128(%rsp), %rcx, %rbx
- adoxq %rcx, %r10
- # A[2] * B[1]
- mulxq 144(%rsp), %rcx, %r15
- adoxq %rbx, %r11
- adcxq %rcx, %r12
- # A[1] * B[2]
- movq 112(%rsp), %rdx
- mulxq 136(%rsp), %rcx, %rbx
- adcxq %r15, %r13
- adoxq %rcx, %r12
- adcxq %rbp, %r14
- adoxq %rbx, %r13
- # A[0] * B[2]
- mulxq 128(%rsp), %rcx, %rbx
- adoxq %rbp, %r14
- xorq %r15, %r15
- adcxq %rcx, %r11
- # A[1] * B[1]
- movq 104(%rsp), %rdx
- mulxq 136(%rsp), %rdx, %rcx
- adcxq %rbx, %r12
- adoxq %rdx, %r11
- # A[3] * B[1]
- movq 104(%rsp), %rdx
- adoxq %rcx, %r12
- mulxq 152(%rsp), %rcx, %rbx
- adcxq %rcx, %r13
- # A[2] * B[2]
- movq 112(%rsp), %rdx
- mulxq 144(%rsp), %rdx, %rcx
- adcxq %rbx, %r14
- adoxq %rdx, %r13
- # A[3] * B[3]
- movq 120(%rsp), %rdx
- adoxq %rcx, %r14
- mulxq 152(%rsp), %rcx, %rbx
- adoxq %rbp, %r15
- adcxq %rcx, %r15
- # A[0] * B[3]
- mulxq 128(%rsp), %rdx, %rcx
- adcxq %rbx, %rbp
- xorq %rbx, %rbx
- adcxq %rdx, %r12
- # A[3] * B[0]
- movq 96(%rsp), %rdx
- adcxq %rcx, %r13
- mulxq 152(%rsp), %rdx, %rcx
- adoxq %rdx, %r12
- adoxq %rcx, %r13
- # A[2] * B[3]
- movq 120(%rsp), %rdx
- mulxq 144(%rsp), %rdx, %rcx
- adcxq %rdx, %r14
- # A[3] * B[2]
- movq 112(%rsp), %rdx
- adcxq %rcx, %r15
- mulxq 152(%rsp), %rcx, %rdx
- adcxq %rbx, %rbp
- adoxq %rcx, %r14
- adoxq %rdx, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rbx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rbx, %rbx
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %rcx, %r15
- adcxq %rcx, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rbx, %rdx
- adcxq %rbx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rbx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, (%rdi)
- movq %r10, 8(%rdi)
- movq %r11, 16(%rdi)
- movq %r12, 24(%rdi)
- # Sub
- movq 128(%rsp), %r9
- movq 136(%rsp), %r10
- movq 144(%rsp), %r11
- movq 152(%rsp), %r12
- subq 96(%rsp), %r9
- movq $0x00, %rax
- sbbq 104(%rsp), %r10
- movq $-19, %rcx
- sbbq 112(%rsp), %r11
- movq $0x7fffffffffffffff, %rbx
- sbbq 120(%rsp), %r12
- sbbq $0x00, %rax
- # Mask the modulus
- andq %rax, %rcx
- andq %rax, %rbx
- # Add modulus (if underflow)
- addq %rcx, %r9
- adcq %rax, %r10
- adcq %rax, %r11
- adcq %rbx, %r12
- movq %r9, 128(%rsp)
- movq %r10, 136(%rsp)
- movq %r11, 144(%rsp)
- movq %r12, 152(%rsp)
- # Square
- # A[0] * A[1]
- movq (%rsp), %rdx
- mulxq 8(%rsp), %r10, %r11
- # A[0] * A[3]
- mulxq 24(%rsp), %r12, %r13
- # A[2] * A[1]
- movq 16(%rsp), %rdx
- mulxq 8(%rsp), %rcx, %rbx
- xorq %rbp, %rbp
- adoxq %rcx, %r12
- # A[2] * A[3]
- mulxq 24(%rsp), %r14, %r15
- adoxq %rbx, %r13
- # A[2] * A[0]
- mulxq (%rsp), %rcx, %rbx
- adoxq %rbp, %r14
- adcxq %rcx, %r11
- adoxq %rbp, %r15
- # A[1] * A[3]
- movq 8(%rsp), %rdx
- mulxq 24(%rsp), %rax, %r9
- adcxq %rbx, %r12
- adcxq %rax, %r13
- adcxq %r9, %r14
- adcxq %rbp, %r15
- # Double with Carry Flag
- xorq %rbp, %rbp
- # A[0] * A[0]
- movq (%rsp), %rdx
- mulxq %rdx, %r9, %rax
- adcxq %r10, %r10
- # A[1] * A[1]
- movq 8(%rsp), %rdx
- mulxq %rdx, %rcx, %rbx
- adcxq %r11, %r11
- adoxq %rax, %r10
- adcxq %r12, %r12
- adoxq %rcx, %r11
- # A[2] * A[2]
- movq 16(%rsp), %rdx
- mulxq %rdx, %rax, %rcx
- adcxq %r13, %r13
- adoxq %rbx, %r12
- adcxq %r14, %r14
- adoxq %rax, %r13
- # A[3] * A[3]
- movq 24(%rsp), %rdx
- mulxq %rdx, %rax, %rbx
- adcxq %r15, %r15
- adoxq %rcx, %r14
- adcxq %rbp, %rbp
- adoxq %rax, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rcx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r13, %rax, %r13
- adcxq %rax, %r9
- adoxq %r13, %r10
- mulxq %r14, %rax, %r14
- adcxq %rax, %r10
- adoxq %r14, %r11
- mulxq %r15, %rax, %r15
- adcxq %rax, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rax
- andq %rcx, %r12
- addq %rax, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r12
- addq %rax, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, (%rsp)
- movq %r10, 8(%rsp)
- movq %r11, 16(%rsp)
- movq %r12, 24(%rsp)
- movq $0x1db42, %rdx
- mulxq 128(%rsp), %r9, %rbp
- mulxq 136(%rsp), %r10, %r15
- mulxq 144(%rsp), %r11, %r14
- mulxq 152(%rsp), %r12, %r13
- addq %rbp, %r10
- adcq %r15, %r11
- adcq %r14, %r12
- adcq $0x00, %r13
- movq $0x7fffffffffffffff, %rbp
- shldq $0x01, %r12, %r13
- andq %rbp, %r12
- imulq $19, %r13, %r13
- addq %r13, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- movq %r9, 32(%rsp)
- movq %r10, 40(%rsp)
- movq %r11, 48(%rsp)
- movq %r12, 56(%rsp)
- # Square
- # A[0] * A[1]
- movq 64(%rsp), %rdx
- mulxq 72(%rsp), %r10, %r11
- # A[0] * A[3]
- mulxq 88(%rsp), %r12, %r13
- # A[2] * A[1]
- movq 80(%rsp), %rdx
- mulxq 72(%rsp), %rcx, %rbx
- xorq %rbp, %rbp
- adoxq %rcx, %r12
- # A[2] * A[3]
- mulxq 88(%rsp), %r14, %r15
- adoxq %rbx, %r13
- # A[2] * A[0]
- mulxq 64(%rsp), %rcx, %rbx
- adoxq %rbp, %r14
- adcxq %rcx, %r11
- adoxq %rbp, %r15
- # A[1] * A[3]
- movq 72(%rsp), %rdx
- mulxq 88(%rsp), %rax, %r9
- adcxq %rbx, %r12
- adcxq %rax, %r13
- adcxq %r9, %r14
- adcxq %rbp, %r15
- # Double with Carry Flag
- xorq %rbp, %rbp
- # A[0] * A[0]
- movq 64(%rsp), %rdx
- mulxq %rdx, %r9, %rax
- adcxq %r10, %r10
- # A[1] * A[1]
- movq 72(%rsp), %rdx
- mulxq %rdx, %rcx, %rbx
- adcxq %r11, %r11
- adoxq %rax, %r10
- adcxq %r12, %r12
- adoxq %rcx, %r11
- # A[2] * A[2]
- movq 80(%rsp), %rdx
- mulxq %rdx, %rax, %rcx
- adcxq %r13, %r13
- adoxq %rbx, %r12
- adcxq %r14, %r14
- adoxq %rax, %r13
- # A[3] * A[3]
- movq 88(%rsp), %rdx
- mulxq %rdx, %rax, %rbx
- adcxq %r15, %r15
- adoxq %rcx, %r14
- adcxq %rbp, %rbp
- adoxq %rax, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rcx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r13, %rax, %r13
- adcxq %rax, %r9
- adoxq %r13, %r10
- mulxq %r14, %rax, %r14
- adcxq %rax, %r10
- adoxq %r14, %r11
- mulxq %r15, %rax, %r15
- adcxq %rax, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rax
- andq %rcx, %r12
- addq %rax, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rax
- andq %rcx, %r12
- addq %rax, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, 64(%rsp)
- movq %r10, 72(%rsp)
- movq %r11, 80(%rsp)
- movq %r12, 88(%rsp)
- # Add
- movq 96(%rsp), %r9
- movq 104(%rsp), %r10
- addq 32(%rsp), %r9
- movq 112(%rsp), %r11
- adcq 40(%rsp), %r10
- movq 120(%rsp), %rax
- adcq 48(%rsp), %r11
- movq $-19, %rcx
- adcq 56(%rsp), %rax
- movq $0x7fffffffffffffff, %rbx
- movq %rax, %r12
- sarq $63, %rax
- # Mask the modulus
- andq %rax, %rcx
- andq %rax, %rbx
- # Sub modulus (if overflow)
- subq %rcx, %r9
- sbbq %rax, %r10
- sbbq %rax, %r11
- sbbq %rbx, %r12
- movq %r9, 96(%rsp)
- movq %r10, 104(%rsp)
- movq %r11, 112(%rsp)
- movq %r12, 120(%rsp)
- # Multiply
- # A[0] * B[0]
- movq (%rsp), %rdx
- mulxq (%r8), %r9, %r10
- # A[2] * B[0]
- mulxq 16(%r8), %r11, %r12
- # A[1] * B[0]
- mulxq 8(%r8), %rcx, %rbx
- xorq %rbp, %rbp
- adcxq %rcx, %r10
- # A[1] * B[3]
- movq 24(%rsp), %rdx
- mulxq 8(%r8), %r13, %r14
- adcxq %rbx, %r11
- # A[0] * B[1]
- movq 8(%rsp), %rdx
- mulxq (%r8), %rcx, %rbx
- adoxq %rcx, %r10
- # A[2] * B[1]
- mulxq 16(%r8), %rcx, %r15
- adoxq %rbx, %r11
- adcxq %rcx, %r12
- # A[1] * B[2]
- movq 16(%rsp), %rdx
- mulxq 8(%r8), %rcx, %rbx
- adcxq %r15, %r13
- adoxq %rcx, %r12
- adcxq %rbp, %r14
- adoxq %rbx, %r13
- # A[0] * B[2]
- mulxq (%r8), %rcx, %rbx
- adoxq %rbp, %r14
- xorq %r15, %r15
- adcxq %rcx, %r11
- # A[1] * B[1]
- movq 8(%rsp), %rdx
- mulxq 8(%r8), %rdx, %rcx
- adcxq %rbx, %r12
- adoxq %rdx, %r11
- # A[3] * B[1]
- movq 8(%rsp), %rdx
- adoxq %rcx, %r12
- mulxq 24(%r8), %rcx, %rbx
- adcxq %rcx, %r13
- # A[2] * B[2]
- movq 16(%rsp), %rdx
- mulxq 16(%r8), %rdx, %rcx
- adcxq %rbx, %r14
- adoxq %rdx, %r13
- # A[3] * B[3]
- movq 24(%rsp), %rdx
- adoxq %rcx, %r14
- mulxq 24(%r8), %rcx, %rbx
- adoxq %rbp, %r15
- adcxq %rcx, %r15
- # A[0] * B[3]
- mulxq (%r8), %rdx, %rcx
- adcxq %rbx, %rbp
- xorq %rbx, %rbx
- adcxq %rdx, %r12
- # A[3] * B[0]
- movq (%rsp), %rdx
- adcxq %rcx, %r13
- mulxq 24(%r8), %rdx, %rcx
- adoxq %rdx, %r12
- adoxq %rcx, %r13
- # A[2] * B[3]
- movq 24(%rsp), %rdx
- mulxq 16(%r8), %rdx, %rcx
- adcxq %rdx, %r14
- # A[3] * B[2]
- movq 16(%rsp), %rdx
- adcxq %rcx, %r15
- mulxq 24(%r8), %rcx, %rdx
- adcxq %rbx, %rbp
- adoxq %rcx, %r14
- adoxq %rdx, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rbx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rbx, %rbx
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %rcx, %r15
- adcxq %rcx, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rbx, %rdx
- adcxq %rbx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rbx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, 32(%rsp)
- movq %r10, 40(%rsp)
- movq %r11, 48(%rsp)
- movq %r12, 56(%rsp)
- # Multiply
- # A[0] * B[0]
- movq 96(%rsp), %rdx
- mulxq 128(%rsp), %r9, %r10
- # A[2] * B[0]
- mulxq 144(%rsp), %r11, %r12
- # A[1] * B[0]
- mulxq 136(%rsp), %rcx, %rbx
- xorq %rbp, %rbp
- adcxq %rcx, %r10
- # A[1] * B[3]
- movq 120(%rsp), %rdx
- mulxq 136(%rsp), %r13, %r14
- adcxq %rbx, %r11
- # A[0] * B[1]
- movq 104(%rsp), %rdx
- mulxq 128(%rsp), %rcx, %rbx
- adoxq %rcx, %r10
- # A[2] * B[1]
- mulxq 144(%rsp), %rcx, %r15
- adoxq %rbx, %r11
- adcxq %rcx, %r12
- # A[1] * B[2]
- movq 112(%rsp), %rdx
- mulxq 136(%rsp), %rcx, %rbx
- adcxq %r15, %r13
- adoxq %rcx, %r12
- adcxq %rbp, %r14
- adoxq %rbx, %r13
- # A[0] * B[2]
- mulxq 128(%rsp), %rcx, %rbx
- adoxq %rbp, %r14
- xorq %r15, %r15
- adcxq %rcx, %r11
- # A[1] * B[1]
- movq 104(%rsp), %rdx
- mulxq 136(%rsp), %rdx, %rcx
- adcxq %rbx, %r12
- adoxq %rdx, %r11
- # A[3] * B[1]
- movq 104(%rsp), %rdx
- adoxq %rcx, %r12
- mulxq 152(%rsp), %rcx, %rbx
- adcxq %rcx, %r13
- # A[2] * B[2]
- movq 112(%rsp), %rdx
- mulxq 144(%rsp), %rdx, %rcx
- adcxq %rbx, %r14
- adoxq %rdx, %r13
- # A[3] * B[3]
- movq 120(%rsp), %rdx
- adoxq %rcx, %r14
- mulxq 152(%rsp), %rcx, %rbx
- adoxq %rbp, %r15
- adcxq %rcx, %r15
- # A[0] * B[3]
- mulxq 128(%rsp), %rdx, %rcx
- adcxq %rbx, %rbp
- xorq %rbx, %rbx
- adcxq %rdx, %r12
- # A[3] * B[0]
- movq 96(%rsp), %rdx
- adcxq %rcx, %r13
- mulxq 152(%rsp), %rdx, %rcx
- adoxq %rdx, %r12
- adoxq %rcx, %r13
- # A[2] * B[3]
- movq 120(%rsp), %rdx
- mulxq 144(%rsp), %rdx, %rcx
- adcxq %rdx, %r14
- # A[3] * B[2]
- movq 112(%rsp), %rdx
- adcxq %rcx, %r15
- mulxq 152(%rsp), %rcx, %rdx
- adcxq %rbx, %rbp
- adoxq %rcx, %r14
- adoxq %rdx, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rbx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rbx, %rbx
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %rcx, %r15
- adcxq %rcx, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rbx, %rdx
- adcxq %rbx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rbx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, (%rsp)
- movq %r10, 8(%rsp)
- movq %r11, 16(%rsp)
- movq %r12, 24(%rsp)
- decb 168(%rsp)
- jge L_curve25519_avx2_bits
- movq $63, 168(%rsp)
- decb 160(%rsp)
- jge L_curve25519_avx2_words
- # Invert
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- movq %rsp, %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 96(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $4, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 128(%rsp), %rdi
- leaq 96(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 128(%rsp), %rdi
- leaq 128(%rsp), %rsi
- movq $19, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 128(%rsp), %rsi
- leaq 96(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $9, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 128(%rsp), %rdi
- leaq 96(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 128(%rsp), %rdi
- leaq 128(%rsp), %rsi
- movq $0x63, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 128(%rsp), %rsi
- leaq 96(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 96(%rsp), %rdi
- leaq 96(%rsp), %rsi
- movq $49, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 96(%rsp), %rsi
- leaq 64(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movq $4, %rdx
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- movq 176(%rsp), %rdi
- # Multiply
- # A[0] * B[0]
- movq (%rsp), %rdx
- mulxq (%rdi), %r9, %r10
- # A[2] * B[0]
- mulxq 16(%rdi), %r11, %r12
- # A[1] * B[0]
- mulxq 8(%rdi), %rcx, %rbx
- xorq %rbp, %rbp
- adcxq %rcx, %r10
- # A[1] * B[3]
- movq 24(%rsp), %rdx
- mulxq 8(%rdi), %r13, %r14
- adcxq %rbx, %r11
- # A[0] * B[1]
- movq 8(%rsp), %rdx
- mulxq (%rdi), %rcx, %rbx
- adoxq %rcx, %r10
- # A[2] * B[1]
- mulxq 16(%rdi), %rcx, %r15
- adoxq %rbx, %r11
- adcxq %rcx, %r12
- # A[1] * B[2]
- movq 16(%rsp), %rdx
- mulxq 8(%rdi), %rcx, %rbx
- adcxq %r15, %r13
- adoxq %rcx, %r12
- adcxq %rbp, %r14
- adoxq %rbx, %r13
- # A[0] * B[2]
- mulxq (%rdi), %rcx, %rbx
- adoxq %rbp, %r14
- xorq %r15, %r15
- adcxq %rcx, %r11
- # A[1] * B[1]
- movq 8(%rsp), %rdx
- mulxq 8(%rdi), %rdx, %rcx
- adcxq %rbx, %r12
- adoxq %rdx, %r11
- # A[3] * B[1]
- movq 8(%rsp), %rdx
- adoxq %rcx, %r12
- mulxq 24(%rdi), %rcx, %rbx
- adcxq %rcx, %r13
- # A[2] * B[2]
- movq 16(%rsp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rbx, %r14
- adoxq %rdx, %r13
- # A[3] * B[3]
- movq 24(%rsp), %rdx
- adoxq %rcx, %r14
- mulxq 24(%rdi), %rcx, %rbx
- adoxq %rbp, %r15
- adcxq %rcx, %r15
- # A[0] * B[3]
- mulxq (%rdi), %rdx, %rcx
- adcxq %rbx, %rbp
- xorq %rbx, %rbx
- adcxq %rdx, %r12
- # A[3] * B[0]
- movq (%rsp), %rdx
- adcxq %rcx, %r13
- mulxq 24(%rdi), %rdx, %rcx
- adoxq %rdx, %r12
- adoxq %rcx, %r13
- # A[2] * B[3]
- movq 24(%rsp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rdx, %r14
- # A[3] * B[2]
- movq 16(%rsp), %rdx
- adcxq %rcx, %r15
- mulxq 24(%rdi), %rcx, %rdx
- adcxq %rbx, %rbp
- adoxq %rcx, %r14
- adoxq %rdx, %r15
- adoxq %rbx, %rbp
- # Reduce
- movq $0x7fffffffffffffff, %rbx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r15, %rbp
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- andq %rbx, %r12
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rbx, %rbx
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %rcx, %r15
- adcxq %rcx, %r11
- adoxq %r15, %r12
- mulxq %rbp, %rbp, %rdx
- adcxq %rbp, %r12
- adoxq %rbx, %rdx
- adcxq %rbx, %rdx
- # Overflow
- shldq $0x01, %r12, %rdx
- movq $0x7fffffffffffffff, %rbx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Reduce if top bit set
- movq %r12, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rbx, %r12
- addq %rcx, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- adcq $0x00, %r12
- # Store
- movq %r9, (%rdi)
- movq %r10, 8(%rdi)
- movq %r11, 16(%rdi)
- movq %r12, 24(%rdi)
- xorq %rax, %rax
- addq $0xc0, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size curve25519_avx2,.-curve25519_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_pow22523_avx2
-.type fe_pow22523_avx2,@function
-.align 4
-fe_pow22523_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_pow22523_avx2
-.p2align 2
-_fe_pow22523_avx2:
-#endif /* __APPLE__ */
- subq $0x70, %rsp
- # pow22523
- movq %rdi, 96(%rsp)
- movq %rsi, 104(%rsp)
- movq %rsp, %rdi
- movq 104(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq 104(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movb $4, %dl
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movb $9, %dl
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movb $19, %dl
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movb $9, %dl
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movb $49, %dl
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 64(%rsp), %rdi
- leaq 64(%rsp), %rsi
- movb $0x63, %dl
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 64(%rsp), %rsi
- leaq 32(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- leaq 32(%rsp), %rdi
- leaq 32(%rsp), %rsi
- movb $49, %dl
-#ifndef __APPLE__
- callq fe_sq_n_avx2@plt
-#else
- callq _fe_sq_n_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- leaq 32(%rsp), %rsi
- movq %rsp, %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- movq %rsp, %rdi
- movq %rsp, %rsi
-#ifndef __APPLE__
- callq fe_sq_avx2@plt
-#else
- callq _fe_sq_avx2
-#endif /* __APPLE__ */
- movq 96(%rsp), %rdi
- movq %rsp, %rsi
- movq 104(%rsp), %rdx
-#ifndef __APPLE__
- callq fe_mul_avx2@plt
-#else
- callq _fe_mul_avx2
-#endif /* __APPLE__ */
- movq 104(%rsp), %rsi
- movq 96(%rsp), %rdi
- addq $0x70, %rsp
- repz retq
-#ifndef __APPLE__
-.text
-.globl fe_ge_to_p2_avx2
-.type fe_ge_to_p2_avx2,@function
-.align 4
-fe_ge_to_p2_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_ge_to_p2_avx2
-.p2align 2
-_fe_ge_to_p2_avx2:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $40, %rsp
- movq %rsi, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rcx, 16(%rsp)
- movq %r8, 24(%rsp)
- movq %r9, 32(%rsp)
- movq 16(%rsp), %rsi
- movq 88(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 88(%rsp), %rsi
- # Multiply
- # A[0] * B[0]
- movq (%rsi), %rdx
- mulxq (%rbx), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rbx), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rbx), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rsi), %rdx
- mulxq 8(%rbx), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rsi), %rdx
- mulxq (%rbx), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rbx), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rsi), %rdx
- mulxq 8(%rbx), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rbx), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rsi), %rdx
- mulxq 8(%rbx), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rsi), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rbx), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rsi), %rdx
- mulxq 16(%rbx), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rsi), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rbx), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rbx), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rsi), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rbx), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rsi), %rdx
- mulxq 16(%rbx), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rsi), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rbx), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- addq $40, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_to_p3_avx2
-.type fe_ge_to_p3_avx2,@function
-.align 4
-fe_ge_to_p3_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_ge_to_p3_avx2
-.p2align 2
-_fe_ge_to_p3_avx2:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $40, %rsp
- movq %rsi, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rcx, 16(%rsp)
- movq %r8, 24(%rsp)
- movq %r9, 32(%rsp)
- movq 24(%rsp), %rsi
- movq 96(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq (%rsp), %rdi
- movq 32(%rsp), %rsi
- movq 88(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq 96(%rsp), %rsi
- # Multiply
- # A[0] * B[0]
- movq (%rsi), %rdx
- mulxq (%rbx), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rbx), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rbx), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rsi), %rdx
- mulxq 8(%rbx), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rsi), %rdx
- mulxq (%rbx), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rbx), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rsi), %rdx
- mulxq 8(%rbx), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rbx), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rsi), %rdx
- mulxq 8(%rbx), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rsi), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rbx), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rsi), %rdx
- mulxq 16(%rbx), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rsi), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rbx), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rbx), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rsi), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rbx), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rsi), %rdx
- mulxq 16(%rbx), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rsi), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rbx), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 32(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- addq $40, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_dbl_avx2
-.type fe_ge_dbl_avx2,@function
-.align 4
-fe_ge_dbl_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_ge_dbl_avx2
-.p2align 2
-_fe_ge_dbl_avx2:
-#endif /* __APPLE__ */
- pushq %rbp
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $48, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq 32(%rsp), %rsi
- # Square
- # A[0] * A[1]
- movq (%rsi), %rdx
- mulxq 8(%rsi), %r9, %r10
- # A[0] * A[3]
- mulxq 24(%rsi), %r11, %r12
- # A[2] * A[1]
- movq 16(%rsi), %rdx
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adoxq %rcx, %r11
- # A[2] * A[3]
- mulxq 24(%rsi), %r13, %r14
- adoxq %rax, %r12
- # A[2] * A[0]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- adcxq %rcx, %r10
- adoxq %r15, %r14
- # A[1] * A[3]
- movq 8(%rsi), %rdx
- mulxq 24(%rsi), %rbp, %r8
- adcxq %rax, %r11
- adcxq %rbp, %r12
- adcxq %r8, %r13
- adcxq %r15, %r14
- # Double with Carry Flag
- xorq %r15, %r15
- # A[0] * A[0]
- movq (%rsi), %rdx
- mulxq %rdx, %r8, %rbp
- adcxq %r9, %r9
- # A[1] * A[1]
- movq 8(%rsi), %rdx
- mulxq %rdx, %rcx, %rax
- adcxq %r10, %r10
- adoxq %rbp, %r9
- adcxq %r11, %r11
- adoxq %rcx, %r10
- # A[2] * A[2]
- movq 16(%rsi), %rdx
- mulxq %rdx, %rbp, %rcx
- adcxq %r12, %r12
- adoxq %rax, %r11
- adcxq %r13, %r13
- adoxq %rbp, %r12
- # A[3] * A[3]
- movq 24(%rsi), %rdx
- mulxq %rdx, %rbp, %rax
- adcxq %r14, %r14
- adoxq %rcx, %r13
- adcxq %r15, %r15
- adoxq %rbp, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r12, %rbp, %r12
- adcxq %rbp, %r8
- adoxq %r12, %r9
- mulxq %r13, %rbp, %r13
- adcxq %rbp, %r9
- adoxq %r13, %r10
- mulxq %r14, %rbp, %r14
- adcxq %rbp, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rbp
- andq %rcx, %r11
- addq %rbp, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rbp
- andq %rcx, %r11
- addq %rbp, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 16(%rsp), %rdi
- movq 40(%rsp), %rbx
- # Square
- # A[0] * A[1]
- movq (%rbx), %rdx
- mulxq 8(%rbx), %r9, %r10
- # A[0] * A[3]
- mulxq 24(%rbx), %r11, %r12
- # A[2] * A[1]
- movq 16(%rbx), %rdx
- mulxq 8(%rbx), %rcx, %rax
- xorq %r15, %r15
- adoxq %rcx, %r11
- # A[2] * A[3]
- mulxq 24(%rbx), %r13, %r14
- adoxq %rax, %r12
- # A[2] * A[0]
- mulxq (%rbx), %rcx, %rax
- adoxq %r15, %r13
- adcxq %rcx, %r10
- adoxq %r15, %r14
- # A[1] * A[3]
- movq 8(%rbx), %rdx
- mulxq 24(%rbx), %rbp, %r8
- adcxq %rax, %r11
- adcxq %rbp, %r12
- adcxq %r8, %r13
- adcxq %r15, %r14
- # Double with Carry Flag
- xorq %r15, %r15
- # A[0] * A[0]
- movq (%rbx), %rdx
- mulxq %rdx, %r8, %rbp
- adcxq %r9, %r9
- # A[1] * A[1]
- movq 8(%rbx), %rdx
- mulxq %rdx, %rcx, %rax
- adcxq %r10, %r10
- adoxq %rbp, %r9
- adcxq %r11, %r11
- adoxq %rcx, %r10
- # A[2] * A[2]
- movq 16(%rbx), %rdx
- mulxq %rdx, %rbp, %rcx
- adcxq %r12, %r12
- adoxq %rax, %r11
- adcxq %r13, %r13
- adoxq %rbp, %r12
- # A[3] * A[3]
- movq 24(%rbx), %rdx
- mulxq %rdx, %rbp, %rax
- adcxq %r14, %r14
- adoxq %rcx, %r13
- adcxq %r15, %r15
- adoxq %rbp, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r12, %rbp, %r12
- adcxq %rbp, %r8
- adoxq %r12, %r9
- mulxq %r13, %rbp, %r13
- adcxq %rbp, %r9
- adoxq %r13, %r10
- mulxq %r14, %rbp, %r14
- adcxq %rbp, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rbp
- andq %rcx, %r11
- addq %rbp, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rbp
- andq %rcx, %r11
- addq %rbp, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq (%rbx), %r8
- movq 16(%rsi), %r10
- adcq 8(%rbx), %r9
- movq 24(%rsi), %rdx
- adcq 16(%rbx), %r10
- movq $-19, %rcx
- adcq 24(%rbx), %rdx
- movq $0x7fffffffffffffff, %rax
- movq %rdx, %r11
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 24(%rsp), %rsi
- # Square
- # A[0] * A[1]
- movq (%rdi), %rdx
- mulxq 8(%rdi), %r9, %r10
- # A[0] * A[3]
- mulxq 24(%rdi), %r11, %r12
- # A[2] * A[1]
- movq 16(%rdi), %rdx
- mulxq 8(%rdi), %rcx, %rax
- xorq %r15, %r15
- adoxq %rcx, %r11
- # A[2] * A[3]
- mulxq 24(%rdi), %r13, %r14
- adoxq %rax, %r12
- # A[2] * A[0]
- mulxq (%rdi), %rcx, %rax
- adoxq %r15, %r13
- adcxq %rcx, %r10
- adoxq %r15, %r14
- # A[1] * A[3]
- movq 8(%rdi), %rdx
- mulxq 24(%rdi), %rbp, %r8
- adcxq %rax, %r11
- adcxq %rbp, %r12
- adcxq %r8, %r13
- adcxq %r15, %r14
- # Double with Carry Flag
- xorq %r15, %r15
- # A[0] * A[0]
- movq (%rdi), %rdx
- mulxq %rdx, %r8, %rbp
- adcxq %r9, %r9
- # A[1] * A[1]
- movq 8(%rdi), %rdx
- mulxq %rdx, %rcx, %rax
- adcxq %r10, %r10
- adoxq %rbp, %r9
- adcxq %r11, %r11
- adoxq %rcx, %r10
- # A[2] * A[2]
- movq 16(%rdi), %rdx
- mulxq %rdx, %rbp, %rcx
- adcxq %r12, %r12
- adoxq %rax, %r11
- adcxq %r13, %r13
- adoxq %rbp, %r12
- # A[3] * A[3]
- movq 24(%rdi), %rdx
- mulxq %rdx, %rbp, %rax
- adcxq %r14, %r14
- adoxq %rcx, %r13
- adcxq %r15, %r15
- adoxq %rbp, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rcx
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rcx, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rcx, %rcx
- mulxq %r12, %rbp, %r12
- adcxq %rbp, %r8
- adoxq %r12, %r9
- mulxq %r13, %rbp, %r13
- adcxq %rbp, %r9
- adoxq %r13, %r10
- mulxq %r14, %rbp, %r14
- adcxq %rbp, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rcx, %rdx
- adcxq %rcx, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rcx
- imulq $19, %rdx, %rbp
- andq %rcx, %r11
- addq %rbp, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rbp
- andq %rcx, %r11
- addq %rbp, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 16(%rsp), %rsi
- movq (%rsp), %rbx
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %rdx
- movq %r8, %r12
- addq (%rbx), %r8
- movq %r9, %r13
- adcq 8(%rbx), %r9
- movq %r10, %r14
- adcq 16(%rbx), %r10
- movq %rdx, %r15
- adcq 24(%rbx), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rbx), %r12
- movq $0x00, %rdx
- sbbq 8(%rbx), %r13
- movq $-19, %rcx
- sbbq 16(%rbx), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rbx), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, (%rsi)
- movq %r13, 8(%rsi)
- movq %r14, 16(%rsi)
- movq %r15, 24(%rsi)
- movq 24(%rsp), %rsi
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rdi), %r8
- movq $0x00, %rdx
- sbbq 8(%rdi), %r9
- movq $-19, %rcx
- sbbq 16(%rdi), %r10
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rdi), %r11
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r8
- adcq %rdx, %r9
- adcq %rdx, %r10
- adcq %rax, %r11
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- movq 104(%rsp), %rdi
- # Square * 2
- # A[0] * A[1]
- movq (%rdi), %rdx
- mulxq 8(%rdi), %r9, %r10
- # A[0] * A[3]
- mulxq 24(%rdi), %r11, %r12
- # A[2] * A[1]
- movq 16(%rdi), %rdx
- mulxq 8(%rdi), %rcx, %rax
- xorq %r15, %r15
- adoxq %rcx, %r11
- # A[2] * A[3]
- mulxq 24(%rdi), %r13, %r14
- adoxq %rax, %r12
- # A[2] * A[0]
- mulxq (%rdi), %rcx, %rax
- adoxq %r15, %r13
- adcxq %rcx, %r10
- adoxq %r15, %r14
- # A[1] * A[3]
- movq 8(%rdi), %rdx
- mulxq 24(%rdi), %rbp, %r8
- adcxq %rax, %r11
- adcxq %rbp, %r12
- adcxq %r8, %r13
- adcxq %r15, %r14
- # Double with Carry Flag
- xorq %r15, %r15
- # A[0] * A[0]
- movq (%rdi), %rdx
- mulxq %rdx, %r8, %rbp
- adcxq %r9, %r9
- # A[1] * A[1]
- movq 8(%rdi), %rdx
- mulxq %rdx, %rcx, %rax
- adcxq %r10, %r10
- adoxq %rbp, %r9
- adcxq %r11, %r11
- adoxq %rcx, %r10
- # A[2] * A[2]
- movq 16(%rdi), %rdx
- mulxq %rdx, %rbp, %rcx
- adcxq %r12, %r12
- adoxq %rax, %r11
- adcxq %r13, %r13
- adoxq %rbp, %r12
- # A[3] * A[3]
- movq 24(%rdi), %rdx
- mulxq %rdx, %rbp, %rax
- adcxq %r14, %r14
- adoxq %rcx, %r13
- adcxq %r15, %r15
- adoxq %rbp, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- xorq %rbp, %rbp
- # Move top half into t4-t7 and remove top bit from t3 and double
- shldq $3, %r15, %rbp
- shldq $2, %r14, %r15
- shldq $2, %r13, %r14
- shldq $2, %r12, %r13
- shldq $2, %r11, %r12
- shldq $0x01, %r10, %r11
- shldq $0x01, %r9, %r10
- shldq $0x01, %r8, %r9
- shlq $0x01, %r8
- andq %rax, %r11
- # Two out left, one in right
- andq %rax, %r15
- # Multiply top bits by 19*19
- imulq $0x169, %rbp, %rcx
- xorq %rax, %rax
- # Multiply top half by 19
- movq $19, %rdx
- adoxq %rcx, %r8
- mulxq %r12, %rbp, %r12
- adcxq %rbp, %r8
- adoxq %r12, %r9
- mulxq %r13, %rbp, %r13
- adcxq %rbp, %r9
- adoxq %r13, %r10
- mulxq %r14, %rbp, %r14
- adcxq %rbp, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rbp
- andq %rax, %r11
- addq %rbp, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rbp
- andq %rax, %r11
- addq %rbp, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 16(%rsp), %rdi
- # Sub
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %r11
- subq (%rdi), %r8
- movq $0x00, %rdx
- sbbq 8(%rdi), %r9
- movq $-19, %rcx
- sbbq 16(%rdi), %r10
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rdi), %r11
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r8
- adcq %rdx, %r9
- adcq %rdx, %r10
- adcq %rax, %r11
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- addq $48, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
- repz retq
-#ifndef __APPLE__
-.size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_madd_avx2
-.type fe_ge_madd_avx2,@function
-.align 4
-fe_ge_madd_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_ge_madd_avx2
-.p2align 2
-_fe_ge_madd_avx2:
-#endif /* __APPLE__ */
- pushq %rbp
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $48, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq 8(%rsp), %rsi
- movq 40(%rsp), %rbx
- movq 32(%rsp), %rbp
- # Add
- movq (%rbx), %r8
- movq 8(%rbx), %r9
- movq 16(%rbx), %r10
- movq 24(%rbx), %rdx
- movq %r8, %r12
- addq (%rbp), %r8
- movq %r9, %r13
- adcq 8(%rbp), %r9
- movq %r10, %r14
- adcq 16(%rbp), %r10
- movq %rdx, %r15
- adcq 24(%rbp), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rbp), %r12
- movq $0x00, %rdx
- sbbq 8(%rbp), %r13
- movq $-19, %rcx
- sbbq 16(%rbp), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rbp), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, (%rsi)
- movq %r13, 8(%rsi)
- movq %r14, 16(%rsi)
- movq %r15, 24(%rsi)
- movq 16(%rsp), %rbx
- movq 128(%rsp), %rbp
- # Multiply
- # A[0] * B[0]
- movq (%rbp), %rdx
- mulxq (%rdi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rdi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rdi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbp), %rdx
- mulxq 8(%rdi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbp), %rdx
- mulxq (%rdi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rdi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbp), %rdx
- mulxq 8(%rdi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rdi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbp), %rdx
- mulxq 8(%rdi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbp), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rdi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbp), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rdi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rdi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbp), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rdi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbp), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rdi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- movq 136(%rsp), %rdi
- # Multiply
- # A[0] * B[0]
- movq (%rdi), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rdi), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rdi), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rdi), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rdi), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rdi), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rdi), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rdi), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rdi), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rdi), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rdi), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 24(%rsp), %rdi
- movq 120(%rsp), %rsi
- movq 112(%rsp), %rbp
- # Multiply
- # A[0] * B[0]
- movq (%rbp), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbp), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbp), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbp), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbp), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbp), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbp), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbp), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbp), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbp), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbp), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rdi
- movq (%rsp), %rsi
- # Add
- movq (%rbx), %r8
- movq 8(%rbx), %r9
- movq 16(%rbx), %r10
- movq 24(%rbx), %rdx
- movq %r8, %r12
- addq (%rdi), %r8
- movq %r9, %r13
- adcq 8(%rdi), %r9
- movq %r10, %r14
- adcq 16(%rdi), %r10
- movq %rdx, %r15
- adcq 24(%rdi), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rdi), %r12
- movq $0x00, %rdx
- sbbq 8(%rdi), %r13
- movq $-19, %rcx
- sbbq 16(%rdi), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rdi), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, (%rsi)
- movq %r13, 8(%rsi)
- movq %r14, 16(%rsi)
- movq %r15, 24(%rsi)
- movq 104(%rsp), %rdi
- # Double
- movq (%rdi), %r8
- movq 8(%rdi), %r9
- addq %r8, %r8
- movq 16(%rdi), %r10
- adcq %r9, %r9
- movq 24(%rdi), %rdx
- adcq %r10, %r10
- movq $-19, %rcx
- adcq %rdx, %rdx
- movq $0x7fffffffffffffff, %rax
- movq %rdx, %r11
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- movq 24(%rsp), %rdi
- # Add
- movq (%rbx), %r8
- movq 8(%rbx), %r9
- movq 16(%rbx), %r10
- movq 24(%rbx), %rdx
- movq %r8, %r12
- addq (%rdi), %r8
- movq %r9, %r13
- adcq 8(%rdi), %r9
- movq %r10, %r14
- adcq 16(%rdi), %r10
- movq %rdx, %r15
- adcq 24(%rdi), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rdi), %r12
- movq $0x00, %rdx
- sbbq 8(%rdi), %r13
- movq $-19, %rcx
- sbbq 16(%rdi), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rdi), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- movq %r12, (%rdi)
- movq %r13, 8(%rdi)
- movq %r14, 16(%rdi)
- movq %r15, 24(%rdi)
- addq $48, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
- repz retq
-#ifndef __APPLE__
-.size fe_ge_madd_avx2,.-fe_ge_madd_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_msub_avx2
-.type fe_ge_msub_avx2,@function
-.align 4
-fe_ge_msub_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_ge_msub_avx2
-.p2align 2
-_fe_ge_msub_avx2:
-#endif /* __APPLE__ */
- pushq %rbp
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $48, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq 8(%rsp), %rsi
- movq 40(%rsp), %rbx
- movq 32(%rsp), %rbp
- # Add
- movq (%rbx), %r8
- movq 8(%rbx), %r9
- movq 16(%rbx), %r10
- movq 24(%rbx), %rdx
- movq %r8, %r12
- addq (%rbp), %r8
- movq %r9, %r13
- adcq 8(%rbp), %r9
- movq %r10, %r14
- adcq 16(%rbp), %r10
- movq %rdx, %r15
- adcq 24(%rbp), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rbp), %r12
- movq $0x00, %rdx
- sbbq 8(%rbp), %r13
- movq $-19, %rcx
- sbbq 16(%rbp), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rbp), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, (%rsi)
- movq %r13, 8(%rsi)
- movq %r14, 16(%rsi)
- movq %r15, 24(%rsi)
- movq 16(%rsp), %rbx
- movq 136(%rsp), %rbp
- # Multiply
- # A[0] * B[0]
- movq (%rbp), %rdx
- mulxq (%rdi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rdi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rdi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbp), %rdx
- mulxq 8(%rdi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbp), %rdx
- mulxq (%rdi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rdi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbp), %rdx
- mulxq 8(%rdi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rdi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbp), %rdx
- mulxq 8(%rdi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbp), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rdi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbp), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rdi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rdi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbp), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rdi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbp), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rdi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- movq 128(%rsp), %rdi
- # Multiply
- # A[0] * B[0]
- movq (%rdi), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rdi), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rdi), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rdi), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rdi), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rdi), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rdi), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rdi), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rdi), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rdi), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rdi), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 24(%rsp), %rdi
- movq 120(%rsp), %rsi
- movq 112(%rsp), %rbp
- # Multiply
- # A[0] * B[0]
- movq (%rbp), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbp), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbp), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbp), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbp), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbp), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbp), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbp), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbp), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbp), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbp), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq 8(%rsp), %rsi
- movq (%rsp), %rbp
- # Add
- movq (%rbx), %r8
- movq 8(%rbx), %r9
- movq 16(%rbx), %r10
- movq 24(%rbx), %rdx
- movq %r8, %r12
- addq (%rsi), %r8
- movq %r9, %r13
- adcq 8(%rsi), %r9
- movq %r10, %r14
- adcq 16(%rsi), %r10
- movq %rdx, %r15
- adcq 24(%rsi), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rsi), %r12
- movq $0x00, %rdx
- sbbq 8(%rsi), %r13
- movq $-19, %rcx
- sbbq 16(%rsi), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rsi), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq %r12, (%rbp)
- movq %r13, 8(%rbp)
- movq %r14, 16(%rbp)
- movq %r15, 24(%rbp)
- movq 104(%rsp), %rsi
- # Double
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- addq %r8, %r8
- movq 16(%rsi), %r10
- adcq %r9, %r9
- movq 24(%rsi), %rdx
- adcq %r10, %r10
- movq $-19, %rcx
- adcq %rdx, %rdx
- movq $0x7fffffffffffffff, %rax
- movq %rdx, %r11
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- # Add
- movq (%rbx), %r8
- movq 8(%rbx), %r9
- movq 16(%rbx), %r10
- movq 24(%rbx), %rdx
- movq %r8, %r12
- addq (%rdi), %r8
- movq %r9, %r13
- adcq 8(%rdi), %r9
- movq %r10, %r14
- adcq 16(%rdi), %r10
- movq %rdx, %r15
- adcq 24(%rdi), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rdi), %r12
- movq $0x00, %rdx
- sbbq 8(%rdi), %r13
- movq $-19, %rcx
- sbbq 16(%rdi), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rdi), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, (%rbx)
- movq %r13, 8(%rbx)
- movq %r14, 16(%rbx)
- movq %r15, 24(%rbx)
- addq $48, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- popq %rbp
- repz retq
-#ifndef __APPLE__
-.size fe_ge_msub_avx2,.-fe_ge_msub_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_add_avx2
-.type fe_ge_add_avx2,@function
-.align 4
-fe_ge_add_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_ge_add_avx2
-.p2align 2
-_fe_ge_add_avx2:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $0x50, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq 8(%rsp), %rsi
- movq 40(%rsp), %rbx
- movq 32(%rsp), %rbp
- # Add
- movq (%rbx), %r8
- movq 8(%rbx), %r9
- movq 16(%rbx), %r10
- movq 24(%rbx), %rdx
- movq %r8, %r12
- addq (%rbp), %r8
- movq %r9, %r13
- adcq 8(%rbp), %r9
- movq %r10, %r14
- adcq 16(%rbp), %r10
- movq %rdx, %r15
- adcq 24(%rbp), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rbp), %r12
- movq $0x00, %rdx
- sbbq 8(%rbp), %r13
- movq $-19, %rcx
- sbbq 16(%rbp), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rbp), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, (%rsi)
- movq %r13, 8(%rsi)
- movq %r14, 16(%rsi)
- movq %r15, 24(%rsi)
- movq 16(%rsp), %rbx
- movq 168(%rsp), %rbp
- # Multiply
- # A[0] * B[0]
- movq (%rbp), %rdx
- mulxq (%rdi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rdi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rdi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbp), %rdx
- mulxq 8(%rdi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbp), %rdx
- mulxq (%rdi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rdi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbp), %rdx
- mulxq 8(%rdi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rdi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbp), %rdx
- mulxq 8(%rdi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbp), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rdi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbp), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rdi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rdi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbp), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rdi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbp), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rdi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- movq 176(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 24(%rsp), %rsi
- movq 160(%rsp), %rbx
- movq 144(%rsp), %rbp
- # Multiply
- # A[0] * B[0]
- movq (%rbp), %rdx
- mulxq (%rbx), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rbx), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rbx), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbp), %rdx
- mulxq 8(%rbx), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbp), %rdx
- mulxq (%rbx), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rbx), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbp), %rdx
- mulxq 8(%rbx), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rbx), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbp), %rdx
- mulxq 8(%rbx), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbp), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rbx), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbp), %rdx
- mulxq 16(%rbx), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbp), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rbx), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rbx), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbp), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rbx), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbp), %rdx
- mulxq 16(%rbx), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbp), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rbx), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 136(%rsp), %rsi
- movq 152(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- leaq 48(%rsp), %rsi
- # Double
- movq (%rdi), %r8
- movq 8(%rdi), %r9
- addq %r8, %r8
- movq 16(%rdi), %r10
- adcq %r9, %r9
- movq 24(%rdi), %rdx
- adcq %r10, %r10
- movq $-19, %rcx
- adcq %rdx, %rdx
- movq $0x7fffffffffffffff, %rax
- movq %rdx, %r11
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 8(%rsp), %rbx
- movq 16(%rsp), %rbp
- # Add
- movq (%rbp), %r8
- movq 8(%rbp), %r9
- movq 16(%rbp), %r10
- movq 24(%rbp), %rdx
- movq %r8, %r12
- addq (%rbx), %r8
- movq %r9, %r13
- adcq 8(%rbx), %r9
- movq %r10, %r14
- adcq 16(%rbx), %r10
- movq %rdx, %r15
- adcq 24(%rbx), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rbx), %r12
- movq $0x00, %rdx
- sbbq 8(%rbx), %r13
- movq $-19, %rcx
- sbbq 16(%rbx), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rbx), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- movq %r12, (%rdi)
- movq %r13, 8(%rdi)
- movq %r14, 16(%rdi)
- movq %r15, 24(%rdi)
- movq 24(%rsp), %rdi
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %rdx
- movq %r8, %r12
- addq (%rdi), %r8
- movq %r9, %r13
- adcq 8(%rdi), %r9
- movq %r10, %r14
- adcq 16(%rdi), %r10
- movq %rdx, %r15
- adcq 24(%rdi), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rdi), %r12
- movq $0x00, %rdx
- sbbq 8(%rdi), %r13
- movq $-19, %rcx
- sbbq 16(%rdi), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rdi), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rbp)
- movq %r9, 8(%rbp)
- movq %r10, 16(%rbp)
- movq %r11, 24(%rbp)
- movq %r12, (%rdi)
- movq %r13, 8(%rdi)
- movq %r14, 16(%rdi)
- movq %r15, 24(%rdi)
- addq $0x50, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_add_avx2,.-fe_ge_add_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl fe_ge_sub_avx2
-.type fe_ge_sub_avx2,@function
-.align 4
-fe_ge_sub_avx2:
-#else
-.section __TEXT,__text
-.globl _fe_ge_sub_avx2
-.p2align 2
-_fe_ge_sub_avx2:
-#endif /* __APPLE__ */
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $0x50, %rsp
- movq %rdi, (%rsp)
- movq %rsi, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rcx, 24(%rsp)
- movq %r8, 32(%rsp)
- movq %r9, 40(%rsp)
- movq 8(%rsp), %rsi
- movq 40(%rsp), %rbx
- movq 32(%rsp), %rbp
- # Add
- movq (%rbx), %r8
- movq 8(%rbx), %r9
- movq 16(%rbx), %r10
- movq 24(%rbx), %rdx
- movq %r8, %r12
- addq (%rbp), %r8
- movq %r9, %r13
- adcq 8(%rbp), %r9
- movq %r10, %r14
- adcq 16(%rbp), %r10
- movq %rdx, %r15
- adcq 24(%rbp), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rbp), %r12
- movq $0x00, %rdx
- sbbq 8(%rbp), %r13
- movq $-19, %rcx
- sbbq 16(%rbp), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rbp), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, (%rsi)
- movq %r13, 8(%rsi)
- movq %r14, 16(%rsi)
- movq %r15, 24(%rsi)
- movq 16(%rsp), %rbx
- movq 176(%rsp), %rbp
- # Multiply
- # A[0] * B[0]
- movq (%rbp), %rdx
- mulxq (%rdi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rdi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rdi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbp), %rdx
- mulxq 8(%rdi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbp), %rdx
- mulxq (%rdi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rdi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbp), %rdx
- mulxq 8(%rdi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rdi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbp), %rdx
- mulxq 8(%rdi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbp), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rdi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbp), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rdi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rdi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbp), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rdi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbp), %rdx
- mulxq 16(%rdi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbp), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rdi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- movq 168(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 24(%rsp), %rsi
- movq 160(%rsp), %rbx
- movq 144(%rsp), %rbp
- # Multiply
- # A[0] * B[0]
- movq (%rbp), %rdx
- mulxq (%rbx), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rbx), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rbx), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbp), %rdx
- mulxq 8(%rbx), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbp), %rdx
- mulxq (%rbx), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rbx), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbp), %rdx
- mulxq 8(%rbx), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rbx), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbp), %rdx
- mulxq 8(%rbx), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbp), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rbx), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbp), %rdx
- mulxq 16(%rbx), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbp), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rbx), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rbx), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbp), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rbx), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbp), %rdx
- mulxq 16(%rbx), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbp), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rbx), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 136(%rsp), %rsi
- movq 152(%rsp), %rbx
- # Multiply
- # A[0] * B[0]
- movq (%rbx), %rdx
- mulxq (%rsi), %r8, %r9
- # A[2] * B[0]
- mulxq 16(%rsi), %r10, %r11
- # A[1] * B[0]
- mulxq 8(%rsi), %rcx, %rax
- xorq %r15, %r15
- adcxq %rcx, %r9
- # A[1] * B[3]
- movq 24(%rbx), %rdx
- mulxq 8(%rsi), %r12, %r13
- adcxq %rax, %r10
- # A[0] * B[1]
- movq 8(%rbx), %rdx
- mulxq (%rsi), %rcx, %rax
- adoxq %rcx, %r9
- # A[2] * B[1]
- mulxq 16(%rsi), %rcx, %r14
- adoxq %rax, %r10
- adcxq %rcx, %r11
- # A[1] * B[2]
- movq 16(%rbx), %rdx
- mulxq 8(%rsi), %rcx, %rax
- adcxq %r14, %r12
- adoxq %rcx, %r11
- adcxq %r15, %r13
- adoxq %rax, %r12
- # A[0] * B[2]
- mulxq (%rsi), %rcx, %rax
- adoxq %r15, %r13
- xorq %r14, %r14
- adcxq %rcx, %r10
- # A[1] * B[1]
- movq 8(%rbx), %rdx
- mulxq 8(%rsi), %rdx, %rcx
- adcxq %rax, %r11
- adoxq %rdx, %r10
- # A[3] * B[1]
- movq 8(%rbx), %rdx
- adoxq %rcx, %r11
- mulxq 24(%rsi), %rcx, %rax
- adcxq %rcx, %r12
- # A[2] * B[2]
- movq 16(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rax, %r13
- adoxq %rdx, %r12
- # A[3] * B[3]
- movq 24(%rbx), %rdx
- adoxq %rcx, %r13
- mulxq 24(%rsi), %rcx, %rax
- adoxq %r15, %r14
- adcxq %rcx, %r14
- # A[0] * B[3]
- mulxq (%rsi), %rdx, %rcx
- adcxq %rax, %r15
- xorq %rax, %rax
- adcxq %rdx, %r11
- # A[3] * B[0]
- movq (%rbx), %rdx
- adcxq %rcx, %r12
- mulxq 24(%rsi), %rdx, %rcx
- adoxq %rdx, %r11
- adoxq %rcx, %r12
- # A[2] * B[3]
- movq 24(%rbx), %rdx
- mulxq 16(%rsi), %rdx, %rcx
- adcxq %rdx, %r13
- # A[3] * B[2]
- movq 16(%rbx), %rdx
- adcxq %rcx, %r14
- mulxq 24(%rsi), %rcx, %rdx
- adcxq %rax, %r15
- adoxq %rcx, %r13
- adoxq %rdx, %r14
- adoxq %rax, %r15
- # Reduce
- movq $0x7fffffffffffffff, %rax
- # Move top half into t4-t7 and remove top bit from t3
- shldq $0x01, %r14, %r15
- shldq $0x01, %r13, %r14
- shldq $0x01, %r12, %r13
- shldq $0x01, %r11, %r12
- andq %rax, %r11
- # Multiply top half by 19
- movq $19, %rdx
- xorq %rax, %rax
- mulxq %r12, %rcx, %r12
- adcxq %rcx, %r8
- adoxq %r12, %r9
- mulxq %r13, %rcx, %r13
- adcxq %rcx, %r9
- adoxq %r13, %r10
- mulxq %r14, %rcx, %r14
- adcxq %rcx, %r10
- adoxq %r14, %r11
- mulxq %r15, %r15, %rdx
- adcxq %r15, %r11
- adoxq %rax, %rdx
- adcxq %rax, %rdx
- # Overflow
- shldq $0x01, %r11, %rdx
- movq $0x7fffffffffffffff, %rax
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Reduce if top bit set
- movq %r11, %rdx
- shrq $63, %rdx
- imulq $19, %rdx, %rcx
- andq %rax, %r11
- addq %rcx, %r8
- adcq $0x00, %r9
- adcq $0x00, %r10
- adcq $0x00, %r11
- # Store
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- leaq 48(%rsp), %rsi
- # Double
- movq (%rdi), %r8
- movq 8(%rdi), %r9
- addq %r8, %r8
- movq 16(%rdi), %r10
- adcq %r9, %r9
- movq 24(%rdi), %rdx
- adcq %r10, %r10
- movq $-19, %rcx
- adcq %rdx, %rdx
- movq $0x7fffffffffffffff, %rax
- movq %rdx, %r11
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- movq %r8, (%rsi)
- movq %r9, 8(%rsi)
- movq %r10, 16(%rsi)
- movq %r11, 24(%rsi)
- movq 8(%rsp), %rbx
- movq 16(%rsp), %rbp
- # Add
- movq (%rbp), %r8
- movq 8(%rbp), %r9
- movq 16(%rbp), %r10
- movq 24(%rbp), %rdx
- movq %r8, %r12
- addq (%rbx), %r8
- movq %r9, %r13
- adcq 8(%rbx), %r9
- movq %r10, %r14
- adcq 16(%rbx), %r10
- movq %rdx, %r15
- adcq 24(%rbx), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rbx), %r12
- movq $0x00, %rdx
- sbbq 8(%rbx), %r13
- movq $-19, %rcx
- sbbq 16(%rbx), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rbx), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rbx)
- movq %r9, 8(%rbx)
- movq %r10, 16(%rbx)
- movq %r11, 24(%rbx)
- movq %r12, (%rdi)
- movq %r13, 8(%rdi)
- movq %r14, 16(%rdi)
- movq %r15, 24(%rdi)
- movq 24(%rsp), %rdi
- # Add
- movq (%rsi), %r8
- movq 8(%rsi), %r9
- movq 16(%rsi), %r10
- movq 24(%rsi), %rdx
- movq %r8, %r12
- addq (%rdi), %r8
- movq %r9, %r13
- adcq 8(%rdi), %r9
- movq %r10, %r14
- adcq 16(%rdi), %r10
- movq %rdx, %r15
- adcq 24(%rdi), %rdx
- movq $-19, %rcx
- movq %rdx, %r11
- movq $0x7fffffffffffffff, %rax
- sarq $63, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Sub modulus (if overflow)
- subq %rcx, %r8
- sbbq %rdx, %r9
- sbbq %rdx, %r10
- sbbq %rax, %r11
- # Sub
- subq (%rdi), %r12
- movq $0x00, %rdx
- sbbq 8(%rdi), %r13
- movq $-19, %rcx
- sbbq 16(%rdi), %r14
- movq $0x7fffffffffffffff, %rax
- sbbq 24(%rdi), %r15
- sbbq $0x00, %rdx
- # Mask the modulus
- andq %rdx, %rcx
- andq %rdx, %rax
- # Add modulus (if underflow)
- addq %rcx, %r12
- adcq %rdx, %r13
- adcq %rdx, %r14
- adcq %rax, %r15
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, (%rbp)
- movq %r13, 8(%rbp)
- movq %r14, 16(%rbp)
- movq %r15, 24(%rbp)
- addq $0x50, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
- repz retq
-#ifndef __APPLE__
-.size fe_ge_sub_avx2,.-fe_ge_sub_avx2
-#endif /* __APPLE__ */
-#endif /* HAVE_INTEL_AVX2 */