diff options
Diffstat (limited to 'client/wolfssl/wolfcrypt/src/fe_x25519_asm.S')
| -rw-r--r-- | client/wolfssl/wolfcrypt/src/fe_x25519_asm.S | 16542 |
1 files changed, 0 insertions, 16542 deletions
diff --git a/client/wolfssl/wolfcrypt/src/fe_x25519_asm.S b/client/wolfssl/wolfcrypt/src/fe_x25519_asm.S deleted file mode 100644 index 6d0f638..0000000 --- a/client/wolfssl/wolfcrypt/src/fe_x25519_asm.S +++ /dev/null @@ -1,16542 +0,0 @@ -/* fe_x25519_asm - * - * Copyright (C) 2006-2020 wolfSSL Inc. - * - * This file is part of wolfSSL. - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA - */ - -#ifndef HAVE_INTEL_AVX1 -#define HAVE_INTEL_AVX1 -#endif /* HAVE_INTEL_AVX1 */ -#ifndef NO_AVX2_SUPPORT -#define HAVE_INTEL_AVX2 -#endif /* NO_AVX2_SUPPORT */ - -#ifndef __APPLE__ -.text -.globl fe_init -.type fe_init,@function -.align 4 -fe_init: -#else -.section __TEXT,__text -.globl _fe_init -.p2align 2 -_fe_init: -#endif /* __APPLE__ */ -#ifdef HAVE_INTEL_AVX2 -#ifndef __APPLE__ - movq cpuFlagsSet@GOTPCREL(%rip), %rax - movl (%rax), %eax -#else - movl _cpuFlagsSet(%rip), %eax -#endif /* __APPLE__ */ - testl %eax, %eax - je L_fe_init_get_flags - repz retq -L_fe_init_get_flags: -#ifndef __APPLE__ - callq cpuid_get_flags@plt -#else - callq _cpuid_get_flags -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq intelFlags@GOTPCREL(%rip), %rdx - movl %eax, (%rdx) -#else - movl %eax, _intelFlags(%rip) -#endif /* __APPLE__ */ - andl $0x50, %eax - cmpl $0x50, %eax - jne L_fe_init_flags_done -#ifndef __APPLE__ - movq fe_mul_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_mul_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_mul_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_mul_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_sq_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_sq_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_sq_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_sq_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_mul121666_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_mul121666_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_mul121666_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_mul121666_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_sq2_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_sq2_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_sq2_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_sq2_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_invert_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_invert_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_invert_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_invert_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq curve25519_avx2@GOTPCREL(%rip), %rax -#else - leaq _curve25519_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq curve25519_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _curve25519_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_pow22523_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_pow22523_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_pow22523_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_pow22523_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_ge_to_p2_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_ge_to_p2_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_ge_to_p3_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_ge_to_p3_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_ge_dbl_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_ge_dbl_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_ge_madd_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_madd_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_ge_madd_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_ge_msub_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_msub_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_ge_msub_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_add_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_ge_add_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_add_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_ge_add_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_ge_sub_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_ge_sub_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_ge_sub_p(%rip) -#endif /* __APPLE__ */ -L_fe_init_flags_done: -#ifndef __APPLE__ - movq cpuFlagsSet@GOTPCREL(%rip), %rdx - movl $0x1, (%rdx) -#else - movl $0x1, _cpuFlagsSet(%rip) -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ - repz retq -#ifndef __APPLE__ -.size fe_init,.-fe_init -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_frombytes -.type fe_frombytes,@function -.align 4 -fe_frombytes: -#else -.section __TEXT,__text -.globl _fe_frombytes -.p2align 2 -_fe_frombytes: -#endif /* __APPLE__ */ - movq $0x7fffffffffffffff, %r9 - movq (%rsi), %rdx - movq 8(%rsi), %rax - movq 16(%rsi), %rcx - movq 24(%rsi), %r8 - andq %r9, %r8 - movq %rdx, (%rdi) - movq %rax, 8(%rdi) - movq %rcx, 16(%rdi) - movq %r8, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size fe_frombytes,.-fe_frombytes -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_tobytes -.type fe_tobytes,@function -.align 4 -fe_tobytes: -#else -.section __TEXT,__text -.globl _fe_tobytes -.p2align 2 -_fe_tobytes: -#endif /* __APPLE__ */ - movq $0x7fffffffffffffff, %r10 - movq (%rsi), %rdx - movq 8(%rsi), %rax - movq 16(%rsi), %rcx - movq 24(%rsi), %r8 - addq $19, %rdx - adcq $0x00, %rax - adcq $0x00, %rcx - adcq $0x00, %r8 - shrq $63, %r8 - imulq $19, %r8, %r9 - movq (%rsi), %rdx - movq 8(%rsi), %rax - movq 16(%rsi), %rcx - movq 24(%rsi), %r8 - addq %r9, %rdx - adcq $0x00, %rax - adcq $0x00, %rcx - adcq $0x00, %r8 - andq %r10, %r8 - movq %rdx, (%rdi) - movq %rax, 8(%rdi) - movq %rcx, 16(%rdi) - movq %r8, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size fe_tobytes,.-fe_tobytes -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_1 -.type fe_1,@function -.align 4 -fe_1: -#else -.section __TEXT,__text -.globl _fe_1 -.p2align 2 -_fe_1: -#endif /* __APPLE__ */ - # Set one - movq $0x01, (%rdi) - movq $0x00, 8(%rdi) - movq $0x00, 16(%rdi) - movq $0x00, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size fe_1,.-fe_1 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_0 -.type fe_0,@function -.align 4 -fe_0: -#else -.section __TEXT,__text -.globl _fe_0 -.p2align 2 -_fe_0: -#endif /* __APPLE__ */ - # Set zero - movq $0x00, (%rdi) - movq $0x00, 8(%rdi) - movq $0x00, 16(%rdi) - movq $0x00, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size fe_0,.-fe_0 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_copy -.type fe_copy,@function -.align 4 -fe_copy: -#else -.section __TEXT,__text -.globl _fe_copy -.p2align 2 -_fe_copy: -#endif /* __APPLE__ */ - # Copy - movq (%rsi), %rdx - movq 8(%rsi), %rax - movq 16(%rsi), %rcx - movq 24(%rsi), %r8 - movq %rdx, (%rdi) - movq %rax, 8(%rdi) - movq %rcx, 16(%rdi) - movq %r8, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size fe_copy,.-fe_copy -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sub -.type fe_sub,@function -.align 4 -fe_sub: -#else -.section __TEXT,__text -.globl _fe_sub -.p2align 2 -_fe_sub: -#endif /* __APPLE__ */ - pushq %r12 - # Sub - movq (%rsi), %rax - movq 8(%rsi), %rcx - movq 16(%rsi), %r8 - movq 24(%rsi), %r9 - subq (%rdx), %rax - movq $0x00, %r10 - sbbq 8(%rdx), %rcx - movq $-19, %r11 - sbbq 16(%rdx), %r8 - movq $0x7fffffffffffffff, %r12 - sbbq 24(%rdx), %r9 - sbbq $0x00, %r10 - # Mask the modulus - andq %r10, %r11 - andq %r10, %r12 - # Add modulus (if underflow) - addq %r11, %rax - adcq %r10, %rcx - adcq %r10, %r8 - adcq %r12, %r9 - movq %rax, (%rdi) - movq %rcx, 8(%rdi) - movq %r8, 16(%rdi) - movq %r9, 24(%rdi) - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_sub,.-fe_sub -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_add -.type fe_add,@function -.align 4 -fe_add: -#else -.section __TEXT,__text -.globl _fe_add -.p2align 2 -_fe_add: -#endif /* __APPLE__ */ - pushq %r12 - # Add - movq (%rsi), %rax - movq 8(%rsi), %rcx - addq (%rdx), %rax - movq 16(%rsi), %r8 - adcq 8(%rdx), %rcx - movq 24(%rsi), %r10 - adcq 16(%rdx), %r8 - movq $-19, %r11 - adcq 24(%rdx), %r10 - movq $0x7fffffffffffffff, %r12 - movq %r10, %r9 - sarq $63, %r10 - # Mask the modulus - andq %r10, %r11 - andq %r10, %r12 - # Sub modulus (if overflow) - subq %r11, %rax - sbbq %r10, %rcx - sbbq %r10, %r8 - sbbq %r12, %r9 - movq %rax, (%rdi) - movq %rcx, 8(%rdi) - movq %r8, 16(%rdi) - movq %r9, 24(%rdi) - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_add,.-fe_add -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_neg -.type fe_neg,@function -.align 4 -fe_neg: -#else -.section __TEXT,__text -.globl _fe_neg -.p2align 2 -_fe_neg: -#endif /* __APPLE__ */ - movq $-19, %rdx - movq $-1, %rax - movq $-1, %rcx - movq $0x7fffffffffffffff, %r8 - subq (%rsi), %rdx - sbbq 8(%rsi), %rax - sbbq 16(%rsi), %rcx - sbbq 24(%rsi), %r8 - movq %rdx, (%rdi) - movq %rax, 8(%rdi) - movq %rcx, 16(%rdi) - movq %r8, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size fe_neg,.-fe_neg -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_cmov -.type fe_cmov,@function -.align 4 -fe_cmov: -#else -.section __TEXT,__text -.globl _fe_cmov -.p2align 2 -_fe_cmov: -#endif /* __APPLE__ */ - cmpl $0x01, %edx - movq (%rdi), %rcx - movq 8(%rdi), %r8 - movq 16(%rdi), %r9 - movq 24(%rdi), %r10 - cmoveq (%rsi), %rcx - cmoveq 8(%rsi), %r8 - cmoveq 16(%rsi), %r9 - cmoveq 24(%rsi), %r10 - movq %rcx, (%rdi) - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size fe_cmov,.-fe_cmov -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_isnonzero -.type fe_isnonzero,@function -.align 4 -fe_isnonzero: -#else -.section __TEXT,__text -.globl _fe_isnonzero -.p2align 2 -_fe_isnonzero: -#endif /* __APPLE__ */ - movq $0x7fffffffffffffff, %r10 - movq (%rdi), %rax - movq 8(%rdi), %rdx - movq 16(%rdi), %rcx - movq 24(%rdi), %r8 - addq $19, %rax - adcq $0x00, %rdx - adcq $0x00, %rcx - adcq $0x00, %r8 - shrq $63, %r8 - imulq $19, %r8, %r9 - movq (%rdi), %rax - movq 8(%rdi), %rdx - movq 16(%rdi), %rcx - movq 24(%rdi), %r8 - addq %r9, %rax - adcq $0x00, %rdx - adcq $0x00, %rcx - adcq $0x00, %r8 - andq %r10, %r8 - orq %rdx, %rax - orq %rcx, %rax - orq %r8, %rax - repz retq -#ifndef __APPLE__ -.size fe_isnonzero,.-fe_isnonzero -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_isnegative -.type fe_isnegative,@function -.align 4 -fe_isnegative: -#else -.section __TEXT,__text -.globl _fe_isnegative -.p2align 2 -_fe_isnegative: -#endif /* __APPLE__ */ - movq $0x7fffffffffffffff, %r11 - movq (%rdi), %rdx - movq 8(%rdi), %rcx - movq 16(%rdi), %r8 - movq 24(%rdi), %r9 - movq %rdx, %rax - addq $19, %rdx - adcq $0x00, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - shrq $63, %r9 - imulq $19, %r9, %r10 - addq %r10, %rax - andq $0x01, %rax - repz retq -#ifndef __APPLE__ -.size fe_isnegative,.-fe_isnegative -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_cmov_table -.type fe_cmov_table,@function -.align 4 -fe_cmov_table: -#else -.section __TEXT,__text -.globl _fe_cmov_table -.p2align 2 -_fe_cmov_table: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rdx, %rcx - movsbq %cl, %rax - cdq - xorb %dl, %al - subb %dl, %al - movb %al, %r15b - movq $0x01, %rax - xorq %rdx, %rdx - xorq %r8, %r8 - xorq %r9, %r9 - movq $0x01, %r10 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - cmpb $0x01, %r15b - movq (%rsi), %r14 - cmoveq %r14, %rax - movq 8(%rsi), %r14 - cmoveq %r14, %rdx - movq 16(%rsi), %r14 - cmoveq %r14, %r8 - movq 24(%rsi), %r14 - cmoveq %r14, %r9 - movq 32(%rsi), %r14 - cmoveq %r14, %r10 - movq 40(%rsi), %r14 - cmoveq %r14, %r11 - movq 48(%rsi), %r14 - cmoveq %r14, %r12 - movq 56(%rsi), %r14 - cmoveq %r14, %r13 - cmpb $2, %r15b - movq 96(%rsi), %r14 - cmoveq %r14, %rax - movq 104(%rsi), %r14 - cmoveq %r14, %rdx - movq 112(%rsi), %r14 - cmoveq %r14, %r8 - movq 120(%rsi), %r14 - cmoveq %r14, %r9 - movq 128(%rsi), %r14 - cmoveq %r14, %r10 - movq 136(%rsi), %r14 - cmoveq %r14, %r11 - movq 144(%rsi), %r14 - cmoveq %r14, %r12 - movq 152(%rsi), %r14 - cmoveq %r14, %r13 - cmpb $3, %r15b - movq 192(%rsi), %r14 - cmoveq %r14, %rax - movq 200(%rsi), %r14 - cmoveq %r14, %rdx - movq 208(%rsi), %r14 - cmoveq %r14, %r8 - movq 216(%rsi), %r14 - cmoveq %r14, %r9 - movq 224(%rsi), %r14 - cmoveq %r14, %r10 - movq 232(%rsi), %r14 - cmoveq %r14, %r11 - movq 240(%rsi), %r14 - cmoveq %r14, %r12 - movq 248(%rsi), %r14 - cmoveq %r14, %r13 - cmpb $4, %r15b - movq 288(%rsi), %r14 - cmoveq %r14, %rax - movq 296(%rsi), %r14 - cmoveq %r14, %rdx - movq 304(%rsi), %r14 - cmoveq %r14, %r8 - movq 312(%rsi), %r14 - cmoveq %r14, %r9 - movq 320(%rsi), %r14 - cmoveq %r14, %r10 - movq 328(%rsi), %r14 - cmoveq %r14, %r11 - movq 336(%rsi), %r14 - cmoveq %r14, %r12 - movq 344(%rsi), %r14 - cmoveq %r14, %r13 - cmpb $5, %r15b - movq 384(%rsi), %r14 - cmoveq %r14, %rax - movq 392(%rsi), %r14 - cmoveq %r14, %rdx - movq 400(%rsi), %r14 - cmoveq %r14, %r8 - movq 408(%rsi), %r14 - cmoveq %r14, %r9 - movq 416(%rsi), %r14 - cmoveq %r14, %r10 - movq 424(%rsi), %r14 - cmoveq %r14, %r11 - movq 432(%rsi), %r14 - cmoveq %r14, %r12 - movq 440(%rsi), %r14 - cmoveq %r14, %r13 - cmpb $6, %r15b - movq 480(%rsi), %r14 - cmoveq %r14, %rax - movq 488(%rsi), %r14 - cmoveq %r14, %rdx - movq 496(%rsi), %r14 - cmoveq %r14, %r8 - movq 504(%rsi), %r14 - cmoveq %r14, %r9 - movq 512(%rsi), %r14 - cmoveq %r14, %r10 - movq 520(%rsi), %r14 - cmoveq %r14, %r11 - movq 528(%rsi), %r14 - cmoveq %r14, %r12 - movq 536(%rsi), %r14 - cmoveq %r14, %r13 - cmpb $7, %r15b - movq 576(%rsi), %r14 - cmoveq %r14, %rax - movq 584(%rsi), %r14 - cmoveq %r14, %rdx - movq 592(%rsi), %r14 - cmoveq %r14, %r8 - movq 600(%rsi), %r14 - cmoveq %r14, %r9 - movq 608(%rsi), %r14 - cmoveq %r14, %r10 - movq 616(%rsi), %r14 - cmoveq %r14, %r11 - movq 624(%rsi), %r14 - cmoveq %r14, %r12 - movq 632(%rsi), %r14 - cmoveq %r14, %r13 - cmpb $8, %r15b - movq 672(%rsi), %r14 - cmoveq %r14, %rax - movq 680(%rsi), %r14 - cmoveq %r14, %rdx - movq 688(%rsi), %r14 - cmoveq %r14, %r8 - movq 696(%rsi), %r14 - cmoveq %r14, %r9 - movq 704(%rsi), %r14 - cmoveq %r14, %r10 - movq 712(%rsi), %r14 - cmoveq %r14, %r11 - movq 720(%rsi), %r14 - cmoveq %r14, %r12 - movq 728(%rsi), %r14 - cmoveq %r14, %r13 - cmpb $0x00, %cl - movq %rax, %r14 - cmovlq %r10, %rax - cmovlq %r14, %r10 - movq %rdx, %r14 - cmovlq %r11, %rdx - cmovlq %r14, %r11 - movq %r8, %r14 - cmovlq %r12, %r8 - cmovlq %r14, %r12 - movq %r9, %r14 - cmovlq %r13, %r9 - cmovlq %r14, %r13 - movq %rax, (%rdi) - movq %rdx, 8(%rdi) - movq %r8, 16(%rdi) - movq %r9, 24(%rdi) - movq %r10, 32(%rdi) - movq %r11, 40(%rdi) - movq %r12, 48(%rdi) - movq %r13, 56(%rdi) - xorq %rax, %rax - xorq %rdx, %rdx - xorq %r8, %r8 - xorq %r9, %r9 - cmpb $0x01, %r15b - movq 64(%rsi), %r14 - cmoveq %r14, %rax - movq 72(%rsi), %r14 - cmoveq %r14, %rdx - movq 80(%rsi), %r14 - cmoveq %r14, %r8 - movq 88(%rsi), %r14 - cmoveq %r14, %r9 - cmpb $2, %r15b - movq 160(%rsi), %r14 - cmoveq %r14, %rax - movq 168(%rsi), %r14 - cmoveq %r14, %rdx - movq 176(%rsi), %r14 - cmoveq %r14, %r8 - movq 184(%rsi), %r14 - cmoveq %r14, %r9 - cmpb $3, %r15b - movq 256(%rsi), %r14 - cmoveq %r14, %rax - movq 264(%rsi), %r14 - cmoveq %r14, %rdx - movq 272(%rsi), %r14 - cmoveq %r14, %r8 - movq 280(%rsi), %r14 - cmoveq %r14, %r9 - cmpb $4, %r15b - movq 352(%rsi), %r14 - cmoveq %r14, %rax - movq 360(%rsi), %r14 - cmoveq %r14, %rdx - movq 368(%rsi), %r14 - cmoveq %r14, %r8 - movq 376(%rsi), %r14 - cmoveq %r14, %r9 - cmpb $5, %r15b - movq 448(%rsi), %r14 - cmoveq %r14, %rax - movq 456(%rsi), %r14 - cmoveq %r14, %rdx - movq 464(%rsi), %r14 - cmoveq %r14, %r8 - movq 472(%rsi), %r14 - cmoveq %r14, %r9 - cmpb $6, %r15b - movq 544(%rsi), %r14 - cmoveq %r14, %rax - movq 552(%rsi), %r14 - cmoveq %r14, %rdx - movq 560(%rsi), %r14 - cmoveq %r14, %r8 - movq 568(%rsi), %r14 - cmoveq %r14, %r9 - cmpb $7, %r15b - movq 640(%rsi), %r14 - cmoveq %r14, %rax - movq 648(%rsi), %r14 - cmoveq %r14, %rdx - movq 656(%rsi), %r14 - cmoveq %r14, %r8 - movq 664(%rsi), %r14 - cmoveq %r14, %r9 - cmpb $8, %r15b - movq 736(%rsi), %r14 - cmoveq %r14, %rax - movq 744(%rsi), %r14 - cmoveq %r14, %rdx - movq 752(%rsi), %r14 - cmoveq %r14, %r8 - movq 760(%rsi), %r14 - cmoveq %r14, %r9 - movq $-19, %r10 - movq $-1, %r11 - movq $-1, %r12 - movq $0x7fffffffffffffff, %r13 - subq %rax, %r10 - sbbq %rdx, %r11 - sbbq %r8, %r12 - sbbq %r9, %r13 - cmpb $0x00, %cl - cmovlq %r10, %rax - cmovlq %r11, %rdx - cmovlq %r12, %r8 - cmovlq %r13, %r9 - movq %rax, 64(%rdi) - movq %rdx, 72(%rdi) - movq %r8, 80(%rdi) - movq %r9, 88(%rdi) - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_cmov_table,.-fe_cmov_table -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_mul -.type fe_mul,@function -.align 4 -fe_mul: -#else -.section __TEXT,__text -.globl _fe_mul -.p2align 2 -_fe_mul: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_mul_p(%rip) -#else - jmpq *_fe_mul_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_mul,.-fe_mul -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sq -.type fe_sq,@function -.align 4 -fe_sq: -#else -.section __TEXT,__text -.globl _fe_sq -.p2align 2 -_fe_sq: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_sq_p(%rip) -#else - jmpq *_fe_sq_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_sq,.-fe_sq -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_mul121666 -.type fe_mul121666,@function -.align 4 -fe_mul121666: -#else -.section __TEXT,__text -.globl _fe_mul121666 -.p2align 2 -_fe_mul121666: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_mul121666_p(%rip) -#else - jmpq *_fe_mul121666_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_mul121666,.-fe_mul121666 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sq2 -.type fe_sq2,@function -.align 4 -fe_sq2: -#else -.section __TEXT,__text -.globl _fe_sq2 -.p2align 2 -_fe_sq2: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_sq2_p(%rip) -#else - jmpq *_fe_sq2_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_sq2,.-fe_sq2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_invert -.type fe_invert,@function -.align 4 -fe_invert: -#else -.section __TEXT,__text -.globl _fe_invert -.p2align 2 -_fe_invert: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_invert_p(%rip) -#else - jmpq *_fe_invert_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_invert,.-fe_invert -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl curve25519 -.type curve25519,@function -.align 4 -curve25519: -#else -.section __TEXT,__text -.globl _curve25519 -.p2align 2 -_curve25519: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *curve25519_p(%rip) -#else - jmpq *_curve25519_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size curve25519,.-curve25519 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_pow22523 -.type fe_pow22523,@function -.align 4 -fe_pow22523: -#else -.section __TEXT,__text -.globl _fe_pow22523 -.p2align 2 -_fe_pow22523: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_pow22523_p(%rip) -#else - jmpq *_fe_pow22523_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_pow22523,.-fe_pow22523 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_to_p2 -.type fe_ge_to_p2,@function -.align 4 -fe_ge_to_p2: -#else -.section __TEXT,__text -.globl _fe_ge_to_p2 -.p2align 2 -_fe_ge_to_p2: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_ge_to_p2_p(%rip) -#else - jmpq *_fe_ge_to_p2_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_ge_to_p2,.-fe_ge_to_p2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_to_p3 -.type fe_ge_to_p3,@function -.align 4 -fe_ge_to_p3: -#else -.section __TEXT,__text -.globl _fe_ge_to_p3 -.p2align 2 -_fe_ge_to_p3: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_ge_to_p3_p(%rip) -#else - jmpq *_fe_ge_to_p3_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_ge_to_p3,.-fe_ge_to_p3 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_dbl -.type fe_ge_dbl,@function -.align 4 -fe_ge_dbl: -#else -.section __TEXT,__text -.globl _fe_ge_dbl -.p2align 2 -_fe_ge_dbl: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_ge_dbl_p(%rip) -#else - jmpq *_fe_ge_dbl_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_ge_dbl,.-fe_ge_dbl -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_madd -.type fe_ge_madd,@function -.align 4 -fe_ge_madd: -#else -.section __TEXT,__text -.globl _fe_ge_madd -.p2align 2 -_fe_ge_madd: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_ge_madd_p(%rip) -#else - jmpq *_fe_ge_madd_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_ge_madd,.-fe_ge_madd -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_msub -.type fe_ge_msub,@function -.align 4 -fe_ge_msub: -#else -.section __TEXT,__text -.globl _fe_ge_msub -.p2align 2 -_fe_ge_msub: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_ge_msub_p(%rip) -#else - jmpq *_fe_ge_msub_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_ge_msub,.-fe_ge_msub -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_add -.type fe_ge_add,@function -.align 4 -fe_ge_add: -#else -.section __TEXT,__text -.globl _fe_ge_add -.p2align 2 -_fe_ge_add: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_ge_add_p(%rip) -#else - jmpq *_fe_ge_add_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_ge_add,.-fe_ge_add -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_sub -.type fe_ge_sub,@function -.align 4 -fe_ge_sub: -#else -.section __TEXT,__text -.globl _fe_ge_sub -.p2align 2 -_fe_ge_sub: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_ge_sub_p(%rip) -#else - jmpq *_fe_ge_sub_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_ge_sub,.-fe_ge_sub -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type cpuFlagsSet, @object -.size cpuFlagsSet,4 -cpuFlagsSet: - .long 0 -#else -.section __DATA,__data -.p2align 2 -_cpuFlagsSet: - .long 0 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type intelFlags, @object -.size intelFlags,4 -intelFlags: - .long 0 -#else -.section __DATA,__data -.p2align 2 -_intelFlags: - .long 0 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_mul_p, @object -.size fe_mul_p,8 -fe_mul_p: - .quad fe_mul_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_mul_p: - .quad _fe_mul_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_sq_p, @object -.size fe_sq_p,8 -fe_sq_p: - .quad fe_sq_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_sq_p: - .quad _fe_sq_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_mul121666_p, @object -.size fe_mul121666_p,8 -fe_mul121666_p: - .quad fe_mul121666_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_mul121666_p: - .quad _fe_mul121666_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_sq2_p, @object -.size fe_sq2_p,8 -fe_sq2_p: - .quad fe_sq2_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_sq2_p: - .quad _fe_sq2_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_invert_p, @object -.size fe_invert_p,8 -fe_invert_p: - .quad fe_invert_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_invert_p: - .quad _fe_invert_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type curve25519_p, @object -.size curve25519_p,8 -curve25519_p: - .quad curve25519_x64 -#else -.section __DATA,__data -.p2align 2 -_curve25519_p: - .quad _curve25519_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_pow22523_p, @object -.size fe_pow22523_p,8 -fe_pow22523_p: - .quad fe_pow22523_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_pow22523_p: - .quad _fe_pow22523_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_ge_to_p2_p, @object -.size fe_ge_to_p2_p,8 -fe_ge_to_p2_p: - .quad fe_ge_to_p2_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_ge_to_p2_p: - .quad _fe_ge_to_p2_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_ge_to_p3_p, @object -.size fe_ge_to_p3_p,8 -fe_ge_to_p3_p: - .quad fe_ge_to_p3_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_ge_to_p3_p: - .quad _fe_ge_to_p3_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_ge_dbl_p, @object -.size fe_ge_dbl_p,8 -fe_ge_dbl_p: - .quad fe_ge_dbl_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_ge_dbl_p: - .quad _fe_ge_dbl_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_ge_madd_p, @object -.size fe_ge_madd_p,8 -fe_ge_madd_p: - .quad fe_ge_madd_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_ge_madd_p: - .quad _fe_ge_madd_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_ge_msub_p, @object -.size fe_ge_msub_p,8 -fe_ge_msub_p: - .quad fe_ge_msub_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_ge_msub_p: - .quad _fe_ge_msub_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_ge_add_p, @object -.size fe_ge_add_p,8 -fe_ge_add_p: - .quad fe_ge_add_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_ge_add_p: - .quad _fe_ge_add_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -.type fe_ge_sub_p, @object -.size fe_ge_sub_p,8 -fe_ge_sub_p: - .quad fe_ge_sub_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_ge_sub_p: - .quad _fe_ge_sub_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_mul_x64 -.type fe_mul_x64,@function -.align 4 -fe_mul_x64: -#else -.section __TEXT,__text -.globl _fe_mul_x64 -.p2align 2 -_fe_mul_x64: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - movq %rdx, %rcx - # Multiply - # A[0] * B[0] - movq (%rcx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rcx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rcx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rcx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rcx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rcx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rcx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rcx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rcx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rcx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rcx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rcx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rcx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rcx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rcx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rcx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_mul_x64,.-fe_mul_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sq_x64 -.type fe_sq_x64,@function -.align 4 -fe_sq_x64: -#else -.section __TEXT,__text -.globl _fe_sq_x64 -.p2align 2 -_fe_sq_x64: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - # Square - # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) - xorq %r13, %r13 - addq %rax, %r12 - adcq %rdx, %r13 - # Double - xorq %r14, %r14 - addq %r8, %r8 - adcq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq $0x00, %r14 - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - movq %rax, %rcx - movq %rdx, %r15 - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %r15, %r8 - adcq %rax, %r9 - adcq $0x00, %rdx - movq %rdx, %r15 - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %r15, %r10 - adcq %rax, %r11 - adcq $0x00, %rdx - movq %rdx, %r15 - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %rax, %r13 - adcq %rdx, %r14 - addq %r15, %r12 - adcq $0x00, %r13 - adcq $0x00, %r14 - # Reduce - movq $0x7fffffffffffffff, %r15 - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - shldq $0x01, %r10, %r11 - andq %r15, %r10 - # Multiply top half by 19 - movq $19, %rax - mulq %r11 - xorq %r11, %r11 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r11 - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - # Add remaining product results in - addq %r11, %r8 - adcq %r12, %r9 - adcq %r13, %r10 - adcq %rax, %r10 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r10, %rdx - imulq $19, %rdx, %rax - andq %r15, %r10 - addq %rax, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - # Reduce if top bit set - movq %r10, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %r15, %r10 - addq %rax, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - # Store - movq %rcx, (%rdi) - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_sq_x64,.-fe_sq_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sq_n_x64 -.type fe_sq_n_x64,@function -.align 4 -fe_sq_n_x64: -#else -.section __TEXT,__text -.globl _fe_sq_n_x64 -.p2align 2 -_fe_sq_n_x64: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - movq %rdx, %rcx -L_fe_sq_n_x64: - # Square - # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r9 - movq %rdx, %r10 - # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) - xorq %r12, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double - xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq %r14, %r14 - adcq $0x00, %r15 - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - movq %rax, %r8 - movq %rdx, %rbx - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %rbx, %r9 - adcq %rax, %r10 - adcq $0x00, %rdx - movq %rdx, %rbx - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %rbx, %r11 - adcq %rax, %r12 - adcq $0x00, %rdx - movq %rdx, %rbx - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rbx, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - decb %cl - jnz L_fe_sq_n_x64 - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_sq_n_x64,.-fe_sq_n_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_mul121666_x64 -.type fe_mul121666_x64,@function -.align 4 -fe_mul121666_x64: -#else -.section __TEXT,__text -.globl _fe_mul121666_x64 -.p2align 2 -_fe_mul121666_x64: -#endif /* __APPLE__ */ - pushq %r12 - # Multiply by 121666 - movq $0x1db42, %rax - mulq (%rsi) - xorq %r10, %r10 - movq %rax, %r8 - movq %rdx, %r9 - movq $0x1db42, %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - movq $0x1db42, %rax - mulq 16(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - movq $0x1db42, %rax - mulq 24(%rsi) - movq $0x7fffffffffffffff, %rcx - addq %rax, %r11 - adcq %rdx, %r12 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - movq $19, %rax - mulq %r12 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_mul121666_x64,.-fe_mul121666_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sq2_x64 -.type fe_sq2_x64,@function -.align 4 -fe_sq2_x64: -#else -.section __TEXT,__text -.globl _fe_sq2_x64 -.p2align 2 -_fe_sq2_x64: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - # Square * 2 - # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) - xorq %r13, %r13 - addq %rax, %r12 - adcq %rdx, %r13 - # Double - xorq %r14, %r14 - addq %r8, %r8 - adcq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq $0x00, %r14 - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - movq %rax, %rcx - movq %rdx, %r15 - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %r15, %r8 - adcq %rax, %r9 - adcq $0x00, %rdx - movq %rdx, %r15 - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %r15, %r10 - adcq %rax, %r11 - adcq $0x00, %rdx - movq %rdx, %r15 - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %rax, %r13 - adcq %rdx, %r14 - addq %r15, %r12 - adcq $0x00, %r13 - adcq $0x00, %r14 - # Reduce - movq $0x7fffffffffffffff, %rbx - xorq %rax, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $3, %r14, %rax - shldq $2, %r13, %r14 - shldq $2, %r12, %r13 - shldq $2, %r11, %r12 - shldq $2, %r10, %r11 - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shldq $0x01, %rcx, %r8 - shlq $0x01, %rcx - andq %rbx, %r10 - # Two out left, one in right - andq %rbx, %r14 - # Multiply top bits by 19*19 - imulq $0x169, %rax, %r15 - # Multiply top half by 19 - movq $19, %rax - mulq %r11 - xorq %r11, %r11 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r11 - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - # Add remaining produce results in - addq %r15, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - adcq %r13, %r10 - adcq %rax, %r10 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r10, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r10 - addq %rax, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - # Reduce if top bit set - movq %r10, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r10 - addq %rax, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - # Store - movq %rcx, (%rdi) - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_sq2_x64,.-fe_sq2_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_invert_x64 -.type fe_invert_x64,@function -.align 4 -fe_invert_x64: -#else -.section __TEXT,__text -.globl _fe_invert_x64 -.p2align 2 -_fe_invert_x64: -#endif /* __APPLE__ */ - subq $0x90, %rsp - # Invert - movq %rdi, 128(%rsp) - movq %rsi, 136(%rsp) - movq %rsp, %rdi - movq 136(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq 136(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $19, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $0x63, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - movq 128(%rsp), %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - addq $0x90, %rsp - repz retq -#ifndef __APPLE__ -.text -.globl curve25519_x64 -.type curve25519_x64,@function -.align 4 -curve25519_x64: -#else -.section __TEXT,__text -.globl _curve25519_x64 -.p2align 2 -_curve25519_x64: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - pushq %rbp - movq %rdx, %r8 - subq $0xb8, %rsp - xorq %rbx, %rbx - movq %rdi, 176(%rsp) - # Set one - movq $0x01, (%rdi) - movq $0x00, 8(%rdi) - movq $0x00, 16(%rdi) - movq $0x00, 24(%rdi) - # Set zero - movq $0x00, (%rsp) - movq $0x00, 8(%rsp) - movq $0x00, 16(%rsp) - movq $0x00, 24(%rsp) - # Set one - movq $0x01, 32(%rsp) - movq $0x00, 40(%rsp) - movq $0x00, 48(%rsp) - movq $0x00, 56(%rsp) - # Copy - movq (%r8), %rcx - movq 8(%r8), %r9 - movq 16(%r8), %r10 - movq 24(%r8), %r11 - movq %rcx, 64(%rsp) - movq %r9, 72(%rsp) - movq %r10, 80(%rsp) - movq %r11, 88(%rsp) - movb $62, 168(%rsp) - movq $3, 160(%rsp) -L_curve25519_x64_words: -L_curve25519_x64_bits: - movq 160(%rsp), %r9 - movb 168(%rsp), %cl - movq (%rsi,%r9,8), %rbp - shrq %cl, %rbp - andq $0x01, %rbp - xorq %rbp, %rbx - negq %rbx - # Conditional Swap - movq (%rdi), %rcx - movq 8(%rdi), %r9 - movq 16(%rdi), %r10 - movq 24(%rdi), %r11 - xorq 64(%rsp), %rcx - xorq 72(%rsp), %r9 - xorq 80(%rsp), %r10 - xorq 88(%rsp), %r11 - andq %rbx, %rcx - andq %rbx, %r9 - andq %rbx, %r10 - andq %rbx, %r11 - xorq %rcx, (%rdi) - xorq %r9, 8(%rdi) - xorq %r10, 16(%rdi) - xorq %r11, 24(%rdi) - xorq %rcx, 64(%rsp) - xorq %r9, 72(%rsp) - xorq %r10, 80(%rsp) - xorq %r11, 88(%rsp) - # Conditional Swap - movq (%rsp), %rcx - movq 8(%rsp), %r9 - movq 16(%rsp), %r10 - movq 24(%rsp), %r11 - xorq 32(%rsp), %rcx - xorq 40(%rsp), %r9 - xorq 48(%rsp), %r10 - xorq 56(%rsp), %r11 - andq %rbx, %rcx - andq %rbx, %r9 - andq %rbx, %r10 - andq %rbx, %r11 - xorq %rcx, (%rsp) - xorq %r9, 8(%rsp) - xorq %r10, 16(%rsp) - xorq %r11, 24(%rsp) - xorq %rcx, 32(%rsp) - xorq %r9, 40(%rsp) - xorq %r10, 48(%rsp) - xorq %r11, 56(%rsp) - movq %rbp, %rbx - # Add - movq (%rdi), %rcx - movq 8(%rdi), %r9 - movq 16(%rdi), %r10 - movq 24(%rdi), %rbp - movq %rcx, %r12 - addq (%rsp), %rcx - movq %r9, %r13 - adcq 8(%rsp), %r9 - movq %r10, %r14 - adcq 16(%rsp), %r10 - movq %rbp, %r15 - adcq 24(%rsp), %rbp - movq $-19, %rax - movq %rbp, %r11 - movq $0x7fffffffffffffff, %rdx - sarq $63, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Sub modulus (if overflow) - subq %rax, %rcx - sbbq %rbp, %r9 - sbbq %rbp, %r10 - sbbq %rdx, %r11 - # Sub - subq (%rsp), %r12 - movq $0x00, %rbp - sbbq 8(%rsp), %r13 - movq $-19, %rax - sbbq 16(%rsp), %r14 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rsp), %r15 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Add modulus (if underflow) - addq %rax, %r12 - adcq %rbp, %r13 - adcq %rbp, %r14 - adcq %rdx, %r15 - movq %rcx, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, 128(%rsp) - movq %r13, 136(%rsp) - movq %r14, 144(%rsp) - movq %r15, 152(%rsp) - # Add - movq 64(%rsp), %rcx - movq 72(%rsp), %r9 - movq 80(%rsp), %r10 - movq 88(%rsp), %rbp - movq %rcx, %r12 - addq 32(%rsp), %rcx - movq %r9, %r13 - adcq 40(%rsp), %r9 - movq %r10, %r14 - adcq 48(%rsp), %r10 - movq %rbp, %r15 - adcq 56(%rsp), %rbp - movq $-19, %rax - movq %rbp, %r11 - movq $0x7fffffffffffffff, %rdx - sarq $63, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Sub modulus (if overflow) - subq %rax, %rcx - sbbq %rbp, %r9 - sbbq %rbp, %r10 - sbbq %rdx, %r11 - # Sub - subq 32(%rsp), %r12 - movq $0x00, %rbp - sbbq 40(%rsp), %r13 - movq $-19, %rax - sbbq 48(%rsp), %r14 - movq $0x7fffffffffffffff, %rdx - sbbq 56(%rsp), %r15 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Add modulus (if underflow) - addq %rax, %r12 - adcq %rbp, %r13 - adcq %rbp, %r14 - adcq %rdx, %r15 - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) - movq %r12, 96(%rsp) - movq %r13, 104(%rsp) - movq %r14, 112(%rsp) - movq %r15, 120(%rsp) - # Multiply - # A[0] * B[0] - movq (%rdi), %rax - mulq 96(%rsp) - movq %rax, %rcx - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rdi), %rax - mulq 96(%rsp) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rdi), %rax - mulq 104(%rsp) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rdi), %rax - mulq 96(%rsp) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rdi), %rax - mulq 104(%rsp) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rdi), %rax - mulq 112(%rsp) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rdi), %rax - mulq 96(%rsp) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rdi), %rax - mulq 104(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rdi), %rax - mulq 112(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rdi), %rax - mulq 120(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rdi), %rax - mulq 104(%rsp) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rdi), %rax - mulq 112(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rdi), %rax - mulq 120(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rdi), %rax - mulq 112(%rsp) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rdi), %rax - mulq 120(%rsp) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rdi), %rax - mulq 120(%rsp) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, 32(%rsp) - movq %r9, 40(%rsp) - movq %r10, 48(%rsp) - movq %r11, 56(%rsp) - # Multiply - # A[0] * B[0] - movq 128(%rsp), %rax - mulq (%rsp) - movq %rax, %rcx - movq %rdx, %r9 - # A[0] * B[1] - movq 136(%rsp), %rax - mulq (%rsp) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq 128(%rsp), %rax - mulq 8(%rsp) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 144(%rsp), %rax - mulq (%rsp) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 136(%rsp), %rax - mulq 8(%rsp) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq 128(%rsp), %rax - mulq 16(%rsp) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 152(%rsp), %rax - mulq (%rsp) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 144(%rsp), %rax - mulq 8(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 136(%rsp), %rax - mulq 16(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq 128(%rsp), %rax - mulq 24(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 152(%rsp), %rax - mulq 8(%rsp) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 144(%rsp), %rax - mulq 16(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 136(%rsp), %rax - mulq 24(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 152(%rsp), %rax - mulq 16(%rsp) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 144(%rsp), %rax - mulq 24(%rsp) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 152(%rsp), %rax - mulq 24(%rsp) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) - # Square - # A[0] * A[1] - movq 128(%rsp), %rax - mulq 136(%rsp) - movq %rax, %r9 - movq %rdx, %r10 - # A[0] * A[2] - movq 128(%rsp), %rax - mulq 144(%rsp) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq 128(%rsp), %rax - mulq 152(%rsp) - xorq %r12, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - # A[1] * A[2] - movq 136(%rsp), %rax - mulq 144(%rsp) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 136(%rsp), %rax - mulq 152(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - # A[2] * A[3] - movq 144(%rsp), %rax - mulq 152(%rsp) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double - xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq %r14, %r14 - adcq $0x00, %r15 - # A[0] * A[0] - movq 128(%rsp), %rax - mulq %rax - movq %rax, %rcx - movq %rdx, %rbp - # A[1] * A[1] - movq 136(%rsp), %rax - mulq %rax - addq %rbp, %r9 - adcq %rax, %r10 - adcq $0x00, %rdx - movq %rdx, %rbp - # A[2] * A[2] - movq 144(%rsp), %rax - mulq %rax - addq %rbp, %r11 - adcq %rax, %r12 - adcq $0x00, %rdx - movq %rdx, %rbp - # A[3] * A[3] - movq 152(%rsp), %rax - mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rbp, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, 96(%rsp) - movq %r9, 104(%rsp) - movq %r10, 112(%rsp) - movq %r11, 120(%rsp) - # Square - # A[0] * A[1] - movq (%rdi), %rax - mulq 8(%rdi) - movq %rax, %r9 - movq %rdx, %r10 - # A[0] * A[2] - movq (%rdi), %rax - mulq 16(%rdi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rdi), %rax - mulq 24(%rdi) - xorq %r12, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - # A[1] * A[2] - movq 8(%rdi), %rax - mulq 16(%rdi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 8(%rdi), %rax - mulq 24(%rdi) - addq %rax, %r12 - adcq %rdx, %r13 - # A[2] * A[3] - movq 16(%rdi), %rax - mulq 24(%rdi) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double - xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq %r14, %r14 - adcq $0x00, %r15 - # A[0] * A[0] - movq (%rdi), %rax - mulq %rax - movq %rax, %rcx - movq %rdx, %rbp - # A[1] * A[1] - movq 8(%rdi), %rax - mulq %rax - addq %rbp, %r9 - adcq %rax, %r10 - adcq $0x00, %rdx - movq %rdx, %rbp - # A[2] * A[2] - movq 16(%rdi), %rax - mulq %rax - addq %rbp, %r11 - adcq %rax, %r12 - adcq $0x00, %rdx - movq %rdx, %rbp - # A[3] * A[3] - movq 24(%rdi), %rax - mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rbp, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, 128(%rsp) - movq %r9, 136(%rsp) - movq %r10, 144(%rsp) - movq %r11, 152(%rsp) - # Add - movq 32(%rsp), %rcx - movq 40(%rsp), %r9 - movq 48(%rsp), %r10 - movq 56(%rsp), %rbp - movq %rcx, %r12 - addq (%rsp), %rcx - movq %r9, %r13 - adcq 8(%rsp), %r9 - movq %r10, %r14 - adcq 16(%rsp), %r10 - movq %rbp, %r15 - adcq 24(%rsp), %rbp - movq $-19, %rax - movq %rbp, %r11 - movq $0x7fffffffffffffff, %rdx - sarq $63, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Sub modulus (if overflow) - subq %rax, %rcx - sbbq %rbp, %r9 - sbbq %rbp, %r10 - sbbq %rdx, %r11 - # Sub - subq (%rsp), %r12 - movq $0x00, %rbp - sbbq 8(%rsp), %r13 - movq $-19, %rax - sbbq 16(%rsp), %r14 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rsp), %r15 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Add modulus (if underflow) - addq %rax, %r12 - adcq %rbp, %r13 - adcq %rbp, %r14 - adcq %rdx, %r15 - movq %rcx, 64(%rsp) - movq %r9, 72(%rsp) - movq %r10, 80(%rsp) - movq %r11, 88(%rsp) - movq %r12, (%rsp) - movq %r13, 8(%rsp) - movq %r14, 16(%rsp) - movq %r15, 24(%rsp) - # Multiply - # A[0] * B[0] - movq 96(%rsp), %rax - mulq 128(%rsp) - movq %rax, %rcx - movq %rdx, %r9 - # A[0] * B[1] - movq 104(%rsp), %rax - mulq 128(%rsp) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq 96(%rsp), %rax - mulq 136(%rsp) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 112(%rsp), %rax - mulq 128(%rsp) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 104(%rsp), %rax - mulq 136(%rsp) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq 96(%rsp), %rax - mulq 144(%rsp) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 120(%rsp), %rax - mulq 128(%rsp) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 112(%rsp), %rax - mulq 136(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 104(%rsp), %rax - mulq 144(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq 96(%rsp), %rax - mulq 152(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 120(%rsp), %rax - mulq 136(%rsp) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 112(%rsp), %rax - mulq 144(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 104(%rsp), %rax - mulq 152(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 120(%rsp), %rax - mulq 144(%rsp) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 112(%rsp), %rax - mulq 152(%rsp) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 120(%rsp), %rax - mulq 152(%rsp) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - # Sub - movq 128(%rsp), %rcx - movq 136(%rsp), %r9 - movq 144(%rsp), %r10 - movq 152(%rsp), %r11 - subq 96(%rsp), %rcx - movq $0x00, %rbp - sbbq 104(%rsp), %r9 - movq $-19, %rax - sbbq 112(%rsp), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 120(%rsp), %r11 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Add modulus (if underflow) - addq %rax, %rcx - adcq %rbp, %r9 - adcq %rbp, %r10 - adcq %rdx, %r11 - movq %rcx, 128(%rsp) - movq %r9, 136(%rsp) - movq %r10, 144(%rsp) - movq %r11, 152(%rsp) - # Square - # A[0] * A[1] - movq (%rsp), %rax - mulq 8(%rsp) - movq %rax, %r9 - movq %rdx, %r10 - # A[0] * A[2] - movq (%rsp), %rax - mulq 16(%rsp) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsp), %rax - mulq 24(%rsp) - xorq %r12, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - # A[1] * A[2] - movq 8(%rsp), %rax - mulq 16(%rsp) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 8(%rsp), %rax - mulq 24(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - # A[2] * A[3] - movq 16(%rsp), %rax - mulq 24(%rsp) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double - xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq %r14, %r14 - adcq $0x00, %r15 - # A[0] * A[0] - movq (%rsp), %rax - mulq %rax - movq %rax, %rcx - movq %rdx, %rbp - # A[1] * A[1] - movq 8(%rsp), %rax - mulq %rax - addq %rbp, %r9 - adcq %rax, %r10 - adcq $0x00, %rdx - movq %rdx, %rbp - # A[2] * A[2] - movq 16(%rsp), %rax - mulq %rax - addq %rbp, %r11 - adcq %rax, %r12 - adcq $0x00, %rdx - movq %rdx, %rbp - # A[3] * A[3] - movq 24(%rsp), %rax - mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rbp, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) - # Multiply by 121666 - movq $0x1db42, %rax - mulq 128(%rsp) - xorq %r10, %r10 - movq %rax, %rcx - movq %rdx, %r9 - movq $0x1db42, %rax - mulq 136(%rsp) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - movq $0x1db42, %rax - mulq 144(%rsp) - xorq %r13, %r13 - addq %rax, %r10 - adcq %rdx, %r11 - movq $0x1db42, %rax - mulq 152(%rsp) - movq $0x7fffffffffffffff, %r12 - addq %rax, %r11 - adcq %rdx, %r13 - shldq $0x01, %r11, %r13 - andq %r12, %r11 - movq $19, %rax - mulq %r13 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - movq %rcx, 32(%rsp) - movq %r9, 40(%rsp) - movq %r10, 48(%rsp) - movq %r11, 56(%rsp) - # Square - # A[0] * A[1] - movq 64(%rsp), %rax - mulq 72(%rsp) - movq %rax, %r9 - movq %rdx, %r10 - # A[0] * A[2] - movq 64(%rsp), %rax - mulq 80(%rsp) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq 64(%rsp), %rax - mulq 88(%rsp) - xorq %r12, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - # A[1] * A[2] - movq 72(%rsp), %rax - mulq 80(%rsp) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 72(%rsp), %rax - mulq 88(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - # A[2] * A[3] - movq 80(%rsp), %rax - mulq 88(%rsp) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double - xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq %r14, %r14 - adcq $0x00, %r15 - # A[0] * A[0] - movq 64(%rsp), %rax - mulq %rax - movq %rax, %rcx - movq %rdx, %rbp - # A[1] * A[1] - movq 72(%rsp), %rax - mulq %rax - addq %rbp, %r9 - adcq %rax, %r10 - adcq $0x00, %rdx - movq %rdx, %rbp - # A[2] * A[2] - movq 80(%rsp), %rax - mulq %rax - addq %rbp, %r11 - adcq %rax, %r12 - adcq $0x00, %rdx - movq %rdx, %rbp - # A[3] * A[3] - movq 88(%rsp), %rax - mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rbp, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, 64(%rsp) - movq %r9, 72(%rsp) - movq %r10, 80(%rsp) - movq %r11, 88(%rsp) - # Add - movq 96(%rsp), %rcx - movq 104(%rsp), %r9 - addq 32(%rsp), %rcx - movq 112(%rsp), %r10 - adcq 40(%rsp), %r9 - movq 120(%rsp), %rbp - adcq 48(%rsp), %r10 - movq $-19, %rax - adcq 56(%rsp), %rbp - movq $0x7fffffffffffffff, %rdx - movq %rbp, %r11 - sarq $63, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Sub modulus (if overflow) - subq %rax, %rcx - sbbq %rbp, %r9 - sbbq %rbp, %r10 - sbbq %rdx, %r11 - movq %rcx, 96(%rsp) - movq %r9, 104(%rsp) - movq %r10, 112(%rsp) - movq %r11, 120(%rsp) - # Multiply - # A[0] * B[0] - movq (%rsp), %rax - mulq (%r8) - movq %rax, %rcx - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rsp), %rax - mulq (%r8) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rsp), %rax - mulq 8(%r8) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rsp), %rax - mulq (%r8) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rsp), %rax - mulq 8(%r8) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rsp), %rax - mulq 16(%r8) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rsp), %rax - mulq (%r8) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rsp), %rax - mulq 8(%r8) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rsp), %rax - mulq 16(%r8) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rsp), %rax - mulq 24(%r8) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rsp), %rax - mulq 8(%r8) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rsp), %rax - mulq 16(%r8) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rsp), %rax - mulq 24(%r8) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rsp), %rax - mulq 16(%r8) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rsp), %rax - mulq 24(%r8) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rsp), %rax - mulq 24(%r8) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, 32(%rsp) - movq %r9, 40(%rsp) - movq %r10, 48(%rsp) - movq %r11, 56(%rsp) - # Multiply - # A[0] * B[0] - movq 96(%rsp), %rax - mulq 128(%rsp) - movq %rax, %rcx - movq %rdx, %r9 - # A[0] * B[1] - movq 104(%rsp), %rax - mulq 128(%rsp) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq 96(%rsp), %rax - mulq 136(%rsp) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 112(%rsp), %rax - mulq 128(%rsp) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 104(%rsp), %rax - mulq 136(%rsp) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq 96(%rsp), %rax - mulq 144(%rsp) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 120(%rsp), %rax - mulq 128(%rsp) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 112(%rsp), %rax - mulq 136(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 104(%rsp), %rax - mulq 144(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq 96(%rsp), %rax - mulq 152(%rsp) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 120(%rsp), %rax - mulq 136(%rsp) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 112(%rsp), %rax - mulq 144(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 104(%rsp), %rax - mulq 152(%rsp) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 120(%rsp), %rax - mulq 144(%rsp) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 112(%rsp), %rax - mulq 152(%rsp) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 120(%rsp), %rax - mulq 152(%rsp) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) - decb 168(%rsp) - jge L_curve25519_x64_bits - movq $63, 168(%rsp) - decb 160(%rsp) - jge L_curve25519_x64_words - # Invert - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - movq %rsp, %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 96(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 128(%rsp), %rdi - leaq 96(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 128(%rsp), %rdi - leaq 128(%rsp), %rsi - movq $19, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 128(%rsp), %rsi - leaq 96(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 128(%rsp), %rdi - leaq 96(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 128(%rsp), %rdi - leaq 128(%rsp), %rsi - movq $0x63, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 128(%rsp), %rsi - leaq 96(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - movq 176(%rsp), %rdi - # Multiply - # A[0] * B[0] - movq (%rsp), %rax - mulq (%rdi) - movq %rax, %rcx - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rsp), %rax - mulq (%rdi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rsp), %rax - mulq 8(%rdi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rsp), %rax - mulq (%rdi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rsp), %rax - mulq 8(%rdi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rsp), %rax - mulq 16(%rdi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rsp), %rax - mulq (%rdi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rsp), %rax - mulq 8(%rdi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rsp), %rax - mulq 16(%rdi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rsp), %rax - mulq 24(%rdi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rsp), %rax - mulq 8(%rdi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rsp), %rax - mulq 16(%rdi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rsp), %rax - mulq 24(%rdi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rsp), %rax - mulq 16(%rdi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rsp), %rax - mulq 24(%rdi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rsp), %rax - mulq 24(%rdi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %rcx, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - xorq %rax, %rax - addq $0xb8, %rsp - popq %rbp - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size curve25519_x64,.-curve25519_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_pow22523_x64 -.type fe_pow22523_x64,@function -.align 4 -fe_pow22523_x64: -#else -.section __TEXT,__text -.globl _fe_pow22523_x64 -.p2align 2 -_fe_pow22523_x64: -#endif /* __APPLE__ */ - subq $0x70, %rsp - # pow22523 - movq %rdi, 96(%rsp) - movq %rsi, 104(%rsp) - movq %rsp, %rdi - movq 104(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq 104(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $19, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $0x63, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - movq 96(%rsp), %rdi - movq %rsp, %rsi - movq 104(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - movq 104(%rsp), %rsi - movq 96(%rsp), %rdi - addq $0x70, %rsp - repz retq -#ifndef __APPLE__ -.text -.globl fe_ge_to_p2_x64 -.type fe_ge_to_p2_x64,@function -.align 4 -fe_ge_to_p2_x64: -#else -.section __TEXT,__text -.globl _fe_ge_to_p2_x64 -.p2align 2 -_fe_ge_to_p2_x64: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $40, %rsp - movq %rsi, (%rsp) - movq %rdx, 8(%rsp) - movq %rcx, 16(%rsp) - movq %r8, 24(%rsp) - movq %r9, 32(%rsp) - movq 16(%rsp), %rsi - movq 88(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 24(%rsp), %rsi - movq 32(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 32(%rsp), %rsi - movq 88(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $40, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_to_p3_x64 -.type fe_ge_to_p3_x64,@function -.align 4 -fe_ge_to_p3_x64: -#else -.section __TEXT,__text -.globl _fe_ge_to_p3_x64 -.p2align 2 -_fe_ge_to_p3_x64: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $40, %rsp - movq %rsi, (%rsp) - movq %rdx, 8(%rsp) - movq %rcx, 16(%rsp) - movq %r8, 24(%rsp) - movq %r9, 32(%rsp) - movq 24(%rsp), %rsi - movq 96(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 32(%rsp), %rsi - movq 88(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 88(%rsp), %rsi - movq 96(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 24(%rsp), %rsi - movq 32(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $40, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_dbl_x64 -.type fe_ge_dbl_x64,@function -.align 4 -fe_ge_dbl_x64: -#else -.section __TEXT,__text -.globl _fe_ge_dbl_x64 -.p2align 2 -_fe_ge_dbl_x64: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $0x50, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 32(%rsp), %rsi - # Square - # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r9 - movq %rdx, %r10 - # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) - xorq %r12, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double - xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq %r14, %r14 - adcq $0x00, %r15 - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - movq %rax, %r8 - movq %rdx, %rcx - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %rcx, %r9 - adcq %rax, %r10 - adcq $0x00, %rdx - movq %rdx, %rcx - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %rcx, %r11 - adcq %rax, %r12 - adcq $0x00, %rdx - movq %rdx, %rcx - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rcx, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 40(%rsp), %rsi - # Square - # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r9 - movq %rdx, %r10 - # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) - xorq %r12, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double - xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq %r14, %r14 - adcq $0x00, %r15 - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - movq %rax, %r8 - movq %rdx, %rcx - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %rcx, %r9 - adcq %rax, %r10 - adcq $0x00, %rdx - movq %rdx, %rcx - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %rcx, %r11 - adcq %rax, %r12 - adcq $0x00, %rdx - movq %rdx, %rcx - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rcx, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 128(%rsp), %rsi - # Square * 2 - # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r9 - movq %rdx, %r10 - # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) - xorq %r12, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double - xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq %r14, %r14 - adcq $0x00, %r15 - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - movq %rax, %r8 - movq %rdx, %rcx - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %rcx, %r9 - adcq %rax, %r10 - adcq $0x00, %rdx - movq %rdx, %rcx - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %rcx, %r11 - adcq %rax, %r12 - adcq $0x00, %rdx - movq %rdx, %rcx - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rcx, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbx - xorq %rax, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $3, %r15, %rax - shldq $2, %r14, %r15 - shldq $2, %r13, %r14 - shldq $2, %r12, %r13 - shldq $2, %r11, %r12 - shldq $0x01, %r10, %r11 - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shlq $0x01, %r8 - andq %rbx, %r11 - # Two out left, one in right - andq %rbx, %r15 - # Multiply top bits by 19*19 - imulq $0x169, %rax, %rcx - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining produce results in - addq %rcx, %r8 - adcq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 32(%rsp), %rsi - movq 40(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq 8(%rsp), %rsi - # Square - # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r9 - movq %rdx, %r10 - # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) - xorq %r12, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double - xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq %r14, %r14 - adcq $0x00, %r15 - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - movq %rax, %r8 - movq %rdx, %rcx - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %rcx, %r9 - adcq %rax, %r10 - adcq $0x00, %rdx - movq %rdx, %rcx - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %rcx, %r11 - adcq %rax, %r12 - adcq $0x00, %rdx - movq %rdx, %rcx - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rcx, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq (%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 16(%rsp), %rsi - movq (%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - leaq 48(%rsp), %rsi - movq 8(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $0x50, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_dbl_x64,.-fe_ge_dbl_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_madd_x64 -.type fe_ge_madd_x64,@function -.align 4 -fe_ge_madd_x64: -#else -.section __TEXT,__text -.globl _fe_ge_madd_x64 -.p2align 2 -_fe_ge_madd_x64: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $0x50, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq (%rsp), %rsi - movq 152(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 8(%rsp), %rsi - movq 160(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 144(%rsp), %rsi - movq 136(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq 128(%rsp), %rsi - movq 128(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $0x50, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_madd_x64,.-fe_ge_madd_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_msub_x64 -.type fe_ge_msub_x64,@function -.align 4 -fe_ge_msub_x64: -#else -.section __TEXT,__text -.globl _fe_ge_msub_x64 -.p2align 2 -_fe_ge_msub_x64: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $0x50, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq (%rsp), %rsi - movq 160(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 8(%rsp), %rsi - movq 152(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 144(%rsp), %rsi - movq 136(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq 128(%rsp), %rsi - movq 128(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $0x50, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_msub_x64,.-fe_ge_msub_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_add_x64 -.type fe_ge_add_x64,@function -.align 4 -fe_ge_add_x64: -#else -.section __TEXT,__text -.globl _fe_ge_add_x64 -.p2align 2 -_fe_ge_add_x64: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $0x50, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq (%rsp), %rsi - movq 160(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 8(%rsp), %rsi - movq 168(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 152(%rsp), %rsi - movq 136(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 128(%rsp), %rsi - movq 144(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq (%rsp), %rsi - movq (%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $0x50, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_add_x64,.-fe_ge_add_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_sub_x64 -.type fe_ge_sub_x64,@function -.align 4 -fe_ge_sub_x64: -#else -.section __TEXT,__text -.globl _fe_ge_sub_x64 -.p2align 2 -_fe_ge_sub_x64: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $0x50, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq (%rsp), %rsi - movq 168(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 8(%rsp), %rsi - movq 160(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 152(%rsp), %rsi - movq 136(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 128(%rsp), %rsi - movq 144(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r13 - adcq %rdx, %r14 - adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq (%rsp), %rsi - movq (%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $0x50, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_sub_x64,.-fe_ge_sub_x64 -#endif /* __APPLE__ */ -#ifdef HAVE_INTEL_AVX2 -#ifndef __APPLE__ -.text -.globl fe_mul_avx2 -.type fe_mul_avx2,@function -.align 4 -fe_mul_avx2: -#else -.section __TEXT,__text -.globl _fe_mul_avx2 -.p2align 2 -_fe_mul_avx2: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - movq %rdx, %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rax, %rcx - xorq %r15, %r15 - adcxq %rax, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rcx, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rax, %rcx - adoxq %rax, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rax, %r14 - adoxq %rcx, %r10 - adcxq %rax, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rax, %rcx - adcxq %r14, %r12 - adoxq %rax, %r11 - adcxq %r15, %r13 - adoxq %rcx, %r12 - # A[0] * B[2] - mulxq (%rsi), %rax, %rcx - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rax, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rax - adcxq %rcx, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rax, %r11 - mulxq 24(%rsi), %rax, %rcx - adcxq %rax, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rax - adcxq %rcx, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rax, %r13 - mulxq 24(%rsi), %rax, %rcx - adoxq %r15, %r14 - adcxq %rax, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rax - adcxq %rcx, %r15 - xorq %rcx, %rcx - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rax, %r12 - mulxq 24(%rsi), %rdx, %rax - adoxq %rdx, %r11 - adoxq %rax, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rax - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rax, %r14 - mulxq 24(%rsi), %rax, %rdx - adcxq %rcx, %r15 - adoxq %rax, %r13 - adoxq %rdx, %r14 - adoxq %rcx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r12, %rax, %r12 - adcxq %rax, %r8 - adoxq %r12, %r9 - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_mul_avx2,.-fe_mul_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sq_avx2 -.type fe_sq_avx2,@function -.align 4 -fe_sq_avx2: -#else -.section __TEXT,__text -.globl _fe_sq_avx2 -.p2align 2 -_fe_sq_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - # Square - # A[0] * A[1] - movq (%rsi), %rdx - mulxq 8(%rsi), %r9, %r10 - # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 - # A[2] * A[1] - movq 16(%rsi), %rdx - mulxq 8(%rsi), %rcx, %rbx - xorq %r15, %r15 - adoxq %rcx, %r11 - # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - adoxq %rbx, %r12 - # A[2] * A[0] - mulxq (%rsi), %rcx, %rbx - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 - # A[1] * A[3] - movq 8(%rsi), %rdx - mulxq 24(%rsi), %rax, %r8 - adcxq %rbx, %r11 - adcxq %rax, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 - # A[0] * A[0] - movq (%rsi), %rdx - mulxq %rdx, %r8, %rax - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rsi), %rdx - mulxq %rdx, %rcx, %rbx - adcxq %r10, %r10 - adoxq %rax, %r9 - adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r12, %r12 - adoxq %rbx, %r11 - adcxq %r13, %r13 - adoxq %rax, %r12 - # A[3] * A[3] - movq 24(%rsi), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r14, %r14 - adoxq %rcx, %r13 - adcxq %r15, %r15 - adoxq %rax, %r14 - adoxq %rbx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r12, %rax, %r12 - adcxq %rax, %r8 - adoxq %r12, %r9 - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_sq_avx2,.-fe_sq_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sq_n_avx2 -.type fe_sq_n_avx2,@function -.align 4 -fe_sq_n_avx2: -#else -.section __TEXT,__text -.globl _fe_sq_n_avx2 -.p2align 2 -_fe_sq_n_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - movq %rdx, %rbp -L_fe_sq_n_avx2: - # Square - # A[0] * A[1] - movq (%rsi), %rdx - mulxq 8(%rsi), %r9, %r10 - # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 - # A[2] * A[1] - movq 16(%rsi), %rdx - mulxq 8(%rsi), %rcx, %rbx - xorq %r15, %r15 - adoxq %rcx, %r11 - # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - adoxq %rbx, %r12 - # A[2] * A[0] - mulxq (%rsi), %rcx, %rbx - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 - # A[1] * A[3] - movq 8(%rsi), %rdx - mulxq 24(%rsi), %rax, %r8 - adcxq %rbx, %r11 - adcxq %rax, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 - # A[0] * A[0] - movq (%rsi), %rdx - mulxq %rdx, %r8, %rax - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rsi), %rdx - mulxq %rdx, %rcx, %rbx - adcxq %r10, %r10 - adoxq %rax, %r9 - adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r12, %r12 - adoxq %rbx, %r11 - adcxq %r13, %r13 - adoxq %rax, %r12 - # A[3] * A[3] - movq 24(%rsi), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r14, %r14 - adoxq %rcx, %r13 - adcxq %r15, %r15 - adoxq %rax, %r14 - adoxq %rbx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r12, %rax, %r12 - adcxq %rax, %r8 - adoxq %r12, %r9 - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - decb %bpl - jnz L_fe_sq_n_avx2 - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_sq_n_avx2,.-fe_sq_n_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_mul121666_avx2 -.type fe_mul121666_avx2,@function -.align 4 -fe_mul121666_avx2: -#else -.section __TEXT,__text -.globl _fe_mul121666_avx2 -.p2align 2 -_fe_mul121666_avx2: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - movq $0x1db42, %rdx - mulxq (%rsi), %rax, %r13 - mulxq 8(%rsi), %rcx, %r12 - mulxq 16(%rsi), %r8, %r11 - mulxq 24(%rsi), %r9, %r10 - addq %r13, %rcx - adcq %r12, %r8 - adcq %r11, %r9 - adcq $0x00, %r10 - movq $0x7fffffffffffffff, %r13 - shldq $0x01, %r9, %r10 - andq %r13, %r9 - imulq $19, %r10, %r10 - addq %r10, %rax - adcq $0x00, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - movq %rax, (%rdi) - movq %rcx, 8(%rdi) - movq %r8, 16(%rdi) - movq %r9, 24(%rdi) - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_mul121666_avx2,.-fe_mul121666_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_sq2_avx2 -.type fe_sq2_avx2,@function -.align 4 -fe_sq2_avx2: -#else -.section __TEXT,__text -.globl _fe_sq2_avx2 -.p2align 2 -_fe_sq2_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - # Square * 2 - # A[0] * A[1] - movq (%rsi), %rdx - mulxq 8(%rsi), %r9, %r10 - # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 - # A[2] * A[1] - movq 16(%rsi), %rdx - mulxq 8(%rsi), %rcx, %rbx - xorq %r15, %r15 - adoxq %rcx, %r11 - # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - adoxq %rbx, %r12 - # A[2] * A[0] - mulxq (%rsi), %rcx, %rbx - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 - # A[1] * A[3] - movq 8(%rsi), %rdx - mulxq 24(%rsi), %rax, %r8 - adcxq %rbx, %r11 - adcxq %rax, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 - # A[0] * A[0] - movq (%rsi), %rdx - mulxq %rdx, %r8, %rax - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rsi), %rdx - mulxq %rdx, %rcx, %rbx - adcxq %r10, %r10 - adoxq %rax, %r9 - adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r12, %r12 - adoxq %rbx, %r11 - adcxq %r13, %r13 - adoxq %rax, %r12 - # A[3] * A[3] - movq 24(%rsi), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r14, %r14 - adoxq %rcx, %r13 - adcxq %r15, %r15 - adoxq %rax, %r14 - adoxq %rbx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbx - xorq %rax, %rax - # Move top half into t4-t7 and remove top bit from t3 and double - shldq $3, %r15, %rax - shldq $2, %r14, %r15 - shldq $2, %r13, %r14 - shldq $2, %r12, %r13 - shldq $2, %r11, %r12 - shldq $0x01, %r10, %r11 - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shlq $0x01, %r8 - andq %rbx, %r11 - # Two out left, one in right - andq %rbx, %r15 - # Multiply top bits by 19*19 - imulq $0x169, %rax, %rcx - xorq %rbx, %rbx - # Multiply top half by 19 - movq $19, %rdx - adoxq %rcx, %r8 - mulxq %r12, %rax, %r12 - adcxq %rax, %r8 - adoxq %r12, %r9 - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_sq2_avx2,.-fe_sq2_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_invert_avx2 -.type fe_invert_avx2,@function -.align 4 -fe_invert_avx2: -#else -.section __TEXT,__text -.globl _fe_invert_avx2 -.p2align 2 -_fe_invert_avx2: -#endif /* __APPLE__ */ - subq $0x90, %rsp - # Invert - movq %rdi, 128(%rsp) - movq %rsi, 136(%rsp) - movq %rsp, %rdi - movq 136(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq 136(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $19, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $0x63, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - movq 128(%rsp), %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - movq 136(%rsp), %rsi - movq 128(%rsp), %rdi - addq $0x90, %rsp - repz retq -#ifndef __APPLE__ -.text -.globl curve25519_avx2 -.type curve25519_avx2,@function -.align 4 -curve25519_avx2: -#else -.section __TEXT,__text -.globl _curve25519_avx2 -.p2align 2 -_curve25519_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - movq %rdx, %r8 - subq $0xc0, %rsp - movq $0x00, 184(%rsp) - movq %rdi, 176(%rsp) - # Set one - movq $0x01, (%rdi) - movq $0x00, 8(%rdi) - movq $0x00, 16(%rdi) - movq $0x00, 24(%rdi) - # Set zero - movq $0x00, (%rsp) - movq $0x00, 8(%rsp) - movq $0x00, 16(%rsp) - movq $0x00, 24(%rsp) - # Set one - movq $0x01, 32(%rsp) - movq $0x00, 40(%rsp) - movq $0x00, 48(%rsp) - movq $0x00, 56(%rsp) - # Copy - movq (%r8), %r9 - movq 8(%r8), %r10 - movq 16(%r8), %r11 - movq 24(%r8), %r12 - movq %r9, 64(%rsp) - movq %r10, 72(%rsp) - movq %r11, 80(%rsp) - movq %r12, 88(%rsp) - movb $62, 168(%rsp) - movq $3, 160(%rsp) -L_curve25519_avx2_words: -L_curve25519_avx2_bits: - movq 184(%rsp), %rbx - movq 160(%rsp), %r9 - movb 168(%rsp), %cl - movq (%rsi,%r9,8), %rax - shrq %cl, %rax - andq $0x01, %rax - xorq %rax, %rbx - negq %rbx - # Conditional Swap - movq (%rdi), %r9 - movq 8(%rdi), %r10 - movq 16(%rdi), %r11 - movq 24(%rdi), %r12 - xorq 64(%rsp), %r9 - xorq 72(%rsp), %r10 - xorq 80(%rsp), %r11 - xorq 88(%rsp), %r12 - andq %rbx, %r9 - andq %rbx, %r10 - andq %rbx, %r11 - andq %rbx, %r12 - xorq %r9, (%rdi) - xorq %r10, 8(%rdi) - xorq %r11, 16(%rdi) - xorq %r12, 24(%rdi) - xorq %r9, 64(%rsp) - xorq %r10, 72(%rsp) - xorq %r11, 80(%rsp) - xorq %r12, 88(%rsp) - # Conditional Swap - movq (%rsp), %r9 - movq 8(%rsp), %r10 - movq 16(%rsp), %r11 - movq 24(%rsp), %r12 - xorq 32(%rsp), %r9 - xorq 40(%rsp), %r10 - xorq 48(%rsp), %r11 - xorq 56(%rsp), %r12 - andq %rbx, %r9 - andq %rbx, %r10 - andq %rbx, %r11 - andq %rbx, %r12 - xorq %r9, (%rsp) - xorq %r10, 8(%rsp) - xorq %r11, 16(%rsp) - xorq %r12, 24(%rsp) - xorq %r9, 32(%rsp) - xorq %r10, 40(%rsp) - xorq %r11, 48(%rsp) - xorq %r12, 56(%rsp) - movq %rax, 184(%rsp) - # Add - movq (%rdi), %r9 - movq 8(%rdi), %r10 - movq 16(%rdi), %r11 - movq 24(%rdi), %rax - movq %r9, %r13 - addq (%rsp), %r9 - movq %r10, %r14 - adcq 8(%rsp), %r10 - movq %r11, %r15 - adcq 16(%rsp), %r11 - movq %rax, %rbp - adcq 24(%rsp), %rax - movq $-19, %rcx - movq %rax, %r12 - movq $0x7fffffffffffffff, %rbx - sarq $63, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Sub modulus (if overflow) - subq %rcx, %r9 - sbbq %rax, %r10 - sbbq %rax, %r11 - sbbq %rbx, %r12 - # Sub - subq (%rsp), %r13 - movq $0x00, %rax - sbbq 8(%rsp), %r14 - movq $-19, %rcx - sbbq 16(%rsp), %r15 - movq $0x7fffffffffffffff, %rbx - sbbq 24(%rsp), %rbp - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Add modulus (if underflow) - addq %rcx, %r13 - adcq %rax, %r14 - adcq %rax, %r15 - adcq %rbx, %rbp - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) - movq %r13, 128(%rsp) - movq %r14, 136(%rsp) - movq %r15, 144(%rsp) - movq %rbp, 152(%rsp) - # Add - movq 64(%rsp), %r9 - movq 72(%rsp), %r10 - movq 80(%rsp), %r11 - movq 88(%rsp), %rax - movq %r9, %r13 - addq 32(%rsp), %r9 - movq %r10, %r14 - adcq 40(%rsp), %r10 - movq %r11, %r15 - adcq 48(%rsp), %r11 - movq %rax, %rbp - adcq 56(%rsp), %rax - movq $-19, %rcx - movq %rax, %r12 - movq $0x7fffffffffffffff, %rbx - sarq $63, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Sub modulus (if overflow) - subq %rcx, %r9 - sbbq %rax, %r10 - sbbq %rax, %r11 - sbbq %rbx, %r12 - # Sub - subq 32(%rsp), %r13 - movq $0x00, %rax - sbbq 40(%rsp), %r14 - movq $-19, %rcx - sbbq 48(%rsp), %r15 - movq $0x7fffffffffffffff, %rbx - sbbq 56(%rsp), %rbp - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Add modulus (if underflow) - addq %rcx, %r13 - adcq %rax, %r14 - adcq %rax, %r15 - adcq %rbx, %rbp - movq %r9, (%rsp) - movq %r10, 8(%rsp) - movq %r11, 16(%rsp) - movq %r12, 24(%rsp) - movq %r13, 96(%rsp) - movq %r14, 104(%rsp) - movq %r15, 112(%rsp) - movq %rbp, 120(%rsp) - # Multiply - # A[0] * B[0] - movq (%rdi), %rdx - mulxq 96(%rsp), %r9, %r10 - # A[2] * B[0] - mulxq 112(%rsp), %r11, %r12 - # A[1] * B[0] - mulxq 104(%rsp), %rcx, %rbx - xorq %rbp, %rbp - adcxq %rcx, %r10 - # A[1] * B[3] - movq 24(%rdi), %rdx - mulxq 104(%rsp), %r13, %r14 - adcxq %rbx, %r11 - # A[0] * B[1] - movq 8(%rdi), %rdx - mulxq 96(%rsp), %rcx, %rbx - adoxq %rcx, %r10 - # A[2] * B[1] - mulxq 112(%rsp), %rcx, %r15 - adoxq %rbx, %r11 - adcxq %rcx, %r12 - # A[1] * B[2] - movq 16(%rdi), %rdx - mulxq 104(%rsp), %rcx, %rbx - adcxq %r15, %r13 - adoxq %rcx, %r12 - adcxq %rbp, %r14 - adoxq %rbx, %r13 - # A[0] * B[2] - mulxq 96(%rsp), %rcx, %rbx - adoxq %rbp, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 - # A[1] * B[1] - movq 8(%rdi), %rdx - mulxq 104(%rsp), %rdx, %rcx - adcxq %rbx, %r12 - adoxq %rdx, %r11 - # A[3] * B[1] - movq 8(%rdi), %rdx - adoxq %rcx, %r12 - mulxq 120(%rsp), %rcx, %rbx - adcxq %rcx, %r13 - # A[2] * B[2] - movq 16(%rdi), %rdx - mulxq 112(%rsp), %rdx, %rcx - adcxq %rbx, %r14 - adoxq %rdx, %r13 - # A[3] * B[3] - movq 24(%rdi), %rdx - adoxq %rcx, %r14 - mulxq 120(%rsp), %rcx, %rbx - adoxq %rbp, %r15 - adcxq %rcx, %r15 - # A[0] * B[3] - mulxq 96(%rsp), %rdx, %rcx - adcxq %rbx, %rbp - xorq %rbx, %rbx - adcxq %rdx, %r12 - # A[3] * B[0] - movq (%rdi), %rdx - adcxq %rcx, %r13 - mulxq 120(%rsp), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 - # A[2] * B[3] - movq 24(%rdi), %rdx - mulxq 112(%rsp), %rdx, %rcx - adcxq %rdx, %r14 - # A[3] * B[2] - movq 16(%rdi), %rdx - adcxq %rcx, %r15 - mulxq 120(%rsp), %rcx, %rdx - adcxq %rbx, %rbp - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rbx, %rbx - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %rcx, %r15 - adcxq %rcx, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, 32(%rsp) - movq %r10, 40(%rsp) - movq %r11, 48(%rsp) - movq %r12, 56(%rsp) - # Multiply - # A[0] * B[0] - movq 128(%rsp), %rdx - mulxq (%rsp), %r9, %r10 - # A[2] * B[0] - mulxq 16(%rsp), %r11, %r12 - # A[1] * B[0] - mulxq 8(%rsp), %rcx, %rbx - xorq %rbp, %rbp - adcxq %rcx, %r10 - # A[1] * B[3] - movq 152(%rsp), %rdx - mulxq 8(%rsp), %r13, %r14 - adcxq %rbx, %r11 - # A[0] * B[1] - movq 136(%rsp), %rdx - mulxq (%rsp), %rcx, %rbx - adoxq %rcx, %r10 - # A[2] * B[1] - mulxq 16(%rsp), %rcx, %r15 - adoxq %rbx, %r11 - adcxq %rcx, %r12 - # A[1] * B[2] - movq 144(%rsp), %rdx - mulxq 8(%rsp), %rcx, %rbx - adcxq %r15, %r13 - adoxq %rcx, %r12 - adcxq %rbp, %r14 - adoxq %rbx, %r13 - # A[0] * B[2] - mulxq (%rsp), %rcx, %rbx - adoxq %rbp, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 - # A[1] * B[1] - movq 136(%rsp), %rdx - mulxq 8(%rsp), %rdx, %rcx - adcxq %rbx, %r12 - adoxq %rdx, %r11 - # A[3] * B[1] - movq 136(%rsp), %rdx - adoxq %rcx, %r12 - mulxq 24(%rsp), %rcx, %rbx - adcxq %rcx, %r13 - # A[2] * B[2] - movq 144(%rsp), %rdx - mulxq 16(%rsp), %rdx, %rcx - adcxq %rbx, %r14 - adoxq %rdx, %r13 - # A[3] * B[3] - movq 152(%rsp), %rdx - adoxq %rcx, %r14 - mulxq 24(%rsp), %rcx, %rbx - adoxq %rbp, %r15 - adcxq %rcx, %r15 - # A[0] * B[3] - mulxq (%rsp), %rdx, %rcx - adcxq %rbx, %rbp - xorq %rbx, %rbx - adcxq %rdx, %r12 - # A[3] * B[0] - movq 128(%rsp), %rdx - adcxq %rcx, %r13 - mulxq 24(%rsp), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 - # A[2] * B[3] - movq 152(%rsp), %rdx - mulxq 16(%rsp), %rdx, %rcx - adcxq %rdx, %r14 - # A[3] * B[2] - movq 144(%rsp), %rdx - adcxq %rcx, %r15 - mulxq 24(%rsp), %rcx, %rdx - adcxq %rbx, %rbp - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rbx, %rbx - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %rcx, %r15 - adcxq %rcx, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, (%rsp) - movq %r10, 8(%rsp) - movq %r11, 16(%rsp) - movq %r12, 24(%rsp) - # Square - # A[0] * A[1] - movq 128(%rsp), %rdx - mulxq 136(%rsp), %r10, %r11 - # A[0] * A[3] - mulxq 152(%rsp), %r12, %r13 - # A[2] * A[1] - movq 144(%rsp), %rdx - mulxq 136(%rsp), %rcx, %rbx - xorq %rbp, %rbp - adoxq %rcx, %r12 - # A[2] * A[3] - mulxq 152(%rsp), %r14, %r15 - adoxq %rbx, %r13 - # A[2] * A[0] - mulxq 128(%rsp), %rcx, %rbx - adoxq %rbp, %r14 - adcxq %rcx, %r11 - adoxq %rbp, %r15 - # A[1] * A[3] - movq 136(%rsp), %rdx - mulxq 152(%rsp), %rax, %r9 - adcxq %rbx, %r12 - adcxq %rax, %r13 - adcxq %r9, %r14 - adcxq %rbp, %r15 - # Double with Carry Flag - xorq %rbp, %rbp - # A[0] * A[0] - movq 128(%rsp), %rdx - mulxq %rdx, %r9, %rax - adcxq %r10, %r10 - # A[1] * A[1] - movq 136(%rsp), %rdx - mulxq %rdx, %rcx, %rbx - adcxq %r11, %r11 - adoxq %rax, %r10 - adcxq %r12, %r12 - adoxq %rcx, %r11 - # A[2] * A[2] - movq 144(%rsp), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r13, %r13 - adoxq %rbx, %r12 - adcxq %r14, %r14 - adoxq %rax, %r13 - # A[3] * A[3] - movq 152(%rsp), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r15, %r15 - adoxq %rcx, %r14 - adcxq %rbp, %rbp - adoxq %rax, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rcx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - mulxq %r15, %rax, %r15 - adcxq %rax, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, 96(%rsp) - movq %r10, 104(%rsp) - movq %r11, 112(%rsp) - movq %r12, 120(%rsp) - # Square - # A[0] * A[1] - movq (%rdi), %rdx - mulxq 8(%rdi), %r10, %r11 - # A[0] * A[3] - mulxq 24(%rdi), %r12, %r13 - # A[2] * A[1] - movq 16(%rdi), %rdx - mulxq 8(%rdi), %rcx, %rbx - xorq %rbp, %rbp - adoxq %rcx, %r12 - # A[2] * A[3] - mulxq 24(%rdi), %r14, %r15 - adoxq %rbx, %r13 - # A[2] * A[0] - mulxq (%rdi), %rcx, %rbx - adoxq %rbp, %r14 - adcxq %rcx, %r11 - adoxq %rbp, %r15 - # A[1] * A[3] - movq 8(%rdi), %rdx - mulxq 24(%rdi), %rax, %r9 - adcxq %rbx, %r12 - adcxq %rax, %r13 - adcxq %r9, %r14 - adcxq %rbp, %r15 - # Double with Carry Flag - xorq %rbp, %rbp - # A[0] * A[0] - movq (%rdi), %rdx - mulxq %rdx, %r9, %rax - adcxq %r10, %r10 - # A[1] * A[1] - movq 8(%rdi), %rdx - mulxq %rdx, %rcx, %rbx - adcxq %r11, %r11 - adoxq %rax, %r10 - adcxq %r12, %r12 - adoxq %rcx, %r11 - # A[2] * A[2] - movq 16(%rdi), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r13, %r13 - adoxq %rbx, %r12 - adcxq %r14, %r14 - adoxq %rax, %r13 - # A[3] * A[3] - movq 24(%rdi), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r15, %r15 - adoxq %rcx, %r14 - adcxq %rbp, %rbp - adoxq %rax, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rcx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - mulxq %r15, %rax, %r15 - adcxq %rax, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, 128(%rsp) - movq %r10, 136(%rsp) - movq %r11, 144(%rsp) - movq %r12, 152(%rsp) - # Add - movq 32(%rsp), %r9 - movq 40(%rsp), %r10 - movq 48(%rsp), %r11 - movq 56(%rsp), %rax - movq %r9, %r13 - addq (%rsp), %r9 - movq %r10, %r14 - adcq 8(%rsp), %r10 - movq %r11, %r15 - adcq 16(%rsp), %r11 - movq %rax, %rbp - adcq 24(%rsp), %rax - movq $-19, %rcx - movq %rax, %r12 - movq $0x7fffffffffffffff, %rbx - sarq $63, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Sub modulus (if overflow) - subq %rcx, %r9 - sbbq %rax, %r10 - sbbq %rax, %r11 - sbbq %rbx, %r12 - # Sub - subq (%rsp), %r13 - movq $0x00, %rax - sbbq 8(%rsp), %r14 - movq $-19, %rcx - sbbq 16(%rsp), %r15 - movq $0x7fffffffffffffff, %rbx - sbbq 24(%rsp), %rbp - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Add modulus (if underflow) - addq %rcx, %r13 - adcq %rax, %r14 - adcq %rax, %r15 - adcq %rbx, %rbp - movq %r9, 64(%rsp) - movq %r10, 72(%rsp) - movq %r11, 80(%rsp) - movq %r12, 88(%rsp) - movq %r13, (%rsp) - movq %r14, 8(%rsp) - movq %r15, 16(%rsp) - movq %rbp, 24(%rsp) - # Multiply - # A[0] * B[0] - movq 96(%rsp), %rdx - mulxq 128(%rsp), %r9, %r10 - # A[2] * B[0] - mulxq 144(%rsp), %r11, %r12 - # A[1] * B[0] - mulxq 136(%rsp), %rcx, %rbx - xorq %rbp, %rbp - adcxq %rcx, %r10 - # A[1] * B[3] - movq 120(%rsp), %rdx - mulxq 136(%rsp), %r13, %r14 - adcxq %rbx, %r11 - # A[0] * B[1] - movq 104(%rsp), %rdx - mulxq 128(%rsp), %rcx, %rbx - adoxq %rcx, %r10 - # A[2] * B[1] - mulxq 144(%rsp), %rcx, %r15 - adoxq %rbx, %r11 - adcxq %rcx, %r12 - # A[1] * B[2] - movq 112(%rsp), %rdx - mulxq 136(%rsp), %rcx, %rbx - adcxq %r15, %r13 - adoxq %rcx, %r12 - adcxq %rbp, %r14 - adoxq %rbx, %r13 - # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx - adoxq %rbp, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 - # A[1] * B[1] - movq 104(%rsp), %rdx - mulxq 136(%rsp), %rdx, %rcx - adcxq %rbx, %r12 - adoxq %rdx, %r11 - # A[3] * B[1] - movq 104(%rsp), %rdx - adoxq %rcx, %r12 - mulxq 152(%rsp), %rcx, %rbx - adcxq %rcx, %r13 - # A[2] * B[2] - movq 112(%rsp), %rdx - mulxq 144(%rsp), %rdx, %rcx - adcxq %rbx, %r14 - adoxq %rdx, %r13 - # A[3] * B[3] - movq 120(%rsp), %rdx - adoxq %rcx, %r14 - mulxq 152(%rsp), %rcx, %rbx - adoxq %rbp, %r15 - adcxq %rcx, %r15 - # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx - adcxq %rbx, %rbp - xorq %rbx, %rbx - adcxq %rdx, %r12 - # A[3] * B[0] - movq 96(%rsp), %rdx - adcxq %rcx, %r13 - mulxq 152(%rsp), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 - # A[2] * B[3] - movq 120(%rsp), %rdx - mulxq 144(%rsp), %rdx, %rcx - adcxq %rdx, %r14 - # A[3] * B[2] - movq 112(%rsp), %rdx - adcxq %rcx, %r15 - mulxq 152(%rsp), %rcx, %rdx - adcxq %rbx, %rbp - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rbx, %rbx - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %rcx, %r15 - adcxq %rcx, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) - # Sub - movq 128(%rsp), %r9 - movq 136(%rsp), %r10 - movq 144(%rsp), %r11 - movq 152(%rsp), %r12 - subq 96(%rsp), %r9 - movq $0x00, %rax - sbbq 104(%rsp), %r10 - movq $-19, %rcx - sbbq 112(%rsp), %r11 - movq $0x7fffffffffffffff, %rbx - sbbq 120(%rsp), %r12 - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Add modulus (if underflow) - addq %rcx, %r9 - adcq %rax, %r10 - adcq %rax, %r11 - adcq %rbx, %r12 - movq %r9, 128(%rsp) - movq %r10, 136(%rsp) - movq %r11, 144(%rsp) - movq %r12, 152(%rsp) - # Square - # A[0] * A[1] - movq (%rsp), %rdx - mulxq 8(%rsp), %r10, %r11 - # A[0] * A[3] - mulxq 24(%rsp), %r12, %r13 - # A[2] * A[1] - movq 16(%rsp), %rdx - mulxq 8(%rsp), %rcx, %rbx - xorq %rbp, %rbp - adoxq %rcx, %r12 - # A[2] * A[3] - mulxq 24(%rsp), %r14, %r15 - adoxq %rbx, %r13 - # A[2] * A[0] - mulxq (%rsp), %rcx, %rbx - adoxq %rbp, %r14 - adcxq %rcx, %r11 - adoxq %rbp, %r15 - # A[1] * A[3] - movq 8(%rsp), %rdx - mulxq 24(%rsp), %rax, %r9 - adcxq %rbx, %r12 - adcxq %rax, %r13 - adcxq %r9, %r14 - adcxq %rbp, %r15 - # Double with Carry Flag - xorq %rbp, %rbp - # A[0] * A[0] - movq (%rsp), %rdx - mulxq %rdx, %r9, %rax - adcxq %r10, %r10 - # A[1] * A[1] - movq 8(%rsp), %rdx - mulxq %rdx, %rcx, %rbx - adcxq %r11, %r11 - adoxq %rax, %r10 - adcxq %r12, %r12 - adoxq %rcx, %r11 - # A[2] * A[2] - movq 16(%rsp), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r13, %r13 - adoxq %rbx, %r12 - adcxq %r14, %r14 - adoxq %rax, %r13 - # A[3] * A[3] - movq 24(%rsp), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r15, %r15 - adoxq %rcx, %r14 - adcxq %rbp, %rbp - adoxq %rax, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rcx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - mulxq %r15, %rax, %r15 - adcxq %rax, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, (%rsp) - movq %r10, 8(%rsp) - movq %r11, 16(%rsp) - movq %r12, 24(%rsp) - movq $0x1db42, %rdx - mulxq 128(%rsp), %r9, %rbp - mulxq 136(%rsp), %r10, %r15 - mulxq 144(%rsp), %r11, %r14 - mulxq 152(%rsp), %r12, %r13 - addq %rbp, %r10 - adcq %r15, %r11 - adcq %r14, %r12 - adcq $0x00, %r13 - movq $0x7fffffffffffffff, %rbp - shldq $0x01, %r12, %r13 - andq %rbp, %r12 - imulq $19, %r13, %r13 - addq %r13, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - movq %r9, 32(%rsp) - movq %r10, 40(%rsp) - movq %r11, 48(%rsp) - movq %r12, 56(%rsp) - # Square - # A[0] * A[1] - movq 64(%rsp), %rdx - mulxq 72(%rsp), %r10, %r11 - # A[0] * A[3] - mulxq 88(%rsp), %r12, %r13 - # A[2] * A[1] - movq 80(%rsp), %rdx - mulxq 72(%rsp), %rcx, %rbx - xorq %rbp, %rbp - adoxq %rcx, %r12 - # A[2] * A[3] - mulxq 88(%rsp), %r14, %r15 - adoxq %rbx, %r13 - # A[2] * A[0] - mulxq 64(%rsp), %rcx, %rbx - adoxq %rbp, %r14 - adcxq %rcx, %r11 - adoxq %rbp, %r15 - # A[1] * A[3] - movq 72(%rsp), %rdx - mulxq 88(%rsp), %rax, %r9 - adcxq %rbx, %r12 - adcxq %rax, %r13 - adcxq %r9, %r14 - adcxq %rbp, %r15 - # Double with Carry Flag - xorq %rbp, %rbp - # A[0] * A[0] - movq 64(%rsp), %rdx - mulxq %rdx, %r9, %rax - adcxq %r10, %r10 - # A[1] * A[1] - movq 72(%rsp), %rdx - mulxq %rdx, %rcx, %rbx - adcxq %r11, %r11 - adoxq %rax, %r10 - adcxq %r12, %r12 - adoxq %rcx, %r11 - # A[2] * A[2] - movq 80(%rsp), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r13, %r13 - adoxq %rbx, %r12 - adcxq %r14, %r14 - adoxq %rax, %r13 - # A[3] * A[3] - movq 88(%rsp), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r15, %r15 - adoxq %rcx, %r14 - adcxq %rbp, %rbp - adoxq %rax, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rcx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - mulxq %r15, %rax, %r15 - adcxq %rax, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, 64(%rsp) - movq %r10, 72(%rsp) - movq %r11, 80(%rsp) - movq %r12, 88(%rsp) - # Add - movq 96(%rsp), %r9 - movq 104(%rsp), %r10 - addq 32(%rsp), %r9 - movq 112(%rsp), %r11 - adcq 40(%rsp), %r10 - movq 120(%rsp), %rax - adcq 48(%rsp), %r11 - movq $-19, %rcx - adcq 56(%rsp), %rax - movq $0x7fffffffffffffff, %rbx - movq %rax, %r12 - sarq $63, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Sub modulus (if overflow) - subq %rcx, %r9 - sbbq %rax, %r10 - sbbq %rax, %r11 - sbbq %rbx, %r12 - movq %r9, 96(%rsp) - movq %r10, 104(%rsp) - movq %r11, 112(%rsp) - movq %r12, 120(%rsp) - # Multiply - # A[0] * B[0] - movq (%rsp), %rdx - mulxq (%r8), %r9, %r10 - # A[2] * B[0] - mulxq 16(%r8), %r11, %r12 - # A[1] * B[0] - mulxq 8(%r8), %rcx, %rbx - xorq %rbp, %rbp - adcxq %rcx, %r10 - # A[1] * B[3] - movq 24(%rsp), %rdx - mulxq 8(%r8), %r13, %r14 - adcxq %rbx, %r11 - # A[0] * B[1] - movq 8(%rsp), %rdx - mulxq (%r8), %rcx, %rbx - adoxq %rcx, %r10 - # A[2] * B[1] - mulxq 16(%r8), %rcx, %r15 - adoxq %rbx, %r11 - adcxq %rcx, %r12 - # A[1] * B[2] - movq 16(%rsp), %rdx - mulxq 8(%r8), %rcx, %rbx - adcxq %r15, %r13 - adoxq %rcx, %r12 - adcxq %rbp, %r14 - adoxq %rbx, %r13 - # A[0] * B[2] - mulxq (%r8), %rcx, %rbx - adoxq %rbp, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 - # A[1] * B[1] - movq 8(%rsp), %rdx - mulxq 8(%r8), %rdx, %rcx - adcxq %rbx, %r12 - adoxq %rdx, %r11 - # A[3] * B[1] - movq 8(%rsp), %rdx - adoxq %rcx, %r12 - mulxq 24(%r8), %rcx, %rbx - adcxq %rcx, %r13 - # A[2] * B[2] - movq 16(%rsp), %rdx - mulxq 16(%r8), %rdx, %rcx - adcxq %rbx, %r14 - adoxq %rdx, %r13 - # A[3] * B[3] - movq 24(%rsp), %rdx - adoxq %rcx, %r14 - mulxq 24(%r8), %rcx, %rbx - adoxq %rbp, %r15 - adcxq %rcx, %r15 - # A[0] * B[3] - mulxq (%r8), %rdx, %rcx - adcxq %rbx, %rbp - xorq %rbx, %rbx - adcxq %rdx, %r12 - # A[3] * B[0] - movq (%rsp), %rdx - adcxq %rcx, %r13 - mulxq 24(%r8), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 - # A[2] * B[3] - movq 24(%rsp), %rdx - mulxq 16(%r8), %rdx, %rcx - adcxq %rdx, %r14 - # A[3] * B[2] - movq 16(%rsp), %rdx - adcxq %rcx, %r15 - mulxq 24(%r8), %rcx, %rdx - adcxq %rbx, %rbp - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rbx, %rbx - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %rcx, %r15 - adcxq %rcx, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, 32(%rsp) - movq %r10, 40(%rsp) - movq %r11, 48(%rsp) - movq %r12, 56(%rsp) - # Multiply - # A[0] * B[0] - movq 96(%rsp), %rdx - mulxq 128(%rsp), %r9, %r10 - # A[2] * B[0] - mulxq 144(%rsp), %r11, %r12 - # A[1] * B[0] - mulxq 136(%rsp), %rcx, %rbx - xorq %rbp, %rbp - adcxq %rcx, %r10 - # A[1] * B[3] - movq 120(%rsp), %rdx - mulxq 136(%rsp), %r13, %r14 - adcxq %rbx, %r11 - # A[0] * B[1] - movq 104(%rsp), %rdx - mulxq 128(%rsp), %rcx, %rbx - adoxq %rcx, %r10 - # A[2] * B[1] - mulxq 144(%rsp), %rcx, %r15 - adoxq %rbx, %r11 - adcxq %rcx, %r12 - # A[1] * B[2] - movq 112(%rsp), %rdx - mulxq 136(%rsp), %rcx, %rbx - adcxq %r15, %r13 - adoxq %rcx, %r12 - adcxq %rbp, %r14 - adoxq %rbx, %r13 - # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx - adoxq %rbp, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 - # A[1] * B[1] - movq 104(%rsp), %rdx - mulxq 136(%rsp), %rdx, %rcx - adcxq %rbx, %r12 - adoxq %rdx, %r11 - # A[3] * B[1] - movq 104(%rsp), %rdx - adoxq %rcx, %r12 - mulxq 152(%rsp), %rcx, %rbx - adcxq %rcx, %r13 - # A[2] * B[2] - movq 112(%rsp), %rdx - mulxq 144(%rsp), %rdx, %rcx - adcxq %rbx, %r14 - adoxq %rdx, %r13 - # A[3] * B[3] - movq 120(%rsp), %rdx - adoxq %rcx, %r14 - mulxq 152(%rsp), %rcx, %rbx - adoxq %rbp, %r15 - adcxq %rcx, %r15 - # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx - adcxq %rbx, %rbp - xorq %rbx, %rbx - adcxq %rdx, %r12 - # A[3] * B[0] - movq 96(%rsp), %rdx - adcxq %rcx, %r13 - mulxq 152(%rsp), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 - # A[2] * B[3] - movq 120(%rsp), %rdx - mulxq 144(%rsp), %rdx, %rcx - adcxq %rdx, %r14 - # A[3] * B[2] - movq 112(%rsp), %rdx - adcxq %rcx, %r15 - mulxq 152(%rsp), %rcx, %rdx - adcxq %rbx, %rbp - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rbx, %rbx - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %rcx, %r15 - adcxq %rcx, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, (%rsp) - movq %r10, 8(%rsp) - movq %r11, 16(%rsp) - movq %r12, 24(%rsp) - decb 168(%rsp) - jge L_curve25519_avx2_bits - movq $63, 168(%rsp) - decb 160(%rsp) - jge L_curve25519_avx2_words - # Invert - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - movq %rsp, %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 96(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 128(%rsp), %rdi - leaq 96(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 128(%rsp), %rdi - leaq 128(%rsp), %rsi - movq $19, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 128(%rsp), %rsi - leaq 96(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $9, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 128(%rsp), %rdi - leaq 96(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 128(%rsp), %rdi - leaq 128(%rsp), %rsi - movq $0x63, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 128(%rsp), %rsi - leaq 96(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - movq $49, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 64(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - movq 176(%rsp), %rdi - # Multiply - # A[0] * B[0] - movq (%rsp), %rdx - mulxq (%rdi), %r9, %r10 - # A[2] * B[0] - mulxq 16(%rdi), %r11, %r12 - # A[1] * B[0] - mulxq 8(%rdi), %rcx, %rbx - xorq %rbp, %rbp - adcxq %rcx, %r10 - # A[1] * B[3] - movq 24(%rsp), %rdx - mulxq 8(%rdi), %r13, %r14 - adcxq %rbx, %r11 - # A[0] * B[1] - movq 8(%rsp), %rdx - mulxq (%rdi), %rcx, %rbx - adoxq %rcx, %r10 - # A[2] * B[1] - mulxq 16(%rdi), %rcx, %r15 - adoxq %rbx, %r11 - adcxq %rcx, %r12 - # A[1] * B[2] - movq 16(%rsp), %rdx - mulxq 8(%rdi), %rcx, %rbx - adcxq %r15, %r13 - adoxq %rcx, %r12 - adcxq %rbp, %r14 - adoxq %rbx, %r13 - # A[0] * B[2] - mulxq (%rdi), %rcx, %rbx - adoxq %rbp, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 - # A[1] * B[1] - movq 8(%rsp), %rdx - mulxq 8(%rdi), %rdx, %rcx - adcxq %rbx, %r12 - adoxq %rdx, %r11 - # A[3] * B[1] - movq 8(%rsp), %rdx - adoxq %rcx, %r12 - mulxq 24(%rdi), %rcx, %rbx - adcxq %rcx, %r13 - # A[2] * B[2] - movq 16(%rsp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rbx, %r14 - adoxq %rdx, %r13 - # A[3] * B[3] - movq 24(%rsp), %rdx - adoxq %rcx, %r14 - mulxq 24(%rdi), %rcx, %rbx - adoxq %rbp, %r15 - adcxq %rcx, %r15 - # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx - adcxq %rbx, %rbp - xorq %rbx, %rbx - adcxq %rdx, %r12 - # A[3] * B[0] - movq (%rsp), %rdx - adcxq %rcx, %r13 - mulxq 24(%rdi), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 - # A[2] * B[3] - movq 24(%rsp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rdx, %r14 - # A[3] * B[2] - movq 16(%rsp), %rdx - adcxq %rcx, %r15 - mulxq 24(%rdi), %rcx, %rdx - adcxq %rbx, %rbp - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %rbx, %rbp - # Reduce - movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx - xorq %rbx, %rbx - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %rcx, %r15 - adcxq %rcx, %r11 - adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Store - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) - xorq %rax, %rax - addq $0xc0, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size curve25519_avx2,.-curve25519_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_pow22523_avx2 -.type fe_pow22523_avx2,@function -.align 4 -fe_pow22523_avx2: -#else -.section __TEXT,__text -.globl _fe_pow22523_avx2 -.p2align 2 -_fe_pow22523_avx2: -#endif /* __APPLE__ */ - subq $0x70, %rsp - # pow22523 - movq %rdi, 96(%rsp) - movq %rsi, 104(%rsp) - movq %rsp, %rdi - movq 104(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq 104(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movb $4, %dl -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movb $9, %dl -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movb $19, %dl -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movb $9, %dl -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movb $49, %dl -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 64(%rsp), %rdi - leaq 64(%rsp), %rsi - movb $0x63, %dl -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 64(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movb $49, %dl -#ifndef __APPLE__ - callq fe_sq_n_avx2@plt -#else - callq _fe_sq_n_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_avx2@plt -#else - callq _fe_sq_avx2 -#endif /* __APPLE__ */ - movq 96(%rsp), %rdi - movq %rsp, %rsi - movq 104(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_avx2@plt -#else - callq _fe_mul_avx2 -#endif /* __APPLE__ */ - movq 104(%rsp), %rsi - movq 96(%rsp), %rdi - addq $0x70, %rsp - repz retq -#ifndef __APPLE__ -.text -.globl fe_ge_to_p2_avx2 -.type fe_ge_to_p2_avx2,@function -.align 4 -fe_ge_to_p2_avx2: -#else -.section __TEXT,__text -.globl _fe_ge_to_p2_avx2 -.p2align 2 -_fe_ge_to_p2_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $40, %rsp - movq %rsi, (%rsp) - movq %rdx, 8(%rsp) - movq %rcx, 16(%rsp) - movq %r8, 24(%rsp) - movq %r9, 32(%rsp) - movq 16(%rsp), %rsi - movq 88(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 24(%rsp), %rsi - movq 32(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 88(%rsp), %rsi - # Multiply - # A[0] * B[0] - movq (%rsi), %rdx - mulxq (%rbx), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rbx), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rsi), %rdx - mulxq 8(%rbx), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rsi), %rdx - mulxq (%rbx), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rbx), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rsi), %rdx - mulxq 8(%rbx), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rsi), %rdx - mulxq 8(%rbx), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rsi), %rdx - adoxq %rcx, %r11 - mulxq 24(%rbx), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rsi), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rsi), %rdx - adoxq %rcx, %r13 - mulxq 24(%rbx), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rbx), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rsi), %rdx - adcxq %rcx, %r12 - mulxq 24(%rbx), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rsi), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rsi), %rdx - adcxq %rcx, %r14 - mulxq 24(%rbx), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $40, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_to_p3_avx2 -.type fe_ge_to_p3_avx2,@function -.align 4 -fe_ge_to_p3_avx2: -#else -.section __TEXT,__text -.globl _fe_ge_to_p3_avx2 -.p2align 2 -_fe_ge_to_p3_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $40, %rsp - movq %rsi, (%rsp) - movq %rdx, 8(%rsp) - movq %rcx, 16(%rsp) - movq %r8, 24(%rsp) - movq %r9, 32(%rsp) - movq 24(%rsp), %rsi - movq 96(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 32(%rsp), %rsi - movq 88(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 96(%rsp), %rsi - # Multiply - # A[0] * B[0] - movq (%rsi), %rdx - mulxq (%rbx), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rbx), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rsi), %rdx - mulxq 8(%rbx), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rsi), %rdx - mulxq (%rbx), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rbx), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rsi), %rdx - mulxq 8(%rbx), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rsi), %rdx - mulxq 8(%rbx), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rsi), %rdx - adoxq %rcx, %r11 - mulxq 24(%rbx), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rsi), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rsi), %rdx - adoxq %rcx, %r13 - mulxq 24(%rbx), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rbx), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rsi), %rdx - adcxq %rcx, %r12 - mulxq 24(%rbx), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rsi), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rsi), %rdx - adcxq %rcx, %r14 - mulxq 24(%rbx), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 24(%rsp), %rsi - movq 32(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $40, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_dbl_avx2 -.type fe_ge_dbl_avx2,@function -.align 4 -fe_ge_dbl_avx2: -#else -.section __TEXT,__text -.globl _fe_ge_dbl_avx2 -.p2align 2 -_fe_ge_dbl_avx2: -#endif /* __APPLE__ */ - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $48, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 32(%rsp), %rsi - # Square - # A[0] * A[1] - movq (%rsi), %rdx - mulxq 8(%rsi), %r9, %r10 - # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 - # A[2] * A[1] - movq 16(%rsi), %rdx - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adoxq %rcx, %r11 - # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - adoxq %rax, %r12 - # A[2] * A[0] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 - # A[1] * A[3] - movq 8(%rsi), %rdx - mulxq 24(%rsi), %rbp, %r8 - adcxq %rax, %r11 - adcxq %rbp, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 - # A[0] * A[0] - movq (%rsi), %rdx - mulxq %rdx, %r8, %rbp - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rsi), %rdx - mulxq %rdx, %rcx, %rax - adcxq %r10, %r10 - adoxq %rbp, %r9 - adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rsi), %rdx - mulxq %rdx, %rbp, %rcx - adcxq %r12, %r12 - adoxq %rax, %r11 - adcxq %r13, %r13 - adoxq %rbp, %r12 - # A[3] * A[3] - movq 24(%rsi), %rdx - mulxq %rdx, %rbp, %rax - adcxq %r14, %r14 - adoxq %rcx, %r13 - adcxq %r15, %r15 - adoxq %rbp, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r12, %rbp, %r12 - adcxq %rbp, %r8 - adoxq %r12, %r9 - mulxq %r13, %rbp, %r13 - adcxq %rbp, %r9 - adoxq %r13, %r10 - mulxq %r14, %rbp, %r14 - adcxq %rbp, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rbp - andq %rcx, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rbp - andq %rcx, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 40(%rsp), %rbx - # Square - # A[0] * A[1] - movq (%rbx), %rdx - mulxq 8(%rbx), %r9, %r10 - # A[0] * A[3] - mulxq 24(%rbx), %r11, %r12 - # A[2] * A[1] - movq 16(%rbx), %rdx - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adoxq %rcx, %r11 - # A[2] * A[3] - mulxq 24(%rbx), %r13, %r14 - adoxq %rax, %r12 - # A[2] * A[0] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 - # A[1] * A[3] - movq 8(%rbx), %rdx - mulxq 24(%rbx), %rbp, %r8 - adcxq %rax, %r11 - adcxq %rbp, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 - # A[0] * A[0] - movq (%rbx), %rdx - mulxq %rdx, %r8, %rbp - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rbx), %rdx - mulxq %rdx, %rcx, %rax - adcxq %r10, %r10 - adoxq %rbp, %r9 - adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rbx), %rdx - mulxq %rdx, %rbp, %rcx - adcxq %r12, %r12 - adoxq %rax, %r11 - adcxq %r13, %r13 - adoxq %rbp, %r12 - # A[3] * A[3] - movq 24(%rbx), %rdx - mulxq %rdx, %rbp, %rax - adcxq %r14, %r14 - adoxq %rcx, %r13 - adcxq %r15, %r15 - adoxq %rbp, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r12, %rbp, %r12 - adcxq %rbp, %r8 - adoxq %r12, %r9 - mulxq %r13, %rbp, %r13 - adcxq %rbp, %r9 - adoxq %r13, %r10 - mulxq %r14, %rbp, %r14 - adcxq %rbp, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rbp - andq %rcx, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rbp - andq %rcx, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rdx - adcq 16(%rbx), %r10 - movq $-19, %rcx - adcq 24(%rbx), %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rsi - # Square - # A[0] * A[1] - movq (%rdi), %rdx - mulxq 8(%rdi), %r9, %r10 - # A[0] * A[3] - mulxq 24(%rdi), %r11, %r12 - # A[2] * A[1] - movq 16(%rdi), %rdx - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adoxq %rcx, %r11 - # A[2] * A[3] - mulxq 24(%rdi), %r13, %r14 - adoxq %rax, %r12 - # A[2] * A[0] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 - # A[1] * A[3] - movq 8(%rdi), %rdx - mulxq 24(%rdi), %rbp, %r8 - adcxq %rax, %r11 - adcxq %rbp, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 - # A[0] * A[0] - movq (%rdi), %rdx - mulxq %rdx, %r8, %rbp - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rdi), %rdx - mulxq %rdx, %rcx, %rax - adcxq %r10, %r10 - adoxq %rbp, %r9 - adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rdi), %rdx - mulxq %rdx, %rbp, %rcx - adcxq %r12, %r12 - adoxq %rax, %r11 - adcxq %r13, %r13 - adoxq %rbp, %r12 - # A[3] * A[3] - movq 24(%rdi), %rdx - mulxq %rdx, %rbp, %rax - adcxq %r14, %r14 - adoxq %rcx, %r13 - adcxq %r15, %r15 - adoxq %rbp, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rcx, %rcx - mulxq %r12, %rbp, %r12 - adcxq %rbp, %r8 - adoxq %r12, %r9 - mulxq %r13, %rbp, %r13 - adcxq %rbp, %r9 - adoxq %r13, %r10 - mulxq %r14, %rbp, %r14 - adcxq %rbp, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rbp - andq %rcx, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rbp - andq %rcx, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 16(%rsp), %rsi - movq (%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %rdx - movq %r8, %r12 - addq (%rbx), %r8 - movq %r9, %r13 - adcq 8(%rbx), %r9 - movq %r10, %r14 - adcq 16(%rbx), %r10 - movq %rdx, %r15 - adcq 24(%rbx), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbx), %r12 - movq $0x00, %rdx - sbbq 8(%rbx), %r13 - movq $-19, %rcx - sbbq 16(%rbx), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbx), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 24(%rsp), %rsi - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rdi), %r8 - movq $0x00, %rdx - sbbq 8(%rdi), %r9 - movq $-19, %rcx - sbbq 16(%rdi), %r10 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r11 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r8 - adcq %rdx, %r9 - adcq %rdx, %r10 - adcq %rax, %r11 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 104(%rsp), %rdi - # Square * 2 - # A[0] * A[1] - movq (%rdi), %rdx - mulxq 8(%rdi), %r9, %r10 - # A[0] * A[3] - mulxq 24(%rdi), %r11, %r12 - # A[2] * A[1] - movq 16(%rdi), %rdx - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adoxq %rcx, %r11 - # A[2] * A[3] - mulxq 24(%rdi), %r13, %r14 - adoxq %rax, %r12 - # A[2] * A[0] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 - # A[1] * A[3] - movq 8(%rdi), %rdx - mulxq 24(%rdi), %rbp, %r8 - adcxq %rax, %r11 - adcxq %rbp, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 - # A[0] * A[0] - movq (%rdi), %rdx - mulxq %rdx, %r8, %rbp - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rdi), %rdx - mulxq %rdx, %rcx, %rax - adcxq %r10, %r10 - adoxq %rbp, %r9 - adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rdi), %rdx - mulxq %rdx, %rbp, %rcx - adcxq %r12, %r12 - adoxq %rax, %r11 - adcxq %r13, %r13 - adoxq %rbp, %r12 - # A[3] * A[3] - movq 24(%rdi), %rdx - mulxq %rdx, %rbp, %rax - adcxq %r14, %r14 - adoxq %rcx, %r13 - adcxq %r15, %r15 - adoxq %rbp, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - xorq %rbp, %rbp - # Move top half into t4-t7 and remove top bit from t3 and double - shldq $3, %r15, %rbp - shldq $2, %r14, %r15 - shldq $2, %r13, %r14 - shldq $2, %r12, %r13 - shldq $2, %r11, %r12 - shldq $0x01, %r10, %r11 - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shlq $0x01, %r8 - andq %rax, %r11 - # Two out left, one in right - andq %rax, %r15 - # Multiply top bits by 19*19 - imulq $0x169, %rbp, %rcx - xorq %rax, %rax - # Multiply top half by 19 - movq $19, %rdx - adoxq %rcx, %r8 - mulxq %r12, %rbp, %r12 - adcxq %rbp, %r8 - adoxq %r12, %r9 - mulxq %r13, %rbp, %r13 - adcxq %rbp, %r9 - adoxq %r13, %r10 - mulxq %r14, %rbp, %r14 - adcxq %rbp, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rbp - andq %rax, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rbp - andq %rax, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 16(%rsp), %rdi - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rdi), %r8 - movq $0x00, %rdx - sbbq 8(%rdi), %r9 - movq $-19, %rcx - sbbq 16(%rdi), %r10 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r11 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r8 - adcq %rdx, %r9 - adcq %rdx, %r10 - adcq %rax, %r11 - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - addq $48, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp - repz retq -#ifndef __APPLE__ -.size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_madd_avx2 -.type fe_ge_madd_avx2,@function -.align 4 -fe_ge_madd_avx2: -#else -.section __TEXT,__text -.globl _fe_ge_madd_avx2 -.p2align 2 -_fe_ge_madd_avx2: -#endif /* __APPLE__ */ - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $48, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 8(%rsp), %rsi - movq 40(%rsp), %rbx - movq 32(%rsp), %rbp - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rbp), %r8 - movq %r9, %r13 - adcq 8(%rbp), %r9 - movq %r10, %r14 - adcq 16(%rbp), %r10 - movq %rdx, %r15 - adcq 24(%rbp), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbp), %r12 - movq $0x00, %rdx - sbbq 8(%rbp), %r13 - movq $-19, %rcx - sbbq 16(%rbp), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbp), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 16(%rsp), %rbx - movq 128(%rsp), %rbp - # Multiply - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rdi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rdi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rdi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rdi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rdi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rdi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rdi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rdi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rdi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rdi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rdi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 136(%rsp), %rdi - # Multiply - # A[0] * B[0] - movq (%rdi), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rdi), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rdi), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rdi), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rdi), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rdi), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rdi), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rdi), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rdi), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rdi), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rdi), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 24(%rsp), %rdi - movq 120(%rsp), %rsi - movq 112(%rsp), %rbp - # Multiply - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq (%rsp), %rsi - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 - movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 104(%rsp), %rdi - # Double - movq (%rdi), %r8 - movq 8(%rdi), %r9 - addq %r8, %r8 - movq 16(%rdi), %r10 - adcq %r9, %r9 - movq 24(%rdi), %rdx - adcq %r10, %r10 - movq $-19, %rcx - adcq %rdx, %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 24(%rsp), %rdi - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 - movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq %r12, (%rdi) - movq %r13, 8(%rdi) - movq %r14, 16(%rdi) - movq %r15, 24(%rdi) - addq $48, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp - repz retq -#ifndef __APPLE__ -.size fe_ge_madd_avx2,.-fe_ge_madd_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_msub_avx2 -.type fe_ge_msub_avx2,@function -.align 4 -fe_ge_msub_avx2: -#else -.section __TEXT,__text -.globl _fe_ge_msub_avx2 -.p2align 2 -_fe_ge_msub_avx2: -#endif /* __APPLE__ */ - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $48, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 8(%rsp), %rsi - movq 40(%rsp), %rbx - movq 32(%rsp), %rbp - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rbp), %r8 - movq %r9, %r13 - adcq 8(%rbp), %r9 - movq %r10, %r14 - adcq 16(%rbp), %r10 - movq %rdx, %r15 - adcq 24(%rbp), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbp), %r12 - movq $0x00, %rdx - sbbq 8(%rbp), %r13 - movq $-19, %rcx - sbbq 16(%rbp), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbp), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 16(%rsp), %rbx - movq 136(%rsp), %rbp - # Multiply - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rdi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rdi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rdi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rdi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rdi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rdi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rdi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rdi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rdi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rdi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rdi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 128(%rsp), %rdi - # Multiply - # A[0] * B[0] - movq (%rdi), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rdi), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rdi), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rdi), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rdi), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rdi), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rdi), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rdi), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rdi), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rdi), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rdi), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 24(%rsp), %rdi - movq 120(%rsp), %rsi - movq 112(%rsp), %rbp - # Multiply - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rsi - movq (%rsp), %rbp - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rsi), %r8 - movq %r9, %r13 - adcq 8(%rsi), %r9 - movq %r10, %r14 - adcq 16(%rsi), %r10 - movq %rdx, %r15 - adcq 24(%rsi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rsi), %r12 - movq $0x00, %rdx - sbbq 8(%rsi), %r13 - movq $-19, %rcx - sbbq 16(%rsi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rsi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq %r12, (%rbp) - movq %r13, 8(%rbp) - movq %r14, 16(%rbp) - movq %r15, 24(%rbp) - movq 104(%rsp), %rsi - # Double - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq %r8, %r8 - movq 16(%rsi), %r10 - adcq %r9, %r9 - movq 24(%rsi), %rdx - adcq %r10, %r10 - movq $-19, %rcx - adcq %rdx, %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 - movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rbx) - movq %r13, 8(%rbx) - movq %r14, 16(%rbx) - movq %r15, 24(%rbx) - addq $48, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp - repz retq -#ifndef __APPLE__ -.size fe_ge_msub_avx2,.-fe_ge_msub_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_add_avx2 -.type fe_ge_add_avx2,@function -.align 4 -fe_ge_add_avx2: -#else -.section __TEXT,__text -.globl _fe_ge_add_avx2 -.p2align 2 -_fe_ge_add_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $0x50, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 8(%rsp), %rsi - movq 40(%rsp), %rbx - movq 32(%rsp), %rbp - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rbp), %r8 - movq %r9, %r13 - adcq 8(%rbp), %r9 - movq %r10, %r14 - adcq 16(%rbp), %r10 - movq %rdx, %r15 - adcq 24(%rbp), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbp), %r12 - movq $0x00, %rdx - sbbq 8(%rbp), %r13 - movq $-19, %rcx - sbbq 16(%rbp), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbp), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 16(%rsp), %rbx - movq 168(%rsp), %rbp - # Multiply - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rdi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rdi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rdi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rdi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rdi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rdi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rdi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rdi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rdi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rdi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rdi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 176(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 24(%rsp), %rsi - movq 160(%rsp), %rbx - movq 144(%rsp), %rbp - # Multiply - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rbx), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rbx), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rbx), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rbx), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rbx), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rbx), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rbx), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rbx), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rbx), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rbx), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rbx), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rbx), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 136(%rsp), %rsi - movq 152(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rsi - # Double - movq (%rdi), %r8 - movq 8(%rdi), %r9 - addq %r8, %r8 - movq 16(%rdi), %r10 - adcq %r9, %r9 - movq 24(%rdi), %rdx - adcq %r10, %r10 - movq $-19, %rcx - adcq %rdx, %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 8(%rsp), %rbx - movq 16(%rsp), %rbp - # Add - movq (%rbp), %r8 - movq 8(%rbp), %r9 - movq 16(%rbp), %r10 - movq 24(%rbp), %rdx - movq %r8, %r12 - addq (%rbx), %r8 - movq %r9, %r13 - adcq 8(%rbx), %r9 - movq %r10, %r14 - adcq 16(%rbx), %r10 - movq %rdx, %r15 - adcq 24(%rbx), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbx), %r12 - movq $0x00, %rdx - sbbq 8(%rbx), %r13 - movq $-19, %rcx - sbbq 16(%rbx), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbx), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq %r12, (%rdi) - movq %r13, 8(%rdi) - movq %r14, 16(%rdi) - movq %r15, 24(%rdi) - movq 24(%rsp), %rdi - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 - movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rbp) - movq %r9, 8(%rbp) - movq %r10, 16(%rbp) - movq %r11, 24(%rbp) - movq %r12, (%rdi) - movq %r13, 8(%rdi) - movq %r14, 16(%rdi) - movq %r15, 24(%rdi) - addq $0x50, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_add_avx2,.-fe_ge_add_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_sub_avx2 -.type fe_ge_sub_avx2,@function -.align 4 -fe_ge_sub_avx2: -#else -.section __TEXT,__text -.globl _fe_ge_sub_avx2 -.p2align 2 -_fe_ge_sub_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $0x50, %rsp - movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 8(%rsp), %rsi - movq 40(%rsp), %rbx - movq 32(%rsp), %rbp - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rbp), %r8 - movq %r9, %r13 - adcq 8(%rbp), %r9 - movq %r10, %r14 - adcq 16(%rbp), %r10 - movq %rdx, %r15 - adcq 24(%rbp), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbp), %r12 - movq $0x00, %rdx - sbbq 8(%rbp), %r13 - movq $-19, %rcx - sbbq 16(%rbp), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbp), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 16(%rsp), %rbx - movq 176(%rsp), %rbp - # Multiply - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rdi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rdi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rdi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rdi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rdi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rdi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rdi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rdi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rdi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rdi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rdi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 168(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 24(%rsp), %rsi - movq 160(%rsp), %rbx - movq 144(%rsp), %rbp - # Multiply - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rbx), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rbx), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rbx), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rbx), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rbx), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rbx), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rbx), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rbx), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rbx), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rbx), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rbx), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rbx), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 136(%rsp), %rsi - movq 152(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rsi - # Double - movq (%rdi), %r8 - movq 8(%rdi), %r9 - addq %r8, %r8 - movq 16(%rdi), %r10 - adcq %r9, %r9 - movq 24(%rdi), %rdx - adcq %r10, %r10 - movq $-19, %rcx - adcq %rdx, %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 8(%rsp), %rbx - movq 16(%rsp), %rbp - # Add - movq (%rbp), %r8 - movq 8(%rbp), %r9 - movq 16(%rbp), %r10 - movq 24(%rbp), %rdx - movq %r8, %r12 - addq (%rbx), %r8 - movq %r9, %r13 - adcq 8(%rbx), %r9 - movq %r10, %r14 - adcq 16(%rbx), %r10 - movq %rdx, %r15 - adcq 24(%rbx), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbx), %r12 - movq $0x00, %rdx - sbbq 8(%rbx), %r13 - movq $-19, %rcx - sbbq 16(%rbx), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbx), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq %r12, (%rdi) - movq %r13, 8(%rdi) - movq %r14, 16(%rdi) - movq %r15, 24(%rdi) - movq 24(%rsp), %rdi - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 - movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rbp) - movq %r13, 8(%rbp) - movq %r14, 16(%rbp) - movq %r15, 24(%rbp) - addq $0x50, %rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_ge_sub_avx2,.-fe_ge_sub_avx2 -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ |