diff options
| author | auth12 <[email protected]> | 2020-07-19 11:57:04 -0700 |
|---|---|---|
| committer | GitHub <[email protected]> | 2020-07-19 11:57:04 -0700 |
| commit | 1bae439a35a3aadca6772716aaeea8c8a0991114 (patch) | |
| tree | f8eab7a7bae237ad697feecfae26b17bab91b16e /client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c | |
| parent | More placeholders and general plan. (diff) | |
| parent | Merge branch 'master' into windows (diff) | |
| download | loader-1bae439a35a3aadca6772716aaeea8c8a0991114.tar.xz loader-1bae439a35a3aadca6772716aaeea8c8a0991114.zip | |
Merge pull request #1 from auth12/windows
Windows
Diffstat (limited to 'client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c')
| -rw-r--r-- | client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c | 5581 |
1 files changed, 5581 insertions, 0 deletions
diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c new file mode 100644 index 0000000..f7ef379 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c @@ -0,0 +1,5581 @@ +/* armv8-32-curve25519 + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c + */ + +#ifndef __aarch64__ + +#include <stdint.h> +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#ifdef WOLFSSL_ARMASM +#include <wolfssl/wolfcrypt/fe_operations.h> +#include <stdint.h> + +void fe_init() +{ + __asm__ __volatile__ ( + "\n\t" + : + : + : "memory" + ); +} + +void fe_frombytes(fe out, const unsigned char* in) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[in]]\n\t" + "ldrd r12, lr, [%[in], #8]\n\t" + "ldrd r4, r5, [%[in], #16]\n\t" + "ldrd r6, r7, [%[in], #24]\n\t" + "and r7, r7, #0x7fffffff\n\t" + "strd r2, r3, [%[out]]\n\t" + "strd r12, lr, [%[out], #8]\n\t" + "strd r4, r5, [%[out], #16]\n\t" + "strd r6, r7, [%[out], #24]\n\t" + : [out] "+r" (out), [in] "+r" (in) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7" + ); +} + +void fe_tobytes(unsigned char* out, const fe n) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[in]]\n\t" + "ldrd r12, lr, [%[in], #8]\n\t" + "ldrd r4, r5, [%[in], #16]\n\t" + "ldrd r6, r7, [%[in], #24]\n\t" + "adds r8, r2, #19\n\t" + "adcs r8, r3, #0\n\t" + "adcs r8, r12, #0\n\t" + "adcs r8, lr, #0\n\t" + "adcs r8, r4, #0\n\t" + "adcs r8, r5, #0\n\t" + "adcs r8, r6, #0\n\t" + "adc r8, r7, #0\n\t" + "asr r8, r8, #31\n\t" + "and r8, r8, #19\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, #0\n\t" + "adcs r12, r12, #0\n\t" + "adcs lr, lr, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + "and r7, r7, #0x7fffffff\n\t" + "strd r2, r3, [%[out]]\n\t" + "strd r12, lr, [%[out], #8]\n\t" + "strd r4, r5, [%[out], #16]\n\t" + "strd r6, r7, [%[out], #24]\n\t" + : [out] "+r" (out), [n] "+r" (n) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8" + ); +} + +void fe_1(fe n) +{ + __asm__ __volatile__ ( + /* Set one */ + "mov r2, #1\n\t" + "mov r1, #0\n\t" + "strd r2, r1, [%[n]]\n\t" + "strd r1, r1, [%[n], #8]\n\t" + "strd r1, r1, [%[n], #16]\n\t" + "strd r1, r1, [%[n], #24]\n\t" + : [n] "+r" (n) + : + : "memory", "r1", "r2" + ); +} + +void fe_0(fe n) +{ + __asm__ __volatile__ ( + /* Set zero */ + "mov r1, #0\n\t" + "strd r1, r1, [%[n]]\n\t" + "strd r1, r1, [%[n], #8]\n\t" + "strd r1, r1, [%[n], #16]\n\t" + "strd r1, r1, [%[n], #24]\n\t" + : [n] "+r" (n) + : + : "memory", "r1" + ); +} + +void fe_copy(fe r, const fe a) +{ + __asm__ __volatile__ ( + /* Copy */ + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "strd r2, r3, [%[r]]\n\t" + "strd r12, lr, [%[r], #8]\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r12, lr, [%[a], #24]\n\t" + "strd r2, r3, [%[r], #16]\n\t" + "strd r12, lr, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr" + ); +} + +void fe_sub(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + /* Sub */ + "ldrd r12, lr, [%[a]]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[b]]\n\t" + "ldrd r8, r9, [%[b], #8]\n\t" + "subs r6, r12, r6\n\t" + "sbcs r7, lr, r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "strd r6, r7, [%[r]]\n\t" + "strd r8, r9, [%[r], #8]\n\t" + "ldrd r12, lr, [%[a], #16]\n\t" + "ldrd r4, r5, [%[a], #24]\n\t" + "ldrd r6, r7, [%[b], #16]\n\t" + "ldrd r8, r9, [%[b], #24]\n\t" + "sbcs r6, r12, r6\n\t" + "sbcs r7, lr, r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbc r9, r5, r9\n\t" + "mov r10, #-19\n\t" + "asr r3, r9, #31\n\t" + /* Mask the modulus */ + "and r10, r3, r10\n\t" + "and r11, r3, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r12, lr, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "adds r12, r12, r10\n\t" + "adcs lr, lr, r3\n\t" + "adcs r4, r4, r3\n\t" + "adcs r5, r5, r3\n\t" + "adcs r6, r6, r3\n\t" + "adcs r7, r7, r3\n\t" + "adcs r8, r8, r3\n\t" + "adc r9, r9, r11\n\t" + "strd r12, lr, [%[r]]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [%[r], #16]\n\t" + "strd r8, r9, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_add(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + /* Add */ + "ldrd r12, lr, [%[a]]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[b]]\n\t" + "ldrd r8, r9, [%[b], #8]\n\t" + "adds r6, r12, r6\n\t" + "adcs r7, lr, r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "strd r6, r7, [%[r]]\n\t" + "strd r8, r9, [%[r], #8]\n\t" + "ldrd r12, lr, [%[a], #16]\n\t" + "ldrd r4, r5, [%[a], #24]\n\t" + "ldrd r6, r7, [%[b], #16]\n\t" + "ldrd r8, r9, [%[b], #24]\n\t" + "adcs r6, r12, r6\n\t" + "adcs r7, lr, r7\n\t" + "adcs r8, r4, r8\n\t" + "adc r9, r5, r9\n\t" + "mov r10, #-19\n\t" + "asr r3, r9, #31\n\t" + /* Mask the modulus */ + "and r10, r3, r10\n\t" + "and r11, r3, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r12, lr, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "subs r12, r12, r10\n\t" + "sbcs lr, lr, r3\n\t" + "sbcs r4, r4, r3\n\t" + "sbcs r5, r5, r3\n\t" + "sbcs r6, r6, r3\n\t" + "sbcs r7, r7, r3\n\t" + "sbcs r8, r8, r3\n\t" + "sbc r9, r9, r11\n\t" + "strd r12, lr, [%[r]]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [%[r], #16]\n\t" + "strd r8, r9, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_neg(fe r, const fe a) +{ + __asm__ __volatile__ ( + "mov r5, #-1\n\t" + "mov r4, #-19\n\t" + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "subs r2, r4, r2\n\t" + "sbcs r3, r5, r3\n\t" + "sbcs r12, r5, r12\n\t" + "sbcs lr, r5, lr\n\t" + "strd r2, r3, [%[r]]\n\t" + "strd r12, lr, [%[r], #8]\n\t" + "mov r4, #0x7fffffff\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r12, lr, [%[a], #24]\n\t" + "sbcs r2, r5, r2\n\t" + "sbcs r3, r5, r3\n\t" + "sbcs r12, r5, r12\n\t" + "sbc lr, r4, lr\n\t" + "strd r2, r3, [%[r], #16]\n\t" + "strd r12, lr, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5" + ); +} + +int fe_isnonzero(const fe a) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "ldrd r4, r5, [%[a], #16]\n\t" + "ldrd r6, r7, [%[a], #24]\n\t" + "adds r1, r2, #19\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "adcs r1, lr, #0\n\t" + "adcs r1, r4, #0\n\t" + "adcs r1, r5, #0\n\t" + "adcs r1, r6, #0\n\t" + "adc r1, r7, #0\n\t" + "asr r1, r1, #31\n\t" + "and r1, r1, #19\n\t" + "adds r2, r2, r1\n\t" + "adcs r3, r3, #0\n\t" + "adcs r12, r12, #0\n\t" + "adcs lr, lr, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + "and r7, r7, #0x7fffffff\n\t" + "orr r2, r2, r3\n\t" + "orr r12, r12, lr\n\t" + "orr r4, r4, r5\n\t" + "orr r6, r6, r7\n\t" + "orr r12, r12, r4\n\t" + "orr r2, r2, r6\n\t" + "orr %[a], r2, r12\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8" + ); + return (uint32_t)(size_t)a; +} + +int fe_isnegative(const fe a) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "adds r1, r2, #19\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "adcs r1, lr, #0\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r12, lr, [%[a], #24]\n\t" + "adcs r1, r2, #0\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "ldr r2, [%[a]]\n\t" + "adc r1, lr, #0\n\t" + "and %[a], r2, #1\n\t" + "lsr r1, r1, #31\n\t" + "eor %[a], %[a], r1\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r12", "lr" + ); + return (uint32_t)(size_t)a; +} + +void fe_cmov_table(fe* r, fe* base, signed char b) +{ + __asm__ __volatile__ ( + "sxtb %[b], %[b]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #1\n\t" + "mov r12, #0\n\t" + "mov lr, #1\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-19\n\t" + "mov r9, #-1\n\t" + "subs r8, r8, r5\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r11, r11, r11\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r]]\n\t" + "strd lr, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #64]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-1\n\t" + "mov r9, #-1\n\t" + "rsbs r11, r11, #0\n\t" + "sbcs r8, r8, r5\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r11, r11, r11\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r], #8]\n\t" + "strd lr, r4, [%[r], #40]\n\t" + "strd r5, r6, [%[r], #72]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-1\n\t" + "mov r9, #-1\n\t" + "rsbs r11, r11, #0\n\t" + "sbcs r8, r8, r5\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r11, r11, r11\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r], #16]\n\t" + "strd lr, r4, [%[r], #48]\n\t" + "strd r5, r6, [%[r], #80]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-1\n\t" + "mov r9, #0x7fffffff\n\t" + "rsbs r11, r11, #0\n\t" + "sbcs r8, r8, r5\n\t" + "sbc r9, r9, r6\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r], #24]\n\t" + "strd lr, r4, [%[r], #56]\n\t" + "strd r5, r6, [%[r], #88]\n\t" + : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_mul(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Multiply */ + "ldr r7, [%[a]]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[b]]\n\t" + "ldr lr, [%[b], #4]\n\t" + /* A[0] * B[0] = 0 */ + "umull r4, r5, r7, r9\n\t" + "str r4, [sp]\n\t" + /* A[0] * B[1] = 1 */ + "umull r3, r6, r7, lr\n\t" + "adds r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[0] = 1 */ + "umull r3, r12, r8, r9\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #4]\n\t" + /* A[2] * B[0] = 2 */ + "ldr r10, [%[a], #8]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adc r4, r4, r12\n\t" + /* A[1] * B[1] = 2 */ + "umull r3, r12, r8, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[0] * B[2] = 2 */ + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #8]\n\t" + /* A[0] * B[3] = 3 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[2] = 3 */ + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * B[1] = 3 */ + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[0] = 3 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #12]\n\t" + /* A[4] * B[0] = 4 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[1] = 4 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * B[2] = 4 */ + "ldr r10, [%[a], #8]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * B[3] = 4 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[0] * B[4] = 4 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #16]\n\t" + /* A[0] * B[5] = 5 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[4] = 5 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[3] = 5 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[2] = 5 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[1] = 5 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[0] = 5 */ + "ldr r10, [%[a], #20]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #20]\n\t" + /* A[6] * B[0] = 6 */ + "ldr r10, [%[a], #24]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[1] = 6 */ + "ldr r10, [%[a], #20]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * B[2] = 6 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[3] = 6 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * B[4] = 6 */ + "ldr r10, [%[a], #8]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[5] = 6 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * B[6] = 6 */ + "ldr r11, [%[b], #24]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #24]\n\t" + /* A[0] * B[7] = 7 */ + "ldr r11, [%[b], #28]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * B[6] = 7 */ + "ldr r11, [%[b], #24]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * B[5] = 7 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[4] = 7 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * B[3] = 7 */ + "ldr r10, [%[a], #16]\n\t" + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * B[2] = 7 */ + "ldr r10, [%[a], #20]\n\t" + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[6] * B[1] = 7 */ + "ldr r10, [%[a], #24]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[7] * B[0] = 7 */ + "ldr r10, [%[a], #28]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #28]\n\t" + "ldr r7, [%[a], #24]\n\t" + "ldr r9, [%[b], #24]\n\t" + /* A[7] * B[1] = 8 */ + "umull r3, r12, r10, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[2] = 8 */ + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[3] = 8 */ + "ldr r10, [%[a], #20]\n\t" + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[4] = 8 */ + "ldr r10, [%[a], #16]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[5] = 8 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[6] = 8 */ + "ldr r10, [%[a], #8]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[7] = 8 */ + "ldr r11, [%[b], #28]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #32]\n\t" + "ldr r8, [%[a], #28]\n\t" + "mov lr, r11\n\t" + /* A[2] * B[7] = 9 */ + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[6] = 9 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * B[5] = 9 */ + "ldr r10, [%[a], #16]\n\t" + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[4] = 9 */ + "ldr r10, [%[a], #20]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[6] * B[3] = 9 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[7] * B[2] = 9 */ + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #36]\n\t" + /* A[7] * B[3] = 10 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[6] * B[4] = 10 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * B[5] = 10 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * B[6] = 10 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[7] = 10 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #40]\n\t" + /* A[4] * B[7] = 11 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[6] = 11 */ + "ldr r10, [%[a], #20]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[5] = 11 */ + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[7] * B[4] = 11 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #44]\n\t" + /* A[7] * B[5] = 12 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[6] * B[6] = 12 */ + "umull r3, r12, r7, r9\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[7] = 12 */ + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #48]\n\t" + /* A[6] * B[7] = 13 */ + "umull r3, r12, r7, lr\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[7] * B[6] = 13 */ + "umull r3, r12, r8, r9\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #52]\n\t" + /* A[7] * B[7] = 14 */ + "umull r3, r12, r8, lr\n\t" + "adds r6, r6, r3\n\t" + "adc r4, r4, r12\n\t" + "str r6, [sp, #56]\n\t" + "str r4, [sp, #60]\n\t" + /* Reduce */ + /* Load bottom half */ + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "ldrd r8, r9, [sp, #16]\n\t" + "ldrd r10, r11, [sp, #24]\n\t" + "lsr r3, r11, #31\n\t" + "and r11, r11, #0x7fffffff\n\t" + "mov lr, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "adds r4, r4, r3\n\t" + "mov %[b], #0\n\t" + "adcs r5, r5, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r5, r5, r3\n\t" + "mov %[b], #0\n\t" + "adcs r6, r6, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r6, r6, r3\n\t" + "mov %[b], #0\n\t" + "adcs r7, r7, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r7, r7, r3\n\t" + "mov %[b], #0\n\t" + "adcs r8, r8, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r8, r8, r3\n\t" + "mov %[b], #0\n\t" + "adcs r9, r9, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r9, r9, r3\n\t" + "mov %[b], #0\n\t" + "adcs r10, r10, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r10, r10, r3\n\t" + "mov %[b], #0\n\t" + "adcs r11, r11, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "adds r11, r11, r3\n\t" + "adc r3, r12, %[b]\n\t" + /* Overflow */ + "lsl r3, r3, #1\n\t" + "orr r3, r3, r11, lsr #31\n\t" + "mul r3, r3, lr\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Reduce if top bit set */ + "asr r3, r11, #31\n\t" + "and r3, r3, lr\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Store */ + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [%[r], #8]\n\t" + "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_sq(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Square */ + "ldr r7, [%[a]]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[a], #8]\n\t" + "ldr r10, [%[a], #12]\n\t" + "ldr r12, [%[a], #16]\n\t" + /* A[0] * A[0] = 0 */ + "umull r4, r5, r7, r7\n\t" + "str r4, [sp]\n\t" + /* A[0] * A[1] = 1 */ + "umull r2, r3, r7, r8\n\t" + "mov r6, #0\n\t" + "adds r5, r5, r2\n\t" + "adc r6, r6, r3\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #4]\n\t" + /* A[1] * A[1] = 2 */ + "umull r2, r3, r8, r8\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + /* A[0] * A[2] = 2 */ + "umull r2, r3, r7, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #8]\n\t" + /* A[0] * A[3] = 3 */ + "umull r2, r3, r7, r10\n\t" + "adds r4, r4, r2\n\t" + "adc r5, r5, r3\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[2] = 3 */ + "umull r2, r3, r8, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #12]\n\t" + /* A[2] * A[2] = 4 */ + "umull r2, r3, r9, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[3] = 4 */ + "umull r2, r3, r8, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[0] * A[4] = 4 */ + "umull r2, r3, r7, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #16]\n\t" + /* A[0] * A[5] = 5 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[4] = 5 */ + "umull r2, r3, r8, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[3] = 5 */ + "umull r2, r3, r9, r10\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #20]\n\t" + /* A[3] * A[3] = 6 */ + "umull r2, r3, r10, r10\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[4] = 6 */ + "umull r2, r3, r9, r12\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[5] = 6 */ + "umull r2, r3, r8, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * A[6] = 6 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #24]\n\t" + /* A[0] * A[7] = 7 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[6] = 7 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * A[5] = 7 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[4] = 7 */ + "umull r2, r3, r10, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #28]\n\t" + /* A[4] * A[4] = 8 */ + "umull r2, r3, r12, r12\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[5] = 8 */ + "umull r2, r3, r10, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[6] = 8 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[7] = 8 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #32]\n\t" + "ldr r7, [%[a], #20]\n\t" + /* A[2] * A[7] = 9 */ + "umull r2, r3, r9, r11\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * A[6] = 9 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * A[5] = 9 */ + "umull r2, r3, r12, r7\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #36]\n\t" + "mov r8, r11\n\t" + /* A[5] * A[5] = 10 */ + "umull r2, r3, r7, r7\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * A[6] = 10 */ + "umull r2, r3, r12, r8\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[7] = 10 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #40]\n\t" + "mov r9, r11\n\t" + /* A[4] * A[7] = 11 */ + "umull r2, r3, r12, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * A[6] = 11 */ + "umull r2, r3, r7, r8\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #44]\n\t" + /* A[6] * A[6] = 12 */ + "umull r2, r3, r8, r8\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * A[7] = 12 */ + "umull r2, r3, r7, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #48]\n\t" + /* A[6] * A[7] = 13 */ + "umull r2, r3, r8, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #52]\n\t" + /* A[7] * A[7] = 14 */ + "umull r2, r3, r9, r9\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + "str r6, [sp, #56]\n\t" + "str r4, [sp, #60]\n\t" + /* Reduce */ + /* Load bottom half */ + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "ldrd r8, r9, [sp, #16]\n\t" + "ldrd r10, r11, [sp, #24]\n\t" + "lsr r2, r11, #31\n\t" + "and r11, r11, #0x7fffffff\n\t" + "mov r12, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r4, r4, r2\n\t" + "mov lr, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r5, r5, r2\n\t" + "mov lr, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r6, r6, r2\n\t" + "mov lr, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r7, r7, r2\n\t" + "mov lr, #0\n\t" + "adcs r8, r8, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r8, r8, r2\n\t" + "mov lr, #0\n\t" + "adcs r9, r9, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r9, r9, r2\n\t" + "mov lr, #0\n\t" + "adcs r10, r10, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r10, r10, r2\n\t" + "mov lr, #0\n\t" + "adcs r11, r11, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r11, r11, r2\n\t" + "adc r2, r3, lr\n\t" + /* Overflow */ + "lsl r2, r2, #1\n\t" + "orr r2, r2, r11, lsr #31\n\t" + "mul r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Reduce if top bit set */ + "asr r2, r11, #31\n\t" + "and r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Store */ + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [%[r], #8]\n\t" + "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_mul121666(fe r, fe a) +{ + __asm__ __volatile__ ( + /* Multiply by 121666 */ + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[a], #16]\n\t" + "ldrd r8, r9, [%[a], #24]\n\t" + "movw lr, #0xdb42\n\t" + "movt lr, #1\n\t" + "umull r2, r10, r2, lr\n\t" + "umull r3, r12, r3, lr\n\t" + "adds r3, r3, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r4, r12, r4, lr\n\t" + "adds r4, r4, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r5, r12, r5, lr\n\t" + "adds r5, r5, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r6, r12, r6, lr\n\t" + "adds r6, r6, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r7, r12, r7, lr\n\t" + "adds r7, r7, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r8, r12, r8, lr\n\t" + "adds r8, r8, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r9, r12, r9, lr\n\t" + "adds r9, r9, r10\n\t" + "adc r10, r12, #0\n\t" + "mov lr, #19\n\t" + "lsl r10, r10, #1\n\t" + "orr r10, r10, r9, lsr #31\n\t" + "mul r10, r10, lr\n\t" + "and r9, r9, #0x7fffffff\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "strd r2, r3, [%[r]]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [%[r], #16]\n\t" + "strd r8, r9, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); +} + +void fe_sq2(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Square * 2 */ + "ldr r7, [%[a]]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[a], #8]\n\t" + "ldr r10, [%[a], #12]\n\t" + "ldr r12, [%[a], #16]\n\t" + /* A[0] * A[0] = 0 */ + "umull r4, r5, r7, r7\n\t" + "str r4, [sp]\n\t" + /* A[0] * A[1] = 1 */ + "umull r2, r3, r7, r8\n\t" + "mov r6, #0\n\t" + "adds r5, r5, r2\n\t" + "adc r6, r6, r3\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #4]\n\t" + /* A[1] * A[1] = 2 */ + "umull r2, r3, r8, r8\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + /* A[0] * A[2] = 2 */ + "umull r2, r3, r7, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #8]\n\t" + /* A[0] * A[3] = 3 */ + "umull r2, r3, r7, r10\n\t" + "adds r4, r4, r2\n\t" + "adc r5, r5, r3\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[2] = 3 */ + "umull r2, r3, r8, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #12]\n\t" + /* A[2] * A[2] = 4 */ + "umull r2, r3, r9, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[3] = 4 */ + "umull r2, r3, r8, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[0] * A[4] = 4 */ + "umull r2, r3, r7, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #16]\n\t" + /* A[0] * A[5] = 5 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[4] = 5 */ + "umull r2, r3, r8, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[3] = 5 */ + "umull r2, r3, r9, r10\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #20]\n\t" + /* A[3] * A[3] = 6 */ + "umull r2, r3, r10, r10\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[4] = 6 */ + "umull r2, r3, r9, r12\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[5] = 6 */ + "umull r2, r3, r8, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * A[6] = 6 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #24]\n\t" + /* A[0] * A[7] = 7 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[6] = 7 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * A[5] = 7 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[4] = 7 */ + "umull r2, r3, r10, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #28]\n\t" + /* A[4] * A[4] = 8 */ + "umull r2, r3, r12, r12\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[5] = 8 */ + "umull r2, r3, r10, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[6] = 8 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[7] = 8 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #32]\n\t" + "ldr r7, [%[a], #20]\n\t" + /* A[2] * A[7] = 9 */ + "umull r2, r3, r9, r11\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * A[6] = 9 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * A[5] = 9 */ + "umull r2, r3, r12, r7\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #36]\n\t" + "mov r8, r11\n\t" + /* A[5] * A[5] = 10 */ + "umull r2, r3, r7, r7\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * A[6] = 10 */ + "umull r2, r3, r12, r8\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[7] = 10 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #40]\n\t" + "mov r9, r11\n\t" + /* A[4] * A[7] = 11 */ + "umull r2, r3, r12, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * A[6] = 11 */ + "umull r2, r3, r7, r8\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #44]\n\t" + /* A[6] * A[6] = 12 */ + "umull r2, r3, r8, r8\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * A[7] = 12 */ + "umull r2, r3, r7, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #48]\n\t" + /* A[6] * A[7] = 13 */ + "umull r2, r3, r8, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #52]\n\t" + /* A[7] * A[7] = 14 */ + "umull r2, r3, r9, r9\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + "str r6, [sp, #56]\n\t" + "str r4, [sp, #60]\n\t" + /* Double and Reduce */ + /* Load bottom half */ + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "ldrd r8, r9, [sp, #16]\n\t" + "ldrd r10, r11, [sp, #24]\n\t" + "lsr r2, r11, #30\n\t" + "lsl r11, r11, #1\n\t" + "orr r11, r11, r10, lsr #31\n\t" + "lsl r10, r10, #1\n\t" + "orr r10, r10, r9, lsr #31\n\t" + "lsl r9, r9, #1\n\t" + "orr r9, r9, r8, lsr #31\n\t" + "lsl r8, r8, #1\n\t" + "orr r8, r8, r7, lsr #31\n\t" + "lsl r7, r7, #1\n\t" + "orr r7, r7, r6, lsr #31\n\t" + "lsl r6, r6, #1\n\t" + "orr r6, r6, r5, lsr #31\n\t" + "lsl r5, r5, #1\n\t" + "orr r5, r5, r4, lsr #31\n\t" + "lsl r4, r4, #1\n\t" + "and r11, r11, #0x7fffffff\n\t" + "mov r12, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r4, r4, r2\n\t" + "mov lr, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r5, r5, r2\n\t" + "mov lr, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r6, r6, r2\n\t" + "mov lr, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r7, r7, r2\n\t" + "mov lr, #0\n\t" + "adcs r8, r8, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r8, r8, r2\n\t" + "mov lr, #0\n\t" + "adcs r9, r9, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r9, r9, r2\n\t" + "mov lr, #0\n\t" + "adcs r10, r10, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r10, r10, r2\n\t" + "mov lr, #0\n\t" + "adcs r11, r11, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r11, r11, r2\n\t" + "adc r2, r3, lr\n\t" + /* Overflow */ + "lsl r2, r2, #1\n\t" + "orr r2, r2, r11, lsr #31\n\t" + "mul r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Reduce if top bit set */ + "asr r2, r11, #31\n\t" + "and r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Store */ + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [%[r], #8]\n\t" + "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_invert(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x88\n\t" + /* Invert */ + "str %[r], [sp, #128]\n\t" + "str %[a], [sp, #132]\n\t" + "mov r0, sp\n\t" + "ldr r1, [sp, #132]\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "ldr r1, [sp, #132]\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_fe_invert1_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert1_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_fe_invert2_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert2_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_fe_invert3_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert3_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_fe_invert4_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert4_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #49\n\t" + "\n" + "L_fe_invert5_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert5_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_fe_invert6_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert6_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "mov r4, #50\n\t" + "\n" + "L_fe_invert7_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert7_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #5\n\t" + "\n" + "L_fe_invert8_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert8_%=\n\t" + "ldr r0, [sp, #128]\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #132]\n\t" + "ldr %[r], [sp, #128]\n\t" + "add sp, sp, #0x88\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr", "r4" + ); +} + +int curve25519(byte* r, byte* n, byte* a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0xbc\n\t" + "str %[r], [sp, #160]\n\t" + "str %[n], [sp, #164]\n\t" + "str %[a], [sp, #168]\n\t" + "mov %[n], #0\n\t" + "str %[n], [sp, #172]\n\t" + /* Set one */ + "mov r11, #1\n\t" + "mov r10, #0\n\t" + "strd r11, r10, [%[r]]\n\t" + "strd r10, r10, [%[r], #8]\n\t" + "strd r10, r10, [%[r], #16]\n\t" + "strd r10, r10, [%[r], #24]\n\t" + /* Set zero */ + "mov r10, #0\n\t" + "strd r10, r10, [sp]\n\t" + "strd r10, r10, [sp, #8]\n\t" + "strd r10, r10, [sp, #16]\n\t" + "strd r10, r10, [sp, #24]\n\t" + /* Set one */ + "mov r11, #1\n\t" + "mov r10, #0\n\t" + "strd r11, r10, [sp, #32]\n\t" + "strd r10, r10, [sp, #40]\n\t" + "strd r10, r10, [sp, #48]\n\t" + "strd r10, r10, [sp, #56]\n\t" + /* Copy */ + "ldrd r4, r5, [%[a]]\n\t" + "ldrd r6, r7, [%[a], #8]\n\t" + "strd r4, r5, [sp, #64]\n\t" + "strd r6, r7, [sp, #72]\n\t" + "ldrd r4, r5, [%[a], #16]\n\t" + "ldrd r6, r7, [%[a], #24]\n\t" + "strd r4, r5, [sp, #80]\n\t" + "strd r6, r7, [sp, #88]\n\t" + "mov %[n], #30\n\t" + "str %[n], [sp, #180]\n\t" + "mov %[a], #28\n\t" + "str %[a], [sp, #176]\n\t" + "\n" + "L_curve25519_words_%=: \n\t" + "\n" + "L_curve25519_bits_%=: \n\t" + "ldr %[n], [sp, #164]\n\t" + "ldr %[a], [%[n], r2]\n\t" + "ldr %[n], [sp, #180]\n\t" + "lsr %[a], %[a], %[n]\n\t" + "and %[a], %[a], #1\n\t" + "str %[a], [sp, #184]\n\t" + "ldr %[n], [sp, #172]\n\t" + "eor %[n], %[n], %[a]\n\t" + "str %[n], [sp, #172]\n\t" + "ldr %[r], [sp, #160]\n\t" + /* Conditional Swap */ + "neg %[n], %[n]\n\t" + "ldrd r4, r5, [%[r]]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [sp, #64]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "ldrd r6, r7, [sp, #72]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [sp, #72]\n\t" + "ldrd r4, r5, [%[r], #16]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r], #16]\n\t" + "strd r6, r7, [sp, #80]\n\t" + "ldrd r4, r5, [%[r], #24]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r], #24]\n\t" + "strd r6, r7, [sp, #88]\n\t" + "ldr %[n], [sp, #172]\n\t" + /* Conditional Swap */ + "neg %[n], %[n]\n\t" + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp]\n\t" + "strd r6, r7, [sp, #32]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp, #8]\n\t" + "strd r6, r7, [sp, #40]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp, #16]\n\t" + "strd r6, r7, [sp, #48]\n\t" + "ldrd r4, r5, [sp, #24]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp, #24]\n\t" + "strd r6, r7, [sp, #56]\n\t" + "ldr %[n], [sp, #184]\n\t" + "str %[n], [sp, #172]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r4, r5, [%[r]]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [%[r]]\n\t" + /* Sub */ + "subs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #128]\n\t" + /* Add */ + "ldrd r4, r5, [%[r], #8]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [%[r], #8]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #136]\n\t" + /* Add */ + "ldrd r4, r5, [%[r], #16]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [%[r], #16]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #144]\n\t" + /* Add */ + "ldrd r4, r5, [%[r], #24]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r3, #-19\n\t" + "asr %[a], r9, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [%[r]]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "ldrd r4, r5, [%[r], #16]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [%[r], #16]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbc r9, r9, r12\n\t" + "strd r8, r9, [%[r], #24]\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp, #128]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #128]\n\t" + "ldrd r4, r5, [sp, #136]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #136]\n\t" + "ldrd r4, r5, [sp, #144]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #144]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r10, r11, [sp, #152]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r4, r5, [sp, #64]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "adds r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp]\n\t" + /* Sub */ + "subs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #96]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #72]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #8]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #104]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #80]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #16]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #112]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #88]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r3, #-19\n\t" + "asr %[a], r9, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [sp]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #8]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #16]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbc r9, r9, r12\n\t" + "strd r8, r9, [sp, #24]\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp, #96]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #96]\n\t" + "ldrd r4, r5, [sp, #104]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #104]\n\t" + "ldrd r4, r5, [sp, #112]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #112]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r10, r11, [sp, #120]\n\t" + "ldr r2, [sp, #160]\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0x80\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq\n\t" + "ldr r1, [sp, #160]\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r4, r5, [sp, #32]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #64]\n\t" + /* Sub */ + "subs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #40]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #72]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #8]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #48]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #80]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #16]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #56]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r3, #-19\n\t" + "asr %[a], r9, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [sp, #64]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #64]\n\t" + "ldrd r4, r5, [sp, #72]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #72]\n\t" + "ldrd r4, r5, [sp, #80]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #80]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbc r9, r9, r12\n\t" + "strd r8, r9, [sp, #88]\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #8]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #16]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r10, r11, [sp, #24]\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul\n\t" + /* Sub */ + "ldrd r4, r5, [sp, #128]\n\t" + "ldrd r6, r7, [sp, #136]\n\t" + "ldrd r8, r9, [sp, #96]\n\t" + "ldrd r10, r11, [sp, #104]\n\t" + "subs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [sp, #128]\n\t" + "strd r10, r11, [sp, #136]\n\t" + "ldrd r4, r5, [sp, #144]\n\t" + "ldrd r6, r7, [sp, #152]\n\t" + "ldrd r8, r9, [sp, #112]\n\t" + "ldrd r10, r11, [sp, #120]\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp, #128]\n\t" + "ldrd r6, r7, [sp, #136]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "adcs r6, r6, %[a]\n\t" + "adcs r7, r7, %[a]\n\t" + "adcs r8, r8, %[a]\n\t" + "adcs r9, r9, %[a]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r4, r5, [sp, #128]\n\t" + "strd r6, r7, [sp, #136]\n\t" + "strd r8, r9, [sp, #144]\n\t" + "strd r10, r11, [sp, #152]\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0\n\t" + "bl fe_sq\n\t" + /* Multiply by 121666 */ + "ldrd r4, r5, [sp, #128]\n\t" + "ldrd r6, r7, [sp, #136]\n\t" + "ldrd r8, r9, [sp, #144]\n\t" + "ldrd r10, r11, [sp, #152]\n\t" + "movw r12, #0xdb42\n\t" + "movt r12, #1\n\t" + "umull r4, %[a], r4, r12\n\t" + "umull r5, r3, r5, r12\n\t" + "adds r5, r5, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r6, r3, r6, r12\n\t" + "adds r6, r6, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r7, r3, r7, r12\n\t" + "adds r7, r7, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r8, r3, r8, r12\n\t" + "adds r8, r8, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r9, r3, r9, r12\n\t" + "adds r9, r9, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r10, r3, r10, r12\n\t" + "adds r10, r10, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r11, r3, r11, r12\n\t" + "adds r11, r11, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "mov r12, #19\n\t" + "lsl %[a], %[a], #1\n\t" + "orr %[a], %[a], r11, lsr #31\n\t" + "mul %[a], %[a], r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, %[a]\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + "strd r4, r5, [sp, #32]\n\t" + "strd r6, r7, [sp, #40]\n\t" + "strd r8, r9, [sp, #48]\n\t" + "strd r10, r11, [sp, #56]\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq\n\t" + /* Add */ + "ldrd r4, r5, [sp, #96]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "ldrd r8, r9, [sp, #32]\n\t" + "ldrd r10, r11, [sp, #40]\n\t" + "adds r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [sp, #96]\n\t" + "strd r10, r11, [sp, #104]\n\t" + "ldrd r4, r5, [sp, #112]\n\t" + "ldrd r6, r7, [sp, #120]\n\t" + "ldrd r8, r9, [sp, #48]\n\t" + "ldrd r10, r11, [sp, #56]\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [sp, #96]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "sbcs r6, r6, %[a]\n\t" + "sbcs r7, r7, %[a]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbcs r9, r9, %[a]\n\t" + "sbcs r10, r10, %[a]\n\t" + "sbc r11, r11, r12\n\t" + "strd r4, r5, [sp, #96]\n\t" + "strd r6, r7, [sp, #104]\n\t" + "strd r8, r9, [sp, #112]\n\t" + "strd r10, r11, [sp, #120]\n\t" + "add r2, sp, #0\n\t" + "ldr r1, [sp, #168]\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #176]\n\t" + "ldr %[n], [sp, #180]\n\t" + "subs %[n], %[n], #1\n\t" + "str %[n], [sp, #180]\n\t" + "bge L_curve25519_bits_%=\n\t" + "mov %[n], #31\n\t" + "str %[n], [sp, #180]\n\t" + "subs %[a], %[a], #4\n\t" + "str %[a], [sp, #176]\n\t" + "bge L_curve25519_words_%=\n\t" + /* Invert */ + "add r0, sp, #32\n\t" + "add r1, sp, #0\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #0x60\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_curve25519_inv_1_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_1_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_curve25519_inv_2_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_2_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_curve25519_inv_3_%=: \n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x80\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_3_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r2, sp, #0x60\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_curve25519_inv_4_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_4_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #49\n\t" + "\n" + "L_curve25519_inv_5_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_5_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_curve25519_inv_6_%=: \n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x80\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_6_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r2, sp, #0x60\n\t" + "bl fe_mul\n\t" + "mov r4, #50\n\t" + "\n" + "L_curve25519_inv_7_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_7_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "mov r4, #5\n\t" + "\n" + "L_curve25519_inv_8_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_8_%=\n\t" + "add r0, sp, #0\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0\n\t" + "ldr r1, [sp, #160]\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul\n\t" + "mov r0, #0\n\t" + "add sp, sp, #0xbc\n\t" + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)r; +} + +void fe_pow22523(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x68\n\t" + /* pow22523 */ + "str %[r], [sp, #96]\n\t" + "str %[a], [sp, #100]\n\t" + "mov r0, sp\n\t" + "ldr r1, [sp, #100]\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "ldr r1, [sp, #100]\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_fe_pow22523_1_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_1_%=\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_fe_pow22523_2_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_2_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_fe_pow22523_3_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_3_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_fe_pow22523_4_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_4_%=\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #49\n\t" + "\n" + "L_fe_pow22523_5_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_5_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_fe_pow22523_6_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_6_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #50\n\t" + "\n" + "L_fe_pow22523_7_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_7_%=\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "mov r4, #2\n\t" + "\n" + "L_fe_pow22523_8_%=: \n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_8_%=\n\t" + "ldr r0, [sp, #96]\n\t" + "mov r1, sp\n\t" + "ldr r2, [sp, #100]\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #100]\n\t" + "ldr %[r], [sp, #96]\n\t" + "add sp, sp, #0x68\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr", "r4" + ); +} + +void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[px], [sp, #12]\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #28]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "lr" + ); +} + +void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r2, [sp, #36]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #28]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #36]\n\t" + "ldr r1, [sp, #32]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "lr" + ); +} + +void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r1, [sp, #88]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_sq\n\t" + "ldr r1, [sp, #92]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_sq\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #88]\n\t" + "ldr r2, [sp, #92]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_sq\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #8]\n\t" + "ldr r2, [sp]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r2]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #8]\n\t" + "ldrd r5, r6, [r2, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r2, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #24]\n\t" + "ldrd r5, r6, [r2, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #12]\n\t" + "ldr r2, [sp, #4]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r1, [sp, #96]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_sq2\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "ldrd r7, r8, [r1]\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #124]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #128]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #116]\n\t" + "ldr r1, [sp, #120]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #112]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #12]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r1]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r0, #8]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r0, #16]\n\t" + "ldrd r5, r6, [r1, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r0, #24]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #32\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #128]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #124]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #116]\n\t" + "ldr r1, [sp, #120]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #112]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #32\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x60\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #192]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #196]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #180]\n\t" + "ldr r1, [sp, #188]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #184]\n\t" + "ldr r1, [sp, #176]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "add r0, sp, #16\n\t" + "ldr r1, [sp]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #12]\n\t" + "add r2, sp, #16\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r1]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r1, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #0x60\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x60\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #196]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #192]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #180]\n\t" + "ldr r1, [sp, #188]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #184]\n\t" + "ldr r1, [sp, #176]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "add r0, sp, #16\n\t" + "ldr r1, [sp]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + "add r2, sp, #16\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #0x60\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +#endif /* WOLFSSL_ARMASM */ +#endif /* !__aarch64__ */ |