diff options
| author | auth12 <[email protected]> | 2020-07-19 11:57:04 -0700 |
|---|---|---|
| committer | GitHub <[email protected]> | 2020-07-19 11:57:04 -0700 |
| commit | 1bae439a35a3aadca6772716aaeea8c8a0991114 (patch) | |
| tree | f8eab7a7bae237ad697feecfae26b17bab91b16e /client/wolfssl/wolfcrypt/src/port/arm | |
| parent | More placeholders and general plan. (diff) | |
| parent | Merge branch 'master' into windows (diff) | |
| download | loader-1bae439a35a3aadca6772716aaeea8c8a0991114.tar.xz loader-1bae439a35a3aadca6772716aaeea8c8a0991114.zip | |
Merge pull request #1 from auth12/windows
Windows
Diffstat (limited to 'client/wolfssl/wolfcrypt/src/port/arm')
15 files changed, 48580 insertions, 0 deletions
diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S new file mode 100644 index 0000000..6fd1ed3 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S @@ -0,0 +1,6012 @@ +/* armv8-32-curve25519 + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S + */ + +#ifdef WOLFSSL_ARMASM +#ifndef __aarch64__ + .text + .align 2 + .globl fe_init + .type fe_init, %function +fe_init: + bx lr + .size fe_init,.-fe_init + .text + .align 2 + .globl fe_frombytes + .type fe_frombytes, %function +fe_frombytes: + push {r4, r5, r6, r7, lr} + ldrd r2, r3, [r1] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + ldrd r4, r5, [r1, #16] + ldrd r6, r7, [r1, #24] + and r7, r7, #0x7fffffff + strd r2, r3, [r0] + str r12, [r0, #8] + str lr, [r0, #12] + strd r4, r5, [r0, #16] + strd r6, r7, [r0, #24] + pop {r4, r5, r6, r7, pc} + .size fe_frombytes,.-fe_frombytes + .text + .align 2 + .globl fe_tobytes + .type fe_tobytes, %function +fe_tobytes: + push {r4, r5, r6, r7, r8, lr} + ldrd r2, r3, [r1] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + ldrd r4, r5, [r1, #16] + ldrd r6, r7, [r1, #24] + adds r8, r2, #19 + adcs r8, r3, #0 + adcs r8, r12, #0 + adcs r8, lr, #0 + adcs r8, r4, #0 + adcs r8, r5, #0 + adcs r8, r6, #0 + adc r8, r7, #0 + asr r8, r8, #31 + and r8, r8, #19 + adds r2, r2, r8 + adcs r3, r3, #0 + adcs r12, r12, #0 + adcs lr, lr, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adc r7, r7, #0 + and r7, r7, #0x7fffffff + strd r2, r3, [r0] + str r12, [r0, #8] + str lr, [r0, #12] + strd r4, r5, [r0, #16] + strd r6, r7, [r0, #24] + pop {r4, r5, r6, r7, r8, pc} + .size fe_tobytes,.-fe_tobytes + .text + .align 2 + .globl fe_1 + .type fe_1, %function +fe_1: + # Set one + mov r2, #1 + mov r1, #0 + str r2, [r0] + str r1, [r0, #4] + str r1, [r0, #8] + str r1, [r0, #12] + str r1, [r0, #16] + str r1, [r0, #20] + str r1, [r0, #24] + str r1, [r0, #28] + bx lr + .size fe_1,.-fe_1 + .text + .align 2 + .globl fe_0 + .type fe_0, %function +fe_0: + # Set zero + mov r1, #0 + str r1, [r0] + str r1, [r0, #4] + str r1, [r0, #8] + str r1, [r0, #12] + str r1, [r0, #16] + str r1, [r0, #20] + str r1, [r0, #24] + str r1, [r0, #28] + bx lr + .size fe_0,.-fe_0 + .text + .align 2 + .globl fe_copy + .type fe_copy, %function +fe_copy: + push {lr} + # Copy + ldrd r2, r3, [r1] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + strd r2, r3, [r0] + str r12, [r0, #8] + str lr, [r0, #12] + ldrd r2, r3, [r1, #16] + ldr r12, [r1, #24] + ldr lr, [r1, #28] + strd r2, r3, [r0, #16] + str r12, [r0, #24] + str lr, [r0, #28] + pop {pc} + .size fe_copy,.-fe_copy + .text + .align 2 + .globl fe_sub + .type fe_sub, %function +fe_sub: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + # Sub + ldr r12, [r1] + ldr lr, [r1, #4] + ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r2] + ldrd r8, r9, [r2, #8] + subs r6, r12, r6 + sbcs r7, lr, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + strd r6, r7, [r0] + strd r8, r9, [r0, #8] + ldr r12, [r1, #16] + ldr lr, [r1, #20] + ldrd r4, r5, [r1, #24] + ldrd r6, r7, [r2, #16] + ldrd r8, r9, [r2, #24] + sbcs r6, r12, r6 + sbcs r7, lr, r7 + sbcs r8, r4, r8 + sbc r9, r5, r9 + mov r10, #-19 + asr r3, r9, #31 + # Mask the modulus + and r10, r3, r10 + and r11, r3, #0x7fffffff + # Add modulus (if underflow) + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + adds r12, r12, r10 + adcs lr, lr, r3 + adcs r4, r4, r3 + adcs r5, r5, r3 + adcs r6, r6, r3 + adcs r7, r7, r3 + adcs r8, r8, r3 + adc r9, r9, r11 + str r12, [r0] + str lr, [r0, #4] + strd r4, r5, [r0, #8] + strd r6, r7, [r0, #16] + strd r8, r9, [r0, #24] + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_sub,.-fe_sub + .text + .align 2 + .globl fe_add + .type fe_add, %function +fe_add: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + # Add + ldr r12, [r1] + ldr lr, [r1, #4] + ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r2] + ldrd r8, r9, [r2, #8] + adds r6, r12, r6 + adcs r7, lr, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + strd r6, r7, [r0] + strd r8, r9, [r0, #8] + ldr r12, [r1, #16] + ldr lr, [r1, #20] + ldrd r4, r5, [r1, #24] + ldrd r6, r7, [r2, #16] + ldrd r8, r9, [r2, #24] + adcs r6, r12, r6 + adcs r7, lr, r7 + adcs r8, r4, r8 + adc r9, r5, r9 + mov r10, #-19 + asr r3, r9, #31 + # Mask the modulus + and r10, r3, r10 + and r11, r3, #0x7fffffff + # Sub modulus (if overflow) + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + subs r12, r12, r10 + sbcs lr, lr, r3 + sbcs r4, r4, r3 + sbcs r5, r5, r3 + sbcs r6, r6, r3 + sbcs r7, r7, r3 + sbcs r8, r8, r3 + sbc r9, r9, r11 + str r12, [r0] + str lr, [r0, #4] + strd r4, r5, [r0, #8] + strd r6, r7, [r0, #16] + strd r8, r9, [r0, #24] + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_add,.-fe_add + .text + .align 2 + .globl fe_neg + .type fe_neg, %function +fe_neg: + push {r4, r5, lr} + mov r5, #-1 + mov r4, #-19 + ldrd r2, r3, [r1] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + subs r2, r4, r2 + sbcs r3, r5, r3 + sbcs r12, r5, r12 + sbcs lr, r5, lr + strd r2, r3, [r0] + str r12, [r0, #8] + str lr, [r0, #12] + mov r4, #0x7fffffff + ldrd r2, r3, [r1, #16] + ldr r12, [r1, #24] + ldr lr, [r1, #28] + sbcs r2, r5, r2 + sbcs r3, r5, r3 + sbcs r12, r5, r12 + sbc lr, r4, lr + strd r2, r3, [r0, #16] + str r12, [r0, #24] + str lr, [r0, #28] + pop {r4, r5, pc} + .size fe_neg,.-fe_neg + .text + .align 2 + .globl fe_isnonzero + .type fe_isnonzero, %function +fe_isnonzero: + push {r4, r5, r6, r7, r8, lr} + ldrd r2, r3, [r0] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + adds r1, r2, #19 + adcs r1, r3, #0 + adcs r1, r12, #0 + adcs r1, lr, #0 + adcs r1, r4, #0 + adcs r1, r5, #0 + adcs r1, r6, #0 + adc r1, r7, #0 + asr r1, r1, #31 + and r1, r1, #19 + adds r2, r2, r1 + adcs r3, r3, #0 + adcs r12, r12, #0 + adcs lr, lr, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adc r7, r7, #0 + and r7, r7, #0x7fffffff + orr r2, r2, r3 + orr r12, r12, lr + orr r4, r4, r5 + orr r6, r6, r7 + orr r12, r12, r4 + orr r2, r2, r6 + orr r0, r2, r12 + pop {r4, r5, r6, r7, r8, pc} + .size fe_isnonzero,.-fe_isnonzero + .text + .align 2 + .globl fe_isnegative + .type fe_isnegative, %function +fe_isnegative: + push {lr} + ldrd r2, r3, [r0] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + adds r1, r2, #19 + adcs r1, r3, #0 + adcs r1, r12, #0 + adcs r1, lr, #0 + ldrd r2, r3, [r0, #16] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + adcs r1, r2, #0 + adcs r1, r3, #0 + adcs r1, r12, #0 + ldr r2, [r0] + adc r1, lr, #0 + and r0, r2, #1 + lsr r1, r1, #31 + eor r0, r0, r1 + pop {pc} + .size fe_isnegative,.-fe_isnegative + .text + .align 2 + .globl fe_cmov_table + .type fe_cmov_table, %function +fe_cmov_table: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sxtb r2, r2 + sbfx r7, r2, #7, #1 + eor r10, r2, r7 + sub r10, r10, r7 + mov r3, #1 + mov r12, #0 + mov lr, #1 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0x80000000 + ror r7, r7, #31 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #30 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #29 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #28 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #27 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #26 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #25 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #24 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #32] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #64] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + sub r1, r1, #0x2a0 + mov r8, #-19 + mov r9, #-1 + subs r8, r8, r5 + sbcs r9, r9, r6 + sbc r11, r11, r11 + asr r10, r2, #31 + eor r7, r3, lr + and r7, r7, r10 + eor r3, r3, r7 + eor lr, lr, r7 + eor r7, r12, r4 + and r7, r7, r10 + eor r12, r12, r7 + eor r4, r4, r7 + eor r8, r8, r5 + and r8, r8, r10 + eor r5, r5, r8 + eor r9, r9, r6 + and r9, r9, r10 + eor r6, r6, r9 + str r3, [r0] + str r12, [r0, #4] + str lr, [r0, #32] + str r4, [r0, #36] + str r5, [r0, #64] + str r6, [r0, #68] + sbfx r7, r2, #7, #1 + eor r10, r2, r7 + sub r10, r10, r7 + mov r3, #0 + mov r12, #0 + mov lr, #0 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0x80000000 + ror r7, r7, #31 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #30 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #29 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #28 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #27 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #26 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #25 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #24 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #8] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #40] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #72] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + sub r1, r1, #0x2a0 + mov r8, #-1 + mov r9, #-1 + rsbs r11, r11, #0 + sbcs r8, r8, r5 + sbcs r9, r9, r6 + sbc r11, r11, r11 + asr r10, r2, #31 + eor r7, r3, lr + and r7, r7, r10 + eor r3, r3, r7 + eor lr, lr, r7 + eor r7, r12, r4 + and r7, r7, r10 + eor r12, r12, r7 + eor r4, r4, r7 + eor r8, r8, r5 + and r8, r8, r10 + eor r5, r5, r8 + eor r9, r9, r6 + and r9, r9, r10 + eor r6, r6, r9 + str r3, [r0, #8] + str r12, [r0, #12] + str lr, [r0, #40] + str r4, [r0, #44] + str r5, [r0, #72] + str r6, [r0, #76] + sbfx r7, r2, #7, #1 + eor r10, r2, r7 + sub r10, r10, r7 + mov r3, #0 + mov r12, #0 + mov lr, #0 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0x80000000 + ror r7, r7, #31 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #30 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #29 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #28 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #27 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #26 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #25 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #24 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #16] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #48] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #80] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + sub r1, r1, #0x2a0 + mov r8, #-1 + mov r9, #-1 + rsbs r11, r11, #0 + sbcs r8, r8, r5 + sbcs r9, r9, r6 + sbc r11, r11, r11 + asr r10, r2, #31 + eor r7, r3, lr + and r7, r7, r10 + eor r3, r3, r7 + eor lr, lr, r7 + eor r7, r12, r4 + and r7, r7, r10 + eor r12, r12, r7 + eor r4, r4, r7 + eor r8, r8, r5 + and r8, r8, r10 + eor r5, r5, r8 + eor r9, r9, r6 + and r9, r9, r10 + eor r6, r6, r9 + str r3, [r0, #16] + str r12, [r0, #20] + str lr, [r0, #48] + str r4, [r0, #52] + str r5, [r0, #80] + str r6, [r0, #84] + sbfx r7, r2, #7, #1 + eor r10, r2, r7 + sub r10, r10, r7 + mov r3, #0 + mov r12, #0 + mov lr, #0 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0x80000000 + ror r7, r7, #31 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #30 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #29 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #28 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #27 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #26 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #25 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + add r1, r1, #0x60 + mov r7, #0x80000000 + ror r7, r7, #24 + ror r7, r7, r10 + asr r7, r7, #31 + ldrd r8, r9, [r1, #24] + eor r8, r8, r3 + eor r9, r9, r12 + and r8, r8, r7 + and r9, r9, r7 + eor r3, r3, r8 + eor r12, r12, r9 + ldrd r8, r9, [r1, #56] + eor r8, r8, lr + eor r9, r9, r4 + and r8, r8, r7 + and r9, r9, r7 + eor lr, lr, r8 + eor r4, r4, r9 + ldrd r8, r9, [r1, #88] + eor r8, r8, r5 + eor r9, r9, r6 + and r8, r8, r7 + and r9, r9, r7 + eor r5, r5, r8 + eor r6, r6, r9 + sub r1, r1, #0x2a0 + mov r8, #-1 + mov r9, #0x7fffffff + rsbs r11, r11, #0 + sbcs r8, r8, r5 + sbc r9, r9, r6 + asr r10, r2, #31 + eor r7, r3, lr + and r7, r7, r10 + eor r3, r3, r7 + eor lr, lr, r7 + eor r7, r12, r4 + and r7, r7, r10 + eor r12, r12, r7 + eor r4, r4, r7 + eor r8, r8, r5 + and r8, r8, r10 + eor r5, r5, r8 + eor r9, r9, r6 + and r9, r9, r10 + eor r6, r6, r9 + str r3, [r0, #24] + str r12, [r0, #28] + str lr, [r0, #56] + str r4, [r0, #60] + str r5, [r0, #88] + str r6, [r0, #92] + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_cmov_table,.-fe_cmov_table + .text + .align 2 + .globl fe_mul + .type fe_mul, %function +fe_mul: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x40 + # Multiply + ldr r7, [r1] + ldr r8, [r1, #4] + ldr r9, [r2] + ldr lr, [r2, #4] + # A[0] * B[0] = 0 + umull r4, r5, r7, r9 + str r4, [sp] + # A[0] * B[1] = 1 + umull r3, r6, r7, lr + adds r5, r5, r3 + adc r6, r6, #0 + # A[1] * B[0] = 1 + umull r3, r12, r8, r9 + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #4] + # A[2] * B[0] = 2 + ldr r10, [r1, #8] + umull r3, r12, r10, r9 + adds r6, r6, r3 + adc r4, r4, r12 + # A[1] * B[1] = 2 + umull r3, r12, r8, lr + adds r6, r6, r3 + mov r5, #0 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[0] * B[2] = 2 + ldr r11, [r2, #8] + umull r3, r12, r7, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + str r6, [sp, #8] + # A[0] * B[3] = 3 + ldr r11, [r2, #12] + umull r3, r12, r7, r11 + adds r4, r4, r3 + mov r6, #0 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[1] * B[2] = 3 + ldr r11, [r2, #8] + umull r3, r12, r8, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[2] * B[1] = 3 + umull r3, r12, r10, lr + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[3] * B[0] = 3 + ldr r10, [r1, #12] + umull r3, r12, r10, r9 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + str r4, [sp, #12] + # A[4] * B[0] = 4 + ldr r10, [r1, #16] + umull r3, r12, r10, r9 + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[3] * B[1] = 4 + ldr r10, [r1, #12] + umull r3, r12, r10, lr + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[2] * B[2] = 4 + ldr r10, [r1, #8] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[1] * B[3] = 4 + ldr r11, [r2, #12] + umull r3, r12, r8, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[0] * B[4] = 4 + ldr r11, [r2, #16] + umull r3, r12, r7, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #16] + # A[0] * B[5] = 5 + ldr r11, [r2, #20] + umull r3, r12, r7, r11 + adds r6, r6, r3 + mov r5, #0 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[1] * B[4] = 5 + ldr r11, [r2, #16] + umull r3, r12, r8, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[2] * B[3] = 5 + ldr r11, [r2, #12] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[3] * B[2] = 5 + ldr r10, [r1, #12] + ldr r11, [r2, #8] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[4] * B[1] = 5 + ldr r10, [r1, #16] + umull r3, r12, r10, lr + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[5] * B[0] = 5 + ldr r10, [r1, #20] + umull r3, r12, r10, r9 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + str r6, [sp, #20] + # A[6] * B[0] = 6 + ldr r10, [r1, #24] + umull r3, r12, r10, r9 + adds r4, r4, r3 + mov r6, #0 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[5] * B[1] = 6 + ldr r10, [r1, #20] + umull r3, r12, r10, lr + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[4] * B[2] = 6 + ldr r10, [r1, #16] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[3] * B[3] = 6 + ldr r10, [r1, #12] + ldr r11, [r2, #12] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[2] * B[4] = 6 + ldr r10, [r1, #8] + ldr r11, [r2, #16] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[1] * B[5] = 6 + ldr r11, [r2, #20] + umull r3, r12, r8, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[0] * B[6] = 6 + ldr r11, [r2, #24] + umull r3, r12, r7, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + str r4, [sp, #24] + # A[0] * B[7] = 7 + ldr r11, [r2, #28] + umull r3, r12, r7, r11 + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[1] * B[6] = 7 + ldr r11, [r2, #24] + umull r3, r12, r8, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[2] * B[5] = 7 + ldr r11, [r2, #20] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[3] * B[4] = 7 + ldr r10, [r1, #12] + ldr r11, [r2, #16] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[4] * B[3] = 7 + ldr r10, [r1, #16] + ldr r11, [r2, #12] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[5] * B[2] = 7 + ldr r10, [r1, #20] + ldr r11, [r2, #8] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[6] * B[1] = 7 + ldr r10, [r1, #24] + umull r3, r12, r10, lr + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[7] * B[0] = 7 + ldr r10, [r1, #28] + umull r3, r12, r10, r9 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #28] + ldr r7, [r1, #24] + ldr r9, [r2, #24] + # A[7] * B[1] = 8 + umull r3, r12, r10, lr + adds r6, r6, r3 + mov r5, #0 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[6] * B[2] = 8 + umull r3, r12, r7, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[5] * B[3] = 8 + ldr r10, [r1, #20] + ldr r11, [r2, #12] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[4] * B[4] = 8 + ldr r10, [r1, #16] + ldr r11, [r2, #16] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[3] * B[5] = 8 + ldr r10, [r1, #12] + ldr r11, [r2, #20] + umull r3, r12, r10, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[2] * B[6] = 8 + ldr r10, [r1, #8] + umull r3, r12, r10, r9 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[1] * B[7] = 8 + ldr r11, [r2, #28] + umull r3, r12, r8, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + str r6, [sp, #32] + ldr r8, [r1, #28] + mov lr, r11 + # A[2] * B[7] = 9 + umull r3, r12, r10, lr + adds r4, r4, r3 + mov r6, #0 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[3] * B[6] = 9 + ldr r10, [r1, #12] + umull r3, r12, r10, r9 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[4] * B[5] = 9 + ldr r10, [r1, #16] + ldr r11, [r2, #20] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[5] * B[4] = 9 + ldr r10, [r1, #20] + ldr r11, [r2, #16] + umull r3, r12, r10, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[6] * B[3] = 9 + ldr r11, [r2, #12] + umull r3, r12, r7, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[7] * B[2] = 9 + ldr r11, [r2, #8] + umull r3, r12, r8, r11 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + str r4, [sp, #36] + # A[7] * B[3] = 10 + ldr r11, [r2, #12] + umull r3, r12, r8, r11 + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[6] * B[4] = 10 + ldr r11, [r2, #16] + umull r3, r12, r7, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[5] * B[5] = 10 + ldr r11, [r2, #20] + umull r3, r12, r10, r11 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[4] * B[6] = 10 + ldr r10, [r1, #16] + umull r3, r12, r10, r9 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[3] * B[7] = 10 + ldr r10, [r1, #12] + umull r3, r12, r10, lr + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #40] + # A[4] * B[7] = 11 + ldr r10, [r1, #16] + umull r3, r12, r10, lr + adds r6, r6, r3 + mov r5, #0 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[5] * B[6] = 11 + ldr r10, [r1, #20] + umull r3, r12, r10, r9 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[6] * B[5] = 11 + umull r3, r12, r7, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + # A[7] * B[4] = 11 + ldr r11, [r2, #16] + umull r3, r12, r8, r11 + adds r6, r6, r3 + adcs r4, r4, r12 + adc r5, r5, #0 + str r6, [sp, #44] + # A[7] * B[5] = 12 + ldr r11, [r2, #20] + umull r3, r12, r8, r11 + adds r4, r4, r3 + mov r6, #0 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[6] * B[6] = 12 + umull r3, r12, r7, r9 + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + # A[5] * B[7] = 12 + umull r3, r12, r10, lr + adds r4, r4, r3 + adcs r5, r5, r12 + adc r6, r6, #0 + str r4, [sp, #48] + # A[6] * B[7] = 13 + umull r3, r12, r7, lr + adds r5, r5, r3 + mov r4, #0 + adcs r6, r6, r12 + adc r4, r4, #0 + # A[7] * B[6] = 13 + umull r3, r12, r8, r9 + adds r5, r5, r3 + adcs r6, r6, r12 + adc r4, r4, #0 + str r5, [sp, #52] + # A[7] * B[7] = 14 + umull r3, r12, r8, lr + adds r6, r6, r3 + adc r4, r4, r12 + str r6, [sp, #56] + str r4, [sp, #60] + # Reduce + # Load bottom half + ldrd r4, r5, [sp] + ldrd r6, r7, [sp, #8] + ldrd r8, r9, [sp, #16] + ldrd r10, r11, [sp, #24] + lsr r3, r11, #31 + and r11, r11, #0x7fffffff + mov lr, #19 + ldr r1, [sp, #32] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + adds r4, r4, r3 + mov r2, #0 + adcs r5, r5, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #36] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r5, r5, r3 + mov r2, #0 + adcs r6, r6, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #40] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r6, r6, r3 + mov r2, #0 + adcs r7, r7, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #44] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r7, r7, r3 + mov r2, #0 + adcs r8, r8, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #48] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r8, r8, r3 + mov r2, #0 + adcs r9, r9, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #52] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r9, r9, r3 + mov r2, #0 + adcs r10, r10, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #56] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + add r12, r12, r2 + adds r10, r10, r3 + mov r2, #0 + adcs r11, r11, r12 + adc r2, r2, #0 + lsr r3, r1, #31 + ldr r1, [sp, #60] + orr r3, r3, r1, lsl #1 + umull r3, r12, lr, r3 + adds r11, r11, r3 + adc r3, r12, r2 + # Overflow + lsl r3, r3, #1 + orr r3, r3, r11, lsr #31 + mul r3, r3, lr + and r11, r11, #0x7fffffff + adds r4, r4, r3 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Reduce if top bit set + asr r3, r11, #31 + and r3, r3, lr + and r11, r11, #0x7fffffff + adds r4, r4, r3 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Store + strd r4, r5, [r0] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] + add sp, sp, #0x40 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_mul,.-fe_mul + .text + .align 2 + .globl fe_sq + .type fe_sq, %function +fe_sq: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x40 + # Square + ldr r7, [r1] + ldr r8, [r1, #4] + ldr r9, [r1, #8] + ldr r10, [r1, #12] + ldr r12, [r1, #16] + # A[0] * A[0] = 0 + umull r4, r5, r7, r7 + str r4, [sp] + # A[0] * A[1] = 1 + umull r2, r3, r7, r8 + mov r6, #0 + adds r5, r5, r2 + adc r6, r6, r3 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #4] + # A[1] * A[1] = 2 + umull r2, r3, r8, r8 + adds r6, r6, r2 + adc r4, r4, r3 + # A[0] * A[2] = 2 + umull r2, r3, r7, r9 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #8] + # A[0] * A[3] = 3 + umull r2, r3, r7, r10 + adds r4, r4, r2 + adc r5, r5, r3 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[1] * A[2] = 3 + umull r2, r3, r8, r9 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #12] + # A[2] * A[2] = 4 + umull r2, r3, r9, r9 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[1] * A[3] = 4 + umull r2, r3, r8, r10 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[0] * A[4] = 4 + umull r2, r3, r7, r12 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #16] + # A[0] * A[5] = 5 + ldr r11, [r1, #20] + umull r2, r3, r7, r11 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[1] * A[4] = 5 + umull r2, r3, r8, r12 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[2] * A[3] = 5 + umull r2, r3, r9, r10 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #20] + # A[3] * A[3] = 6 + umull r2, r3, r10, r10 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[2] * A[4] = 6 + umull r2, r3, r9, r12 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[1] * A[5] = 6 + umull r2, r3, r8, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[0] * A[6] = 6 + ldr r11, [r1, #24] + umull r2, r3, r7, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #24] + # A[0] * A[7] = 7 + ldr r11, [r1, #28] + umull r2, r3, r7, r11 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[1] * A[6] = 7 + ldr r11, [r1, #24] + umull r2, r3, r8, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[2] * A[5] = 7 + ldr r11, [r1, #20] + umull r2, r3, r9, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[3] * A[4] = 7 + umull r2, r3, r10, r12 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #28] + # A[4] * A[4] = 8 + umull r2, r3, r12, r12 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[3] * A[5] = 8 + umull r2, r3, r10, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[2] * A[6] = 8 + ldr r11, [r1, #24] + umull r2, r3, r9, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[1] * A[7] = 8 + ldr r11, [r1, #28] + umull r2, r3, r8, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #32] + ldr r7, [r1, #20] + # A[2] * A[7] = 9 + umull r2, r3, r9, r11 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[3] * A[6] = 9 + ldr r11, [r1, #24] + umull r2, r3, r10, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[4] * A[5] = 9 + umull r2, r3, r12, r7 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #36] + mov r8, r11 + # A[5] * A[5] = 10 + umull r2, r3, r7, r7 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[4] * A[6] = 10 + umull r2, r3, r12, r8 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[3] * A[7] = 10 + ldr r11, [r1, #28] + umull r2, r3, r10, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #40] + mov r9, r11 + # A[4] * A[7] = 11 + umull r2, r3, r12, r9 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[5] * A[6] = 11 + umull r2, r3, r7, r8 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #44] + # A[6] * A[6] = 12 + umull r2, r3, r8, r8 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[5] * A[7] = 12 + umull r2, r3, r7, r9 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #48] + # A[6] * A[7] = 13 + umull r2, r3, r8, r9 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #52] + # A[7] * A[7] = 14 + umull r2, r3, r9, r9 + adds r6, r6, r2 + adc r4, r4, r3 + str r6, [sp, #56] + str r4, [sp, #60] + # Reduce + # Load bottom half + ldrd r4, r5, [sp] + ldrd r6, r7, [sp, #8] + ldrd r8, r9, [sp, #16] + ldrd r10, r11, [sp, #24] + lsr r2, r11, #31 + and r11, r11, #0x7fffffff + mov r12, #19 + ldr r1, [sp, #32] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + adds r4, r4, r2 + mov lr, #0 + adcs r5, r5, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #36] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r5, r5, r2 + mov lr, #0 + adcs r6, r6, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #40] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r6, r6, r2 + mov lr, #0 + adcs r7, r7, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #44] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r7, r7, r2 + mov lr, #0 + adcs r8, r8, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #48] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r8, r8, r2 + mov lr, #0 + adcs r9, r9, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #52] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r9, r9, r2 + mov lr, #0 + adcs r10, r10, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #56] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r10, r10, r2 + mov lr, #0 + adcs r11, r11, r3 + adc lr, lr, #0 + lsr r2, r1, #31 + ldr r1, [sp, #60] + orr r2, r2, r1, lsl #1 + umull r2, r3, r12, r2 + adds r11, r11, r2 + adc r2, r3, lr + # Overflow + lsl r2, r2, #1 + orr r2, r2, r11, lsr #31 + mul r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Reduce if top bit set + asr r2, r11, #31 + and r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Store + strd r4, r5, [r0] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] + add sp, sp, #0x40 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_sq,.-fe_sq + .text + .align 2 + .globl fe_mul121666 + .type fe_mul121666, %function +fe_mul121666: + push {r4, r5, r6, r7, r8, r9, r10, lr} + # Multiply by 121666 + ldrd r2, r3, [r1] + ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r1, #16] + ldrd r8, r9, [r1, #24] + movw lr, #0xdb42 + movt lr, #1 + umull r2, r10, r2, lr + umull r3, r12, r3, lr + adds r3, r3, r10 + adc r10, r12, #0 + umull r4, r12, r4, lr + adds r4, r4, r10 + adc r10, r12, #0 + umull r5, r12, r5, lr + adds r5, r5, r10 + adc r10, r12, #0 + umull r6, r12, r6, lr + adds r6, r6, r10 + adc r10, r12, #0 + umull r7, r12, r7, lr + adds r7, r7, r10 + adc r10, r12, #0 + umull r8, r12, r8, lr + adds r8, r8, r10 + adc r10, r12, #0 + umull r9, r12, r9, lr + adds r9, r9, r10 + adc r10, r12, #0 + mov lr, #19 + lsl r10, r10, #1 + orr r10, r10, r9, lsr #31 + mul r10, r10, lr + and r9, r9, #0x7fffffff + adds r2, r2, r10 + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adc r9, r9, #0 + strd r2, r3, [r0] + strd r4, r5, [r0, #8] + strd r6, r7, [r0, #16] + strd r8, r9, [r0, #24] + pop {r4, r5, r6, r7, r8, r9, r10, pc} + .size fe_mul121666,.-fe_mul121666 + .text + .align 2 + .globl fe_sq2 + .type fe_sq2, %function +fe_sq2: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x40 + # Square * 2 + ldr r7, [r1] + ldr r8, [r1, #4] + ldr r9, [r1, #8] + ldr r10, [r1, #12] + ldr r12, [r1, #16] + # A[0] * A[0] = 0 + umull r4, r5, r7, r7 + str r4, [sp] + # A[0] * A[1] = 1 + umull r2, r3, r7, r8 + mov r6, #0 + adds r5, r5, r2 + adc r6, r6, r3 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #4] + # A[1] * A[1] = 2 + umull r2, r3, r8, r8 + adds r6, r6, r2 + adc r4, r4, r3 + # A[0] * A[2] = 2 + umull r2, r3, r7, r9 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #8] + # A[0] * A[3] = 3 + umull r2, r3, r7, r10 + adds r4, r4, r2 + adc r5, r5, r3 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[1] * A[2] = 3 + umull r2, r3, r8, r9 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #12] + # A[2] * A[2] = 4 + umull r2, r3, r9, r9 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[1] * A[3] = 4 + umull r2, r3, r8, r10 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[0] * A[4] = 4 + umull r2, r3, r7, r12 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #16] + # A[0] * A[5] = 5 + ldr r11, [r1, #20] + umull r2, r3, r7, r11 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[1] * A[4] = 5 + umull r2, r3, r8, r12 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[2] * A[3] = 5 + umull r2, r3, r9, r10 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #20] + # A[3] * A[3] = 6 + umull r2, r3, r10, r10 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[2] * A[4] = 6 + umull r2, r3, r9, r12 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[1] * A[5] = 6 + umull r2, r3, r8, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[0] * A[6] = 6 + ldr r11, [r1, #24] + umull r2, r3, r7, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #24] + # A[0] * A[7] = 7 + ldr r11, [r1, #28] + umull r2, r3, r7, r11 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[1] * A[6] = 7 + ldr r11, [r1, #24] + umull r2, r3, r8, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[2] * A[5] = 7 + ldr r11, [r1, #20] + umull r2, r3, r9, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[3] * A[4] = 7 + umull r2, r3, r10, r12 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #28] + # A[4] * A[4] = 8 + umull r2, r3, r12, r12 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[3] * A[5] = 8 + umull r2, r3, r10, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[2] * A[6] = 8 + ldr r11, [r1, #24] + umull r2, r3, r9, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[1] * A[7] = 8 + ldr r11, [r1, #28] + umull r2, r3, r8, r11 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #32] + ldr r7, [r1, #20] + # A[2] * A[7] = 9 + umull r2, r3, r9, r11 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[3] * A[6] = 9 + ldr r11, [r1, #24] + umull r2, r3, r10, r11 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[4] * A[5] = 9 + umull r2, r3, r12, r7 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #36] + mov r8, r11 + # A[5] * A[5] = 10 + umull r2, r3, r7, r7 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[4] * A[6] = 10 + umull r2, r3, r12, r8 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + # A[3] * A[7] = 10 + ldr r11, [r1, #28] + umull r2, r3, r10, r11 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #40] + mov r9, r11 + # A[4] * A[7] = 11 + umull r2, r3, r12, r9 + adds r6, r6, r2 + mov r5, #0 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + # A[5] * A[6] = 11 + umull r2, r3, r7, r8 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + adds r6, r6, r2 + adcs r4, r4, r3 + adc r5, r5, #0 + str r6, [sp, #44] + # A[6] * A[6] = 12 + umull r2, r3, r8, r8 + adds r4, r4, r2 + mov r6, #0 + adcs r5, r5, r3 + adc r6, r6, #0 + # A[5] * A[7] = 12 + umull r2, r3, r7, r9 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + adds r4, r4, r2 + adcs r5, r5, r3 + adc r6, r6, #0 + str r4, [sp, #48] + # A[6] * A[7] = 13 + umull r2, r3, r8, r9 + adds r5, r5, r2 + mov r4, #0 + adcs r6, r6, r3 + adc r4, r4, #0 + adds r5, r5, r2 + adcs r6, r6, r3 + adc r4, r4, #0 + str r5, [sp, #52] + # A[7] * A[7] = 14 + umull r2, r3, r9, r9 + adds r6, r6, r2 + adc r4, r4, r3 + str r6, [sp, #56] + str r4, [sp, #60] + # Double and Reduce + # Load bottom half + ldrd r4, r5, [sp] + ldrd r6, r7, [sp, #8] + ldrd r8, r9, [sp, #16] + ldrd r10, r11, [sp, #24] + lsr r2, r11, #30 + lsl r11, r11, #1 + orr r11, r11, r10, lsr #31 + lsl r10, r10, #1 + orr r10, r10, r9, lsr #31 + lsl r9, r9, #1 + orr r9, r9, r8, lsr #31 + lsl r8, r8, #1 + orr r8, r8, r7, lsr #31 + lsl r7, r7, #1 + orr r7, r7, r6, lsr #31 + lsl r6, r6, #1 + orr r6, r6, r5, lsr #31 + lsl r5, r5, #1 + orr r5, r5, r4, lsr #31 + lsl r4, r4, #1 + and r11, r11, #0x7fffffff + mov r12, #19 + ldr r1, [sp, #32] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + adds r4, r4, r2 + mov lr, #0 + adcs r5, r5, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #36] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r5, r5, r2 + mov lr, #0 + adcs r6, r6, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #40] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r6, r6, r2 + mov lr, #0 + adcs r7, r7, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #44] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r7, r7, r2 + mov lr, #0 + adcs r8, r8, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #48] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r8, r8, r2 + mov lr, #0 + adcs r9, r9, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #52] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r9, r9, r2 + mov lr, #0 + adcs r10, r10, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #56] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + add r3, r3, lr + adds r10, r10, r2 + mov lr, #0 + adcs r11, r11, r3 + adc lr, lr, #0 + lsr r2, r1, #30 + ldr r1, [sp, #60] + orr r2, r2, r1, lsl #2 + umull r2, r3, r12, r2 + adds r11, r11, r2 + adc r2, r3, lr + # Overflow + lsl r2, r2, #1 + orr r2, r2, r11, lsr #31 + mul r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Reduce if top bit set + asr r2, r11, #31 + and r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + # Store + strd r4, r5, [r0] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] + add sp, sp, #0x40 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_sq2,.-fe_sq2 + .text + .align 2 + .globl fe_invert + .type fe_invert, %function +fe_invert: + push {r4, lr} + sub sp, sp, #0x88 + # Invert + str r0, [sp, #128] + str r1, [sp, #132] + mov r0, sp + ldr r1, [sp, #132] + bl fe_sq + add r0, sp, #32 + mov r1, sp + bl fe_sq + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + add r0, sp, #32 + ldr r1, [sp, #132] + add r2, sp, #32 + bl fe_mul + mov r0, sp + mov r1, sp + add r2, sp, #32 + bl fe_mul + add r0, sp, #0x40 + mov r1, sp + bl fe_sq + add r0, sp, #32 + add r1, sp, #32 + add r2, sp, #0x40 + bl fe_mul + add r0, sp, #0x40 + add r1, sp, #32 + bl fe_sq + mov r4, #4 +L_fe_invert1: + add r0, sp, #0x40 + add r1, sp, #0x40 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert1 + add r0, sp, #32 + add r1, sp, #0x40 + add r2, sp, #32 + bl fe_mul + add r0, sp, #0x40 + add r1, sp, #32 + bl fe_sq + mov r4, #9 +L_fe_invert2: + add r0, sp, #0x40 + add r1, sp, #0x40 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert2 + add r0, sp, #0x40 + add r1, sp, #0x40 + add r2, sp, #32 + bl fe_mul + add r0, sp, #0x60 + add r1, sp, #0x40 + bl fe_sq + mov r4, #19 +L_fe_invert3: + add r0, sp, #0x60 + add r1, sp, #0x60 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert3 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 + bl fe_mul + mov r4, #10 +L_fe_invert4: + add r0, sp, #0x40 + add r1, sp, #0x40 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert4 + add r0, sp, #32 + add r1, sp, #0x40 + add r2, sp, #32 + bl fe_mul + add r0, sp, #0x40 + add r1, sp, #32 + bl fe_sq + mov r4, #49 +L_fe_invert5: + add r0, sp, #0x40 + add r1, sp, #0x40 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert5 + add r0, sp, #0x40 + add r1, sp, #0x40 + add r2, sp, #32 + bl fe_mul + add r0, sp, #0x60 + add r1, sp, #0x40 + bl fe_sq + mov r4, #0x63 +L_fe_invert6: + add r0, sp, #0x60 + add r1, sp, #0x60 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert6 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 + bl fe_mul + mov r4, #50 +L_fe_invert7: + add r0, sp, #0x40 + add r1, sp, #0x40 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert7 + add r0, sp, #32 + add r1, sp, #0x40 + add r2, sp, #32 + bl fe_mul + mov r4, #5 +L_fe_invert8: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_invert8 + ldr r0, [sp, #128] + add r1, sp, #32 + mov r2, sp + bl fe_mul + ldr r1, [sp, #132] + ldr r0, [sp, #128] + add sp, sp, #0x88 + pop {r4, pc} + .size fe_invert,.-fe_invert + .text + .align 2 + .globl curve25519 + .type curve25519, %function +curve25519: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0xbc + str r0, [sp, #160] + str r1, [sp, #164] + str r2, [sp, #168] + mov r1, #0 + str r1, [sp, #172] + # Set one + mov r11, #1 + mov r10, #0 + str r11, [r0] + str r10, [r0, #4] + str r10, [r0, #8] + str r10, [r0, #12] + str r10, [r0, #16] + str r10, [r0, #20] + str r10, [r0, #24] + str r10, [r0, #28] + # Set zero + mov r10, #0 + str r10, [sp] + str r10, [sp, #4] + str r10, [sp, #8] + str r10, [sp, #12] + str r10, [sp, #16] + str r10, [sp, #20] + str r10, [sp, #24] + str r10, [sp, #28] + # Set one + mov r11, #1 + mov r10, #0 + str r11, [sp, #32] + str r10, [sp, #36] + str r10, [sp, #40] + str r10, [sp, #44] + str r10, [sp, #48] + str r10, [sp, #52] + str r10, [sp, #56] + str r10, [sp, #60] + # Copy + ldrd r4, r5, [r2] + ldrd r6, r7, [r2, #8] + strd r4, r5, [sp, #64] + strd r6, r7, [sp, #72] + ldrd r4, r5, [r2, #16] + ldrd r6, r7, [r2, #24] + strd r4, r5, [sp, #80] + strd r6, r7, [sp, #88] + mov r1, #30 + str r1, [sp, #180] + mov r2, #28 + str r2, [sp, #176] +L_curve25519_words: +L_curve25519_bits: + ldr r1, [sp, #164] + ldr r2, [r1, r2] + ldr r1, [sp, #180] + lsr r2, r2, r1 + and r2, r2, #1 + str r2, [sp, #184] + ldr r1, [sp, #172] + eor r1, r1, r2 + str r1, [sp, #172] + ldr r0, [sp, #160] + # Conditional Swap + neg r1, r1 + ldrd r4, r5, [r0] + ldrd r6, r7, [sp, #64] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [r0] + strd r6, r7, [sp, #64] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [sp, #72] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [r0, #8] + strd r6, r7, [sp, #72] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [sp, #80] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [r0, #16] + strd r6, r7, [sp, #80] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [sp, #88] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [r0, #24] + strd r6, r7, [sp, #88] + ldr r1, [sp, #172] + # Conditional Swap + neg r1, r1 + ldrd r4, r5, [sp] + ldrd r6, r7, [sp, #32] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [sp] + strd r6, r7, [sp, #32] + ldrd r4, r5, [sp, #8] + ldrd r6, r7, [sp, #40] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [sp, #8] + strd r6, r7, [sp, #40] + ldrd r4, r5, [sp, #16] + ldrd r6, r7, [sp, #48] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [sp, #16] + strd r6, r7, [sp, #48] + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #56] + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 + strd r4, r5, [sp, #24] + strd r6, r7, [sp, #56] + ldr r1, [sp, #184] + str r1, [sp, #172] + # Add-Sub + # Add + ldrd r4, r5, [r0] + ldrd r6, r7, [sp] + adds r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [r0] + # Sub + subs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #128] + # Add + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [sp, #8] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [r0, #8] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #136] + # Add + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [sp, #16] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [r0, #16] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #144] + # Add + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [sp, #24] + adds r3, r3, #-1 + adcs r8, r4, r6 + adc r9, r5, r7 + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + sbc r11, r5, r7 + mov r3, #-19 + asr r2, r9, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Sub modulus (if overflow) + ldrd r4, r5, [r0] + subs r4, r4, r3 + sbcs r5, r5, r2 + strd r4, r5, [r0] + ldrd r4, r5, [r0, #8] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [r0, #8] + ldrd r4, r5, [r0, #16] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [r0, #16] + sbcs r8, r8, r2 + sbc r9, r9, r12 + strd r8, r9, [r0, #24] + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Add modulus (if underflow) + ldrd r4, r5, [sp, #128] + adds r4, r4, r3 + adcs r5, r5, r2 + strd r4, r5, [sp, #128] + ldrd r4, r5, [sp, #136] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #136] + ldrd r4, r5, [sp, #144] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #144] + adcs r10, r10, r2 + adc r11, r11, r12 + strd r10, r11, [sp, #152] + # Add-Sub + # Add + ldrd r4, r5, [sp, #64] + ldrd r6, r7, [sp, #32] + adds r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp] + # Sub + subs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #96] + # Add + ldrd r4, r5, [sp, #72] + ldrd r6, r7, [sp, #40] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #8] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #104] + # Add + ldrd r4, r5, [sp, #80] + ldrd r6, r7, [sp, #48] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #16] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #112] + # Add + ldrd r4, r5, [sp, #88] + ldrd r6, r7, [sp, #56] + adds r3, r3, #-1 + adcs r8, r4, r6 + adc r9, r5, r7 + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + sbc r11, r5, r7 + mov r3, #-19 + asr r2, r9, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Sub modulus (if overflow) + ldrd r4, r5, [sp] + subs r4, r4, r3 + sbcs r5, r5, r2 + strd r4, r5, [sp] + ldrd r4, r5, [sp, #8] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [sp, #8] + ldrd r4, r5, [sp, #16] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [sp, #16] + sbcs r8, r8, r2 + sbc r9, r9, r12 + strd r8, r9, [sp, #24] + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Add modulus (if underflow) + ldrd r4, r5, [sp, #96] + adds r4, r4, r3 + adcs r5, r5, r2 + strd r4, r5, [sp, #96] + ldrd r4, r5, [sp, #104] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #104] + ldrd r4, r5, [sp, #112] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #112] + adcs r10, r10, r2 + adc r11, r11, r12 + strd r10, r11, [sp, #120] + ldr r2, [sp, #160] + add r1, sp, #0x60 + add r0, sp, #32 + bl fe_mul + add r2, sp, #0x80 + add r1, sp, #0 + add r0, sp, #0 + bl fe_mul + add r1, sp, #0x80 + add r0, sp, #0x60 + bl fe_sq + ldr r1, [sp, #160] + add r0, sp, #0x80 + bl fe_sq + # Add-Sub + # Add + ldrd r4, r5, [sp, #32] + ldrd r6, r7, [sp] + adds r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #64] + # Sub + subs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp] + # Add + ldrd r4, r5, [sp, #40] + ldrd r6, r7, [sp, #8] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #72] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #8] + # Add + ldrd r4, r5, [sp, #48] + ldrd r6, r7, [sp, #16] + adds r3, r3, #-1 + adcs r8, r4, r6 + mov r3, #0 + adcs r9, r5, r7 + adc r3, r3, #0 + strd r8, r9, [sp, #80] + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + mov r12, #0 + sbcs r11, r5, r7 + adc r12, r12, #0 + strd r10, r11, [sp, #16] + # Add + ldrd r4, r5, [sp, #56] + ldrd r6, r7, [sp, #24] + adds r3, r3, #-1 + adcs r8, r4, r6 + adc r9, r5, r7 + # Sub + adds r12, r12, #-1 + sbcs r10, r4, r6 + sbc r11, r5, r7 + mov r3, #-19 + asr r2, r9, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Sub modulus (if overflow) + ldrd r4, r5, [sp, #64] + subs r4, r4, r3 + sbcs r5, r5, r2 + strd r4, r5, [sp, #64] + ldrd r4, r5, [sp, #72] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [sp, #72] + ldrd r4, r5, [sp, #80] + sbcs r4, r4, r2 + sbcs r5, r5, r2 + strd r4, r5, [sp, #80] + sbcs r8, r8, r2 + sbc r9, r9, r12 + strd r8, r9, [sp, #88] + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Add modulus (if underflow) + ldrd r4, r5, [sp] + adds r4, r4, r3 + adcs r5, r5, r2 + strd r4, r5, [sp] + ldrd r4, r5, [sp, #8] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #8] + ldrd r4, r5, [sp, #16] + adcs r4, r4, r2 + adcs r5, r5, r2 + strd r4, r5, [sp, #16] + adcs r10, r10, r2 + adc r11, r11, r12 + strd r10, r11, [sp, #24] + add r2, sp, #0x60 + add r1, sp, #0x80 + ldr r0, [sp, #160] + bl fe_mul + # Sub + ldrd r4, r5, [sp, #128] + ldrd r6, r7, [sp, #136] + ldrd r8, r9, [sp, #96] + ldrd r10, r11, [sp, #104] + subs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + sbcs r11, r7, r11 + strd r8, r9, [sp, #128] + strd r10, r11, [sp, #136] + ldrd r4, r5, [sp, #144] + ldrd r6, r7, [sp, #152] + ldrd r8, r9, [sp, #112] + ldrd r10, r11, [sp, #120] + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + sbc r11, r7, r11 + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Add modulus (if underflow) + ldrd r4, r5, [sp, #128] + ldrd r6, r7, [sp, #136] + adds r4, r4, r3 + adcs r5, r5, r2 + adcs r6, r6, r2 + adcs r7, r7, r2 + adcs r8, r8, r2 + adcs r9, r9, r2 + adcs r10, r10, r2 + adc r11, r11, r12 + strd r4, r5, [sp, #128] + strd r6, r7, [sp, #136] + strd r8, r9, [sp, #144] + strd r10, r11, [sp, #152] + add r1, sp, #0 + add r0, sp, #0 + bl fe_sq + # Multiply by 121666 + ldrd r4, r5, [sp, #128] + ldrd r6, r7, [sp, #136] + ldrd r8, r9, [sp, #144] + ldrd r10, r11, [sp, #152] + movw r12, #0xdb42 + movt r12, #1 + umull r4, r2, r4, r12 + umull r5, r3, r5, r12 + adds r5, r5, r2 + adc r2, r3, #0 + umull r6, r3, r6, r12 + adds r6, r6, r2 + adc r2, r3, #0 + umull r7, r3, r7, r12 + adds r7, r7, r2 + adc r2, r3, #0 + umull r8, r3, r8, r12 + adds r8, r8, r2 + adc r2, r3, #0 + umull r9, r3, r9, r12 + adds r9, r9, r2 + adc r2, r3, #0 + umull r10, r3, r10, r12 + adds r10, r10, r2 + adc r2, r3, #0 + umull r11, r3, r11, r12 + adds r11, r11, r2 + adc r2, r3, #0 + mov r12, #19 + lsl r2, r2, #1 + orr r2, r2, r11, lsr #31 + mul r2, r2, r12 + and r11, r11, #0x7fffffff + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adc r11, r11, #0 + strd r4, r5, [sp, #32] + strd r6, r7, [sp, #40] + strd r8, r9, [sp, #48] + strd r10, r11, [sp, #56] + add r1, sp, #0x40 + add r0, sp, #0x40 + bl fe_sq + # Add + ldrd r4, r5, [sp, #96] + ldrd r6, r7, [sp, #104] + ldrd r8, r9, [sp, #32] + ldrd r10, r11, [sp, #40] + adds r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + adcs r11, r7, r11 + strd r8, r9, [sp, #96] + strd r10, r11, [sp, #104] + ldrd r4, r5, [sp, #112] + ldrd r6, r7, [sp, #120] + ldrd r8, r9, [sp, #48] + ldrd r10, r11, [sp, #56] + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + adc r11, r7, r11 + mov r3, #-19 + asr r2, r11, #31 + # Mask the modulus + and r3, r2, r3 + and r12, r2, #0x7fffffff + # Sub modulus (if overflow) + ldrd r4, r5, [sp, #96] + ldrd r6, r7, [sp, #104] + subs r4, r4, r3 + sbcs r5, r5, r2 + sbcs r6, r6, r2 + sbcs r7, r7, r2 + sbcs r8, r8, r2 + sbcs r9, r9, r2 + sbcs r10, r10, r2 + sbc r11, r11, r12 + strd r4, r5, [sp, #96] + strd r6, r7, [sp, #104] + strd r8, r9, [sp, #112] + strd r10, r11, [sp, #120] + add r2, sp, #0 + ldr r1, [sp, #168] + add r0, sp, #32 + bl fe_mul + add r2, sp, #0x60 + add r1, sp, #0x80 + add r0, sp, #0 + bl fe_mul + ldr r2, [sp, #176] + ldr r1, [sp, #180] + subs r1, r1, #1 + str r1, [sp, #180] + bge L_curve25519_bits + mov r1, #31 + str r1, [sp, #180] + subs r2, r2, #4 + str r2, [sp, #176] + bge L_curve25519_words + # Invert + add r0, sp, #32 + add r1, sp, #0 + bl fe_sq + add r0, sp, #0x40 + add r1, sp, #32 + bl fe_sq + add r0, sp, #0x40 + add r1, sp, #0x40 + bl fe_sq + add r0, sp, #0x40 + add r1, sp, #0 + add r2, sp, #0x40 + bl fe_mul + add r0, sp, #32 + add r1, sp, #32 + add r2, sp, #0x40 + bl fe_mul + add r0, sp, #0x60 + add r1, sp, #32 + bl fe_sq + add r0, sp, #0x40 + add r1, sp, #0x40 + add r2, sp, #0x60 + bl fe_mul + add r0, sp, #0x60 + add r1, sp, #0x40 + bl fe_sq + mov r4, #4 +L_curve25519_inv_1: + add r0, sp, #0x60 + add r1, sp, #0x60 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_1 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 + bl fe_mul + add r0, sp, #0x60 + add r1, sp, #0x40 + bl fe_sq + mov r4, #9 +L_curve25519_inv_2: + add r0, sp, #0x60 + add r1, sp, #0x60 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_2 + add r0, sp, #0x60 + add r1, sp, #0x60 + add r2, sp, #0x40 + bl fe_mul + add r0, sp, #0x80 + add r1, sp, #0x60 + bl fe_sq + mov r4, #19 +L_curve25519_inv_3: + add r0, sp, #0x80 + add r1, sp, #0x80 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_3 + add r0, sp, #0x60 + add r1, sp, #0x80 + add r2, sp, #0x60 + bl fe_mul + mov r4, #10 +L_curve25519_inv_4: + add r0, sp, #0x60 + add r1, sp, #0x60 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_4 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 + bl fe_mul + add r0, sp, #0x60 + add r1, sp, #0x40 + bl fe_sq + mov r4, #49 +L_curve25519_inv_5: + add r0, sp, #0x60 + add r1, sp, #0x60 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_5 + add r0, sp, #0x60 + add r1, sp, #0x60 + add r2, sp, #0x40 + bl fe_mul + add r0, sp, #0x80 + add r1, sp, #0x60 + bl fe_sq + mov r4, #0x63 +L_curve25519_inv_6: + add r0, sp, #0x80 + add r1, sp, #0x80 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_6 + add r0, sp, #0x60 + add r1, sp, #0x80 + add r2, sp, #0x60 + bl fe_mul + mov r4, #50 +L_curve25519_inv_7: + add r0, sp, #0x60 + add r1, sp, #0x60 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_7 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 + bl fe_mul + mov r4, #5 +L_curve25519_inv_8: + add r0, sp, #0x40 + add r1, sp, #0x40 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_curve25519_inv_8 + add r0, sp, #0 + add r1, sp, #0x40 + add r2, sp, #32 + bl fe_mul + add r2, sp, #0 + ldr r1, [sp, #160] + ldr r0, [sp, #160] + bl fe_mul + mov r0, #0 + add sp, sp, #0xbc + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size curve25519,.-curve25519 + .text + .align 2 + .globl fe_pow22523 + .type fe_pow22523, %function +fe_pow22523: + push {r4, lr} + sub sp, sp, #0x68 + # pow22523 + str r0, [sp, #96] + str r1, [sp, #100] + mov r0, sp + ldr r1, [sp, #100] + bl fe_sq + add r0, sp, #32 + mov r1, sp + bl fe_sq + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + add r0, sp, #32 + ldr r1, [sp, #100] + add r2, sp, #32 + bl fe_mul + mov r0, sp + mov r1, sp + add r2, sp, #32 + bl fe_mul + mov r0, sp + mov r1, sp + bl fe_sq + mov r0, sp + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #32 + mov r1, sp + bl fe_sq + mov r4, #4 +L_fe_pow22523_1: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_1 + mov r0, sp + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #32 + mov r1, sp + bl fe_sq + mov r4, #9 +L_fe_pow22523_2: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_2 + add r0, sp, #32 + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #0x40 + add r1, sp, #32 + bl fe_sq + mov r4, #19 +L_fe_pow22523_3: + add r0, sp, #0x40 + add r1, sp, #0x40 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_3 + add r0, sp, #32 + add r1, sp, #0x40 + add r2, sp, #32 + bl fe_mul + mov r4, #10 +L_fe_pow22523_4: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_4 + mov r0, sp + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #32 + mov r1, sp + bl fe_sq + mov r4, #49 +L_fe_pow22523_5: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_5 + add r0, sp, #32 + add r1, sp, #32 + mov r2, sp + bl fe_mul + add r0, sp, #0x40 + add r1, sp, #32 + bl fe_sq + mov r4, #0x63 +L_fe_pow22523_6: + add r0, sp, #0x40 + add r1, sp, #0x40 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_6 + add r0, sp, #32 + add r1, sp, #0x40 + add r2, sp, #32 + bl fe_mul + mov r4, #50 +L_fe_pow22523_7: + add r0, sp, #32 + add r1, sp, #32 + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_7 + mov r0, sp + add r1, sp, #32 + mov r2, sp + bl fe_mul + mov r4, #2 +L_fe_pow22523_8: + mov r0, sp + mov r1, sp + bl fe_sq + sub r4, r4, #1 + cmp r4, #0 + bne L_fe_pow22523_8 + ldr r0, [sp, #96] + mov r1, sp + ldr r2, [sp, #100] + bl fe_mul + ldr r1, [sp, #100] + ldr r0, [sp, #96] + add sp, sp, #0x68 + pop {r4, pc} + .size fe_pow22523,.-fe_pow22523 + .text + .align 2 + .globl fe_ge_to_p2 + .type fe_ge_to_p2, %function +fe_ge_to_p2: + push {lr} + sub sp, sp, #16 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r2, [sp, #28] + ldr r1, [sp, #12] + ldr r0, [sp] + bl fe_mul + ldr r2, [sp, #24] + ldr r1, [sp, #20] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #28] + ldr r1, [sp, #24] + ldr r0, [sp, #8] + bl fe_mul + add sp, sp, #16 + pop {pc} + .size fe_ge_to_p2,.-fe_ge_to_p2 + .text + .align 2 + .globl fe_ge_to_p3 + .type fe_ge_to_p3, %function +fe_ge_to_p3: + push {lr} + sub sp, sp, #16 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r2, [sp, #32] + ldr r1, [sp, #20] + ldr r0, [sp] + bl fe_mul + ldr r2, [sp, #28] + ldr r1, [sp, #24] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #32] + ldr r1, [sp, #28] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #24] + ldr r1, [sp, #20] + ldr r0, [sp, #12] + bl fe_mul + add sp, sp, #16 + pop {pc} + .size fe_ge_to_p3,.-fe_ge_to_p3 + .text + .align 2 + .globl fe_ge_dbl + .type fe_ge_dbl, %function +fe_ge_dbl: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #16 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r1, [sp, #52] + ldr r0, [sp] + bl fe_sq + ldr r1, [sp, #56] + ldr r0, [sp, #8] + bl fe_sq + ldr r0, [sp, #4] + ldr r1, [sp, #52] + ldr r2, [sp, #56] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r1, [sp, #4] + ldr r0, [sp, #12] + bl fe_sq + ldr r0, [sp, #4] + ldr r1, [sp, #8] + ldr r2, [sp] + # Add-Sub + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r2] + ldr r6, [r2, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r1, #8] + ldr r4, [r1, #12] + ldr r5, [r2, #8] + ldr r6, [r2, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r2, #16] + ldr r6, [r2, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r1, #24] + ldr r4, [r1, #28] + ldr r5, [r2, #24] + ldr r6, [r2, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp] + ldr r1, [sp, #12] + ldr r2, [sp, #4] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r1, [sp, #60] + ldr r0, [sp, #12] + bl fe_sq2 + ldr r0, [sp, #12] + ldr r1, [sp, #8] + # Sub + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + ldr r7, [r1] + ldr r8, [r1, #4] + ldr r9, [r1, #8] + ldr r10, [r1, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + add sp, sp, #16 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_ge_dbl,.-fe_ge_dbl + .text + .align 2 + .globl fe_ge_madd + .type fe_ge_madd, %function +fe_ge_madd: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #32 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r0, [sp] + ldr r1, [sp, #72] + ldr r2, [sp, #68] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp, #72] + ldr r2, [sp, #68] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r2, [sp, #88] + ldr r1, [sp] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #92] + ldr r1, [sp, #4] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #80] + ldr r1, [sp, #84] + ldr r0, [sp, #12] + bl fe_mul + ldr r0, [sp, #4] + ldr r1, [sp] + ldr r2, [sp, #8] + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp, #8] + ldr r1, [sp, #76] + # Double + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + adds r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r10, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #8] + ldr r1, [sp, #12] + # Add-Sub + # Add + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r1] + ldr r6, [r1, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r1, #16] + ldr r6, [r1, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + add sp, sp, #32 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_ge_madd,.-fe_ge_madd + .text + .align 2 + .globl fe_ge_msub + .type fe_ge_msub, %function +fe_ge_msub: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #32 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r0, [sp] + ldr r1, [sp, #72] + ldr r2, [sp, #68] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp, #72] + ldr r2, [sp, #68] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r2, [sp, #92] + ldr r1, [sp] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #88] + ldr r1, [sp, #4] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #80] + ldr r1, [sp, #84] + ldr r0, [sp, #12] + bl fe_mul + ldr r0, [sp, #4] + ldr r1, [sp] + ldr r2, [sp, #8] + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp, #8] + ldr r1, [sp, #76] + # Double + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + adds r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r10, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #12] + ldr r1, [sp, #8] + # Add-Sub + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r1, #8] + ldr r4, [r1, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r1, #24] + ldr r4, [r1, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + add sp, sp, #32 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_ge_msub,.-fe_ge_msub + .text + .align 2 + .globl fe_ge_add + .type fe_ge_add, %function +fe_ge_add: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x60 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r0, [sp] + ldr r1, [sp, #136] + ldr r2, [sp, #132] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp, #136] + ldr r2, [sp, #132] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r2, [sp, #156] + ldr r1, [sp] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #160] + ldr r1, [sp, #4] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #144] + ldr r1, [sp, #152] + ldr r0, [sp, #12] + bl fe_mul + ldr r2, [sp, #148] + ldr r1, [sp, #140] + ldr r0, [sp] + bl fe_mul + add r0, sp, #16 + ldr r1, [sp] + # Double + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + adds r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r10, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp] + ldr r2, [sp, #8] + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp, #8] + ldr r1, [sp, #12] + add r2, sp, #16 + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r1] + ldr r6, [r1, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r1, #16] + ldr r6, [r1, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + add sp, sp, #0x60 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_ge_add,.-fe_ge_add + .text + .align 2 + .globl fe_ge_sub + .type fe_ge_sub, %function +fe_ge_sub: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x60 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + str r3, [sp, #12] + ldr r0, [sp] + ldr r1, [sp, #136] + ldr r2, [sp, #132] + # Add + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + adds r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + adc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp, #136] + ldr r2, [sp, #132] + # Sub + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r2] + ldr r8, [r2, #4] + ldr r9, [r2, #8] + ldr r10, [r2, #12] + subs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbcs r10, r6, r10 + str r7, [r0] + str r8, [r0, #4] + str r9, [r0, #8] + str r10, [r0, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + ldr r5, [r1, #24] + ldr r6, [r1, #28] + ldr r7, [r2, #16] + ldr r8, [r2, #20] + ldr r9, [r2, #24] + ldr r10, [r2, #28] + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + sbc r10, r6, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r3, r3, r12 + adcs r4, r4, r11 + adcs r5, r5, r11 + adcs r6, r6, r11 + adcs r7, r7, r11 + adcs r8, r8, r11 + adcs r9, r9, r11 + adc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r2, [sp, #160] + ldr r1, [sp] + ldr r0, [sp, #8] + bl fe_mul + ldr r2, [sp, #156] + ldr r1, [sp, #4] + ldr r0, [sp, #4] + bl fe_mul + ldr r2, [sp, #144] + ldr r1, [sp, #152] + ldr r0, [sp, #12] + bl fe_mul + ldr r2, [sp, #148] + ldr r1, [sp, #140] + ldr r0, [sp] + bl fe_mul + add r0, sp, #16 + ldr r1, [sp] + # Double + ldr r3, [r1] + ldr r4, [r1, #4] + ldr r5, [r1, #8] + ldr r6, [r1, #12] + ldr r7, [r1, #16] + ldr r8, [r1, #20] + ldr r9, [r1, #24] + ldr r10, [r1, #28] + adds r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r10, r10 + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + subs r3, r3, r12 + sbcs r4, r4, r11 + sbcs r5, r5, r11 + sbcs r6, r6, r11 + sbcs r7, r7, r11 + sbcs r8, r8, r11 + sbcs r9, r9, r11 + sbc r10, r10, lr + str r3, [r0] + str r4, [r0, #4] + str r5, [r0, #8] + str r6, [r0, #12] + str r7, [r0, #16] + str r8, [r0, #20] + str r9, [r0, #24] + str r10, [r0, #28] + ldr r0, [sp, #4] + ldr r1, [sp] + ldr r2, [sp, #8] + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + ldr r0, [sp, #12] + ldr r1, [sp, #8] + add r2, sp, #16 + # Add-Sub + # Add + ldr r3, [r2] + ldr r4, [r2, #4] + ldr r5, [r0] + ldr r6, [r0, #4] + adds r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0] + str r8, [r0, #4] + # Sub + subs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1] + str r10, [r1, #4] + # Add + ldr r3, [r2, #8] + ldr r4, [r2, #12] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #8] + str r8, [r0, #12] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #8] + str r10, [r1, #12] + # Add + ldr r3, [r2, #16] + ldr r4, [r2, #20] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + adds r12, r12, #-1 + adcs r7, r3, r5 + mov r12, #0 + adcs r8, r4, r6 + adc r12, r12, #0 + str r7, [r0, #16] + str r8, [r0, #20] + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + mov lr, #0 + sbcs r10, r4, r6 + adc lr, lr, #0 + str r9, [r1, #16] + str r10, [r1, #20] + # Add + ldr r3, [r2, #24] + ldr r4, [r2, #28] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + adds r12, r12, #-1 + adcs r7, r3, r5 + adc r8, r4, r6 + # Sub + adds lr, lr, #-1 + sbcs r9, r3, r5 + sbc r10, r4, r6 + mov r12, #-19 + asr r11, r8, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Sub modulus (if overflow) + ldr r3, [r0] + ldr r4, [r0, #4] + subs r3, r3, r12 + sbcs r4, r4, r11 + str r3, [r0] + str r4, [r0, #4] + ldr r3, [r0, #8] + ldr r4, [r0, #12] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #8] + str r4, [r0, #12] + ldr r3, [r0, #16] + ldr r4, [r0, #20] + sbcs r3, r3, r11 + sbcs r4, r4, r11 + str r3, [r0, #16] + str r4, [r0, #20] + sbcs r7, r7, r11 + sbc r8, r8, lr + str r7, [r0, #24] + str r8, [r0, #28] + mov r12, #-19 + asr r11, r10, #31 + # Mask the modulus + and r12, r11, r12 + and lr, r11, #0x7fffffff + # Add modulus (if underflow) + ldr r3, [r1] + ldr r4, [r1, #4] + adds r3, r3, r12 + adcs r4, r4, r11 + str r3, [r1] + str r4, [r1, #4] + ldr r3, [r1, #8] + ldr r4, [r1, #12] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #8] + str r4, [r1, #12] + ldr r3, [r1, #16] + ldr r4, [r1, #20] + adcs r3, r3, r11 + adcs r4, r4, r11 + str r3, [r1, #16] + str r4, [r1, #20] + adcs r9, r9, r11 + adc r10, r10, lr + str r9, [r1, #24] + str r10, [r1, #28] + add sp, sp, #0x60 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_ge_sub,.-fe_ge_sub +#endif /* !__aarch64__ */ +#endif /* WOLFSSL_ARMASM */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c new file mode 100644 index 0000000..f7ef379 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c @@ -0,0 +1,5581 @@ +/* armv8-32-curve25519 + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c + */ + +#ifndef __aarch64__ + +#include <stdint.h> +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#ifdef WOLFSSL_ARMASM +#include <wolfssl/wolfcrypt/fe_operations.h> +#include <stdint.h> + +void fe_init() +{ + __asm__ __volatile__ ( + "\n\t" + : + : + : "memory" + ); +} + +void fe_frombytes(fe out, const unsigned char* in) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[in]]\n\t" + "ldrd r12, lr, [%[in], #8]\n\t" + "ldrd r4, r5, [%[in], #16]\n\t" + "ldrd r6, r7, [%[in], #24]\n\t" + "and r7, r7, #0x7fffffff\n\t" + "strd r2, r3, [%[out]]\n\t" + "strd r12, lr, [%[out], #8]\n\t" + "strd r4, r5, [%[out], #16]\n\t" + "strd r6, r7, [%[out], #24]\n\t" + : [out] "+r" (out), [in] "+r" (in) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7" + ); +} + +void fe_tobytes(unsigned char* out, const fe n) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[in]]\n\t" + "ldrd r12, lr, [%[in], #8]\n\t" + "ldrd r4, r5, [%[in], #16]\n\t" + "ldrd r6, r7, [%[in], #24]\n\t" + "adds r8, r2, #19\n\t" + "adcs r8, r3, #0\n\t" + "adcs r8, r12, #0\n\t" + "adcs r8, lr, #0\n\t" + "adcs r8, r4, #0\n\t" + "adcs r8, r5, #0\n\t" + "adcs r8, r6, #0\n\t" + "adc r8, r7, #0\n\t" + "asr r8, r8, #31\n\t" + "and r8, r8, #19\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, #0\n\t" + "adcs r12, r12, #0\n\t" + "adcs lr, lr, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + "and r7, r7, #0x7fffffff\n\t" + "strd r2, r3, [%[out]]\n\t" + "strd r12, lr, [%[out], #8]\n\t" + "strd r4, r5, [%[out], #16]\n\t" + "strd r6, r7, [%[out], #24]\n\t" + : [out] "+r" (out), [n] "+r" (n) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8" + ); +} + +void fe_1(fe n) +{ + __asm__ __volatile__ ( + /* Set one */ + "mov r2, #1\n\t" + "mov r1, #0\n\t" + "strd r2, r1, [%[n]]\n\t" + "strd r1, r1, [%[n], #8]\n\t" + "strd r1, r1, [%[n], #16]\n\t" + "strd r1, r1, [%[n], #24]\n\t" + : [n] "+r" (n) + : + : "memory", "r1", "r2" + ); +} + +void fe_0(fe n) +{ + __asm__ __volatile__ ( + /* Set zero */ + "mov r1, #0\n\t" + "strd r1, r1, [%[n]]\n\t" + "strd r1, r1, [%[n], #8]\n\t" + "strd r1, r1, [%[n], #16]\n\t" + "strd r1, r1, [%[n], #24]\n\t" + : [n] "+r" (n) + : + : "memory", "r1" + ); +} + +void fe_copy(fe r, const fe a) +{ + __asm__ __volatile__ ( + /* Copy */ + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "strd r2, r3, [%[r]]\n\t" + "strd r12, lr, [%[r], #8]\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r12, lr, [%[a], #24]\n\t" + "strd r2, r3, [%[r], #16]\n\t" + "strd r12, lr, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr" + ); +} + +void fe_sub(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + /* Sub */ + "ldrd r12, lr, [%[a]]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[b]]\n\t" + "ldrd r8, r9, [%[b], #8]\n\t" + "subs r6, r12, r6\n\t" + "sbcs r7, lr, r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "strd r6, r7, [%[r]]\n\t" + "strd r8, r9, [%[r], #8]\n\t" + "ldrd r12, lr, [%[a], #16]\n\t" + "ldrd r4, r5, [%[a], #24]\n\t" + "ldrd r6, r7, [%[b], #16]\n\t" + "ldrd r8, r9, [%[b], #24]\n\t" + "sbcs r6, r12, r6\n\t" + "sbcs r7, lr, r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbc r9, r5, r9\n\t" + "mov r10, #-19\n\t" + "asr r3, r9, #31\n\t" + /* Mask the modulus */ + "and r10, r3, r10\n\t" + "and r11, r3, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r12, lr, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "adds r12, r12, r10\n\t" + "adcs lr, lr, r3\n\t" + "adcs r4, r4, r3\n\t" + "adcs r5, r5, r3\n\t" + "adcs r6, r6, r3\n\t" + "adcs r7, r7, r3\n\t" + "adcs r8, r8, r3\n\t" + "adc r9, r9, r11\n\t" + "strd r12, lr, [%[r]]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [%[r], #16]\n\t" + "strd r8, r9, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_add(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + /* Add */ + "ldrd r12, lr, [%[a]]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[b]]\n\t" + "ldrd r8, r9, [%[b], #8]\n\t" + "adds r6, r12, r6\n\t" + "adcs r7, lr, r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "strd r6, r7, [%[r]]\n\t" + "strd r8, r9, [%[r], #8]\n\t" + "ldrd r12, lr, [%[a], #16]\n\t" + "ldrd r4, r5, [%[a], #24]\n\t" + "ldrd r6, r7, [%[b], #16]\n\t" + "ldrd r8, r9, [%[b], #24]\n\t" + "adcs r6, r12, r6\n\t" + "adcs r7, lr, r7\n\t" + "adcs r8, r4, r8\n\t" + "adc r9, r5, r9\n\t" + "mov r10, #-19\n\t" + "asr r3, r9, #31\n\t" + /* Mask the modulus */ + "and r10, r3, r10\n\t" + "and r11, r3, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r12, lr, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "subs r12, r12, r10\n\t" + "sbcs lr, lr, r3\n\t" + "sbcs r4, r4, r3\n\t" + "sbcs r5, r5, r3\n\t" + "sbcs r6, r6, r3\n\t" + "sbcs r7, r7, r3\n\t" + "sbcs r8, r8, r3\n\t" + "sbc r9, r9, r11\n\t" + "strd r12, lr, [%[r]]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [%[r], #16]\n\t" + "strd r8, r9, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_neg(fe r, const fe a) +{ + __asm__ __volatile__ ( + "mov r5, #-1\n\t" + "mov r4, #-19\n\t" + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "subs r2, r4, r2\n\t" + "sbcs r3, r5, r3\n\t" + "sbcs r12, r5, r12\n\t" + "sbcs lr, r5, lr\n\t" + "strd r2, r3, [%[r]]\n\t" + "strd r12, lr, [%[r], #8]\n\t" + "mov r4, #0x7fffffff\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r12, lr, [%[a], #24]\n\t" + "sbcs r2, r5, r2\n\t" + "sbcs r3, r5, r3\n\t" + "sbcs r12, r5, r12\n\t" + "sbc lr, r4, lr\n\t" + "strd r2, r3, [%[r], #16]\n\t" + "strd r12, lr, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5" + ); +} + +int fe_isnonzero(const fe a) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "ldrd r4, r5, [%[a], #16]\n\t" + "ldrd r6, r7, [%[a], #24]\n\t" + "adds r1, r2, #19\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "adcs r1, lr, #0\n\t" + "adcs r1, r4, #0\n\t" + "adcs r1, r5, #0\n\t" + "adcs r1, r6, #0\n\t" + "adc r1, r7, #0\n\t" + "asr r1, r1, #31\n\t" + "and r1, r1, #19\n\t" + "adds r2, r2, r1\n\t" + "adcs r3, r3, #0\n\t" + "adcs r12, r12, #0\n\t" + "adcs lr, lr, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + "and r7, r7, #0x7fffffff\n\t" + "orr r2, r2, r3\n\t" + "orr r12, r12, lr\n\t" + "orr r4, r4, r5\n\t" + "orr r6, r6, r7\n\t" + "orr r12, r12, r4\n\t" + "orr r2, r2, r6\n\t" + "orr %[a], r2, r12\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8" + ); + return (uint32_t)(size_t)a; +} + +int fe_isnegative(const fe a) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "adds r1, r2, #19\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "adcs r1, lr, #0\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r12, lr, [%[a], #24]\n\t" + "adcs r1, r2, #0\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "ldr r2, [%[a]]\n\t" + "adc r1, lr, #0\n\t" + "and %[a], r2, #1\n\t" + "lsr r1, r1, #31\n\t" + "eor %[a], %[a], r1\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r12", "lr" + ); + return (uint32_t)(size_t)a; +} + +void fe_cmov_table(fe* r, fe* base, signed char b) +{ + __asm__ __volatile__ ( + "sxtb %[b], %[b]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #1\n\t" + "mov r12, #0\n\t" + "mov lr, #1\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-19\n\t" + "mov r9, #-1\n\t" + "subs r8, r8, r5\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r11, r11, r11\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r]]\n\t" + "strd lr, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #64]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-1\n\t" + "mov r9, #-1\n\t" + "rsbs r11, r11, #0\n\t" + "sbcs r8, r8, r5\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r11, r11, r11\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r], #8]\n\t" + "strd lr, r4, [%[r], #40]\n\t" + "strd r5, r6, [%[r], #72]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-1\n\t" + "mov r9, #-1\n\t" + "rsbs r11, r11, #0\n\t" + "sbcs r8, r8, r5\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r11, r11, r11\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r], #16]\n\t" + "strd lr, r4, [%[r], #48]\n\t" + "strd r5, r6, [%[r], #80]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-1\n\t" + "mov r9, #0x7fffffff\n\t" + "rsbs r11, r11, #0\n\t" + "sbcs r8, r8, r5\n\t" + "sbc r9, r9, r6\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r], #24]\n\t" + "strd lr, r4, [%[r], #56]\n\t" + "strd r5, r6, [%[r], #88]\n\t" + : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_mul(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Multiply */ + "ldr r7, [%[a]]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[b]]\n\t" + "ldr lr, [%[b], #4]\n\t" + /* A[0] * B[0] = 0 */ + "umull r4, r5, r7, r9\n\t" + "str r4, [sp]\n\t" + /* A[0] * B[1] = 1 */ + "umull r3, r6, r7, lr\n\t" + "adds r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[0] = 1 */ + "umull r3, r12, r8, r9\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #4]\n\t" + /* A[2] * B[0] = 2 */ + "ldr r10, [%[a], #8]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adc r4, r4, r12\n\t" + /* A[1] * B[1] = 2 */ + "umull r3, r12, r8, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[0] * B[2] = 2 */ + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #8]\n\t" + /* A[0] * B[3] = 3 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[2] = 3 */ + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * B[1] = 3 */ + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[0] = 3 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #12]\n\t" + /* A[4] * B[0] = 4 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[1] = 4 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * B[2] = 4 */ + "ldr r10, [%[a], #8]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * B[3] = 4 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[0] * B[4] = 4 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #16]\n\t" + /* A[0] * B[5] = 5 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[4] = 5 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[3] = 5 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[2] = 5 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[1] = 5 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[0] = 5 */ + "ldr r10, [%[a], #20]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #20]\n\t" + /* A[6] * B[0] = 6 */ + "ldr r10, [%[a], #24]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[1] = 6 */ + "ldr r10, [%[a], #20]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * B[2] = 6 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[3] = 6 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * B[4] = 6 */ + "ldr r10, [%[a], #8]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[5] = 6 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * B[6] = 6 */ + "ldr r11, [%[b], #24]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #24]\n\t" + /* A[0] * B[7] = 7 */ + "ldr r11, [%[b], #28]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * B[6] = 7 */ + "ldr r11, [%[b], #24]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * B[5] = 7 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[4] = 7 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * B[3] = 7 */ + "ldr r10, [%[a], #16]\n\t" + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * B[2] = 7 */ + "ldr r10, [%[a], #20]\n\t" + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[6] * B[1] = 7 */ + "ldr r10, [%[a], #24]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[7] * B[0] = 7 */ + "ldr r10, [%[a], #28]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #28]\n\t" + "ldr r7, [%[a], #24]\n\t" + "ldr r9, [%[b], #24]\n\t" + /* A[7] * B[1] = 8 */ + "umull r3, r12, r10, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[2] = 8 */ + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[3] = 8 */ + "ldr r10, [%[a], #20]\n\t" + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[4] = 8 */ + "ldr r10, [%[a], #16]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[5] = 8 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[6] = 8 */ + "ldr r10, [%[a], #8]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[7] = 8 */ + "ldr r11, [%[b], #28]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #32]\n\t" + "ldr r8, [%[a], #28]\n\t" + "mov lr, r11\n\t" + /* A[2] * B[7] = 9 */ + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[6] = 9 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * B[5] = 9 */ + "ldr r10, [%[a], #16]\n\t" + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[4] = 9 */ + "ldr r10, [%[a], #20]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[6] * B[3] = 9 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[7] * B[2] = 9 */ + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #36]\n\t" + /* A[7] * B[3] = 10 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[6] * B[4] = 10 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * B[5] = 10 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * B[6] = 10 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[7] = 10 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #40]\n\t" + /* A[4] * B[7] = 11 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[6] = 11 */ + "ldr r10, [%[a], #20]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[5] = 11 */ + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[7] * B[4] = 11 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #44]\n\t" + /* A[7] * B[5] = 12 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[6] * B[6] = 12 */ + "umull r3, r12, r7, r9\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[7] = 12 */ + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #48]\n\t" + /* A[6] * B[7] = 13 */ + "umull r3, r12, r7, lr\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[7] * B[6] = 13 */ + "umull r3, r12, r8, r9\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #52]\n\t" + /* A[7] * B[7] = 14 */ + "umull r3, r12, r8, lr\n\t" + "adds r6, r6, r3\n\t" + "adc r4, r4, r12\n\t" + "str r6, [sp, #56]\n\t" + "str r4, [sp, #60]\n\t" + /* Reduce */ + /* Load bottom half */ + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "ldrd r8, r9, [sp, #16]\n\t" + "ldrd r10, r11, [sp, #24]\n\t" + "lsr r3, r11, #31\n\t" + "and r11, r11, #0x7fffffff\n\t" + "mov lr, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "adds r4, r4, r3\n\t" + "mov %[b], #0\n\t" + "adcs r5, r5, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r5, r5, r3\n\t" + "mov %[b], #0\n\t" + "adcs r6, r6, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r6, r6, r3\n\t" + "mov %[b], #0\n\t" + "adcs r7, r7, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r7, r7, r3\n\t" + "mov %[b], #0\n\t" + "adcs r8, r8, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r8, r8, r3\n\t" + "mov %[b], #0\n\t" + "adcs r9, r9, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r9, r9, r3\n\t" + "mov %[b], #0\n\t" + "adcs r10, r10, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r10, r10, r3\n\t" + "mov %[b], #0\n\t" + "adcs r11, r11, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "adds r11, r11, r3\n\t" + "adc r3, r12, %[b]\n\t" + /* Overflow */ + "lsl r3, r3, #1\n\t" + "orr r3, r3, r11, lsr #31\n\t" + "mul r3, r3, lr\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Reduce if top bit set */ + "asr r3, r11, #31\n\t" + "and r3, r3, lr\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Store */ + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [%[r], #8]\n\t" + "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_sq(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Square */ + "ldr r7, [%[a]]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[a], #8]\n\t" + "ldr r10, [%[a], #12]\n\t" + "ldr r12, [%[a], #16]\n\t" + /* A[0] * A[0] = 0 */ + "umull r4, r5, r7, r7\n\t" + "str r4, [sp]\n\t" + /* A[0] * A[1] = 1 */ + "umull r2, r3, r7, r8\n\t" + "mov r6, #0\n\t" + "adds r5, r5, r2\n\t" + "adc r6, r6, r3\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #4]\n\t" + /* A[1] * A[1] = 2 */ + "umull r2, r3, r8, r8\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + /* A[0] * A[2] = 2 */ + "umull r2, r3, r7, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #8]\n\t" + /* A[0] * A[3] = 3 */ + "umull r2, r3, r7, r10\n\t" + "adds r4, r4, r2\n\t" + "adc r5, r5, r3\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[2] = 3 */ + "umull r2, r3, r8, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #12]\n\t" + /* A[2] * A[2] = 4 */ + "umull r2, r3, r9, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[3] = 4 */ + "umull r2, r3, r8, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[0] * A[4] = 4 */ + "umull r2, r3, r7, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #16]\n\t" + /* A[0] * A[5] = 5 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[4] = 5 */ + "umull r2, r3, r8, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[3] = 5 */ + "umull r2, r3, r9, r10\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #20]\n\t" + /* A[3] * A[3] = 6 */ + "umull r2, r3, r10, r10\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[4] = 6 */ + "umull r2, r3, r9, r12\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[5] = 6 */ + "umull r2, r3, r8, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * A[6] = 6 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #24]\n\t" + /* A[0] * A[7] = 7 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[6] = 7 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * A[5] = 7 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[4] = 7 */ + "umull r2, r3, r10, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #28]\n\t" + /* A[4] * A[4] = 8 */ + "umull r2, r3, r12, r12\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[5] = 8 */ + "umull r2, r3, r10, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[6] = 8 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[7] = 8 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #32]\n\t" + "ldr r7, [%[a], #20]\n\t" + /* A[2] * A[7] = 9 */ + "umull r2, r3, r9, r11\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * A[6] = 9 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * A[5] = 9 */ + "umull r2, r3, r12, r7\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #36]\n\t" + "mov r8, r11\n\t" + /* A[5] * A[5] = 10 */ + "umull r2, r3, r7, r7\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * A[6] = 10 */ + "umull r2, r3, r12, r8\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[7] = 10 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #40]\n\t" + "mov r9, r11\n\t" + /* A[4] * A[7] = 11 */ + "umull r2, r3, r12, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * A[6] = 11 */ + "umull r2, r3, r7, r8\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #44]\n\t" + /* A[6] * A[6] = 12 */ + "umull r2, r3, r8, r8\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * A[7] = 12 */ + "umull r2, r3, r7, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #48]\n\t" + /* A[6] * A[7] = 13 */ + "umull r2, r3, r8, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #52]\n\t" + /* A[7] * A[7] = 14 */ + "umull r2, r3, r9, r9\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + "str r6, [sp, #56]\n\t" + "str r4, [sp, #60]\n\t" + /* Reduce */ + /* Load bottom half */ + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "ldrd r8, r9, [sp, #16]\n\t" + "ldrd r10, r11, [sp, #24]\n\t" + "lsr r2, r11, #31\n\t" + "and r11, r11, #0x7fffffff\n\t" + "mov r12, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r4, r4, r2\n\t" + "mov lr, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r5, r5, r2\n\t" + "mov lr, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r6, r6, r2\n\t" + "mov lr, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r7, r7, r2\n\t" + "mov lr, #0\n\t" + "adcs r8, r8, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r8, r8, r2\n\t" + "mov lr, #0\n\t" + "adcs r9, r9, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r9, r9, r2\n\t" + "mov lr, #0\n\t" + "adcs r10, r10, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r10, r10, r2\n\t" + "mov lr, #0\n\t" + "adcs r11, r11, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r11, r11, r2\n\t" + "adc r2, r3, lr\n\t" + /* Overflow */ + "lsl r2, r2, #1\n\t" + "orr r2, r2, r11, lsr #31\n\t" + "mul r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Reduce if top bit set */ + "asr r2, r11, #31\n\t" + "and r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Store */ + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [%[r], #8]\n\t" + "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_mul121666(fe r, fe a) +{ + __asm__ __volatile__ ( + /* Multiply by 121666 */ + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[a], #16]\n\t" + "ldrd r8, r9, [%[a], #24]\n\t" + "movw lr, #0xdb42\n\t" + "movt lr, #1\n\t" + "umull r2, r10, r2, lr\n\t" + "umull r3, r12, r3, lr\n\t" + "adds r3, r3, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r4, r12, r4, lr\n\t" + "adds r4, r4, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r5, r12, r5, lr\n\t" + "adds r5, r5, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r6, r12, r6, lr\n\t" + "adds r6, r6, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r7, r12, r7, lr\n\t" + "adds r7, r7, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r8, r12, r8, lr\n\t" + "adds r8, r8, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r9, r12, r9, lr\n\t" + "adds r9, r9, r10\n\t" + "adc r10, r12, #0\n\t" + "mov lr, #19\n\t" + "lsl r10, r10, #1\n\t" + "orr r10, r10, r9, lsr #31\n\t" + "mul r10, r10, lr\n\t" + "and r9, r9, #0x7fffffff\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "strd r2, r3, [%[r]]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [%[r], #16]\n\t" + "strd r8, r9, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); +} + +void fe_sq2(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Square * 2 */ + "ldr r7, [%[a]]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[a], #8]\n\t" + "ldr r10, [%[a], #12]\n\t" + "ldr r12, [%[a], #16]\n\t" + /* A[0] * A[0] = 0 */ + "umull r4, r5, r7, r7\n\t" + "str r4, [sp]\n\t" + /* A[0] * A[1] = 1 */ + "umull r2, r3, r7, r8\n\t" + "mov r6, #0\n\t" + "adds r5, r5, r2\n\t" + "adc r6, r6, r3\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #4]\n\t" + /* A[1] * A[1] = 2 */ + "umull r2, r3, r8, r8\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + /* A[0] * A[2] = 2 */ + "umull r2, r3, r7, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #8]\n\t" + /* A[0] * A[3] = 3 */ + "umull r2, r3, r7, r10\n\t" + "adds r4, r4, r2\n\t" + "adc r5, r5, r3\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[2] = 3 */ + "umull r2, r3, r8, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #12]\n\t" + /* A[2] * A[2] = 4 */ + "umull r2, r3, r9, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[3] = 4 */ + "umull r2, r3, r8, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[0] * A[4] = 4 */ + "umull r2, r3, r7, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #16]\n\t" + /* A[0] * A[5] = 5 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[4] = 5 */ + "umull r2, r3, r8, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[3] = 5 */ + "umull r2, r3, r9, r10\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #20]\n\t" + /* A[3] * A[3] = 6 */ + "umull r2, r3, r10, r10\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[4] = 6 */ + "umull r2, r3, r9, r12\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[5] = 6 */ + "umull r2, r3, r8, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * A[6] = 6 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #24]\n\t" + /* A[0] * A[7] = 7 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[6] = 7 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * A[5] = 7 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[4] = 7 */ + "umull r2, r3, r10, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #28]\n\t" + /* A[4] * A[4] = 8 */ + "umull r2, r3, r12, r12\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[5] = 8 */ + "umull r2, r3, r10, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[6] = 8 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[7] = 8 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #32]\n\t" + "ldr r7, [%[a], #20]\n\t" + /* A[2] * A[7] = 9 */ + "umull r2, r3, r9, r11\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * A[6] = 9 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * A[5] = 9 */ + "umull r2, r3, r12, r7\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #36]\n\t" + "mov r8, r11\n\t" + /* A[5] * A[5] = 10 */ + "umull r2, r3, r7, r7\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * A[6] = 10 */ + "umull r2, r3, r12, r8\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[7] = 10 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #40]\n\t" + "mov r9, r11\n\t" + /* A[4] * A[7] = 11 */ + "umull r2, r3, r12, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * A[6] = 11 */ + "umull r2, r3, r7, r8\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #44]\n\t" + /* A[6] * A[6] = 12 */ + "umull r2, r3, r8, r8\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * A[7] = 12 */ + "umull r2, r3, r7, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #48]\n\t" + /* A[6] * A[7] = 13 */ + "umull r2, r3, r8, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #52]\n\t" + /* A[7] * A[7] = 14 */ + "umull r2, r3, r9, r9\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + "str r6, [sp, #56]\n\t" + "str r4, [sp, #60]\n\t" + /* Double and Reduce */ + /* Load bottom half */ + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "ldrd r8, r9, [sp, #16]\n\t" + "ldrd r10, r11, [sp, #24]\n\t" + "lsr r2, r11, #30\n\t" + "lsl r11, r11, #1\n\t" + "orr r11, r11, r10, lsr #31\n\t" + "lsl r10, r10, #1\n\t" + "orr r10, r10, r9, lsr #31\n\t" + "lsl r9, r9, #1\n\t" + "orr r9, r9, r8, lsr #31\n\t" + "lsl r8, r8, #1\n\t" + "orr r8, r8, r7, lsr #31\n\t" + "lsl r7, r7, #1\n\t" + "orr r7, r7, r6, lsr #31\n\t" + "lsl r6, r6, #1\n\t" + "orr r6, r6, r5, lsr #31\n\t" + "lsl r5, r5, #1\n\t" + "orr r5, r5, r4, lsr #31\n\t" + "lsl r4, r4, #1\n\t" + "and r11, r11, #0x7fffffff\n\t" + "mov r12, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r4, r4, r2\n\t" + "mov lr, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r5, r5, r2\n\t" + "mov lr, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r6, r6, r2\n\t" + "mov lr, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r7, r7, r2\n\t" + "mov lr, #0\n\t" + "adcs r8, r8, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r8, r8, r2\n\t" + "mov lr, #0\n\t" + "adcs r9, r9, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r9, r9, r2\n\t" + "mov lr, #0\n\t" + "adcs r10, r10, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r10, r10, r2\n\t" + "mov lr, #0\n\t" + "adcs r11, r11, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r11, r11, r2\n\t" + "adc r2, r3, lr\n\t" + /* Overflow */ + "lsl r2, r2, #1\n\t" + "orr r2, r2, r11, lsr #31\n\t" + "mul r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Reduce if top bit set */ + "asr r2, r11, #31\n\t" + "and r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Store */ + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [%[r], #8]\n\t" + "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_invert(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x88\n\t" + /* Invert */ + "str %[r], [sp, #128]\n\t" + "str %[a], [sp, #132]\n\t" + "mov r0, sp\n\t" + "ldr r1, [sp, #132]\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "ldr r1, [sp, #132]\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_fe_invert1_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert1_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_fe_invert2_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert2_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_fe_invert3_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert3_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_fe_invert4_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert4_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #49\n\t" + "\n" + "L_fe_invert5_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert5_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_fe_invert6_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert6_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "mov r4, #50\n\t" + "\n" + "L_fe_invert7_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert7_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #5\n\t" + "\n" + "L_fe_invert8_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert8_%=\n\t" + "ldr r0, [sp, #128]\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #132]\n\t" + "ldr %[r], [sp, #128]\n\t" + "add sp, sp, #0x88\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr", "r4" + ); +} + +int curve25519(byte* r, byte* n, byte* a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0xbc\n\t" + "str %[r], [sp, #160]\n\t" + "str %[n], [sp, #164]\n\t" + "str %[a], [sp, #168]\n\t" + "mov %[n], #0\n\t" + "str %[n], [sp, #172]\n\t" + /* Set one */ + "mov r11, #1\n\t" + "mov r10, #0\n\t" + "strd r11, r10, [%[r]]\n\t" + "strd r10, r10, [%[r], #8]\n\t" + "strd r10, r10, [%[r], #16]\n\t" + "strd r10, r10, [%[r], #24]\n\t" + /* Set zero */ + "mov r10, #0\n\t" + "strd r10, r10, [sp]\n\t" + "strd r10, r10, [sp, #8]\n\t" + "strd r10, r10, [sp, #16]\n\t" + "strd r10, r10, [sp, #24]\n\t" + /* Set one */ + "mov r11, #1\n\t" + "mov r10, #0\n\t" + "strd r11, r10, [sp, #32]\n\t" + "strd r10, r10, [sp, #40]\n\t" + "strd r10, r10, [sp, #48]\n\t" + "strd r10, r10, [sp, #56]\n\t" + /* Copy */ + "ldrd r4, r5, [%[a]]\n\t" + "ldrd r6, r7, [%[a], #8]\n\t" + "strd r4, r5, [sp, #64]\n\t" + "strd r6, r7, [sp, #72]\n\t" + "ldrd r4, r5, [%[a], #16]\n\t" + "ldrd r6, r7, [%[a], #24]\n\t" + "strd r4, r5, [sp, #80]\n\t" + "strd r6, r7, [sp, #88]\n\t" + "mov %[n], #30\n\t" + "str %[n], [sp, #180]\n\t" + "mov %[a], #28\n\t" + "str %[a], [sp, #176]\n\t" + "\n" + "L_curve25519_words_%=: \n\t" + "\n" + "L_curve25519_bits_%=: \n\t" + "ldr %[n], [sp, #164]\n\t" + "ldr %[a], [%[n], r2]\n\t" + "ldr %[n], [sp, #180]\n\t" + "lsr %[a], %[a], %[n]\n\t" + "and %[a], %[a], #1\n\t" + "str %[a], [sp, #184]\n\t" + "ldr %[n], [sp, #172]\n\t" + "eor %[n], %[n], %[a]\n\t" + "str %[n], [sp, #172]\n\t" + "ldr %[r], [sp, #160]\n\t" + /* Conditional Swap */ + "neg %[n], %[n]\n\t" + "ldrd r4, r5, [%[r]]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [sp, #64]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "ldrd r6, r7, [sp, #72]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [sp, #72]\n\t" + "ldrd r4, r5, [%[r], #16]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r], #16]\n\t" + "strd r6, r7, [sp, #80]\n\t" + "ldrd r4, r5, [%[r], #24]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r], #24]\n\t" + "strd r6, r7, [sp, #88]\n\t" + "ldr %[n], [sp, #172]\n\t" + /* Conditional Swap */ + "neg %[n], %[n]\n\t" + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp]\n\t" + "strd r6, r7, [sp, #32]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp, #8]\n\t" + "strd r6, r7, [sp, #40]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp, #16]\n\t" + "strd r6, r7, [sp, #48]\n\t" + "ldrd r4, r5, [sp, #24]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp, #24]\n\t" + "strd r6, r7, [sp, #56]\n\t" + "ldr %[n], [sp, #184]\n\t" + "str %[n], [sp, #172]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r4, r5, [%[r]]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [%[r]]\n\t" + /* Sub */ + "subs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #128]\n\t" + /* Add */ + "ldrd r4, r5, [%[r], #8]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [%[r], #8]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #136]\n\t" + /* Add */ + "ldrd r4, r5, [%[r], #16]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [%[r], #16]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #144]\n\t" + /* Add */ + "ldrd r4, r5, [%[r], #24]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r3, #-19\n\t" + "asr %[a], r9, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [%[r]]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "ldrd r4, r5, [%[r], #16]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [%[r], #16]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbc r9, r9, r12\n\t" + "strd r8, r9, [%[r], #24]\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp, #128]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #128]\n\t" + "ldrd r4, r5, [sp, #136]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #136]\n\t" + "ldrd r4, r5, [sp, #144]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #144]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r10, r11, [sp, #152]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r4, r5, [sp, #64]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "adds r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp]\n\t" + /* Sub */ + "subs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #96]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #72]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #8]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #104]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #80]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #16]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #112]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #88]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r3, #-19\n\t" + "asr %[a], r9, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [sp]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #8]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #16]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbc r9, r9, r12\n\t" + "strd r8, r9, [sp, #24]\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp, #96]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #96]\n\t" + "ldrd r4, r5, [sp, #104]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #104]\n\t" + "ldrd r4, r5, [sp, #112]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #112]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r10, r11, [sp, #120]\n\t" + "ldr r2, [sp, #160]\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0x80\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq\n\t" + "ldr r1, [sp, #160]\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r4, r5, [sp, #32]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #64]\n\t" + /* Sub */ + "subs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #40]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #72]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #8]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #48]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #80]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #16]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #56]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r3, #-19\n\t" + "asr %[a], r9, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [sp, #64]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #64]\n\t" + "ldrd r4, r5, [sp, #72]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #72]\n\t" + "ldrd r4, r5, [sp, #80]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #80]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbc r9, r9, r12\n\t" + "strd r8, r9, [sp, #88]\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #8]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #16]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r10, r11, [sp, #24]\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul\n\t" + /* Sub */ + "ldrd r4, r5, [sp, #128]\n\t" + "ldrd r6, r7, [sp, #136]\n\t" + "ldrd r8, r9, [sp, #96]\n\t" + "ldrd r10, r11, [sp, #104]\n\t" + "subs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [sp, #128]\n\t" + "strd r10, r11, [sp, #136]\n\t" + "ldrd r4, r5, [sp, #144]\n\t" + "ldrd r6, r7, [sp, #152]\n\t" + "ldrd r8, r9, [sp, #112]\n\t" + "ldrd r10, r11, [sp, #120]\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp, #128]\n\t" + "ldrd r6, r7, [sp, #136]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "adcs r6, r6, %[a]\n\t" + "adcs r7, r7, %[a]\n\t" + "adcs r8, r8, %[a]\n\t" + "adcs r9, r9, %[a]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r4, r5, [sp, #128]\n\t" + "strd r6, r7, [sp, #136]\n\t" + "strd r8, r9, [sp, #144]\n\t" + "strd r10, r11, [sp, #152]\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0\n\t" + "bl fe_sq\n\t" + /* Multiply by 121666 */ + "ldrd r4, r5, [sp, #128]\n\t" + "ldrd r6, r7, [sp, #136]\n\t" + "ldrd r8, r9, [sp, #144]\n\t" + "ldrd r10, r11, [sp, #152]\n\t" + "movw r12, #0xdb42\n\t" + "movt r12, #1\n\t" + "umull r4, %[a], r4, r12\n\t" + "umull r5, r3, r5, r12\n\t" + "adds r5, r5, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r6, r3, r6, r12\n\t" + "adds r6, r6, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r7, r3, r7, r12\n\t" + "adds r7, r7, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r8, r3, r8, r12\n\t" + "adds r8, r8, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r9, r3, r9, r12\n\t" + "adds r9, r9, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r10, r3, r10, r12\n\t" + "adds r10, r10, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r11, r3, r11, r12\n\t" + "adds r11, r11, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "mov r12, #19\n\t" + "lsl %[a], %[a], #1\n\t" + "orr %[a], %[a], r11, lsr #31\n\t" + "mul %[a], %[a], r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, %[a]\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + "strd r4, r5, [sp, #32]\n\t" + "strd r6, r7, [sp, #40]\n\t" + "strd r8, r9, [sp, #48]\n\t" + "strd r10, r11, [sp, #56]\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq\n\t" + /* Add */ + "ldrd r4, r5, [sp, #96]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "ldrd r8, r9, [sp, #32]\n\t" + "ldrd r10, r11, [sp, #40]\n\t" + "adds r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [sp, #96]\n\t" + "strd r10, r11, [sp, #104]\n\t" + "ldrd r4, r5, [sp, #112]\n\t" + "ldrd r6, r7, [sp, #120]\n\t" + "ldrd r8, r9, [sp, #48]\n\t" + "ldrd r10, r11, [sp, #56]\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [sp, #96]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "sbcs r6, r6, %[a]\n\t" + "sbcs r7, r7, %[a]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbcs r9, r9, %[a]\n\t" + "sbcs r10, r10, %[a]\n\t" + "sbc r11, r11, r12\n\t" + "strd r4, r5, [sp, #96]\n\t" + "strd r6, r7, [sp, #104]\n\t" + "strd r8, r9, [sp, #112]\n\t" + "strd r10, r11, [sp, #120]\n\t" + "add r2, sp, #0\n\t" + "ldr r1, [sp, #168]\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #176]\n\t" + "ldr %[n], [sp, #180]\n\t" + "subs %[n], %[n], #1\n\t" + "str %[n], [sp, #180]\n\t" + "bge L_curve25519_bits_%=\n\t" + "mov %[n], #31\n\t" + "str %[n], [sp, #180]\n\t" + "subs %[a], %[a], #4\n\t" + "str %[a], [sp, #176]\n\t" + "bge L_curve25519_words_%=\n\t" + /* Invert */ + "add r0, sp, #32\n\t" + "add r1, sp, #0\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #0x60\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_curve25519_inv_1_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_1_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_curve25519_inv_2_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_2_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_curve25519_inv_3_%=: \n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x80\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_3_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r2, sp, #0x60\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_curve25519_inv_4_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_4_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #49\n\t" + "\n" + "L_curve25519_inv_5_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_5_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_curve25519_inv_6_%=: \n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x80\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_6_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r2, sp, #0x60\n\t" + "bl fe_mul\n\t" + "mov r4, #50\n\t" + "\n" + "L_curve25519_inv_7_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_7_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "mov r4, #5\n\t" + "\n" + "L_curve25519_inv_8_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_8_%=\n\t" + "add r0, sp, #0\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0\n\t" + "ldr r1, [sp, #160]\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul\n\t" + "mov r0, #0\n\t" + "add sp, sp, #0xbc\n\t" + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)r; +} + +void fe_pow22523(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x68\n\t" + /* pow22523 */ + "str %[r], [sp, #96]\n\t" + "str %[a], [sp, #100]\n\t" + "mov r0, sp\n\t" + "ldr r1, [sp, #100]\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "ldr r1, [sp, #100]\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_fe_pow22523_1_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_1_%=\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_fe_pow22523_2_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_2_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_fe_pow22523_3_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_3_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_fe_pow22523_4_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_4_%=\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #49\n\t" + "\n" + "L_fe_pow22523_5_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_5_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_fe_pow22523_6_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_6_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #50\n\t" + "\n" + "L_fe_pow22523_7_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_7_%=\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "mov r4, #2\n\t" + "\n" + "L_fe_pow22523_8_%=: \n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_8_%=\n\t" + "ldr r0, [sp, #96]\n\t" + "mov r1, sp\n\t" + "ldr r2, [sp, #100]\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #100]\n\t" + "ldr %[r], [sp, #96]\n\t" + "add sp, sp, #0x68\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr", "r4" + ); +} + +void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[px], [sp, #12]\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #28]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "lr" + ); +} + +void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r2, [sp, #36]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #28]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #36]\n\t" + "ldr r1, [sp, #32]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "lr" + ); +} + +void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r1, [sp, #88]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_sq\n\t" + "ldr r1, [sp, #92]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_sq\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #88]\n\t" + "ldr r2, [sp, #92]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_sq\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #8]\n\t" + "ldr r2, [sp]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r2]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #8]\n\t" + "ldrd r5, r6, [r2, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r2, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #24]\n\t" + "ldrd r5, r6, [r2, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #12]\n\t" + "ldr r2, [sp, #4]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r1, [sp, #96]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_sq2\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "ldrd r7, r8, [r1]\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #124]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #128]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #116]\n\t" + "ldr r1, [sp, #120]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #112]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #12]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r1]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r0, #8]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r0, #16]\n\t" + "ldrd r5, r6, [r1, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r0, #24]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #32\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #128]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #124]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #116]\n\t" + "ldr r1, [sp, #120]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #112]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #32\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x60\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #192]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #196]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #180]\n\t" + "ldr r1, [sp, #188]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #184]\n\t" + "ldr r1, [sp, #176]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "add r0, sp, #16\n\t" + "ldr r1, [sp]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #12]\n\t" + "add r2, sp, #16\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r1]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r1, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #0x60\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x60\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #196]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #192]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #180]\n\t" + "ldr r1, [sp, #188]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #184]\n\t" + "ldr r1, [sp, #176]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "add r0, sp, #16\n\t" + "ldr r1, [sp]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + "add r2, sp, #16\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #0x60\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +#endif /* WOLFSSL_ARMASM */ +#endif /* !__aarch64__ */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S new file mode 100644 index 0000000..d2b899c --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S @@ -0,0 +1,5335 @@ +/* armv8-32-sha512-asm + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S + */ + +#ifdef WOLFSSL_ARMASM +#ifndef __aarch64__ +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA512_transform_len_k, %object + .size L_SHA512_transform_len_k, 640 + .align 3 +L_SHA512_transform_len_k: + .word 0xd728ae22 + .word 0x428a2f98 + .word 0x23ef65cd + .word 0x71374491 + .word 0xec4d3b2f + .word 0xb5c0fbcf + .word 0x8189dbbc + .word 0xe9b5dba5 + .word 0xf348b538 + .word 0x3956c25b + .word 0xb605d019 + .word 0x59f111f1 + .word 0xaf194f9b + .word 0x923f82a4 + .word 0xda6d8118 + .word 0xab1c5ed5 + .word 0xa3030242 + .word 0xd807aa98 + .word 0x45706fbe + .word 0x12835b01 + .word 0x4ee4b28c + .word 0x243185be + .word 0xd5ffb4e2 + .word 0x550c7dc3 + .word 0xf27b896f + .word 0x72be5d74 + .word 0x3b1696b1 + .word 0x80deb1fe + .word 0x25c71235 + .word 0x9bdc06a7 + .word 0xcf692694 + .word 0xc19bf174 + .word 0x9ef14ad2 + .word 0xe49b69c1 + .word 0x384f25e3 + .word 0xefbe4786 + .word 0x8b8cd5b5 + .word 0xfc19dc6 + .word 0x77ac9c65 + .word 0x240ca1cc + .word 0x592b0275 + .word 0x2de92c6f + .word 0x6ea6e483 + .word 0x4a7484aa + .word 0xbd41fbd4 + .word 0x5cb0a9dc + .word 0x831153b5 + .word 0x76f988da + .word 0xee66dfab + .word 0x983e5152 + .word 0x2db43210 + .word 0xa831c66d + .word 0x98fb213f + .word 0xb00327c8 + .word 0xbeef0ee4 + .word 0xbf597fc7 + .word 0x3da88fc2 + .word 0xc6e00bf3 + .word 0x930aa725 + .word 0xd5a79147 + .word 0xe003826f + .word 0x6ca6351 + .word 0xa0e6e70 + .word 0x14292967 + .word 0x46d22ffc + .word 0x27b70a85 + .word 0x5c26c926 + .word 0x2e1b2138 + .word 0x5ac42aed + .word 0x4d2c6dfc + .word 0x9d95b3df + .word 0x53380d13 + .word 0x8baf63de + .word 0x650a7354 + .word 0x3c77b2a8 + .word 0x766a0abb + .word 0x47edaee6 + .word 0x81c2c92e + .word 0x1482353b + .word 0x92722c85 + .word 0x4cf10364 + .word 0xa2bfe8a1 + .word 0xbc423001 + .word 0xa81a664b + .word 0xd0f89791 + .word 0xc24b8b70 + .word 0x654be30 + .word 0xc76c51a3 + .word 0xd6ef5218 + .word 0xd192e819 + .word 0x5565a910 + .word 0xd6990624 + .word 0x5771202a + .word 0xf40e3585 + .word 0x32bbd1b8 + .word 0x106aa070 + .word 0xb8d2d0c8 + .word 0x19a4c116 + .word 0x5141ab53 + .word 0x1e376c08 + .word 0xdf8eeb99 + .word 0x2748774c + .word 0xe19b48a8 + .word 0x34b0bcb5 + .word 0xc5c95a63 + .word 0x391c0cb3 + .word 0xe3418acb + .word 0x4ed8aa4a + .word 0x7763e373 + .word 0x5b9cca4f + .word 0xd6b2b8a3 + .word 0x682e6ff3 + .word 0x5defb2fc + .word 0x748f82ee + .word 0x43172f60 + .word 0x78a5636f + .word 0xa1f0ab72 + .word 0x84c87814 + .word 0x1a6439ec + .word 0x8cc70208 + .word 0x23631e28 + .word 0x90befffa + .word 0xde82bde9 + .word 0xa4506ceb + .word 0xb2c67915 + .word 0xbef9a3f7 + .word 0xe372532b + .word 0xc67178f2 + .word 0xea26619c + .word 0xca273ece + .word 0x21c0c207 + .word 0xd186b8c7 + .word 0xcde0eb1e + .word 0xeada7dd6 + .word 0xee6ed178 + .word 0xf57d4f7f + .word 0x72176fba + .word 0x6f067aa + .word 0xa2c898a6 + .word 0xa637dc5 + .word 0xbef90dae + .word 0x113f9804 + .word 0x131c471b + .word 0x1b710b35 + .word 0x23047d84 + .word 0x28db77f5 + .word 0x40c72493 + .word 0x32caab7b + .word 0x15c9bebc + .word 0x3c9ebe0a + .word 0x9c100d4c + .word 0x431d67c4 + .word 0xcb3e42b6 + .word 0x4cc5d4be + .word 0xfc657e2a + .word 0x597f299c + .word 0x3ad6faec + .word 0x5fcb6fab + .word 0x4a475817 + .word 0x6c44198c + .text + .align 2 + .globl Transform_Sha512_Len + .type Transform_Sha512_Len, %function +Transform_Sha512_Len: + push {r4, r5, r6, r7, r8, r9, r10, lr} + sub sp, sp, #0xc0 + adr r3, L_SHA512_transform_len_k + # Copy digest to add in at end + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + ldrd r8, r9, [r0, #24] + str r12, [sp, #128] + str lr, [sp, #132] + strd r4, r5, [sp, #136] + strd r6, r7, [sp, #144] + strd r8, r9, [sp, #152] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + ldrd r8, r9, [r0, #56] + str r12, [sp, #160] + str lr, [sp, #164] + strd r4, r5, [sp, #168] + strd r6, r7, [sp, #176] + strd r8, r9, [sp, #184] + # Start of loop processing a block +L_sha512_len_neon_begin: + # Load, Reverse and Store W + ldr r12, [r1] + ldr lr, [r1, #4] + ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r1, #16] + ldrd r8, r9, [r1, #24] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str lr, [sp] + str r12, [sp, #4] + str r5, [sp, #8] + str r4, [sp, #12] + str r7, [sp, #16] + str r6, [sp, #20] + str r9, [sp, #24] + str r8, [sp, #28] + ldr r12, [r1, #32] + ldr lr, [r1, #36] + ldrd r4, r5, [r1, #40] + ldrd r6, r7, [r1, #48] + ldrd r8, r9, [r1, #56] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str lr, [sp, #32] + str r12, [sp, #36] + str r5, [sp, #40] + str r4, [sp, #44] + str r7, [sp, #48] + str r6, [sp, #52] + str r9, [sp, #56] + str r8, [sp, #60] + ldr r12, [r1, #64] + ldr lr, [r1, #68] + ldrd r4, r5, [r1, #72] + ldrd r6, r7, [r1, #80] + ldrd r8, r9, [r1, #88] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str lr, [sp, #64] + str r12, [sp, #68] + str r5, [sp, #72] + str r4, [sp, #76] + str r7, [sp, #80] + str r6, [sp, #84] + str r9, [sp, #88] + str r8, [sp, #92] + ldr r12, [r1, #96] + ldr lr, [r1, #100] + ldrd r4, r5, [r1, #104] + ldrd r6, r7, [r1, #112] + ldrd r8, r9, [r1, #120] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str lr, [sp, #96] + str r12, [sp, #100] + str r5, [sp, #104] + str r4, [sp, #108] + str r7, [sp, #112] + str r6, [sp, #116] + str r9, [sp, #120] + str r8, [sp, #124] + # Pre-calc: b ^ c + ldrd r8, r9, [r0, #8] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r8, r8, r12 + eor r9, r9, lr + mov r10, #4 + # Start of 16 rounds +L_sha512_len_neon_start: + # Round 0 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r6, r7, [sp] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #24] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0] + ldr lr, [r0, #4] + strd r6, r7, [r0, #24] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0] + ldrd r4, r5, [r0, #8] + str r12, [r0, #56] + str lr, [r0, #60] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #56] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #56] + mov r8, r6 + mov r9, r7 + # Calc new W[0] + ldr r12, [sp, #112] + ldr lr, [sp, #116] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp] + ldr lr, [sp, #4] + ldrd r6, r7, [sp, #72] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp] + str lr, [sp, #4] + ldr r12, [sp, #8] + ldr lr, [sp, #12] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp] + ldr lr, [sp, #4] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp] + str lr, [sp, #4] + # Round 1 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r4, r5, [r0, #32] + ldrd r6, r7, [r0, #40] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r6, r7, [sp, #8] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #8] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #16] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #56] + ldr lr, [r0, #60] + strd r6, r7, [r0, #16] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #56] + ldrd r4, r5, [r0] + str r12, [r0, #48] + str lr, [r0, #52] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #48] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #48] + mov r8, r6 + mov r9, r7 + # Calc new W[1] + ldr r12, [sp, #120] + ldr lr, [sp, #124] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #8] + ldr lr, [sp, #12] + ldrd r6, r7, [sp, #80] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #8] + str lr, [sp, #12] + ldr r12, [sp, #16] + ldr lr, [sp, #20] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #8] + ldr lr, [sp, #12] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #8] + str lr, [sp, #12] + # Round 2 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r0, #32] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r6, r7, [sp, #16] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #16] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #8] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #48] + ldr lr, [r0, #52] + strd r6, r7, [r0, #8] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #48] + ldrd r4, r5, [r0, #56] + str r12, [r0, #40] + str lr, [r0, #44] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #40] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #40] + mov r8, r6 + mov r9, r7 + # Calc new W[2] + ldr r12, [sp] + ldr lr, [sp, #4] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #16] + ldr lr, [sp, #20] + ldrd r6, r7, [sp, #88] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #16] + str lr, [sp, #20] + ldr r12, [sp, #24] + ldr lr, [sp, #28] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #16] + ldr lr, [sp, #20] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #16] + str lr, [sp, #20] + # Round 3 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r6, r7, [sp, #24] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #24] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #40] + ldr lr, [r0, #44] + strd r6, r7, [r0] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #40] + ldrd r4, r5, [r0, #48] + str r12, [r0, #32] + str lr, [r0, #36] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #32] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #32] + mov r8, r6 + mov r9, r7 + # Calc new W[3] + ldr r12, [sp, #8] + ldr lr, [sp, #12] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #24] + ldr lr, [sp, #28] + ldrd r6, r7, [sp, #96] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #24] + str lr, [sp, #28] + ldr r12, [sp, #32] + ldr lr, [sp, #36] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #24] + ldr lr, [sp, #28] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #24] + str lr, [sp, #28] + # Round 4 + ldr r12, [r0] + ldr lr, [r0, #4] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r6, r7, [sp, #32] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #32] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #56] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #32] + ldr lr, [r0, #36] + strd r6, r7, [r0, #56] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #32] + ldrd r4, r5, [r0, #40] + str r12, [r0, #24] + str lr, [r0, #28] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #24] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #24] + mov r8, r6 + mov r9, r7 + # Calc new W[4] + ldr r12, [sp, #16] + ldr lr, [sp, #20] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #32] + ldr lr, [sp, #36] + ldrd r6, r7, [sp, #104] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #32] + str lr, [sp, #36] + ldr r12, [sp, #40] + ldr lr, [sp, #44] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #32] + ldr lr, [sp, #36] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #32] + str lr, [sp, #36] + # Round 5 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r4, r5, [r0] + ldrd r6, r7, [r0, #8] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r6, r7, [sp, #40] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #40] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #48] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #24] + ldr lr, [r0, #28] + strd r6, r7, [r0, #48] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #24] + ldrd r4, r5, [r0, #32] + str r12, [r0, #16] + str lr, [r0, #20] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #16] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #16] + mov r8, r6 + mov r9, r7 + # Calc new W[5] + ldr r12, [sp, #24] + ldr lr, [sp, #28] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #40] + ldr lr, [sp, #44] + ldrd r6, r7, [sp, #112] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #40] + str lr, [sp, #44] + ldr r12, [sp, #48] + ldr lr, [sp, #52] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #40] + ldr lr, [sp, #44] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #40] + str lr, [sp, #44] + # Round 6 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [r0] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r6, r7, [sp, #48] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #48] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #40] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #16] + ldr lr, [r0, #20] + strd r6, r7, [r0, #40] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #16] + ldrd r4, r5, [r0, #24] + str r12, [r0, #8] + str lr, [r0, #12] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #8] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #8] + mov r8, r6 + mov r9, r7 + # Calc new W[6] + ldr r12, [sp, #32] + ldr lr, [sp, #36] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #48] + ldr lr, [sp, #52] + ldrd r6, r7, [sp, #120] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #48] + str lr, [sp, #52] + ldr r12, [sp, #56] + ldr lr, [sp, #60] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #48] + ldr lr, [sp, #52] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #48] + str lr, [sp, #52] + # Round 7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r4, r5, [r0, #48] + ldrd r6, r7, [r0, #56] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r6, r7, [sp, #56] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #56] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #32] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #8] + ldr lr, [r0, #12] + strd r6, r7, [r0, #32] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #8] + ldrd r4, r5, [r0, #16] + str r12, [r0] + str lr, [r0, #4] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0] + mov r8, r6 + mov r9, r7 + # Calc new W[7] + ldr r12, [sp, #40] + ldr lr, [sp, #44] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #56] + ldr lr, [sp, #60] + ldrd r6, r7, [sp] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #56] + str lr, [sp, #60] + ldr r12, [sp, #64] + ldr lr, [sp, #68] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #56] + ldr lr, [sp, #60] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #56] + str lr, [sp, #60] + # Round 8 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r6, r7, [sp, #64] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #64] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #24] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0] + ldr lr, [r0, #4] + strd r6, r7, [r0, #24] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0] + ldrd r4, r5, [r0, #8] + str r12, [r0, #56] + str lr, [r0, #60] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #56] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #56] + mov r8, r6 + mov r9, r7 + # Calc new W[8] + ldr r12, [sp, #48] + ldr lr, [sp, #52] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #64] + ldr lr, [sp, #68] + ldrd r6, r7, [sp, #8] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #64] + str lr, [sp, #68] + ldr r12, [sp, #72] + ldr lr, [sp, #76] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #64] + ldr lr, [sp, #68] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #64] + str lr, [sp, #68] + # Round 9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r4, r5, [r0, #32] + ldrd r6, r7, [r0, #40] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r6, r7, [sp, #72] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #72] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #16] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #56] + ldr lr, [r0, #60] + strd r6, r7, [r0, #16] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #56] + ldrd r4, r5, [r0] + str r12, [r0, #48] + str lr, [r0, #52] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #48] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #48] + mov r8, r6 + mov r9, r7 + # Calc new W[9] + ldr r12, [sp, #56] + ldr lr, [sp, #60] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #72] + ldr lr, [sp, #76] + ldrd r6, r7, [sp, #16] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #72] + str lr, [sp, #76] + ldr r12, [sp, #80] + ldr lr, [sp, #84] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #72] + ldr lr, [sp, #76] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #72] + str lr, [sp, #76] + # Round 10 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r0, #32] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r6, r7, [sp, #80] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #80] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #8] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #48] + ldr lr, [r0, #52] + strd r6, r7, [r0, #8] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #48] + ldrd r4, r5, [r0, #56] + str r12, [r0, #40] + str lr, [r0, #44] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #40] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #40] + mov r8, r6 + mov r9, r7 + # Calc new W[10] + ldr r12, [sp, #64] + ldr lr, [sp, #68] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #80] + ldr lr, [sp, #84] + ldrd r6, r7, [sp, #24] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #80] + str lr, [sp, #84] + ldr r12, [sp, #88] + ldr lr, [sp, #92] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #80] + ldr lr, [sp, #84] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #80] + str lr, [sp, #84] + # Round 11 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r6, r7, [sp, #88] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #88] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #40] + ldr lr, [r0, #44] + strd r6, r7, [r0] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #40] + ldrd r4, r5, [r0, #48] + str r12, [r0, #32] + str lr, [r0, #36] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #32] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #32] + mov r8, r6 + mov r9, r7 + # Calc new W[11] + ldr r12, [sp, #72] + ldr lr, [sp, #76] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #88] + ldr lr, [sp, #92] + ldrd r6, r7, [sp, #32] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #88] + str lr, [sp, #92] + ldr r12, [sp, #96] + ldr lr, [sp, #100] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #88] + ldr lr, [sp, #92] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #88] + str lr, [sp, #92] + # Round 12 + ldr r12, [r0] + ldr lr, [r0, #4] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r6, r7, [sp, #96] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #96] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #56] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #32] + ldr lr, [r0, #36] + strd r6, r7, [r0, #56] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #32] + ldrd r4, r5, [r0, #40] + str r12, [r0, #24] + str lr, [r0, #28] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #24] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #24] + mov r8, r6 + mov r9, r7 + # Calc new W[12] + ldr r12, [sp, #80] + ldr lr, [sp, #84] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #96] + ldr lr, [sp, #100] + ldrd r6, r7, [sp, #40] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #96] + str lr, [sp, #100] + ldr r12, [sp, #104] + ldr lr, [sp, #108] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #96] + ldr lr, [sp, #100] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #96] + str lr, [sp, #100] + # Round 13 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r4, r5, [r0] + ldrd r6, r7, [r0, #8] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r6, r7, [sp, #104] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #104] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #48] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #24] + ldr lr, [r0, #28] + strd r6, r7, [r0, #48] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #24] + ldrd r4, r5, [r0, #32] + str r12, [r0, #16] + str lr, [r0, #20] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #16] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #16] + mov r8, r6 + mov r9, r7 + # Calc new W[13] + ldr r12, [sp, #88] + ldr lr, [sp, #92] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #104] + ldr lr, [sp, #108] + ldrd r6, r7, [sp, #48] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #104] + str lr, [sp, #108] + ldr r12, [sp, #112] + ldr lr, [sp, #116] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #104] + ldr lr, [sp, #108] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #104] + str lr, [sp, #108] + # Round 14 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [r0] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r6, r7, [sp, #112] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #112] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #40] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #16] + ldr lr, [r0, #20] + strd r6, r7, [r0, #40] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #16] + ldrd r4, r5, [r0, #24] + str r12, [r0, #8] + str lr, [r0, #12] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #8] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #8] + mov r8, r6 + mov r9, r7 + # Calc new W[14] + ldr r12, [sp, #96] + ldr lr, [sp, #100] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #112] + ldr lr, [sp, #116] + ldrd r6, r7, [sp, #56] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #112] + str lr, [sp, #116] + ldr r12, [sp, #120] + ldr lr, [sp, #124] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #112] + ldr lr, [sp, #116] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #112] + str lr, [sp, #116] + # Round 15 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r4, r5, [r0, #48] + ldrd r6, r7, [r0, #56] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r6, r7, [sp, #120] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #120] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #32] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #8] + ldr lr, [r0, #12] + strd r6, r7, [r0, #32] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #8] + ldrd r4, r5, [r0, #16] + str r12, [r0] + str lr, [r0, #4] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0] + mov r8, r6 + mov r9, r7 + # Calc new W[15] + ldr r12, [sp, #104] + ldr lr, [sp, #108] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #120] + ldr lr, [sp, #124] + ldrd r6, r7, [sp, #64] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #120] + str lr, [sp, #124] + ldr r12, [sp] + ldr lr, [sp, #4] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #120] + ldr lr, [sp, #124] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #120] + str lr, [sp, #124] + add r3, r3, #0x80 + subs r10, r10, #1 + bne L_sha512_len_neon_start + # Round 0 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r6, r7, [sp] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #24] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0] + ldr lr, [r0, #4] + strd r6, r7, [r0, #24] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0] + ldrd r4, r5, [r0, #8] + str r12, [r0, #56] + str lr, [r0, #60] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #56] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #56] + mov r8, r6 + mov r9, r7 + # Round 1 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r4, r5, [r0, #32] + ldrd r6, r7, [r0, #40] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r6, r7, [sp, #8] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #8] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #16] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #56] + ldr lr, [r0, #60] + strd r6, r7, [r0, #16] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #56] + ldrd r4, r5, [r0] + str r12, [r0, #48] + str lr, [r0, #52] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #48] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #48] + mov r8, r6 + mov r9, r7 + # Round 2 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r0, #32] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r6, r7, [sp, #16] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #16] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #8] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #48] + ldr lr, [r0, #52] + strd r6, r7, [r0, #8] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #48] + ldrd r4, r5, [r0, #56] + str r12, [r0, #40] + str lr, [r0, #44] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #40] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #40] + mov r8, r6 + mov r9, r7 + # Round 3 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r6, r7, [sp, #24] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #24] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #40] + ldr lr, [r0, #44] + strd r6, r7, [r0] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #40] + ldrd r4, r5, [r0, #48] + str r12, [r0, #32] + str lr, [r0, #36] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #32] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #32] + mov r8, r6 + mov r9, r7 + # Round 4 + ldr r12, [r0] + ldr lr, [r0, #4] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r6, r7, [sp, #32] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #32] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #56] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #32] + ldr lr, [r0, #36] + strd r6, r7, [r0, #56] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #32] + ldrd r4, r5, [r0, #40] + str r12, [r0, #24] + str lr, [r0, #28] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #24] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #24] + mov r8, r6 + mov r9, r7 + # Round 5 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r4, r5, [r0] + ldrd r6, r7, [r0, #8] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r6, r7, [sp, #40] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #40] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #48] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #24] + ldr lr, [r0, #28] + strd r6, r7, [r0, #48] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #24] + ldrd r4, r5, [r0, #32] + str r12, [r0, #16] + str lr, [r0, #20] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #16] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #16] + mov r8, r6 + mov r9, r7 + # Round 6 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [r0] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r6, r7, [sp, #48] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #48] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #40] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #16] + ldr lr, [r0, #20] + strd r6, r7, [r0, #40] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #16] + ldrd r4, r5, [r0, #24] + str r12, [r0, #8] + str lr, [r0, #12] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #8] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #8] + mov r8, r6 + mov r9, r7 + # Round 7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r4, r5, [r0, #48] + ldrd r6, r7, [r0, #56] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r6, r7, [sp, #56] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #56] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #32] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #8] + ldr lr, [r0, #12] + strd r6, r7, [r0, #32] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #8] + ldrd r4, r5, [r0, #16] + str r12, [r0] + str lr, [r0, #4] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0] + mov r8, r6 + mov r9, r7 + # Round 8 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r6, r7, [sp, #64] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #64] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #24] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0] + ldr lr, [r0, #4] + strd r6, r7, [r0, #24] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0] + ldrd r4, r5, [r0, #8] + str r12, [r0, #56] + str lr, [r0, #60] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #56] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #56] + mov r8, r6 + mov r9, r7 + # Round 9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r4, r5, [r0, #32] + ldrd r6, r7, [r0, #40] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r6, r7, [sp, #72] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #72] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #16] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #56] + ldr lr, [r0, #60] + strd r6, r7, [r0, #16] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #56] + ldrd r4, r5, [r0] + str r12, [r0, #48] + str lr, [r0, #52] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #48] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #48] + mov r8, r6 + mov r9, r7 + # Round 10 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r0, #32] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r6, r7, [sp, #80] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #80] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #8] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #48] + ldr lr, [r0, #52] + strd r6, r7, [r0, #8] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #48] + ldrd r4, r5, [r0, #56] + str r12, [r0, #40] + str lr, [r0, #44] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #40] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #40] + mov r8, r6 + mov r9, r7 + # Round 11 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r6, r7, [sp, #88] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #88] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #40] + ldr lr, [r0, #44] + strd r6, r7, [r0] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #40] + ldrd r4, r5, [r0, #48] + str r12, [r0, #32] + str lr, [r0, #36] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #32] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #32] + mov r8, r6 + mov r9, r7 + # Round 12 + ldr r12, [r0] + ldr lr, [r0, #4] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r6, r7, [sp, #96] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #96] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #56] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #32] + ldr lr, [r0, #36] + strd r6, r7, [r0, #56] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #32] + ldrd r4, r5, [r0, #40] + str r12, [r0, #24] + str lr, [r0, #28] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #24] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #24] + mov r8, r6 + mov r9, r7 + # Round 13 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r4, r5, [r0] + ldrd r6, r7, [r0, #8] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r6, r7, [sp, #104] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #104] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #48] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #24] + ldr lr, [r0, #28] + strd r6, r7, [r0, #48] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #24] + ldrd r4, r5, [r0, #32] + str r12, [r0, #16] + str lr, [r0, #20] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #16] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #16] + mov r8, r6 + mov r9, r7 + # Round 14 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [r0] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r6, r7, [sp, #112] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #112] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #40] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #16] + ldr lr, [r0, #20] + strd r6, r7, [r0, #40] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #16] + ldrd r4, r5, [r0, #24] + str r12, [r0, #8] + str lr, [r0, #12] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #8] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #8] + mov r8, r6 + mov r9, r7 + # Round 15 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r4, r5, [r0, #48] + ldrd r6, r7, [r0, #56] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r6, r7, [sp, #120] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #120] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #32] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #8] + ldr lr, [r0, #12] + strd r6, r7, [r0, #32] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #8] + ldrd r4, r5, [r0, #16] + str r12, [r0] + str lr, [r0, #4] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0] + mov r8, r6 + mov r9, r7 + # Add in digest from start + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [sp, #128] + ldrd r8, r9, [sp, #136] + adds r12, r12, r6 + adc lr, lr, r7 + adds r4, r4, r8 + adc r5, r5, r9 + str r12, [r0] + str lr, [r0, #4] + strd r4, r5, [r0, #8] + str r12, [sp, #128] + str lr, [sp, #132] + strd r4, r5, [sp, #136] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [sp, #144] + ldrd r8, r9, [sp, #152] + adds r12, r12, r6 + adc lr, lr, r7 + adds r4, r4, r8 + adc r5, r5, r9 + str r12, [r0, #16] + str lr, [r0, #20] + strd r4, r5, [r0, #24] + str r12, [sp, #144] + str lr, [sp, #148] + strd r4, r5, [sp, #152] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [sp, #160] + ldrd r8, r9, [sp, #168] + adds r12, r12, r6 + adc lr, lr, r7 + adds r4, r4, r8 + adc r5, r5, r9 + str r12, [r0, #32] + str lr, [r0, #36] + strd r4, r5, [r0, #40] + str r12, [sp, #160] + str lr, [sp, #164] + strd r4, r5, [sp, #168] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [sp, #176] + ldrd r8, r9, [sp, #184] + adds r12, r12, r6 + adc lr, lr, r7 + adds r4, r4, r8 + adc r5, r5, r9 + str r12, [r0, #48] + str lr, [r0, #52] + strd r4, r5, [r0, #56] + str r12, [sp, #176] + str lr, [sp, #180] + strd r4, r5, [sp, #184] + subs r2, r2, #0x80 + sub r3, r3, #0x200 + add r1, r1, #0x80 + bne L_sha512_len_neon_begin + eor r0, r0, r0 + add sp, sp, #0xc0 + pop {r4, r5, r6, r7, r8, r9, r10, pc} + .size Transform_Sha512_Len,.-Transform_Sha512_Len +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#ifndef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA512_transform_neon_len_k, %object + .size L_SHA512_transform_neon_len_k, 640 + .align 3 +L_SHA512_transform_neon_len_k: + .word 0xd728ae22 + .word 0x428a2f98 + .word 0x23ef65cd + .word 0x71374491 + .word 0xec4d3b2f + .word 0xb5c0fbcf + .word 0x8189dbbc + .word 0xe9b5dba5 + .word 0xf348b538 + .word 0x3956c25b + .word 0xb605d019 + .word 0x59f111f1 + .word 0xaf194f9b + .word 0x923f82a4 + .word 0xda6d8118 + .word 0xab1c5ed5 + .word 0xa3030242 + .word 0xd807aa98 + .word 0x45706fbe + .word 0x12835b01 + .word 0x4ee4b28c + .word 0x243185be + .word 0xd5ffb4e2 + .word 0x550c7dc3 + .word 0xf27b896f + .word 0x72be5d74 + .word 0x3b1696b1 + .word 0x80deb1fe + .word 0x25c71235 + .word 0x9bdc06a7 + .word 0xcf692694 + .word 0xc19bf174 + .word 0x9ef14ad2 + .word 0xe49b69c1 + .word 0x384f25e3 + .word 0xefbe4786 + .word 0x8b8cd5b5 + .word 0xfc19dc6 + .word 0x77ac9c65 + .word 0x240ca1cc + .word 0x592b0275 + .word 0x2de92c6f + .word 0x6ea6e483 + .word 0x4a7484aa + .word 0xbd41fbd4 + .word 0x5cb0a9dc + .word 0x831153b5 + .word 0x76f988da + .word 0xee66dfab + .word 0x983e5152 + .word 0x2db43210 + .word 0xa831c66d + .word 0x98fb213f + .word 0xb00327c8 + .word 0xbeef0ee4 + .word 0xbf597fc7 + .word 0x3da88fc2 + .word 0xc6e00bf3 + .word 0x930aa725 + .word 0xd5a79147 + .word 0xe003826f + .word 0x6ca6351 + .word 0xa0e6e70 + .word 0x14292967 + .word 0x46d22ffc + .word 0x27b70a85 + .word 0x5c26c926 + .word 0x2e1b2138 + .word 0x5ac42aed + .word 0x4d2c6dfc + .word 0x9d95b3df + .word 0x53380d13 + .word 0x8baf63de + .word 0x650a7354 + .word 0x3c77b2a8 + .word 0x766a0abb + .word 0x47edaee6 + .word 0x81c2c92e + .word 0x1482353b + .word 0x92722c85 + .word 0x4cf10364 + .word 0xa2bfe8a1 + .word 0xbc423001 + .word 0xa81a664b + .word 0xd0f89791 + .word 0xc24b8b70 + .word 0x654be30 + .word 0xc76c51a3 + .word 0xd6ef5218 + .word 0xd192e819 + .word 0x5565a910 + .word 0xd6990624 + .word 0x5771202a + .word 0xf40e3585 + .word 0x32bbd1b8 + .word 0x106aa070 + .word 0xb8d2d0c8 + .word 0x19a4c116 + .word 0x5141ab53 + .word 0x1e376c08 + .word 0xdf8eeb99 + .word 0x2748774c + .word 0xe19b48a8 + .word 0x34b0bcb5 + .word 0xc5c95a63 + .word 0x391c0cb3 + .word 0xe3418acb + .word 0x4ed8aa4a + .word 0x7763e373 + .word 0x5b9cca4f + .word 0xd6b2b8a3 + .word 0x682e6ff3 + .word 0x5defb2fc + .word 0x748f82ee + .word 0x43172f60 + .word 0x78a5636f + .word 0xa1f0ab72 + .word 0x84c87814 + .word 0x1a6439ec + .word 0x8cc70208 + .word 0x23631e28 + .word 0x90befffa + .word 0xde82bde9 + .word 0xa4506ceb + .word 0xb2c67915 + .word 0xbef9a3f7 + .word 0xe372532b + .word 0xc67178f2 + .word 0xea26619c + .word 0xca273ece + .word 0x21c0c207 + .word 0xd186b8c7 + .word 0xcde0eb1e + .word 0xeada7dd6 + .word 0xee6ed178 + .word 0xf57d4f7f + .word 0x72176fba + .word 0x6f067aa + .word 0xa2c898a6 + .word 0xa637dc5 + .word 0xbef90dae + .word 0x113f9804 + .word 0x131c471b + .word 0x1b710b35 + .word 0x23047d84 + .word 0x28db77f5 + .word 0x40c72493 + .word 0x32caab7b + .word 0x15c9bebc + .word 0x3c9ebe0a + .word 0x9c100d4c + .word 0x431d67c4 + .word 0xcb3e42b6 + .word 0x4cc5d4be + .word 0xfc657e2a + .word 0x597f299c + .word 0x3ad6faec + .word 0x5fcb6fab + .word 0x4a475817 + .word 0x6c44198c + .text + .align 2 + .globl Transform_Sha512_Len + .type Transform_Sha512_Len, %function +Transform_Sha512_Len: + vpush {d8-d15} + # Load digest into working vars + vldm.64 r0, {d0-d7} + # Start of loop processing a block +L_sha512_len_neon_begin: + # Load W + vldm.64 r1!, {d16-d31} + vrev64.8 q8, q8 + vrev64.8 q9, q9 + vrev64.8 q10, q10 + vrev64.8 q11, q11 + vrev64.8 q12, q12 + vrev64.8 q13, q13 + vrev64.8 q14, q14 + vrev64.8 q15, q15 + adr r3, L_SHA512_transform_neon_len_k + mov r12, #4 + # Start of 16 rounds +L_sha512_len_neon_start: + # Round 0 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d4, #50 + vsri.u64 d8, d4, #14 + vshl.u64 d9, d0, #36 + vsri.u64 d9, d0, #28 + vshl.u64 d10, d4, #46 + vsri.u64 d10, d4, #18 + vshl.u64 d11, d0, #30 + vsri.u64 d11, d0, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d4, #23 + vsri.u64 d10, d4, #41 + vshl.u64 d11, d0, #25 + vsri.u64 d11, d0, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d7, d8 + vadd.i64 d12, d16 + vmov d8, d4 + veor d10, d1, d2 + vadd.i64 d7, d12 + vbsl d8, d5, d6 + vbsl d10, d0, d2 + vadd.i64 d7, d8 + vadd.i64 d10, d9 + vadd.i64 d3, d7 + vadd.i64 d7, d10 + # Round 1 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d3, #50 + vsri.u64 d8, d3, #14 + vshl.u64 d9, d7, #36 + vsri.u64 d9, d7, #28 + vshl.u64 d10, d3, #46 + vsri.u64 d10, d3, #18 + vshl.u64 d11, d7, #30 + vsri.u64 d11, d7, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d3, #23 + vsri.u64 d10, d3, #41 + vshl.u64 d11, d7, #25 + vsri.u64 d11, d7, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d6, d8 + vadd.i64 d12, d17 + vmov d8, d3 + veor d10, d0, d1 + vadd.i64 d6, d12 + vbsl d8, d4, d5 + vbsl d10, d7, d1 + vadd.i64 d6, d8 + vadd.i64 d10, d9 + vadd.i64 d2, d6 + vadd.i64 d6, d10 + # Calc new W[0]-W[1] + vext.8 q6, q8, q9, #8 + vshl.u64 q4, q15, #45 + vsri.u64 q4, q15, #19 + vshl.u64 q5, q15, #3 + vsri.u64 q5, q15, #61 + veor q5, q4 + vshr.u64 q4, q15, #6 + veor q5, q4 + vadd.i64 q8, q5 + vext.8 q7, q12, q13, #8 + vadd.i64 q8, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q8, q5 + # Round 2 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d2, #50 + vsri.u64 d8, d2, #14 + vshl.u64 d9, d6, #36 + vsri.u64 d9, d6, #28 + vshl.u64 d10, d2, #46 + vsri.u64 d10, d2, #18 + vshl.u64 d11, d6, #30 + vsri.u64 d11, d6, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d2, #23 + vsri.u64 d10, d2, #41 + vshl.u64 d11, d6, #25 + vsri.u64 d11, d6, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d5, d8 + vadd.i64 d12, d18 + vmov d8, d2 + veor d10, d7, d0 + vadd.i64 d5, d12 + vbsl d8, d3, d4 + vbsl d10, d6, d0 + vadd.i64 d5, d8 + vadd.i64 d10, d9 + vadd.i64 d1, d5 + vadd.i64 d5, d10 + # Round 3 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d1, #50 + vsri.u64 d8, d1, #14 + vshl.u64 d9, d5, #36 + vsri.u64 d9, d5, #28 + vshl.u64 d10, d1, #46 + vsri.u64 d10, d1, #18 + vshl.u64 d11, d5, #30 + vsri.u64 d11, d5, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d1, #23 + vsri.u64 d10, d1, #41 + vshl.u64 d11, d5, #25 + vsri.u64 d11, d5, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d4, d8 + vadd.i64 d12, d19 + vmov d8, d1 + veor d10, d6, d7 + vadd.i64 d4, d12 + vbsl d8, d2, d3 + vbsl d10, d5, d7 + vadd.i64 d4, d8 + vadd.i64 d10, d9 + vadd.i64 d0, d4 + vadd.i64 d4, d10 + # Calc new W[2]-W[3] + vext.8 q6, q9, q10, #8 + vshl.u64 q4, q8, #45 + vsri.u64 q4, q8, #19 + vshl.u64 q5, q8, #3 + vsri.u64 q5, q8, #61 + veor q5, q4 + vshr.u64 q4, q8, #6 + veor q5, q4 + vadd.i64 q9, q5 + vext.8 q7, q13, q14, #8 + vadd.i64 q9, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q9, q5 + # Round 4 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d0, #50 + vsri.u64 d8, d0, #14 + vshl.u64 d9, d4, #36 + vsri.u64 d9, d4, #28 + vshl.u64 d10, d0, #46 + vsri.u64 d10, d0, #18 + vshl.u64 d11, d4, #30 + vsri.u64 d11, d4, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d0, #23 + vsri.u64 d10, d0, #41 + vshl.u64 d11, d4, #25 + vsri.u64 d11, d4, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d3, d8 + vadd.i64 d12, d20 + vmov d8, d0 + veor d10, d5, d6 + vadd.i64 d3, d12 + vbsl d8, d1, d2 + vbsl d10, d4, d6 + vadd.i64 d3, d8 + vadd.i64 d10, d9 + vadd.i64 d7, d3 + vadd.i64 d3, d10 + # Round 5 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d7, #50 + vsri.u64 d8, d7, #14 + vshl.u64 d9, d3, #36 + vsri.u64 d9, d3, #28 + vshl.u64 d10, d7, #46 + vsri.u64 d10, d7, #18 + vshl.u64 d11, d3, #30 + vsri.u64 d11, d3, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d7, #23 + vsri.u64 d10, d7, #41 + vshl.u64 d11, d3, #25 + vsri.u64 d11, d3, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d2, d8 + vadd.i64 d12, d21 + vmov d8, d7 + veor d10, d4, d5 + vadd.i64 d2, d12 + vbsl d8, d0, d1 + vbsl d10, d3, d5 + vadd.i64 d2, d8 + vadd.i64 d10, d9 + vadd.i64 d6, d2 + vadd.i64 d2, d10 + # Calc new W[4]-W[5] + vext.8 q6, q10, q11, #8 + vshl.u64 q4, q9, #45 + vsri.u64 q4, q9, #19 + vshl.u64 q5, q9, #3 + vsri.u64 q5, q9, #61 + veor q5, q4 + vshr.u64 q4, q9, #6 + veor q5, q4 + vadd.i64 q10, q5 + vext.8 q7, q14, q15, #8 + vadd.i64 q10, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q10, q5 + # Round 6 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d6, #50 + vsri.u64 d8, d6, #14 + vshl.u64 d9, d2, #36 + vsri.u64 d9, d2, #28 + vshl.u64 d10, d6, #46 + vsri.u64 d10, d6, #18 + vshl.u64 d11, d2, #30 + vsri.u64 d11, d2, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d6, #23 + vsri.u64 d10, d6, #41 + vshl.u64 d11, d2, #25 + vsri.u64 d11, d2, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d1, d8 + vadd.i64 d12, d22 + vmov d8, d6 + veor d10, d3, d4 + vadd.i64 d1, d12 + vbsl d8, d7, d0 + vbsl d10, d2, d4 + vadd.i64 d1, d8 + vadd.i64 d10, d9 + vadd.i64 d5, d1 + vadd.i64 d1, d10 + # Round 7 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d5, #50 + vsri.u64 d8, d5, #14 + vshl.u64 d9, d1, #36 + vsri.u64 d9, d1, #28 + vshl.u64 d10, d5, #46 + vsri.u64 d10, d5, #18 + vshl.u64 d11, d1, #30 + vsri.u64 d11, d1, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d5, #23 + vsri.u64 d10, d5, #41 + vshl.u64 d11, d1, #25 + vsri.u64 d11, d1, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d0, d8 + vadd.i64 d12, d23 + vmov d8, d5 + veor d10, d2, d3 + vadd.i64 d0, d12 + vbsl d8, d6, d7 + vbsl d10, d1, d3 + vadd.i64 d0, d8 + vadd.i64 d10, d9 + vadd.i64 d4, d0 + vadd.i64 d0, d10 + # Calc new W[6]-W[7] + vext.8 q6, q11, q12, #8 + vshl.u64 q4, q10, #45 + vsri.u64 q4, q10, #19 + vshl.u64 q5, q10, #3 + vsri.u64 q5, q10, #61 + veor q5, q4 + vshr.u64 q4, q10, #6 + veor q5, q4 + vadd.i64 q11, q5 + vext.8 q7, q15, q8, #8 + vadd.i64 q11, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q11, q5 + # Round 8 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d4, #50 + vsri.u64 d8, d4, #14 + vshl.u64 d9, d0, #36 + vsri.u64 d9, d0, #28 + vshl.u64 d10, d4, #46 + vsri.u64 d10, d4, #18 + vshl.u64 d11, d0, #30 + vsri.u64 d11, d0, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d4, #23 + vsri.u64 d10, d4, #41 + vshl.u64 d11, d0, #25 + vsri.u64 d11, d0, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d7, d8 + vadd.i64 d12, d24 + vmov d8, d4 + veor d10, d1, d2 + vadd.i64 d7, d12 + vbsl d8, d5, d6 + vbsl d10, d0, d2 + vadd.i64 d7, d8 + vadd.i64 d10, d9 + vadd.i64 d3, d7 + vadd.i64 d7, d10 + # Round 9 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d3, #50 + vsri.u64 d8, d3, #14 + vshl.u64 d9, d7, #36 + vsri.u64 d9, d7, #28 + vshl.u64 d10, d3, #46 + vsri.u64 d10, d3, #18 + vshl.u64 d11, d7, #30 + vsri.u64 d11, d7, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d3, #23 + vsri.u64 d10, d3, #41 + vshl.u64 d11, d7, #25 + vsri.u64 d11, d7, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d6, d8 + vadd.i64 d12, d25 + vmov d8, d3 + veor d10, d0, d1 + vadd.i64 d6, d12 + vbsl d8, d4, d5 + vbsl d10, d7, d1 + vadd.i64 d6, d8 + vadd.i64 d10, d9 + vadd.i64 d2, d6 + vadd.i64 d6, d10 + # Calc new W[8]-W[9] + vext.8 q6, q12, q13, #8 + vshl.u64 q4, q11, #45 + vsri.u64 q4, q11, #19 + vshl.u64 q5, q11, #3 + vsri.u64 q5, q11, #61 + veor q5, q4 + vshr.u64 q4, q11, #6 + veor q5, q4 + vadd.i64 q12, q5 + vext.8 q7, q8, q9, #8 + vadd.i64 q12, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q12, q5 + # Round 10 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d2, #50 + vsri.u64 d8, d2, #14 + vshl.u64 d9, d6, #36 + vsri.u64 d9, d6, #28 + vshl.u64 d10, d2, #46 + vsri.u64 d10, d2, #18 + vshl.u64 d11, d6, #30 + vsri.u64 d11, d6, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d2, #23 + vsri.u64 d10, d2, #41 + vshl.u64 d11, d6, #25 + vsri.u64 d11, d6, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d5, d8 + vadd.i64 d12, d26 + vmov d8, d2 + veor d10, d7, d0 + vadd.i64 d5, d12 + vbsl d8, d3, d4 + vbsl d10, d6, d0 + vadd.i64 d5, d8 + vadd.i64 d10, d9 + vadd.i64 d1, d5 + vadd.i64 d5, d10 + # Round 11 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d1, #50 + vsri.u64 d8, d1, #14 + vshl.u64 d9, d5, #36 + vsri.u64 d9, d5, #28 + vshl.u64 d10, d1, #46 + vsri.u64 d10, d1, #18 + vshl.u64 d11, d5, #30 + vsri.u64 d11, d5, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d1, #23 + vsri.u64 d10, d1, #41 + vshl.u64 d11, d5, #25 + vsri.u64 d11, d5, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d4, d8 + vadd.i64 d12, d27 + vmov d8, d1 + veor d10, d6, d7 + vadd.i64 d4, d12 + vbsl d8, d2, d3 + vbsl d10, d5, d7 + vadd.i64 d4, d8 + vadd.i64 d10, d9 + vadd.i64 d0, d4 + vadd.i64 d4, d10 + # Calc new W[10]-W[11] + vext.8 q6, q13, q14, #8 + vshl.u64 q4, q12, #45 + vsri.u64 q4, q12, #19 + vshl.u64 q5, q12, #3 + vsri.u64 q5, q12, #61 + veor q5, q4 + vshr.u64 q4, q12, #6 + veor q5, q4 + vadd.i64 q13, q5 + vext.8 q7, q9, q10, #8 + vadd.i64 q13, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q13, q5 + # Round 12 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d0, #50 + vsri.u64 d8, d0, #14 + vshl.u64 d9, d4, #36 + vsri.u64 d9, d4, #28 + vshl.u64 d10, d0, #46 + vsri.u64 d10, d0, #18 + vshl.u64 d11, d4, #30 + vsri.u64 d11, d4, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d0, #23 + vsri.u64 d10, d0, #41 + vshl.u64 d11, d4, #25 + vsri.u64 d11, d4, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d3, d8 + vadd.i64 d12, d28 + vmov d8, d0 + veor d10, d5, d6 + vadd.i64 d3, d12 + vbsl d8, d1, d2 + vbsl d10, d4, d6 + vadd.i64 d3, d8 + vadd.i64 d10, d9 + vadd.i64 d7, d3 + vadd.i64 d3, d10 + # Round 13 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d7, #50 + vsri.u64 d8, d7, #14 + vshl.u64 d9, d3, #36 + vsri.u64 d9, d3, #28 + vshl.u64 d10, d7, #46 + vsri.u64 d10, d7, #18 + vshl.u64 d11, d3, #30 + vsri.u64 d11, d3, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d7, #23 + vsri.u64 d10, d7, #41 + vshl.u64 d11, d3, #25 + vsri.u64 d11, d3, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d2, d8 + vadd.i64 d12, d29 + vmov d8, d7 + veor d10, d4, d5 + vadd.i64 d2, d12 + vbsl d8, d0, d1 + vbsl d10, d3, d5 + vadd.i64 d2, d8 + vadd.i64 d10, d9 + vadd.i64 d6, d2 + vadd.i64 d2, d10 + # Calc new W[12]-W[13] + vext.8 q6, q14, q15, #8 + vshl.u64 q4, q13, #45 + vsri.u64 q4, q13, #19 + vshl.u64 q5, q13, #3 + vsri.u64 q5, q13, #61 + veor q5, q4 + vshr.u64 q4, q13, #6 + veor q5, q4 + vadd.i64 q14, q5 + vext.8 q7, q10, q11, #8 + vadd.i64 q14, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q14, q5 + # Round 14 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d6, #50 + vsri.u64 d8, d6, #14 + vshl.u64 d9, d2, #36 + vsri.u64 d9, d2, #28 + vshl.u64 d10, d6, #46 + vsri.u64 d10, d6, #18 + vshl.u64 d11, d2, #30 + vsri.u64 d11, d2, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d6, #23 + vsri.u64 d10, d6, #41 + vshl.u64 d11, d2, #25 + vsri.u64 d11, d2, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d1, d8 + vadd.i64 d12, d30 + vmov d8, d6 + veor d10, d3, d4 + vadd.i64 d1, d12 + vbsl d8, d7, d0 + vbsl d10, d2, d4 + vadd.i64 d1, d8 + vadd.i64 d10, d9 + vadd.i64 d5, d1 + vadd.i64 d1, d10 + # Round 15 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d5, #50 + vsri.u64 d8, d5, #14 + vshl.u64 d9, d1, #36 + vsri.u64 d9, d1, #28 + vshl.u64 d10, d5, #46 + vsri.u64 d10, d5, #18 + vshl.u64 d11, d1, #30 + vsri.u64 d11, d1, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d5, #23 + vsri.u64 d10, d5, #41 + vshl.u64 d11, d1, #25 + vsri.u64 d11, d1, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d0, d8 + vadd.i64 d12, d31 + vmov d8, d5 + veor d10, d2, d3 + vadd.i64 d0, d12 + vbsl d8, d6, d7 + vbsl d10, d1, d3 + vadd.i64 d0, d8 + vadd.i64 d10, d9 + vadd.i64 d4, d0 + vadd.i64 d0, d10 + # Calc new W[14]-W[15] + vext.8 q6, q15, q8, #8 + vshl.u64 q4, q14, #45 + vsri.u64 q4, q14, #19 + vshl.u64 q5, q14, #3 + vsri.u64 q5, q14, #61 + veor q5, q4 + vshr.u64 q4, q14, #6 + veor q5, q4 + vadd.i64 q15, q5 + vext.8 q7, q11, q12, #8 + vadd.i64 q15, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q15, q5 + subs r12, r12, #1 + bne L_sha512_len_neon_start + # Round 0 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d4, #50 + vsri.u64 d8, d4, #14 + vshl.u64 d9, d0, #36 + vsri.u64 d9, d0, #28 + vshl.u64 d10, d4, #46 + vsri.u64 d10, d4, #18 + vshl.u64 d11, d0, #30 + vsri.u64 d11, d0, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d4, #23 + vsri.u64 d10, d4, #41 + vshl.u64 d11, d0, #25 + vsri.u64 d11, d0, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d7, d8 + vadd.i64 d12, d16 + vmov d8, d4 + veor d10, d1, d2 + vadd.i64 d7, d12 + vbsl d8, d5, d6 + vbsl d10, d0, d2 + vadd.i64 d7, d8 + vadd.i64 d10, d9 + vadd.i64 d3, d7 + vadd.i64 d7, d10 + # Round 1 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d3, #50 + vsri.u64 d8, d3, #14 + vshl.u64 d9, d7, #36 + vsri.u64 d9, d7, #28 + vshl.u64 d10, d3, #46 + vsri.u64 d10, d3, #18 + vshl.u64 d11, d7, #30 + vsri.u64 d11, d7, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d3, #23 + vsri.u64 d10, d3, #41 + vshl.u64 d11, d7, #25 + vsri.u64 d11, d7, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d6, d8 + vadd.i64 d12, d17 + vmov d8, d3 + veor d10, d0, d1 + vadd.i64 d6, d12 + vbsl d8, d4, d5 + vbsl d10, d7, d1 + vadd.i64 d6, d8 + vadd.i64 d10, d9 + vadd.i64 d2, d6 + vadd.i64 d6, d10 + # Round 2 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d2, #50 + vsri.u64 d8, d2, #14 + vshl.u64 d9, d6, #36 + vsri.u64 d9, d6, #28 + vshl.u64 d10, d2, #46 + vsri.u64 d10, d2, #18 + vshl.u64 d11, d6, #30 + vsri.u64 d11, d6, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d2, #23 + vsri.u64 d10, d2, #41 + vshl.u64 d11, d6, #25 + vsri.u64 d11, d6, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d5, d8 + vadd.i64 d12, d18 + vmov d8, d2 + veor d10, d7, d0 + vadd.i64 d5, d12 + vbsl d8, d3, d4 + vbsl d10, d6, d0 + vadd.i64 d5, d8 + vadd.i64 d10, d9 + vadd.i64 d1, d5 + vadd.i64 d5, d10 + # Round 3 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d1, #50 + vsri.u64 d8, d1, #14 + vshl.u64 d9, d5, #36 + vsri.u64 d9, d5, #28 + vshl.u64 d10, d1, #46 + vsri.u64 d10, d1, #18 + vshl.u64 d11, d5, #30 + vsri.u64 d11, d5, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d1, #23 + vsri.u64 d10, d1, #41 + vshl.u64 d11, d5, #25 + vsri.u64 d11, d5, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d4, d8 + vadd.i64 d12, d19 + vmov d8, d1 + veor d10, d6, d7 + vadd.i64 d4, d12 + vbsl d8, d2, d3 + vbsl d10, d5, d7 + vadd.i64 d4, d8 + vadd.i64 d10, d9 + vadd.i64 d0, d4 + vadd.i64 d4, d10 + # Round 4 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d0, #50 + vsri.u64 d8, d0, #14 + vshl.u64 d9, d4, #36 + vsri.u64 d9, d4, #28 + vshl.u64 d10, d0, #46 + vsri.u64 d10, d0, #18 + vshl.u64 d11, d4, #30 + vsri.u64 d11, d4, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d0, #23 + vsri.u64 d10, d0, #41 + vshl.u64 d11, d4, #25 + vsri.u64 d11, d4, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d3, d8 + vadd.i64 d12, d20 + vmov d8, d0 + veor d10, d5, d6 + vadd.i64 d3, d12 + vbsl d8, d1, d2 + vbsl d10, d4, d6 + vadd.i64 d3, d8 + vadd.i64 d10, d9 + vadd.i64 d7, d3 + vadd.i64 d3, d10 + # Round 5 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d7, #50 + vsri.u64 d8, d7, #14 + vshl.u64 d9, d3, #36 + vsri.u64 d9, d3, #28 + vshl.u64 d10, d7, #46 + vsri.u64 d10, d7, #18 + vshl.u64 d11, d3, #30 + vsri.u64 d11, d3, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d7, #23 + vsri.u64 d10, d7, #41 + vshl.u64 d11, d3, #25 + vsri.u64 d11, d3, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d2, d8 + vadd.i64 d12, d21 + vmov d8, d7 + veor d10, d4, d5 + vadd.i64 d2, d12 + vbsl d8, d0, d1 + vbsl d10, d3, d5 + vadd.i64 d2, d8 + vadd.i64 d10, d9 + vadd.i64 d6, d2 + vadd.i64 d2, d10 + # Round 6 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d6, #50 + vsri.u64 d8, d6, #14 + vshl.u64 d9, d2, #36 + vsri.u64 d9, d2, #28 + vshl.u64 d10, d6, #46 + vsri.u64 d10, d6, #18 + vshl.u64 d11, d2, #30 + vsri.u64 d11, d2, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d6, #23 + vsri.u64 d10, d6, #41 + vshl.u64 d11, d2, #25 + vsri.u64 d11, d2, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d1, d8 + vadd.i64 d12, d22 + vmov d8, d6 + veor d10, d3, d4 + vadd.i64 d1, d12 + vbsl d8, d7, d0 + vbsl d10, d2, d4 + vadd.i64 d1, d8 + vadd.i64 d10, d9 + vadd.i64 d5, d1 + vadd.i64 d1, d10 + # Round 7 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d5, #50 + vsri.u64 d8, d5, #14 + vshl.u64 d9, d1, #36 + vsri.u64 d9, d1, #28 + vshl.u64 d10, d5, #46 + vsri.u64 d10, d5, #18 + vshl.u64 d11, d1, #30 + vsri.u64 d11, d1, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d5, #23 + vsri.u64 d10, d5, #41 + vshl.u64 d11, d1, #25 + vsri.u64 d11, d1, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d0, d8 + vadd.i64 d12, d23 + vmov d8, d5 + veor d10, d2, d3 + vadd.i64 d0, d12 + vbsl d8, d6, d7 + vbsl d10, d1, d3 + vadd.i64 d0, d8 + vadd.i64 d10, d9 + vadd.i64 d4, d0 + vadd.i64 d0, d10 + # Round 8 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d4, #50 + vsri.u64 d8, d4, #14 + vshl.u64 d9, d0, #36 + vsri.u64 d9, d0, #28 + vshl.u64 d10, d4, #46 + vsri.u64 d10, d4, #18 + vshl.u64 d11, d0, #30 + vsri.u64 d11, d0, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d4, #23 + vsri.u64 d10, d4, #41 + vshl.u64 d11, d0, #25 + vsri.u64 d11, d0, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d7, d8 + vadd.i64 d12, d24 + vmov d8, d4 + veor d10, d1, d2 + vadd.i64 d7, d12 + vbsl d8, d5, d6 + vbsl d10, d0, d2 + vadd.i64 d7, d8 + vadd.i64 d10, d9 + vadd.i64 d3, d7 + vadd.i64 d7, d10 + # Round 9 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d3, #50 + vsri.u64 d8, d3, #14 + vshl.u64 d9, d7, #36 + vsri.u64 d9, d7, #28 + vshl.u64 d10, d3, #46 + vsri.u64 d10, d3, #18 + vshl.u64 d11, d7, #30 + vsri.u64 d11, d7, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d3, #23 + vsri.u64 d10, d3, #41 + vshl.u64 d11, d7, #25 + vsri.u64 d11, d7, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d6, d8 + vadd.i64 d12, d25 + vmov d8, d3 + veor d10, d0, d1 + vadd.i64 d6, d12 + vbsl d8, d4, d5 + vbsl d10, d7, d1 + vadd.i64 d6, d8 + vadd.i64 d10, d9 + vadd.i64 d2, d6 + vadd.i64 d6, d10 + # Round 10 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d2, #50 + vsri.u64 d8, d2, #14 + vshl.u64 d9, d6, #36 + vsri.u64 d9, d6, #28 + vshl.u64 d10, d2, #46 + vsri.u64 d10, d2, #18 + vshl.u64 d11, d6, #30 + vsri.u64 d11, d6, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d2, #23 + vsri.u64 d10, d2, #41 + vshl.u64 d11, d6, #25 + vsri.u64 d11, d6, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d5, d8 + vadd.i64 d12, d26 + vmov d8, d2 + veor d10, d7, d0 + vadd.i64 d5, d12 + vbsl d8, d3, d4 + vbsl d10, d6, d0 + vadd.i64 d5, d8 + vadd.i64 d10, d9 + vadd.i64 d1, d5 + vadd.i64 d5, d10 + # Round 11 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d1, #50 + vsri.u64 d8, d1, #14 + vshl.u64 d9, d5, #36 + vsri.u64 d9, d5, #28 + vshl.u64 d10, d1, #46 + vsri.u64 d10, d1, #18 + vshl.u64 d11, d5, #30 + vsri.u64 d11, d5, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d1, #23 + vsri.u64 d10, d1, #41 + vshl.u64 d11, d5, #25 + vsri.u64 d11, d5, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d4, d8 + vadd.i64 d12, d27 + vmov d8, d1 + veor d10, d6, d7 + vadd.i64 d4, d12 + vbsl d8, d2, d3 + vbsl d10, d5, d7 + vadd.i64 d4, d8 + vadd.i64 d10, d9 + vadd.i64 d0, d4 + vadd.i64 d4, d10 + # Round 12 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d0, #50 + vsri.u64 d8, d0, #14 + vshl.u64 d9, d4, #36 + vsri.u64 d9, d4, #28 + vshl.u64 d10, d0, #46 + vsri.u64 d10, d0, #18 + vshl.u64 d11, d4, #30 + vsri.u64 d11, d4, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d0, #23 + vsri.u64 d10, d0, #41 + vshl.u64 d11, d4, #25 + vsri.u64 d11, d4, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d3, d8 + vadd.i64 d12, d28 + vmov d8, d0 + veor d10, d5, d6 + vadd.i64 d3, d12 + vbsl d8, d1, d2 + vbsl d10, d4, d6 + vadd.i64 d3, d8 + vadd.i64 d10, d9 + vadd.i64 d7, d3 + vadd.i64 d3, d10 + # Round 13 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d7, #50 + vsri.u64 d8, d7, #14 + vshl.u64 d9, d3, #36 + vsri.u64 d9, d3, #28 + vshl.u64 d10, d7, #46 + vsri.u64 d10, d7, #18 + vshl.u64 d11, d3, #30 + vsri.u64 d11, d3, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d7, #23 + vsri.u64 d10, d7, #41 + vshl.u64 d11, d3, #25 + vsri.u64 d11, d3, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d2, d8 + vadd.i64 d12, d29 + vmov d8, d7 + veor d10, d4, d5 + vadd.i64 d2, d12 + vbsl d8, d0, d1 + vbsl d10, d3, d5 + vadd.i64 d2, d8 + vadd.i64 d10, d9 + vadd.i64 d6, d2 + vadd.i64 d2, d10 + # Round 14 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d6, #50 + vsri.u64 d8, d6, #14 + vshl.u64 d9, d2, #36 + vsri.u64 d9, d2, #28 + vshl.u64 d10, d6, #46 + vsri.u64 d10, d6, #18 + vshl.u64 d11, d2, #30 + vsri.u64 d11, d2, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d6, #23 + vsri.u64 d10, d6, #41 + vshl.u64 d11, d2, #25 + vsri.u64 d11, d2, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d1, d8 + vadd.i64 d12, d30 + vmov d8, d6 + veor d10, d3, d4 + vadd.i64 d1, d12 + vbsl d8, d7, d0 + vbsl d10, d2, d4 + vadd.i64 d1, d8 + vadd.i64 d10, d9 + vadd.i64 d5, d1 + vadd.i64 d1, d10 + # Round 15 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d5, #50 + vsri.u64 d8, d5, #14 + vshl.u64 d9, d1, #36 + vsri.u64 d9, d1, #28 + vshl.u64 d10, d5, #46 + vsri.u64 d10, d5, #18 + vshl.u64 d11, d1, #30 + vsri.u64 d11, d1, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d5, #23 + vsri.u64 d10, d5, #41 + vshl.u64 d11, d1, #25 + vsri.u64 d11, d1, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d0, d8 + vadd.i64 d12, d31 + vmov d8, d5 + veor d10, d2, d3 + vadd.i64 d0, d12 + vbsl d8, d6, d7 + vbsl d10, d1, d3 + vadd.i64 d0, d8 + vadd.i64 d10, d9 + vadd.i64 d4, d0 + vadd.i64 d0, d10 + # Add in digest from start + vldm.64 r0, {d8-d15} + vadd.i64 q0, q0, q4 + vadd.i64 q1, q1, q5 + vadd.i64 q2, q2, q6 + vadd.i64 q3, q3, q7 + vstm.64 r0, {d0-d7} + subs r2, r2, #0x80 + bne L_sha512_len_neon_begin + vpop {d8-d15} + bx lr + .size Transform_Sha512_Len,.-Transform_Sha512_Len +#endif /* !WOLFSSL_ARMASM_NO_NEON */ +#endif /* !__aarch64__ */ +#endif /* WOLFSSL_ARMASM */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c new file mode 100644 index 0000000..c502a39 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c @@ -0,0 +1,4783 @@ +/* armv8-32-sha512-asm + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c + */ + +#ifndef __aarch64__ +#include <stdint.h> + +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#ifdef WOLFSSL_ARMASM +#include <wolfssl/wolfcrypt/sha512.h> + +#ifdef WOLFSSL_ARMASM_NO_NEON +static const uint64_t L_SHA512_transform_len_k[] = { + 0x428a2f98d728ae22UL, + 0x7137449123ef65cdUL, + 0xb5c0fbcfec4d3b2fUL, + 0xe9b5dba58189dbbcUL, + 0x3956c25bf348b538UL, + 0x59f111f1b605d019UL, + 0x923f82a4af194f9bUL, + 0xab1c5ed5da6d8118UL, + 0xd807aa98a3030242UL, + 0x12835b0145706fbeUL, + 0x243185be4ee4b28cUL, + 0x550c7dc3d5ffb4e2UL, + 0x72be5d74f27b896fUL, + 0x80deb1fe3b1696b1UL, + 0x9bdc06a725c71235UL, + 0xc19bf174cf692694UL, + 0xe49b69c19ef14ad2UL, + 0xefbe4786384f25e3UL, + 0xfc19dc68b8cd5b5UL, + 0x240ca1cc77ac9c65UL, + 0x2de92c6f592b0275UL, + 0x4a7484aa6ea6e483UL, + 0x5cb0a9dcbd41fbd4UL, + 0x76f988da831153b5UL, + 0x983e5152ee66dfabUL, + 0xa831c66d2db43210UL, + 0xb00327c898fb213fUL, + 0xbf597fc7beef0ee4UL, + 0xc6e00bf33da88fc2UL, + 0xd5a79147930aa725UL, + 0x6ca6351e003826fUL, + 0x142929670a0e6e70UL, + 0x27b70a8546d22ffcUL, + 0x2e1b21385c26c926UL, + 0x4d2c6dfc5ac42aedUL, + 0x53380d139d95b3dfUL, + 0x650a73548baf63deUL, + 0x766a0abb3c77b2a8UL, + 0x81c2c92e47edaee6UL, + 0x92722c851482353bUL, + 0xa2bfe8a14cf10364UL, + 0xa81a664bbc423001UL, + 0xc24b8b70d0f89791UL, + 0xc76c51a30654be30UL, + 0xd192e819d6ef5218UL, + 0xd69906245565a910UL, + 0xf40e35855771202aUL, + 0x106aa07032bbd1b8UL, + 0x19a4c116b8d2d0c8UL, + 0x1e376c085141ab53UL, + 0x2748774cdf8eeb99UL, + 0x34b0bcb5e19b48a8UL, + 0x391c0cb3c5c95a63UL, + 0x4ed8aa4ae3418acbUL, + 0x5b9cca4f7763e373UL, + 0x682e6ff3d6b2b8a3UL, + 0x748f82ee5defb2fcUL, + 0x78a5636f43172f60UL, + 0x84c87814a1f0ab72UL, + 0x8cc702081a6439ecUL, + 0x90befffa23631e28UL, + 0xa4506cebde82bde9UL, + 0xbef9a3f7b2c67915UL, + 0xc67178f2e372532bUL, + 0xca273eceea26619cUL, + 0xd186b8c721c0c207UL, + 0xeada7dd6cde0eb1eUL, + 0xf57d4f7fee6ed178UL, + 0x6f067aa72176fbaUL, + 0xa637dc5a2c898a6UL, + 0x113f9804bef90daeUL, + 0x1b710b35131c471bUL, + 0x28db77f523047d84UL, + 0x32caab7b40c72493UL, + 0x3c9ebe0a15c9bebcUL, + 0x431d67c49c100d4cUL, + 0x4cc5d4becb3e42b6UL, + 0x597f299cfc657e2aUL, + 0x5fcb6fab3ad6faecUL, + 0x6c44198c4a475817UL, +}; + +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0xc0\n\t" + "mov r3, %[L_SHA512_transform_len_k]\n\t" + /* Copy digest to add in at end */ + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r8, r9, [%[sha512], #24]\n\t" + "strd r12, lr, [sp, #128]\n\t" + "strd r4, r5, [sp, #136]\n\t" + "strd r6, r7, [sp, #144]\n\t" + "strd r8, r9, [sp, #152]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r8, r9, [%[sha512], #56]\n\t" + "strd r12, lr, [sp, #160]\n\t" + "strd r4, r5, [sp, #168]\n\t" + "strd r6, r7, [sp, #176]\n\t" + "strd r8, r9, [sp, #184]\n\t" + /* Start of loop processing a block */ + "\n" + "L_sha512_len_neon_begin_%=: \n\t" + /* Load, Reverse and Store W */ + "ldrd r12, lr, [%[data]]\n\t" + "ldrd r4, r5, [%[data], #8]\n\t" + "ldrd r6, r7, [%[data], #16]\n\t" + "ldrd r8, r9, [%[data], #24]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "str lr, [sp]\n\t" + "str r12, [sp, #4]\n\t" + "str r5, [sp, #8]\n\t" + "str r4, [sp, #12]\n\t" + "str r7, [sp, #16]\n\t" + "str r6, [sp, #20]\n\t" + "str r9, [sp, #24]\n\t" + "str r8, [sp, #28]\n\t" + "ldrd r12, lr, [%[data], #32]\n\t" + "ldrd r4, r5, [%[data], #40]\n\t" + "ldrd r6, r7, [%[data], #48]\n\t" + "ldrd r8, r9, [%[data], #56]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "str lr, [sp, #32]\n\t" + "str r12, [sp, #36]\n\t" + "str r5, [sp, #40]\n\t" + "str r4, [sp, #44]\n\t" + "str r7, [sp, #48]\n\t" + "str r6, [sp, #52]\n\t" + "str r9, [sp, #56]\n\t" + "str r8, [sp, #60]\n\t" + "ldrd r12, lr, [%[data], #64]\n\t" + "ldrd r4, r5, [%[data], #72]\n\t" + "ldrd r6, r7, [%[data], #80]\n\t" + "ldrd r8, r9, [%[data], #88]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "str lr, [sp, #64]\n\t" + "str r12, [sp, #68]\n\t" + "str r5, [sp, #72]\n\t" + "str r4, [sp, #76]\n\t" + "str r7, [sp, #80]\n\t" + "str r6, [sp, #84]\n\t" + "str r9, [sp, #88]\n\t" + "str r8, [sp, #92]\n\t" + "ldrd r12, lr, [%[data], #96]\n\t" + "ldrd r4, r5, [%[data], #104]\n\t" + "ldrd r6, r7, [%[data], #112]\n\t" + "ldrd r8, r9, [%[data], #120]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "str lr, [sp, #96]\n\t" + "str r12, [sp, #100]\n\t" + "str r5, [sp, #104]\n\t" + "str r4, [sp, #108]\n\t" + "str r7, [sp, #112]\n\t" + "str r6, [sp, #116]\n\t" + "str r9, [sp, #120]\n\t" + "str r8, [sp, #124]\n\t" + /* Pre-calc: b ^ c */ + "ldrd r8, r9, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r8, r8, r12\n\t" + "eor r9, r9, lr\n\t" + "mov r10, #4\n\t" + /* Start of 16 rounds */ + "\n" + "L_sha512_len_neon_start_%=: \n\t" + /* Round 0 */ + "ldrd r12, lr, [%[sha512], #32]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "strd r6, r7, [%[sha512], #24]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[0] */ + "ldrd r12, lr, [sp, #112]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp]\n\t" + "ldrd r6, r7, [sp, #72]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp]\n\t" + "ldrd r12, lr, [sp, #8]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp]\n\t" + /* Round 1 */ + "ldrd r12, lr, [%[sha512], #24]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #8]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "strd r6, r7, [%[sha512], #16]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #48]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[1] */ + "ldrd r12, lr, [sp, #120]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #8]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #8]\n\t" + "ldrd r12, lr, [sp, #16]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #8]\n\t" + /* Round 2 */ + "ldrd r12, lr, [%[sha512], #16]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #16]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "strd r6, r7, [%[sha512], #8]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[2] */ + "ldrd r12, lr, [sp]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #16]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #16]\n\t" + "ldrd r12, lr, [sp, #24]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #16]\n\t" + /* Round 3 */ + "ldrd r12, lr, [%[sha512], #8]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #24]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "strd r6, r7, [%[sha512]]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #32]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[3] */ + "ldrd r12, lr, [sp, #8]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #24]\n\t" + "ldrd r6, r7, [sp, #96]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #24]\n\t" + "ldrd r12, lr, [sp, #32]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #24]\n\t" + /* Round 4 */ + "ldrd r12, lr, [%[sha512]]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #32]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "strd r6, r7, [%[sha512], #56]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[4] */ + "ldrd r12, lr, [sp, #16]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #32]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #32]\n\t" + "ldrd r12, lr, [sp, #40]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #32]\n\t" + /* Round 5 */ + "ldrd r12, lr, [%[sha512], #56]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #40]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "strd r6, r7, [%[sha512], #48]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #16]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[5] */ + "ldrd r12, lr, [sp, #24]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #40]\n\t" + "ldrd r6, r7, [sp, #112]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #40]\n\t" + "ldrd r12, lr, [sp, #48]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #40]\n\t" + /* Round 6 */ + "ldrd r12, lr, [%[sha512], #48]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #48]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "strd r6, r7, [%[sha512], #40]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[6] */ + "ldrd r12, lr, [sp, #32]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #48]\n\t" + "ldrd r6, r7, [sp, #120]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #48]\n\t" + "ldrd r12, lr, [sp, #56]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #48]\n\t" + /* Round 7 */ + "ldrd r12, lr, [%[sha512], #40]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #56]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "strd r6, r7, [%[sha512], #32]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512]]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[7] */ + "ldrd r12, lr, [sp, #40]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #56]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #56]\n\t" + "ldrd r12, lr, [sp, #64]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #56]\n\t" + /* Round 8 */ + "ldrd r12, lr, [%[sha512], #32]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #64]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "strd r6, r7, [%[sha512], #24]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[8] */ + "ldrd r12, lr, [sp, #48]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #64]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #64]\n\t" + "ldrd r12, lr, [sp, #72]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #64]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #64]\n\t" + /* Round 9 */ + "ldrd r12, lr, [%[sha512], #24]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r6, r7, [sp, #72]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #72]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "strd r6, r7, [%[sha512], #16]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #48]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[9] */ + "ldrd r12, lr, [sp, #56]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #72]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #72]\n\t" + "ldrd r12, lr, [sp, #80]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #72]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #72]\n\t" + /* Round 10 */ + "ldrd r12, lr, [%[sha512], #16]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #80]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "strd r6, r7, [%[sha512], #8]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[10] */ + "ldrd r12, lr, [sp, #64]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #80]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #80]\n\t" + "ldrd r12, lr, [sp, #88]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #80]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #80]\n\t" + /* Round 11 */ + "ldrd r12, lr, [%[sha512], #8]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #88]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "strd r6, r7, [%[sha512]]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #32]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[11] */ + "ldrd r12, lr, [sp, #72]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #88]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #88]\n\t" + "ldrd r12, lr, [sp, #96]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #88]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #88]\n\t" + /* Round 12 */ + "ldrd r12, lr, [%[sha512]]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #96]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #96]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "strd r6, r7, [%[sha512], #56]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[12] */ + "ldrd r12, lr, [sp, #80]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #96]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #96]\n\t" + "ldrd r12, lr, [sp, #104]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #96]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #96]\n\t" + /* Round 13 */ + "ldrd r12, lr, [%[sha512], #56]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #104]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "strd r6, r7, [%[sha512], #48]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #16]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[13] */ + "ldrd r12, lr, [sp, #88]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #104]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #104]\n\t" + "ldrd r12, lr, [sp, #112]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #104]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #104]\n\t" + /* Round 14 */ + "ldrd r12, lr, [%[sha512], #48]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #112]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #112]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "strd r6, r7, [%[sha512], #40]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[14] */ + "ldrd r12, lr, [sp, #96]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #112]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #112]\n\t" + "ldrd r12, lr, [sp, #120]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #112]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #112]\n\t" + /* Round 15 */ + "ldrd r12, lr, [%[sha512], #40]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r6, r7, [sp, #120]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #120]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "strd r6, r7, [%[sha512], #32]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512]]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[15] */ + "ldrd r12, lr, [sp, #104]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #120]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #120]\n\t" + "ldrd r12, lr, [sp]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #120]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #120]\n\t" + "add r3, r3, #0x80\n\t" + "subs r10, r10, #1\n\t" + "bne L_sha512_len_neon_start_%=\n\t" + /* Round 0 */ + "ldrd r12, lr, [%[sha512], #32]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "strd r6, r7, [%[sha512], #24]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 1 */ + "ldrd r12, lr, [%[sha512], #24]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #8]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "strd r6, r7, [%[sha512], #16]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #48]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 2 */ + "ldrd r12, lr, [%[sha512], #16]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #16]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "strd r6, r7, [%[sha512], #8]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 3 */ + "ldrd r12, lr, [%[sha512], #8]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #24]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "strd r6, r7, [%[sha512]]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #32]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 4 */ + "ldrd r12, lr, [%[sha512]]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #32]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "strd r6, r7, [%[sha512], #56]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 5 */ + "ldrd r12, lr, [%[sha512], #56]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #40]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "strd r6, r7, [%[sha512], #48]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #16]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 6 */ + "ldrd r12, lr, [%[sha512], #48]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #48]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "strd r6, r7, [%[sha512], #40]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 7 */ + "ldrd r12, lr, [%[sha512], #40]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #56]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "strd r6, r7, [%[sha512], #32]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512]]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 8 */ + "ldrd r12, lr, [%[sha512], #32]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #64]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "strd r6, r7, [%[sha512], #24]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 9 */ + "ldrd r12, lr, [%[sha512], #24]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r6, r7, [sp, #72]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #72]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "strd r6, r7, [%[sha512], #16]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #48]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 10 */ + "ldrd r12, lr, [%[sha512], #16]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #80]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "strd r6, r7, [%[sha512], #8]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 11 */ + "ldrd r12, lr, [%[sha512], #8]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #88]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "strd r6, r7, [%[sha512]]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #32]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 12 */ + "ldrd r12, lr, [%[sha512]]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #96]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #96]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "strd r6, r7, [%[sha512], #56]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 13 */ + "ldrd r12, lr, [%[sha512], #56]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #104]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "strd r6, r7, [%[sha512], #48]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #16]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 14 */ + "ldrd r12, lr, [%[sha512], #48]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #112]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #112]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "strd r6, r7, [%[sha512], #40]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 15 */ + "ldrd r12, lr, [%[sha512], #40]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r6, r7, [sp, #120]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #120]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "strd r6, r7, [%[sha512], #32]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512]]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Add in digest from start */ + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #128]\n\t" + "ldrd r8, r9, [sp, #136]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [sp, #128]\n\t" + "strd r4, r5, [sp, #136]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #144]\n\t" + "ldrd r8, r9, [sp, #152]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [sp, #144]\n\t" + "strd r4, r5, [sp, #152]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #160]\n\t" + "ldrd r8, r9, [sp, #168]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [sp, #160]\n\t" + "strd r4, r5, [sp, #168]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp, #176]\n\t" + "ldrd r8, r9, [sp, #184]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [sp, #176]\n\t" + "strd r4, r5, [sp, #184]\n\t" + "subs %[len], %[len], #0x80\n\t" + "sub r3, r3, #0x200\n\t" + "add %[data], %[data], #0x80\n\t" + "bne L_sha512_len_neon_begin_%=\n\t" + "eor r0, r0, r0\n\t" + "add sp, sp, #0xc0\n\t" + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k) + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#include <wolfssl/wolfcrypt/sha512.h> + +#ifndef WOLFSSL_ARMASM_NO_NEON +static const uint64_t L_SHA512_transform_neon_len_k[] = { + 0x428a2f98d728ae22UL, + 0x7137449123ef65cdUL, + 0xb5c0fbcfec4d3b2fUL, + 0xe9b5dba58189dbbcUL, + 0x3956c25bf348b538UL, + 0x59f111f1b605d019UL, + 0x923f82a4af194f9bUL, + 0xab1c5ed5da6d8118UL, + 0xd807aa98a3030242UL, + 0x12835b0145706fbeUL, + 0x243185be4ee4b28cUL, + 0x550c7dc3d5ffb4e2UL, + 0x72be5d74f27b896fUL, + 0x80deb1fe3b1696b1UL, + 0x9bdc06a725c71235UL, + 0xc19bf174cf692694UL, + 0xe49b69c19ef14ad2UL, + 0xefbe4786384f25e3UL, + 0xfc19dc68b8cd5b5UL, + 0x240ca1cc77ac9c65UL, + 0x2de92c6f592b0275UL, + 0x4a7484aa6ea6e483UL, + 0x5cb0a9dcbd41fbd4UL, + 0x76f988da831153b5UL, + 0x983e5152ee66dfabUL, + 0xa831c66d2db43210UL, + 0xb00327c898fb213fUL, + 0xbf597fc7beef0ee4UL, + 0xc6e00bf33da88fc2UL, + 0xd5a79147930aa725UL, + 0x6ca6351e003826fUL, + 0x142929670a0e6e70UL, + 0x27b70a8546d22ffcUL, + 0x2e1b21385c26c926UL, + 0x4d2c6dfc5ac42aedUL, + 0x53380d139d95b3dfUL, + 0x650a73548baf63deUL, + 0x766a0abb3c77b2a8UL, + 0x81c2c92e47edaee6UL, + 0x92722c851482353bUL, + 0xa2bfe8a14cf10364UL, + 0xa81a664bbc423001UL, + 0xc24b8b70d0f89791UL, + 0xc76c51a30654be30UL, + 0xd192e819d6ef5218UL, + 0xd69906245565a910UL, + 0xf40e35855771202aUL, + 0x106aa07032bbd1b8UL, + 0x19a4c116b8d2d0c8UL, + 0x1e376c085141ab53UL, + 0x2748774cdf8eeb99UL, + 0x34b0bcb5e19b48a8UL, + 0x391c0cb3c5c95a63UL, + 0x4ed8aa4ae3418acbUL, + 0x5b9cca4f7763e373UL, + 0x682e6ff3d6b2b8a3UL, + 0x748f82ee5defb2fcUL, + 0x78a5636f43172f60UL, + 0x84c87814a1f0ab72UL, + 0x8cc702081a6439ecUL, + 0x90befffa23631e28UL, + 0xa4506cebde82bde9UL, + 0xbef9a3f7b2c67915UL, + 0xc67178f2e372532bUL, + 0xca273eceea26619cUL, + 0xd186b8c721c0c207UL, + 0xeada7dd6cde0eb1eUL, + 0xf57d4f7fee6ed178UL, + 0x6f067aa72176fbaUL, + 0xa637dc5a2c898a6UL, + 0x113f9804bef90daeUL, + 0x1b710b35131c471bUL, + 0x28db77f523047d84UL, + 0x32caab7b40c72493UL, + 0x3c9ebe0a15c9bebcUL, + 0x431d67c49c100d4cUL, + 0x4cc5d4becb3e42b6UL, + 0x597f299cfc657e2aUL, + 0x5fcb6fab3ad6faecUL, + 0x6c44198c4a475817UL, +}; + +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + /* Load digest into working vars */ + "vldm.64 %[sha512], {d0-d7}\n\t" + /* Start of loop processing a block */ + "\n" + "L_sha512_len_neon_begin_%=: \n\t" + /* Load W */ + "vldm.64 %[data]!, {d16-d31}\n\t" + "vrev64.8 q8, q8\n\t" + "vrev64.8 q9, q9\n\t" + "vrev64.8 q10, q10\n\t" + "vrev64.8 q11, q11\n\t" + "vrev64.8 q12, q12\n\t" + "vrev64.8 q13, q13\n\t" + "vrev64.8 q14, q14\n\t" + "vrev64.8 q15, q15\n\t" + "mov r3, %[L_SHA512_transform_neon_len_k]\n\t" + "mov r12, #4\n\t" + /* Start of 16 rounds */ + "\n" + "L_sha512_len_neon_start_%=: \n\t" + /* Round 0 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d4, #50\n\t" + "vsri.u64 d8, d4, #14\n\t" + "vshl.u64 d9, d0, #36\n\t" + "vsri.u64 d9, d0, #28\n\t" + "vshl.u64 d10, d4, #46\n\t" + "vsri.u64 d10, d4, #18\n\t" + "vshl.u64 d11, d0, #30\n\t" + "vsri.u64 d11, d0, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d4, #23\n\t" + "vsri.u64 d10, d4, #41\n\t" + "vshl.u64 d11, d0, #25\n\t" + "vsri.u64 d11, d0, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d12, d16\n\t" + "vmov d8, d4\n\t" + "veor d10, d1, d2\n\t" + "vadd.i64 d7, d12\n\t" + "vbsl d8, d5, d6\n\t" + "vbsl d10, d0, d2\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d3, d7\n\t" + "vadd.i64 d7, d10\n\t" + /* Round 1 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d3, #50\n\t" + "vsri.u64 d8, d3, #14\n\t" + "vshl.u64 d9, d7, #36\n\t" + "vsri.u64 d9, d7, #28\n\t" + "vshl.u64 d10, d3, #46\n\t" + "vsri.u64 d10, d3, #18\n\t" + "vshl.u64 d11, d7, #30\n\t" + "vsri.u64 d11, d7, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d3, #23\n\t" + "vsri.u64 d10, d3, #41\n\t" + "vshl.u64 d11, d7, #25\n\t" + "vsri.u64 d11, d7, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d12, d17\n\t" + "vmov d8, d3\n\t" + "veor d10, d0, d1\n\t" + "vadd.i64 d6, d12\n\t" + "vbsl d8, d4, d5\n\t" + "vbsl d10, d7, d1\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d2, d6\n\t" + "vadd.i64 d6, d10\n\t" + /* Calc new W[0]-W[1] */ + "vext.8 q6, q8, q9, #8\n\t" + "vshl.u64 q4, q15, #45\n\t" + "vsri.u64 q4, q15, #19\n\t" + "vshl.u64 q5, q15, #3\n\t" + "vsri.u64 q5, q15, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q15, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q8, q5\n\t" + "vext.8 q7, q12, q13, #8\n\t" + "vadd.i64 q8, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q8, q5\n\t" + /* Round 2 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d2, #50\n\t" + "vsri.u64 d8, d2, #14\n\t" + "vshl.u64 d9, d6, #36\n\t" + "vsri.u64 d9, d6, #28\n\t" + "vshl.u64 d10, d2, #46\n\t" + "vsri.u64 d10, d2, #18\n\t" + "vshl.u64 d11, d6, #30\n\t" + "vsri.u64 d11, d6, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d2, #23\n\t" + "vsri.u64 d10, d2, #41\n\t" + "vshl.u64 d11, d6, #25\n\t" + "vsri.u64 d11, d6, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d12, d18\n\t" + "vmov d8, d2\n\t" + "veor d10, d7, d0\n\t" + "vadd.i64 d5, d12\n\t" + "vbsl d8, d3, d4\n\t" + "vbsl d10, d6, d0\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d1, d5\n\t" + "vadd.i64 d5, d10\n\t" + /* Round 3 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d1, #50\n\t" + "vsri.u64 d8, d1, #14\n\t" + "vshl.u64 d9, d5, #36\n\t" + "vsri.u64 d9, d5, #28\n\t" + "vshl.u64 d10, d1, #46\n\t" + "vsri.u64 d10, d1, #18\n\t" + "vshl.u64 d11, d5, #30\n\t" + "vsri.u64 d11, d5, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d1, #23\n\t" + "vsri.u64 d10, d1, #41\n\t" + "vshl.u64 d11, d5, #25\n\t" + "vsri.u64 d11, d5, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d12, d19\n\t" + "vmov d8, d1\n\t" + "veor d10, d6, d7\n\t" + "vadd.i64 d4, d12\n\t" + "vbsl d8, d2, d3\n\t" + "vbsl d10, d5, d7\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d0, d4\n\t" + "vadd.i64 d4, d10\n\t" + /* Calc new W[2]-W[3] */ + "vext.8 q6, q9, q10, #8\n\t" + "vshl.u64 q4, q8, #45\n\t" + "vsri.u64 q4, q8, #19\n\t" + "vshl.u64 q5, q8, #3\n\t" + "vsri.u64 q5, q8, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q8, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q9, q5\n\t" + "vext.8 q7, q13, q14, #8\n\t" + "vadd.i64 q9, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q9, q5\n\t" + /* Round 4 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d0, #50\n\t" + "vsri.u64 d8, d0, #14\n\t" + "vshl.u64 d9, d4, #36\n\t" + "vsri.u64 d9, d4, #28\n\t" + "vshl.u64 d10, d0, #46\n\t" + "vsri.u64 d10, d0, #18\n\t" + "vshl.u64 d11, d4, #30\n\t" + "vsri.u64 d11, d4, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d0, #23\n\t" + "vsri.u64 d10, d0, #41\n\t" + "vshl.u64 d11, d4, #25\n\t" + "vsri.u64 d11, d4, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d12, d20\n\t" + "vmov d8, d0\n\t" + "veor d10, d5, d6\n\t" + "vadd.i64 d3, d12\n\t" + "vbsl d8, d1, d2\n\t" + "vbsl d10, d4, d6\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d7, d3\n\t" + "vadd.i64 d3, d10\n\t" + /* Round 5 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d7, #50\n\t" + "vsri.u64 d8, d7, #14\n\t" + "vshl.u64 d9, d3, #36\n\t" + "vsri.u64 d9, d3, #28\n\t" + "vshl.u64 d10, d7, #46\n\t" + "vsri.u64 d10, d7, #18\n\t" + "vshl.u64 d11, d3, #30\n\t" + "vsri.u64 d11, d3, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d7, #23\n\t" + "vsri.u64 d10, d7, #41\n\t" + "vshl.u64 d11, d3, #25\n\t" + "vsri.u64 d11, d3, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d12, d21\n\t" + "vmov d8, d7\n\t" + "veor d10, d4, d5\n\t" + "vadd.i64 d2, d12\n\t" + "vbsl d8, d0, d1\n\t" + "vbsl d10, d3, d5\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d6, d2\n\t" + "vadd.i64 d2, d10\n\t" + /* Calc new W[4]-W[5] */ + "vext.8 q6, q10, q11, #8\n\t" + "vshl.u64 q4, q9, #45\n\t" + "vsri.u64 q4, q9, #19\n\t" + "vshl.u64 q5, q9, #3\n\t" + "vsri.u64 q5, q9, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q9, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q10, q5\n\t" + "vext.8 q7, q14, q15, #8\n\t" + "vadd.i64 q10, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q10, q5\n\t" + /* Round 6 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d6, #50\n\t" + "vsri.u64 d8, d6, #14\n\t" + "vshl.u64 d9, d2, #36\n\t" + "vsri.u64 d9, d2, #28\n\t" + "vshl.u64 d10, d6, #46\n\t" + "vsri.u64 d10, d6, #18\n\t" + "vshl.u64 d11, d2, #30\n\t" + "vsri.u64 d11, d2, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d6, #23\n\t" + "vsri.u64 d10, d6, #41\n\t" + "vshl.u64 d11, d2, #25\n\t" + "vsri.u64 d11, d2, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d12, d22\n\t" + "vmov d8, d6\n\t" + "veor d10, d3, d4\n\t" + "vadd.i64 d1, d12\n\t" + "vbsl d8, d7, d0\n\t" + "vbsl d10, d2, d4\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d5, d1\n\t" + "vadd.i64 d1, d10\n\t" + /* Round 7 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d5, #50\n\t" + "vsri.u64 d8, d5, #14\n\t" + "vshl.u64 d9, d1, #36\n\t" + "vsri.u64 d9, d1, #28\n\t" + "vshl.u64 d10, d5, #46\n\t" + "vsri.u64 d10, d5, #18\n\t" + "vshl.u64 d11, d1, #30\n\t" + "vsri.u64 d11, d1, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d5, #23\n\t" + "vsri.u64 d10, d5, #41\n\t" + "vshl.u64 d11, d1, #25\n\t" + "vsri.u64 d11, d1, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d12, d23\n\t" + "vmov d8, d5\n\t" + "veor d10, d2, d3\n\t" + "vadd.i64 d0, d12\n\t" + "vbsl d8, d6, d7\n\t" + "vbsl d10, d1, d3\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d4, d0\n\t" + "vadd.i64 d0, d10\n\t" + /* Calc new W[6]-W[7] */ + "vext.8 q6, q11, q12, #8\n\t" + "vshl.u64 q4, q10, #45\n\t" + "vsri.u64 q4, q10, #19\n\t" + "vshl.u64 q5, q10, #3\n\t" + "vsri.u64 q5, q10, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q10, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q11, q5\n\t" + "vext.8 q7, q15, q8, #8\n\t" + "vadd.i64 q11, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q11, q5\n\t" + /* Round 8 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d4, #50\n\t" + "vsri.u64 d8, d4, #14\n\t" + "vshl.u64 d9, d0, #36\n\t" + "vsri.u64 d9, d0, #28\n\t" + "vshl.u64 d10, d4, #46\n\t" + "vsri.u64 d10, d4, #18\n\t" + "vshl.u64 d11, d0, #30\n\t" + "vsri.u64 d11, d0, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d4, #23\n\t" + "vsri.u64 d10, d4, #41\n\t" + "vshl.u64 d11, d0, #25\n\t" + "vsri.u64 d11, d0, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d12, d24\n\t" + "vmov d8, d4\n\t" + "veor d10, d1, d2\n\t" + "vadd.i64 d7, d12\n\t" + "vbsl d8, d5, d6\n\t" + "vbsl d10, d0, d2\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d3, d7\n\t" + "vadd.i64 d7, d10\n\t" + /* Round 9 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d3, #50\n\t" + "vsri.u64 d8, d3, #14\n\t" + "vshl.u64 d9, d7, #36\n\t" + "vsri.u64 d9, d7, #28\n\t" + "vshl.u64 d10, d3, #46\n\t" + "vsri.u64 d10, d3, #18\n\t" + "vshl.u64 d11, d7, #30\n\t" + "vsri.u64 d11, d7, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d3, #23\n\t" + "vsri.u64 d10, d3, #41\n\t" + "vshl.u64 d11, d7, #25\n\t" + "vsri.u64 d11, d7, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d12, d25\n\t" + "vmov d8, d3\n\t" + "veor d10, d0, d1\n\t" + "vadd.i64 d6, d12\n\t" + "vbsl d8, d4, d5\n\t" + "vbsl d10, d7, d1\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d2, d6\n\t" + "vadd.i64 d6, d10\n\t" + /* Calc new W[8]-W[9] */ + "vext.8 q6, q12, q13, #8\n\t" + "vshl.u64 q4, q11, #45\n\t" + "vsri.u64 q4, q11, #19\n\t" + "vshl.u64 q5, q11, #3\n\t" + "vsri.u64 q5, q11, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q11, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q12, q5\n\t" + "vext.8 q7, q8, q9, #8\n\t" + "vadd.i64 q12, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q12, q5\n\t" + /* Round 10 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d2, #50\n\t" + "vsri.u64 d8, d2, #14\n\t" + "vshl.u64 d9, d6, #36\n\t" + "vsri.u64 d9, d6, #28\n\t" + "vshl.u64 d10, d2, #46\n\t" + "vsri.u64 d10, d2, #18\n\t" + "vshl.u64 d11, d6, #30\n\t" + "vsri.u64 d11, d6, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d2, #23\n\t" + "vsri.u64 d10, d2, #41\n\t" + "vshl.u64 d11, d6, #25\n\t" + "vsri.u64 d11, d6, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d12, d26\n\t" + "vmov d8, d2\n\t" + "veor d10, d7, d0\n\t" + "vadd.i64 d5, d12\n\t" + "vbsl d8, d3, d4\n\t" + "vbsl d10, d6, d0\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d1, d5\n\t" + "vadd.i64 d5, d10\n\t" + /* Round 11 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d1, #50\n\t" + "vsri.u64 d8, d1, #14\n\t" + "vshl.u64 d9, d5, #36\n\t" + "vsri.u64 d9, d5, #28\n\t" + "vshl.u64 d10, d1, #46\n\t" + "vsri.u64 d10, d1, #18\n\t" + "vshl.u64 d11, d5, #30\n\t" + "vsri.u64 d11, d5, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d1, #23\n\t" + "vsri.u64 d10, d1, #41\n\t" + "vshl.u64 d11, d5, #25\n\t" + "vsri.u64 d11, d5, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d12, d27\n\t" + "vmov d8, d1\n\t" + "veor d10, d6, d7\n\t" + "vadd.i64 d4, d12\n\t" + "vbsl d8, d2, d3\n\t" + "vbsl d10, d5, d7\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d0, d4\n\t" + "vadd.i64 d4, d10\n\t" + /* Calc new W[10]-W[11] */ + "vext.8 q6, q13, q14, #8\n\t" + "vshl.u64 q4, q12, #45\n\t" + "vsri.u64 q4, q12, #19\n\t" + "vshl.u64 q5, q12, #3\n\t" + "vsri.u64 q5, q12, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q12, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q13, q5\n\t" + "vext.8 q7, q9, q10, #8\n\t" + "vadd.i64 q13, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q13, q5\n\t" + /* Round 12 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d0, #50\n\t" + "vsri.u64 d8, d0, #14\n\t" + "vshl.u64 d9, d4, #36\n\t" + "vsri.u64 d9, d4, #28\n\t" + "vshl.u64 d10, d0, #46\n\t" + "vsri.u64 d10, d0, #18\n\t" + "vshl.u64 d11, d4, #30\n\t" + "vsri.u64 d11, d4, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d0, #23\n\t" + "vsri.u64 d10, d0, #41\n\t" + "vshl.u64 d11, d4, #25\n\t" + "vsri.u64 d11, d4, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d12, d28\n\t" + "vmov d8, d0\n\t" + "veor d10, d5, d6\n\t" + "vadd.i64 d3, d12\n\t" + "vbsl d8, d1, d2\n\t" + "vbsl d10, d4, d6\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d7, d3\n\t" + "vadd.i64 d3, d10\n\t" + /* Round 13 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d7, #50\n\t" + "vsri.u64 d8, d7, #14\n\t" + "vshl.u64 d9, d3, #36\n\t" + "vsri.u64 d9, d3, #28\n\t" + "vshl.u64 d10, d7, #46\n\t" + "vsri.u64 d10, d7, #18\n\t" + "vshl.u64 d11, d3, #30\n\t" + "vsri.u64 d11, d3, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d7, #23\n\t" + "vsri.u64 d10, d7, #41\n\t" + "vshl.u64 d11, d3, #25\n\t" + "vsri.u64 d11, d3, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d12, d29\n\t" + "vmov d8, d7\n\t" + "veor d10, d4, d5\n\t" + "vadd.i64 d2, d12\n\t" + "vbsl d8, d0, d1\n\t" + "vbsl d10, d3, d5\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d6, d2\n\t" + "vadd.i64 d2, d10\n\t" + /* Calc new W[12]-W[13] */ + "vext.8 q6, q14, q15, #8\n\t" + "vshl.u64 q4, q13, #45\n\t" + "vsri.u64 q4, q13, #19\n\t" + "vshl.u64 q5, q13, #3\n\t" + "vsri.u64 q5, q13, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q13, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q14, q5\n\t" + "vext.8 q7, q10, q11, #8\n\t" + "vadd.i64 q14, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q14, q5\n\t" + /* Round 14 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d6, #50\n\t" + "vsri.u64 d8, d6, #14\n\t" + "vshl.u64 d9, d2, #36\n\t" + "vsri.u64 d9, d2, #28\n\t" + "vshl.u64 d10, d6, #46\n\t" + "vsri.u64 d10, d6, #18\n\t" + "vshl.u64 d11, d2, #30\n\t" + "vsri.u64 d11, d2, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d6, #23\n\t" + "vsri.u64 d10, d6, #41\n\t" + "vshl.u64 d11, d2, #25\n\t" + "vsri.u64 d11, d2, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d12, d30\n\t" + "vmov d8, d6\n\t" + "veor d10, d3, d4\n\t" + "vadd.i64 d1, d12\n\t" + "vbsl d8, d7, d0\n\t" + "vbsl d10, d2, d4\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d5, d1\n\t" + "vadd.i64 d1, d10\n\t" + /* Round 15 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d5, #50\n\t" + "vsri.u64 d8, d5, #14\n\t" + "vshl.u64 d9, d1, #36\n\t" + "vsri.u64 d9, d1, #28\n\t" + "vshl.u64 d10, d5, #46\n\t" + "vsri.u64 d10, d5, #18\n\t" + "vshl.u64 d11, d1, #30\n\t" + "vsri.u64 d11, d1, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d5, #23\n\t" + "vsri.u64 d10, d5, #41\n\t" + "vshl.u64 d11, d1, #25\n\t" + "vsri.u64 d11, d1, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d12, d31\n\t" + "vmov d8, d5\n\t" + "veor d10, d2, d3\n\t" + "vadd.i64 d0, d12\n\t" + "vbsl d8, d6, d7\n\t" + "vbsl d10, d1, d3\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d4, d0\n\t" + "vadd.i64 d0, d10\n\t" + /* Calc new W[14]-W[15] */ + "vext.8 q6, q15, q8, #8\n\t" + "vshl.u64 q4, q14, #45\n\t" + "vsri.u64 q4, q14, #19\n\t" + "vshl.u64 q5, q14, #3\n\t" + "vsri.u64 q5, q14, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q14, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q15, q5\n\t" + "vext.8 q7, q11, q12, #8\n\t" + "vadd.i64 q15, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q15, q5\n\t" + "subs r12, r12, #1\n\t" + "bne L_sha512_len_neon_start_%=\n\t" + /* Round 0 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d4, #50\n\t" + "vsri.u64 d8, d4, #14\n\t" + "vshl.u64 d9, d0, #36\n\t" + "vsri.u64 d9, d0, #28\n\t" + "vshl.u64 d10, d4, #46\n\t" + "vsri.u64 d10, d4, #18\n\t" + "vshl.u64 d11, d0, #30\n\t" + "vsri.u64 d11, d0, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d4, #23\n\t" + "vsri.u64 d10, d4, #41\n\t" + "vshl.u64 d11, d0, #25\n\t" + "vsri.u64 d11, d0, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d12, d16\n\t" + "vmov d8, d4\n\t" + "veor d10, d1, d2\n\t" + "vadd.i64 d7, d12\n\t" + "vbsl d8, d5, d6\n\t" + "vbsl d10, d0, d2\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d3, d7\n\t" + "vadd.i64 d7, d10\n\t" + /* Round 1 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d3, #50\n\t" + "vsri.u64 d8, d3, #14\n\t" + "vshl.u64 d9, d7, #36\n\t" + "vsri.u64 d9, d7, #28\n\t" + "vshl.u64 d10, d3, #46\n\t" + "vsri.u64 d10, d3, #18\n\t" + "vshl.u64 d11, d7, #30\n\t" + "vsri.u64 d11, d7, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d3, #23\n\t" + "vsri.u64 d10, d3, #41\n\t" + "vshl.u64 d11, d7, #25\n\t" + "vsri.u64 d11, d7, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d12, d17\n\t" + "vmov d8, d3\n\t" + "veor d10, d0, d1\n\t" + "vadd.i64 d6, d12\n\t" + "vbsl d8, d4, d5\n\t" + "vbsl d10, d7, d1\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d2, d6\n\t" + "vadd.i64 d6, d10\n\t" + /* Round 2 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d2, #50\n\t" + "vsri.u64 d8, d2, #14\n\t" + "vshl.u64 d9, d6, #36\n\t" + "vsri.u64 d9, d6, #28\n\t" + "vshl.u64 d10, d2, #46\n\t" + "vsri.u64 d10, d2, #18\n\t" + "vshl.u64 d11, d6, #30\n\t" + "vsri.u64 d11, d6, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d2, #23\n\t" + "vsri.u64 d10, d2, #41\n\t" + "vshl.u64 d11, d6, #25\n\t" + "vsri.u64 d11, d6, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d12, d18\n\t" + "vmov d8, d2\n\t" + "veor d10, d7, d0\n\t" + "vadd.i64 d5, d12\n\t" + "vbsl d8, d3, d4\n\t" + "vbsl d10, d6, d0\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d1, d5\n\t" + "vadd.i64 d5, d10\n\t" + /* Round 3 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d1, #50\n\t" + "vsri.u64 d8, d1, #14\n\t" + "vshl.u64 d9, d5, #36\n\t" + "vsri.u64 d9, d5, #28\n\t" + "vshl.u64 d10, d1, #46\n\t" + "vsri.u64 d10, d1, #18\n\t" + "vshl.u64 d11, d5, #30\n\t" + "vsri.u64 d11, d5, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d1, #23\n\t" + "vsri.u64 d10, d1, #41\n\t" + "vshl.u64 d11, d5, #25\n\t" + "vsri.u64 d11, d5, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d12, d19\n\t" + "vmov d8, d1\n\t" + "veor d10, d6, d7\n\t" + "vadd.i64 d4, d12\n\t" + "vbsl d8, d2, d3\n\t" + "vbsl d10, d5, d7\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d0, d4\n\t" + "vadd.i64 d4, d10\n\t" + /* Round 4 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d0, #50\n\t" + "vsri.u64 d8, d0, #14\n\t" + "vshl.u64 d9, d4, #36\n\t" + "vsri.u64 d9, d4, #28\n\t" + "vshl.u64 d10, d0, #46\n\t" + "vsri.u64 d10, d0, #18\n\t" + "vshl.u64 d11, d4, #30\n\t" + "vsri.u64 d11, d4, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d0, #23\n\t" + "vsri.u64 d10, d0, #41\n\t" + "vshl.u64 d11, d4, #25\n\t" + "vsri.u64 d11, d4, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d12, d20\n\t" + "vmov d8, d0\n\t" + "veor d10, d5, d6\n\t" + "vadd.i64 d3, d12\n\t" + "vbsl d8, d1, d2\n\t" + "vbsl d10, d4, d6\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d7, d3\n\t" + "vadd.i64 d3, d10\n\t" + /* Round 5 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d7, #50\n\t" + "vsri.u64 d8, d7, #14\n\t" + "vshl.u64 d9, d3, #36\n\t" + "vsri.u64 d9, d3, #28\n\t" + "vshl.u64 d10, d7, #46\n\t" + "vsri.u64 d10, d7, #18\n\t" + "vshl.u64 d11, d3, #30\n\t" + "vsri.u64 d11, d3, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d7, #23\n\t" + "vsri.u64 d10, d7, #41\n\t" + "vshl.u64 d11, d3, #25\n\t" + "vsri.u64 d11, d3, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d12, d21\n\t" + "vmov d8, d7\n\t" + "veor d10, d4, d5\n\t" + "vadd.i64 d2, d12\n\t" + "vbsl d8, d0, d1\n\t" + "vbsl d10, d3, d5\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d6, d2\n\t" + "vadd.i64 d2, d10\n\t" + /* Round 6 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d6, #50\n\t" + "vsri.u64 d8, d6, #14\n\t" + "vshl.u64 d9, d2, #36\n\t" + "vsri.u64 d9, d2, #28\n\t" + "vshl.u64 d10, d6, #46\n\t" + "vsri.u64 d10, d6, #18\n\t" + "vshl.u64 d11, d2, #30\n\t" + "vsri.u64 d11, d2, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d6, #23\n\t" + "vsri.u64 d10, d6, #41\n\t" + "vshl.u64 d11, d2, #25\n\t" + "vsri.u64 d11, d2, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d12, d22\n\t" + "vmov d8, d6\n\t" + "veor d10, d3, d4\n\t" + "vadd.i64 d1, d12\n\t" + "vbsl d8, d7, d0\n\t" + "vbsl d10, d2, d4\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d5, d1\n\t" + "vadd.i64 d1, d10\n\t" + /* Round 7 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d5, #50\n\t" + "vsri.u64 d8, d5, #14\n\t" + "vshl.u64 d9, d1, #36\n\t" + "vsri.u64 d9, d1, #28\n\t" + "vshl.u64 d10, d5, #46\n\t" + "vsri.u64 d10, d5, #18\n\t" + "vshl.u64 d11, d1, #30\n\t" + "vsri.u64 d11, d1, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d5, #23\n\t" + "vsri.u64 d10, d5, #41\n\t" + "vshl.u64 d11, d1, #25\n\t" + "vsri.u64 d11, d1, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d12, d23\n\t" + "vmov d8, d5\n\t" + "veor d10, d2, d3\n\t" + "vadd.i64 d0, d12\n\t" + "vbsl d8, d6, d7\n\t" + "vbsl d10, d1, d3\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d4, d0\n\t" + "vadd.i64 d0, d10\n\t" + /* Round 8 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d4, #50\n\t" + "vsri.u64 d8, d4, #14\n\t" + "vshl.u64 d9, d0, #36\n\t" + "vsri.u64 d9, d0, #28\n\t" + "vshl.u64 d10, d4, #46\n\t" + "vsri.u64 d10, d4, #18\n\t" + "vshl.u64 d11, d0, #30\n\t" + "vsri.u64 d11, d0, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d4, #23\n\t" + "vsri.u64 d10, d4, #41\n\t" + "vshl.u64 d11, d0, #25\n\t" + "vsri.u64 d11, d0, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d12, d24\n\t" + "vmov d8, d4\n\t" + "veor d10, d1, d2\n\t" + "vadd.i64 d7, d12\n\t" + "vbsl d8, d5, d6\n\t" + "vbsl d10, d0, d2\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d3, d7\n\t" + "vadd.i64 d7, d10\n\t" + /* Round 9 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d3, #50\n\t" + "vsri.u64 d8, d3, #14\n\t" + "vshl.u64 d9, d7, #36\n\t" + "vsri.u64 d9, d7, #28\n\t" + "vshl.u64 d10, d3, #46\n\t" + "vsri.u64 d10, d3, #18\n\t" + "vshl.u64 d11, d7, #30\n\t" + "vsri.u64 d11, d7, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d3, #23\n\t" + "vsri.u64 d10, d3, #41\n\t" + "vshl.u64 d11, d7, #25\n\t" + "vsri.u64 d11, d7, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d12, d25\n\t" + "vmov d8, d3\n\t" + "veor d10, d0, d1\n\t" + "vadd.i64 d6, d12\n\t" + "vbsl d8, d4, d5\n\t" + "vbsl d10, d7, d1\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d2, d6\n\t" + "vadd.i64 d6, d10\n\t" + /* Round 10 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d2, #50\n\t" + "vsri.u64 d8, d2, #14\n\t" + "vshl.u64 d9, d6, #36\n\t" + "vsri.u64 d9, d6, #28\n\t" + "vshl.u64 d10, d2, #46\n\t" + "vsri.u64 d10, d2, #18\n\t" + "vshl.u64 d11, d6, #30\n\t" + "vsri.u64 d11, d6, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d2, #23\n\t" + "vsri.u64 d10, d2, #41\n\t" + "vshl.u64 d11, d6, #25\n\t" + "vsri.u64 d11, d6, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d12, d26\n\t" + "vmov d8, d2\n\t" + "veor d10, d7, d0\n\t" + "vadd.i64 d5, d12\n\t" + "vbsl d8, d3, d4\n\t" + "vbsl d10, d6, d0\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d1, d5\n\t" + "vadd.i64 d5, d10\n\t" + /* Round 11 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d1, #50\n\t" + "vsri.u64 d8, d1, #14\n\t" + "vshl.u64 d9, d5, #36\n\t" + "vsri.u64 d9, d5, #28\n\t" + "vshl.u64 d10, d1, #46\n\t" + "vsri.u64 d10, d1, #18\n\t" + "vshl.u64 d11, d5, #30\n\t" + "vsri.u64 d11, d5, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d1, #23\n\t" + "vsri.u64 d10, d1, #41\n\t" + "vshl.u64 d11, d5, #25\n\t" + "vsri.u64 d11, d5, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d12, d27\n\t" + "vmov d8, d1\n\t" + "veor d10, d6, d7\n\t" + "vadd.i64 d4, d12\n\t" + "vbsl d8, d2, d3\n\t" + "vbsl d10, d5, d7\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d0, d4\n\t" + "vadd.i64 d4, d10\n\t" + /* Round 12 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d0, #50\n\t" + "vsri.u64 d8, d0, #14\n\t" + "vshl.u64 d9, d4, #36\n\t" + "vsri.u64 d9, d4, #28\n\t" + "vshl.u64 d10, d0, #46\n\t" + "vsri.u64 d10, d0, #18\n\t" + "vshl.u64 d11, d4, #30\n\t" + "vsri.u64 d11, d4, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d0, #23\n\t" + "vsri.u64 d10, d0, #41\n\t" + "vshl.u64 d11, d4, #25\n\t" + "vsri.u64 d11, d4, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d12, d28\n\t" + "vmov d8, d0\n\t" + "veor d10, d5, d6\n\t" + "vadd.i64 d3, d12\n\t" + "vbsl d8, d1, d2\n\t" + "vbsl d10, d4, d6\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d7, d3\n\t" + "vadd.i64 d3, d10\n\t" + /* Round 13 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d7, #50\n\t" + "vsri.u64 d8, d7, #14\n\t" + "vshl.u64 d9, d3, #36\n\t" + "vsri.u64 d9, d3, #28\n\t" + "vshl.u64 d10, d7, #46\n\t" + "vsri.u64 d10, d7, #18\n\t" + "vshl.u64 d11, d3, #30\n\t" + "vsri.u64 d11, d3, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d7, #23\n\t" + "vsri.u64 d10, d7, #41\n\t" + "vshl.u64 d11, d3, #25\n\t" + "vsri.u64 d11, d3, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d12, d29\n\t" + "vmov d8, d7\n\t" + "veor d10, d4, d5\n\t" + "vadd.i64 d2, d12\n\t" + "vbsl d8, d0, d1\n\t" + "vbsl d10, d3, d5\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d6, d2\n\t" + "vadd.i64 d2, d10\n\t" + /* Round 14 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d6, #50\n\t" + "vsri.u64 d8, d6, #14\n\t" + "vshl.u64 d9, d2, #36\n\t" + "vsri.u64 d9, d2, #28\n\t" + "vshl.u64 d10, d6, #46\n\t" + "vsri.u64 d10, d6, #18\n\t" + "vshl.u64 d11, d2, #30\n\t" + "vsri.u64 d11, d2, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d6, #23\n\t" + "vsri.u64 d10, d6, #41\n\t" + "vshl.u64 d11, d2, #25\n\t" + "vsri.u64 d11, d2, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d12, d30\n\t" + "vmov d8, d6\n\t" + "veor d10, d3, d4\n\t" + "vadd.i64 d1, d12\n\t" + "vbsl d8, d7, d0\n\t" + "vbsl d10, d2, d4\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d5, d1\n\t" + "vadd.i64 d1, d10\n\t" + /* Round 15 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d5, #50\n\t" + "vsri.u64 d8, d5, #14\n\t" + "vshl.u64 d9, d1, #36\n\t" + "vsri.u64 d9, d1, #28\n\t" + "vshl.u64 d10, d5, #46\n\t" + "vsri.u64 d10, d5, #18\n\t" + "vshl.u64 d11, d1, #30\n\t" + "vsri.u64 d11, d1, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d5, #23\n\t" + "vsri.u64 d10, d5, #41\n\t" + "vshl.u64 d11, d1, #25\n\t" + "vsri.u64 d11, d1, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d12, d31\n\t" + "vmov d8, d5\n\t" + "veor d10, d2, d3\n\t" + "vadd.i64 d0, d12\n\t" + "vbsl d8, d6, d7\n\t" + "vbsl d10, d1, d3\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d4, d0\n\t" + "vadd.i64 d0, d10\n\t" + /* Add in digest from start */ + "vldm.64 %[sha512], {d8-d15}\n\t" + "vadd.i64 q0, q0, q4\n\t" + "vadd.i64 q1, q1, q5\n\t" + "vadd.i64 q2, q2, q6\n\t" + "vadd.i64 q3, q3, q7\n\t" + "vstm.64 %[sha512], {d0-d7}\n\t" + "subs %[len], %[len], #0x80\n\t" + "bne L_sha512_len_neon_begin_%=\n\t" + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k), [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k) + : "memory", "r3", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#endif /* !WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_ARMASM */ +#endif /* !__aarch64__ */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-aes.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-aes.c new file mode 100644 index 0000000..d0f8a9c --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-aes.c @@ -0,0 +1,4653 @@ +/* armv8-aes.c + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + + +/* + * There are two versions one for 64 (Aarch64) and one for 32 bit (Aarch32). + * If changing one check the other. + */ + + +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#if !defined(NO_AES) && defined(WOLFSSL_ARMASM) + +#include <wolfssl/wolfcrypt/aes.h> +#include <wolfssl/wolfcrypt/error-crypt.h> +#include <wolfssl/wolfcrypt/logging.h> +#ifdef NO_INLINE + #include <wolfssl/wolfcrypt/misc.h> +#else + #define WOLFSSL_MISC_INCLUDED + #include <wolfcrypt/src/misc.c> +#endif + +#ifdef _MSC_VER + /* 4127 warning constant while(1) */ + #pragma warning(disable: 4127) +#endif + + +static const byte rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,0x1B, 0x36 + /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ +}; + +/* get table value from hardware */ +#ifdef __aarch64__ + #define SBOX(x) \ + do { \ + __asm__ volatile ( \ + "DUP v1.4s, %w[in] \n" \ + "MOVI v0.16b, #0 \n" \ + "AESE v0.16b, v1.16b \n" \ + "UMOV %w[out], v0.s[0] \n" \ + : [out] "=r"((x)) \ + : [in] "r" ((x)) \ + : "cc", "memory", "v0", "v1"\ + ); \ + } while(0) + + #define IMIX(x) \ + do { \ + __asm__ volatile ( \ + "LD1 {v0.16b}, [%[in]] \n" \ + "AESIMC v0.16b, v0.16b \n" \ + "ST1 {v0.16b}, [%[out]]\n" \ + : [out] "=r" ((x)) \ + : [in] "0" ((x)) \ + : "cc", "memory", "v0" \ + ); \ + } while(0) +#else /* if not defined __aarch64__ then use 32 bit version */ + #define SBOX(x) \ + do { \ + __asm__ volatile ( \ + "VDUP.32 q1, %[in] \n" \ + "VMOV.i32 q0, #0 \n" \ + "AESE.8 q0, q1 \n" \ + "VMOV.32 %[out], d0[0] \n" \ + : [out] "=r"((x)) \ + : [in] "r" ((x)) \ + : "cc", "memory", "q0", "q1"\ + ); \ + } while(0) + + #define IMIX(x) \ + do { \ + __asm__ volatile ( \ + "VLD1.32 {q0}, [%[in]] \n" \ + "AESIMC.8 q0, q0 \n" \ + "VST1.32 {q0}, [%[out]] \n" \ + : [out] "=r" ((x)) \ + : [in] "0" ((x)) \ + : "cc", "memory", "q0" \ + ); \ + } while(0) +#endif /* aarch64 */ + + +#ifdef HAVE_AESGCM + +static WC_INLINE void IncrementGcmCounter(byte* inOutCtr) +{ + int i; + + /* in network byte order so start at end and work back */ + for (i = AES_BLOCK_SIZE - 1; i >= AES_BLOCK_SIZE - CTR_SZ; i--) { + if (++inOutCtr[i]) /* we're done unless we overflow */ + return; + } +} + + +static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz) +{ + /* Multiply the sz by 8 */ + word32 szHi = (sz >> (8*sizeof(sz) - 3)); + sz <<= 3; + + /* copy over the words of the sz into the destination buffer */ + buf[0] = (szHi >> 24) & 0xff; + buf[1] = (szHi >> 16) & 0xff; + buf[2] = (szHi >> 8) & 0xff; + buf[3] = szHi & 0xff; + buf[4] = (sz >> 24) & 0xff; + buf[5] = (sz >> 16) & 0xff; + buf[6] = (sz >> 8) & 0xff; + buf[7] = sz & 0xff; +} + +#endif /* HAVE_AESGCM */ + +/* Similar to wolfSSL software implementation of expanding the AES key. + * Changed out the locations of where table look ups where made to + * use hardware instruction. Also altered decryption key to match. */ +int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, + const byte* iv, int dir) +{ + word32 temp; + word32 *rk; + unsigned int i = 0; + +#if defined(AES_MAX_KEY_SIZE) + const word32 max_key_len = (AES_MAX_KEY_SIZE / 8); +#endif + + if (!((keylen == 16) || (keylen == 24) || (keylen == 32)) || + aes == NULL || userKey == NULL) + return BAD_FUNC_ARG; + + rk = aes->key; +#if defined(AES_MAX_KEY_SIZE) + /* Check key length */ + if (keylen > max_key_len) { + return BAD_FUNC_ARG; + } +#endif + + #ifdef WOLFSSL_AES_COUNTER + aes->left = 0; + #endif /* WOLFSSL_AES_COUNTER */ + + aes->rounds = keylen/4 + 6; + XMEMCPY(rk, userKey, keylen); + + switch(keylen) + { +#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128 && \ + defined(WOLFSSL_AES_128) + case 16: + while (1) + { + temp = rk[3]; + SBOX(temp); + temp = rotrFixed(temp, 8); + rk[4] = rk[0] ^ temp ^ rcon[i]; + rk[5] = rk[4] ^ rk[1]; + rk[6] = rk[5] ^ rk[2]; + rk[7] = rk[6] ^ rk[3]; + if (++i == 10) + break; + rk += 4; + } + break; +#endif /* 128 */ + +#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 192 && \ + defined(WOLFSSL_AES_192) + case 24: + /* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */ + while (1) + { + temp = rk[5]; + SBOX(temp); + temp = rotrFixed(temp, 8); + rk[ 6] = rk[ 0] ^ temp ^ rcon[i]; + rk[ 7] = rk[ 1] ^ rk[ 6]; + rk[ 8] = rk[ 2] ^ rk[ 7]; + rk[ 9] = rk[ 3] ^ rk[ 8]; + if (++i == 8) + break; + rk[10] = rk[ 4] ^ rk[ 9]; + rk[11] = rk[ 5] ^ rk[10]; + rk += 6; + } + break; +#endif /* 192 */ + +#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256 && \ + defined(WOLFSSL_AES_256) + case 32: + while (1) + { + temp = rk[7]; + SBOX(temp); + temp = rotrFixed(temp, 8); + rk[8] = rk[0] ^ temp ^ rcon[i]; + rk[ 9] = rk[ 1] ^ rk[ 8]; + rk[10] = rk[ 2] ^ rk[ 9]; + rk[11] = rk[ 3] ^ rk[10]; + if (++i == 7) + break; + temp = rk[11]; + SBOX(temp); + rk[12] = rk[ 4] ^ temp; + rk[13] = rk[ 5] ^ rk[12]; + rk[14] = rk[ 6] ^ rk[13]; + rk[15] = rk[ 7] ^ rk[14]; + + rk += 8; + } + break; +#endif /* 256 */ + + default: + return BAD_FUNC_ARG; + } + + if (dir == AES_DECRYPTION) + { +#ifdef HAVE_AES_DECRYPT + unsigned int j; + rk = aes->key; + + /* invert the order of the round keys: */ + for (i = 0, j = 4* aes->rounds; i < j; i += 4, j -= 4) { + temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; + temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; + temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; + temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; + } + /* apply the inverse MixColumn transform to all round keys but the + first and the last: */ + for (i = 1; i < aes->rounds; i++) { + rk += 4; + IMIX(rk); + } +#else + WOLFSSL_MSG("AES Decryption not compiled in"); + return BAD_FUNC_ARG; +#endif /* HAVE_AES_DECRYPT */ + } + + return wc_AesSetIV(aes, iv); +} + +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen, + const byte* iv, int dir) + { + return wc_AesSetKey(aes, userKey, keylen, iv, dir); + } +#endif + +/* wc_AesSetIV is shared between software and hardware */ +int wc_AesSetIV(Aes* aes, const byte* iv) +{ + if (aes == NULL) + return BAD_FUNC_ARG; + + if (iv) + XMEMCPY(aes->reg, iv, AES_BLOCK_SIZE); + else + XMEMSET(aes->reg, 0, AES_BLOCK_SIZE); + + return 0; +} + + +#ifdef __aarch64__ +/* AES CCM/GCM use encrypt direct but not decrypt */ +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) + { + word32* keyPt = aes->key; + + /* + AESE exor's input with round key + shift rows of exor'ed result + sub bytes for shifted rows + */ + + __asm__ __volatile__ ( + "LD1 {v0.16b}, [%[CtrIn]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + + "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + + "#subtract rounds done so far and see if should continue\n" + "MOV w12, %w[R] \n" + "SUB w12, w12, #10 \n" + "CBZ w12, 1f \n" + "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + + "SUB w12, w12, #2 \n" + "CBZ w12, 1f \n" + "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + + "#Final AddRoundKey then store result \n" + "1: \n" + "LD1 {v1.2d}, [%[Key]], #16 \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "ST1 {v0.16b}, [%[CtrOut]] \n" + + :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds), + "=r" (inBlock) + :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds), + [CtrIn] "3" (inBlock) + : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4" + ); + + return 0; + } +#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */ +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + #ifdef HAVE_AES_DECRYPT + static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) + { + word32* keyPt = aes->key; + + /* + AESE exor's input with round key + shift rows of exor'ed result + sub bytes for shifted rows + */ + + __asm__ __volatile__ ( + "LD1 {v0.16b}, [%[CtrIn]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + + "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + + "#subtract rounds done so far and see if should continue\n" + "MOV w12, %w[R] \n" + "SUB w12, w12, #10 \n" + "CBZ w12, 1f \n" + "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + + "SUB w12, w12, #2 \n" + "CBZ w12, 1f \n" + "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + + "#Final AddRoundKey then store result \n" + "1: \n" + "LD1 {v1.2d}, [%[Key]], #16 \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "ST1 {v0.4s}, [%[CtrOut]] \n" + + :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds), + "=r" (inBlock) + :[Key] "1" (aes->key), "0" (outBlock), [R] "2" (aes->rounds), + [CtrIn] "3" (inBlock) + : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4" + ); + + return 0; +} + #endif /* HAVE_AES_DECRYPT */ +#endif /* DIRECT or COUNTER */ + +/* AES-CBC */ +#ifdef HAVE_AES_CBC + int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) + { + word32 numBlocks = sz / AES_BLOCK_SIZE; + + if (aes == NULL || out == NULL || (in == NULL && sz > 0)) { + return BAD_FUNC_ARG; + } + + /* do as many block size ops as possible */ + if (numBlocks > 0) { + word32* key = aes->key; + word32* reg = aes->reg; + /* + AESE exor's input with round key + shift rows of exor'ed result + sub bytes for shifted rows + + note: grouping AESE & AESMC together as pairs reduces latency + */ + switch(aes->rounds) { +#ifdef WOLFSSL_AES_128 + case 10: /* AES 128 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d},[%[Key]], #48 \n" + "LD1 {v0.2d}, [%[reg]] \n" + + "LD1 {v12.2d}, [%[input]], #16 \n" + "1:\n" + "#CBC operations, xorbuf in with current aes->reg \n" + "EOR v0.16b, v0.16b, v12.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "SUB w11, w11, #1 \n" + "EOR v0.16b, v0.16b, v11.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "CBZ w11, 2f \n" + "LD1 {v12.2d}, [%[input]], #16 \n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "ST1 {v0.2d}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) + :"0" (out), [Key] "r" (key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "1" (reg) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" + ); + break; +#endif /* WOLFSSL_AES_128 */ +#ifdef WOLFSSL_AES_192 + case 12: /* AES 192 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, %[Key], #64 \n" + "LD1 {v5.2d-v8.2d}, %[Key], #64 \n" + "LD1 {v9.2d-v12.2d},%[Key], #64 \n" + "LD1 {v13.2d}, %[Key], #16 \n" + "LD1 {v0.2d}, %[reg] \n" + + "LD1 {v14.2d}, [%[input]], #16 \n" + "1:\n" + "#CBC operations, xorbuf in with current aes->reg \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "CBZ w11, 2f \n" + "LD1 {v14.2d}, [%[input]], #16\n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "ST1 {v0.2d}, %[regOut] \n" + + + :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) + :"0" (out), [Key] "m" (aes->key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "m" (aes->reg) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); + break; +#endif /* WOLFSSL_AES_192*/ +#ifdef WOLFSSL_AES_256 + case 14: /* AES 256 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, %[Key], #64 \n" + + "LD1 {v5.2d-v8.2d}, %[Key], #64 \n" + "LD1 {v9.2d-v12.2d}, %[Key], #64 \n" + "LD1 {v13.2d-v15.2d}, %[Key], #48 \n" + "LD1 {v0.2d}, %[reg] \n" + + "LD1 {v16.2d}, [%[input]], #16 \n" + "1: \n" + "#CBC operations, xorbuf in with current aes->reg \n" + "EOR v0.16b, v0.16b, v16.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "CBZ w11, 2f \n" + "LD1 {v16.2d}, [%[input]], #16 \n" + "B 1b \n" + + "2: \n" + "#store current counter value at the end \n" + "ST1 {v0.2d}, %[regOut] \n" + + + :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) + :"0" (out), [Key] "m" (aes->key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "m" (aes->reg) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15", + "v16" + ); + break; +#endif /* WOLFSSL_AES_256 */ + default: + WOLFSSL_MSG("Bad AES-CBC round value"); + return BAD_FUNC_ARG; + } + } + + return 0; + } + + #ifdef HAVE_AES_DECRYPT + int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) + { + word32 numBlocks = sz / AES_BLOCK_SIZE; + + if (aes == NULL || out == NULL || (in == NULL && sz > 0) + || sz % AES_BLOCK_SIZE != 0) { + return BAD_FUNC_ARG; + } + + /* do as many block size ops as possible */ + if (numBlocks > 0) { + word32* key = aes->key; + word32* reg = aes->reg; + + switch(aes->rounds) { +#ifdef WOLFSSL_AES_128 + case 10: /* AES 128 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d},[%[Key]], #48 \n" + "LD1 {v13.2d}, [%[reg]] \n" + + "1:\n" + "LD1 {v0.2d}, [%[input]], #16 \n" + "MOV v12.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v5.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v6.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v7.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v8.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v9.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n" + + "EOR v0.16b, v0.16b, v13.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "MOV v13.16b, v12.16b \n" + + "CBZ w11, 2f \n" + "B 1b \n" + + "2: \n" + "#store current counter value at the end \n" + "ST1 {v13.2d}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) + :"0" (out), [Key] "r" (key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "1" (reg) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" + ); + break; +#endif /* WOLFSSL_AES_128 */ +#ifdef WOLFSSL_AES_192 + case 12: /* AES 192 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v12.2d},[%[Key]], #64 \n" + "LD1 {v13.16b}, [%[Key]], #16 \n" + "LD1 {v15.2d}, [%[reg]] \n" + + "LD1 {v0.2d}, [%[input]], #16 \n" + "1: \n" + "MOV v14.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v5.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v6.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v7.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v8.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v9.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v10.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v11.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n" + + "EOR v0.16b, v0.16b, v15.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "MOV v15.16b, v14.16b \n" + + "CBZ w11, 2f \n" + "LD1 {v0.2d}, [%[input]], #16 \n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "ST1 {v15.2d}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) + :"0" (out), [Key] "r" (key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "1" (reg) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); + break; +#endif /* WOLFSSL_AES_192 */ +#ifdef WOLFSSL_AES_256 + case 14: /* AES 256 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n" + "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n" + "LD1 {v17.2d}, [%[reg]] \n" + + "LD1 {v0.2d}, [%[input]], #16 \n" + "1: \n" + "MOV v16.16b, v0.16b \n" + "AESD v0.16b, v1.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v2.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v3.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v4.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v5.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v6.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v7.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v8.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v9.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v10.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v11.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v12.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v13.16b \n" + "AESIMC v0.16b, v0.16b \n" + "AESD v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + + "EOR v0.16b, v0.16b, v17.16b \n" + "SUB w11, w11, #1 \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "MOV v17.16b, v16.16b \n" + + "CBZ w11, 2f \n" + "LD1 {v0.2d}, [%[input]], #16 \n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "ST1 {v17.2d}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) + :"0" (out), [Key] "r" (key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "1" (reg) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15", + "v16", "v17" + ); + break; +#endif /* WOLFSSL_AES_256 */ + default: + WOLFSSL_MSG("Bad AES-CBC round value"); + return BAD_FUNC_ARG; + } + } + + return 0; + } + #endif + +#endif /* HAVE_AES_CBC */ + +/* AES-CTR */ +#ifdef WOLFSSL_AES_COUNTER + + /* Increment AES counter */ + static WC_INLINE void IncrementAesCounter(byte* inOutCtr) + { + int i; + + /* in network byte order so start at end and work back */ + for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) { + if (++inOutCtr[i]) /* we're done unless we overflow */ + return; + } + } + + int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) + { + byte* tmp; + word32 numBlocks; + + if (aes == NULL || out == NULL || in == NULL) { + return BAD_FUNC_ARG; + } + + tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left; + + /* consume any unused bytes left in aes->tmp */ + while (aes->left && sz) { + *(out++) = *(in++) ^ *(tmp++); + aes->left--; + sz--; + } + + /* do as many block size ops as possible */ + numBlocks = sz/AES_BLOCK_SIZE; + if (numBlocks > 0) { + /* pointer needed because it is incremented when read, causing + * an issue with call to encrypt/decrypt leftovers */ + byte* keyPt = (byte*)aes->key; + sz -= numBlocks * AES_BLOCK_SIZE; + switch(aes->rounds) { +#ifdef WOLFSSL_AES_128 + case 10: /* AES 128 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + + "#Create vector with the value 1 \n" + "MOVI v15.16b, #1 \n" + "USHR v15.2d, v15.2d, #56 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "EOR v14.16b, v14.16b, v14.16b \n" + "EXT v14.16b, v15.16b, v14.16b, #8\n" + + "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" + "LD1 {v13.2d}, %[reg] \n" + + /* double block */ + "1: \n" + "CMP w11, #1 \n" + "BEQ 2f \n" + "CMP w11, #0 \n" + "BEQ 3f \n" + + "MOV v0.16b, v13.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v13.16b, v13.16b \n" /* network order */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "SUB w11, w11, #2 \n" + "ADD v15.2d, v13.2d, v14.2d \n" /* add 1 to counter */ + "ADD v13.2d, v15.2d, v14.2d \n" /* add 1 to counter */ + + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v15.16b, v15.16b \n" /* revert from network order */ + "REV64 v13.16b, v13.16b \n" /* revert from network order */ + + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v1.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v2.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v3.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v4.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v5.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v10.16b \n" + "AESE v15.16b, v6.16b \n" + "AESMC v15.16b, v15.16b \n" + + "EOR v0.16b, v0.16b, v11.16b \n" + "AESE v15.16b, v7.16b \n" + "AESMC v15.16b, v15.16b \n" + + "LD1 {v12.2d}, [%[input]], #16 \n" + "AESE v15.16b, v8.16b \n" + "AESMC v15.16b, v15.16b \n" + + "EOR v0.16b, v0.16b, v12.16b \n" + "AESE v15.16b, v9.16b \n" + "AESMC v15.16b, v15.16b \n" + + "LD1 {v12.2d}, [%[input]], #16 \n" + "AESE v15.16b, v10.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "EOR v15.16b, v15.16b, v11.16b \n" + "EOR v15.16b, v15.16b, v12.16b \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "B 1b \n" + + /* single block */ + "2: \n" + "MOV v0.16b, v13.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v13.16b, v13.16b \n" /* network order */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */ + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #1 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v13.16b, v13.16b \n" /* revert from network order */ + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n" + "#CTR operations, increment counter and xorbuf \n" + "LD1 {v12.2d}, [%[input]], #16 \n" + "EOR v0.16b, v0.16b, v12.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "3: \n" + "#store current counter value at the end \n" + "ST1 {v13.2d}, %[regOut] \n" + + :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg), + "=r" (in) + :"0" (out), [Key] "1" (keyPt), [input] "3" (in), + [blocks] "r" (numBlocks), [reg] "m" (aes->reg) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15" + ); + break; +#endif /* WOLFSSL_AES_128 */ +#ifdef WOLFSSL_AES_192 + case 12: /* AES 192 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + + "#Create vector with the value 1 \n" + "MOVI v16.16b, #1 \n" + "USHR v16.2d, v16.2d, #56 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "EOR v14.16b, v14.16b, v14.16b \n" + "EXT v16.16b, v16.16b, v14.16b, #8\n" + + "LD1 {v9.2d-v12.2d}, [%[Key]], #64\n" + "LD1 {v15.2d}, %[reg] \n" + "LD1 {v13.16b}, [%[Key]], #16 \n" + + /* double block */ + "1: \n" + "CMP w11, #1 \n" + "BEQ 2f \n" + "CMP w11, #0 \n" + "BEQ 3f \n" + + "MOV v0.16b, v15.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v15.16b, v15.16b \n" /* network order */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "SUB w11, w11, #2 \n" + "ADD v17.2d, v15.2d, v16.2d \n" /* add 1 to counter */ + "ADD v15.2d, v17.2d, v16.2d \n" /* add 1 to counter */ + + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v17.16b, v17.16b, v17.16b, #8 \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v17.16b, v17.16b \n" /* revert from network order */ + "REV64 v15.16b, v15.16b \n" /* revert from network order */ + + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v1.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v2.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v3.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v4.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v5.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v6.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v7.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v12.16b \n" + "AESE v17.16b, v8.16b \n" + "AESMC v17.16b, v17.16b \n" + + "EOR v0.16b, v0.16b, v13.16b \n" + "AESE v17.16b, v9.16b \n" + "AESMC v17.16b, v17.16b \n" + + "LD1 {v14.2d}, [%[input]], #16 \n" + "AESE v17.16b, v10.16b \n" + "AESMC v17.16b, v17.16b \n" + + "EOR v0.16b, v0.16b, v14.16b \n" + "AESE v17.16b, v11.16b \n" + "AESMC v17.16b, v17.16b \n" + + "LD1 {v14.2d}, [%[input]], #16 \n" + "AESE v17.16b, v12.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "EOR v17.16b, v17.16b, v13.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "ST1 {v17.2d}, [%[out]], #16 \n" + + "B 1b \n" + + "2: \n" + "LD1 {v14.2d}, [%[input]], #16 \n" + "MOV v0.16b, v15.16b \n" + + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v15.16b, v15.16b \n" /* network order */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "ADD v15.2d, v15.2d, v16.2d \n" /* add 1 to counter */ + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #1 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v15.16b, v15.16b \n" /* revert from network order */ + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n" + "#CTR operations, increment counter and xorbuf \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "3: \n" + "#store current counter value at the end \n" + "ST1 {v15.2d}, %[regOut] \n" + + :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg), + "=r" (in) + :"0" (out), [Key] "1" (keyPt), [input] "3" (in), + [blocks] "r" (numBlocks), [reg] "m" (aes->reg) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16", "v17" + ); + break; +#endif /* WOLFSSL_AES_192 */ +#ifdef WOLFSSL_AES_256 + case 14: /* AES 256 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + + "#Create vector with the value 1 \n" + "MOVI v18.16b, #1 \n" + "USHR v18.2d, v18.2d, #56 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "EOR v19.16b, v19.16b, v19.16b \n" + "EXT v18.16b, v18.16b, v19.16b, #8\n" + + "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n" + "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n" + "LD1 {v17.2d}, %[reg] \n" + + /* double block */ + "1: \n" + "CMP w11, #1 \n" + "BEQ 2f \n" + "CMP w11, #0 \n" + "BEQ 3f \n" + + "MOV v0.16b, v17.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v17.16b, v17.16b \n" /* network order */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v17.16b, v17.16b, v17.16b, #8 \n" + "SUB w11, w11, #2 \n" + "ADD v19.2d, v17.2d, v18.2d \n" /* add 1 to counter */ + "ADD v17.2d, v19.2d, v18.2d \n" /* add 1 to counter */ + + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "EXT v17.16b, v17.16b, v17.16b, #8 \n" + + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v19.16b, v19.16b \n" /* revert from network order */ + "REV64 v17.16b, v17.16b \n" /* revert from network order */ + + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v1.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v2.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v3.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v4.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v5.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v6.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v7.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v8.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v9.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v14.16b \n" + "AESE v19.16b, v10.16b \n" + "AESMC v19.16b, v19.16b \n" + + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v19.16b, v11.16b \n" + "AESMC v19.16b, v19.16b \n" + + "LD1 {v16.2d}, [%[input]], #16 \n" + "AESE v19.16b, v12.16b \n" + "AESMC v19.16b, v19.16b \n" + + "EOR v0.16b, v0.16b, v16.16b \n" + "AESE v19.16b, v13.16b \n" + "AESMC v19.16b, v19.16b \n" + + "LD1 {v16.2d}, [%[input]], #16 \n" + "AESE v19.16b, v14.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v19.16b, v19.16b, v16.16b \n" + "ST1 {v19.2d}, [%[out]], #16 \n" + + "B 1b \n" + + "2: \n" + "LD1 {v16.2d}, [%[input]], #16 \n" + "MOV v0.16b, v17.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v17.16b, v17.16b \n" /* network order */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v17.16b, v17.16b, v17.16b, #8 \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "ADD v17.2d, v17.2d, v18.2d \n" /* add 1 to counter */ + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v17.16b, v17.16b, v17.16b, #8 \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v17.16b, v17.16b \n" /* revert from network order */ + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "#CTR operations, increment counter and xorbuf \n" + "EOR v0.16b, v0.16b, v16.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "3: \n" + "#store current counter value at the end \n" + "ST1 {v17.2d}, %[regOut] \n" + + + :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg), + "=r" (in) + :"0" (out), [Key] "1" (keyPt), [input] "3" (in), + [blocks] "r" (numBlocks), [reg] "m" (aes->reg) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16", "v17", "v18", "v19" + ); + break; +#endif /* WOLFSSL_AES_256 */ + default: + WOLFSSL_MSG("Bad AES-CTR round value"); + return BAD_FUNC_ARG; + } + + aes->left = 0; + } + + /* handle non block size remaining */ + if (sz) { + wc_AesEncrypt(aes, (byte*)aes->reg, (byte*)aes->tmp); + IncrementAesCounter((byte*)aes->reg); + + aes->left = AES_BLOCK_SIZE; + tmp = (byte*)aes->tmp; + + while (sz--) { + *(out++) = *(in++) ^ *(tmp++); + aes->left--; + } + } + return 0; + } + +#endif /* WOLFSSL_AES_COUNTER */ + +#ifdef HAVE_AESGCM + +/* + * Based from GCM implementation in wolfcrypt/src/aes.c + */ + +/* PMULL and RBIT only with AArch64 */ +/* Use ARM hardware for polynomial multiply */ +static void GMULT(byte* X, byte* Y) +{ + __asm__ volatile ( + "LD1 {v0.16b}, [%[inX]] \n" + "LD1 {v1.16b}, [%[inY]] \n" /* v1 already reflected from set key */ + "RBIT v0.16b, v0.16b \n" + + + /* Algorithm 1 from Intel GCM white paper. + "Carry-Less Multiplication and Its Usage for Computing the GCM Mode" + */ + "PMULL v3.1q, v0.1d, v1.1d \n" /* a0 * b0 = C */ + "PMULL2 v4.1q, v0.2d, v1.2d \n" /* a1 * b1 = D */ + "EXT v5.16b, v1.16b, v1.16b, #8 \n" /* b0b1 -> b1b0 */ + "PMULL v6.1q, v0.1d, v5.1d \n" /* a0 * b1 = E */ + "PMULL2 v5.1q, v0.2d, v5.2d \n" /* a1 * b0 = F */ + + "#Set a register to all 0s using EOR \n" + "EOR v7.16b, v7.16b, v7.16b \n" + "EOR v5.16b, v5.16b, v6.16b \n" /* F ^ E */ + "EXT v6.16b, v7.16b, v5.16b, #8 \n" /* get (F^E)[0] */ + "EOR v3.16b, v3.16b, v6.16b \n" /* low 128 bits in v3 */ + "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* get (F^E)[1] */ + "EOR v4.16b, v4.16b, v6.16b \n" /* high 128 bits in v4 */ + + + /* Based from White Paper "Implementing GCM on ARMv8" + by Conrado P.L. Gouvea and Julio Lopez + reduction on 256bit value using Algorithm 5 */ + "MOVI v8.16b, #0x87 \n" + "USHR v8.2d, v8.2d, #56 \n" + /* v8 is now 0x00000000000000870000000000000087 reflected 0xe1....*/ + "PMULL2 v5.1q, v4.2d, v8.2d \n" + "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* v7 is all 0's */ + "EOR v4.16b, v4.16b, v6.16b \n" + "EXT v6.16b, v7.16b, v5.16b, #8 \n" + "EOR v3.16b, v3.16b, v6.16b \n" + "PMULL v5.1q, v4.1d, v8.1d \n" + "EOR v4.16b, v3.16b, v5.16b \n" + + "RBIT v4.16b, v4.16b \n" + "STR q4, [%[out]] \n" + : [out] "=r" (X), "=r" (Y) + : [inX] "0" (X), [inY] "1" (Y) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8" + ); +} + + +void GHASH(Aes* aes, const byte* a, word32 aSz, + const byte* c, word32 cSz, byte* s, word32 sSz) +{ + byte x[AES_BLOCK_SIZE]; + byte scratch[AES_BLOCK_SIZE]; + word32 blocks, partial; + byte* h = aes->H; + + XMEMSET(x, 0, AES_BLOCK_SIZE); + + /* Hash in A, the Additional Authentication Data */ + if (aSz != 0 && a != NULL) { + blocks = aSz / AES_BLOCK_SIZE; + partial = aSz % AES_BLOCK_SIZE; + /* do as many blocks as possible */ + while (blocks--) { + xorbuf(x, a, AES_BLOCK_SIZE); + GMULT(x, h); + a += AES_BLOCK_SIZE; + } + if (partial != 0) { + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, a, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, h); + } + } + + /* Hash in C, the Ciphertext */ + if (cSz != 0 && c != NULL) { + blocks = cSz / AES_BLOCK_SIZE; + partial = cSz % AES_BLOCK_SIZE; + while (blocks--) { + xorbuf(x, c, AES_BLOCK_SIZE); + GMULT(x, h); + c += AES_BLOCK_SIZE; + } + if (partial != 0) { + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, c, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, h); + } + } + + /* Hash in the lengths of A and C in bits */ + FlattenSzInBits(&scratch[0], aSz); + FlattenSzInBits(&scratch[8], cSz); + xorbuf(x, scratch, AES_BLOCK_SIZE); + + /* Copy the result (minus last GMULT) into s. */ + XMEMCPY(s, x, sSz); +} + + +#ifdef WOLFSSL_AES_128 +/* internal function : see wc_AesGcmEncrypt */ +static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + word32 blocks; + word32 partial; + byte counter[AES_BLOCK_SIZE]; + byte initialCounter[AES_BLOCK_SIZE]; + byte x[AES_BLOCK_SIZE]; + byte scratch[AES_BLOCK_SIZE]; + + /* Noticed different optimization levels treated head of array different. + Some cases was stack pointer plus offset others was a regester containing + address. To make uniform for passing in to inline assembly code am using + pointers to the head of each local array. + */ + byte* ctr = counter; + byte* iCtr = initialCounter; + byte* xPt = x; + byte* sPt = scratch; + byte* keyPt; /* pointer to handle pointer advencment */ + + XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(initialCounter, iv, ivSz); + initialCounter[AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); + GMULT(initialCounter, aes->H); + } + XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE); + + + /* Hash in the Additional Authentication Data */ + XMEMSET(x, 0, AES_BLOCK_SIZE); + if (authInSz != 0 && authIn != NULL) { + blocks = authInSz / AES_BLOCK_SIZE; + partial = authInSz % AES_BLOCK_SIZE; + /* do as many blocks as possible */ + while (blocks--) { + xorbuf(x, authIn, AES_BLOCK_SIZE); + GMULT(x, aes->H); + authIn += AES_BLOCK_SIZE; + } + if (partial != 0) { + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, authIn, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, aes->H); + } + } + + /* do as many blocks as possible */ + blocks = sz / AES_BLOCK_SIZE; + partial = sz % AES_BLOCK_SIZE; + if (blocks > 0) { + keyPt = (byte*)aes->key; + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v13.2d}, [%[ctr]] \n" + + "#Create vector with the value 1 \n" + "MOVI v14.16b, #1 \n" + "USHR v14.2d, v14.2d, #56 \n" + "EOR v22.16b, v22.16b, v22.16b \n" + "EXT v14.16b, v14.16b, v22.16b, #8\n" + + + /*************************************************** + Get first out block for GHASH using AES encrypt + ***************************************************/ + "REV64 v13.16b, v13.16b \n" /* network order */ + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */ + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "REV64 v13.16b, v13.16b \n" /* revert from network order */ + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "MOV v0.16b, v13.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v16.2d}, %[inY] \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #1 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOVI v23.16b, #0x87 \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */ + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n" + + "EOR v0.16b, v0.16b, v12.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "MOV v15.16b, v0.16b \n" + + "CBZ w11, 1f \n" /* only one block jump to final GHASH */ + + "LD1 {v12.2d}, [%[input]], #16 \n" + + /*************************************************** + Interweave GHASH and encrypt if more then 1 block + ***************************************************/ + "2: \n" + "REV64 v13.16b, v13.16b \n" /* network order */ + "EOR v15.16b, v17.16b, v15.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */ + "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */ + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "REV64 v13.16b, v13.16b \n" /* revert from network order */ + "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ + "MOV v0.16b, v13.16b \n" + "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ + "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ + "SUB w11, w11, #1 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" + "AESE v0.16b, v10.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" + "EOR v0.16b, v0.16b, v11.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" + + "EOR v0.16b, v0.16b, v12.16b \n" + "PMULL v20.1q, v19.1d, v23.1d \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "EOR v19.16b, v18.16b, v20.16b \n" + "MOV v15.16b, v0.16b \n" + "RBIT v17.16b, v19.16b \n" + + "CBZ w11, 1f \n" + "LD1 {v12.2d}, [%[input]], #16 \n" + "B 2b \n" + + /*************************************************** + GHASH on last block + ***************************************************/ + "1: \n" + "EOR v15.16b, v17.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */ + + "#store current AES counter value \n" + "ST1 {v13.2d}, [%[ctrOut]] \n" + "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ + "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ + "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ + "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ + "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ + "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ + "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ + "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ + "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + + "#Reduce product from multiplication \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ + "EOR v19.16b, v19.16b, v21.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" + "EOR v18.16b, v18.16b, v21.16b \n" + "PMULL v20.1q, v19.1d, v23.1d \n" + "EOR v19.16b, v18.16b, v20.16b \n" + "RBIT v17.16b, v19.16b \n" + "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */ + + :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in) + ,[xOut] "=r" (xPt),"=m" (aes->H) + :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), + [input] "3" (in) + ,[inX] "4" (xPt), [inY] "m" (aes->H) + : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" + ); + } + + /* take care of partial block sizes leftover */ + if (partial != 0) { + IncrementGcmCounter(counter); + wc_AesEncrypt(aes, counter, scratch); + xorbuf(scratch, in, partial); + XMEMCPY(out, scratch, partial); + + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, out, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, aes->H); + } + + /* Hash in the lengths of A and C in bits */ + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + FlattenSzInBits(&scratch[0], authInSz); + FlattenSzInBits(&scratch[8], sz); + xorbuf(x, scratch, AES_BLOCK_SIZE); + XMEMCPY(scratch, x, AES_BLOCK_SIZE); + + keyPt = (byte*)aes->key; + __asm__ __volatile__ ( + + "LD1 {v16.16b}, [%[tag]] \n" + "LD1 {v17.16b}, %[h] \n" + "RBIT v16.16b, v16.16b \n" + + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "PMULL v18.1q, v16.1d, v17.1d \n" /* a0 * b0 = C */ + "PMULL2 v19.1q, v16.2d, v17.2d \n" /* a1 * b1 = D */ + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */ + "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" + "PMULL v21.1q, v16.1d, v20.1d \n" /* a0 * b1 = E */ + "PMULL2 v20.1q, v16.2d, v20.2d \n" /* a1 * b0 = F */ + "LD1 {v0.2d}, [%[ctr]] \n" + + "#Set a register to all 0s using EOR \n" + "EOR v22.16b, v22.16b, v22.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + "MOVI v23.16b, #0x87 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v20.16b, v22.16b, #8 \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v20.1q, v19.1d, v23.1d \n" + "EOR v19.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "RBIT v19.16b, v19.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n" + "EOR v19.16b, v19.16b, v0.16b \n" + "STR q19, [%[out]] \n" + + :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr) + :[tag] "0" (sPt), [Key] "1" (keyPt), + [ctr] "2" (iCtr) , [h] "m" (aes->H) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14", + "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24" + ); + + + if (authTagSz > AES_BLOCK_SIZE) { + XMEMCPY(authTag, scratch, AES_BLOCK_SIZE); + } + else { + /* authTagSz can be smaller than AES_BLOCK_SIZE */ + XMEMCPY(authTag, scratch, authTagSz); + } + return 0; +} +#endif /* WOLFSSL_AES_128 */ + +#ifdef WOLFSSL_AES_192 +/* internal function : see wc_AesGcmEncrypt */ +static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + word32 blocks; + word32 partial; + byte counter[AES_BLOCK_SIZE]; + byte initialCounter[AES_BLOCK_SIZE]; + byte x[AES_BLOCK_SIZE]; + byte scratch[AES_BLOCK_SIZE]; + + /* Noticed different optimization levels treated head of array different. + Some cases was stack pointer plus offset others was a regester containing + address. To make uniform for passing in to inline assembly code am using + pointers to the head of each local array. + */ + byte* ctr = counter; + byte* iCtr = initialCounter; + byte* xPt = x; + byte* sPt = scratch; + byte* keyPt; /* pointer to handle pointer advencment */ + + XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(initialCounter, iv, ivSz); + initialCounter[AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); + GMULT(initialCounter, aes->H); + } + XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE); + + + /* Hash in the Additional Authentication Data */ + XMEMSET(x, 0, AES_BLOCK_SIZE); + if (authInSz != 0 && authIn != NULL) { + blocks = authInSz / AES_BLOCK_SIZE; + partial = authInSz % AES_BLOCK_SIZE; + /* do as many blocks as possible */ + while (blocks--) { + xorbuf(x, authIn, AES_BLOCK_SIZE); + GMULT(x, aes->H); + authIn += AES_BLOCK_SIZE; + } + if (partial != 0) { + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, authIn, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, aes->H); + } + } + + /* do as many blocks as possible */ + blocks = sz / AES_BLOCK_SIZE; + partial = sz % AES_BLOCK_SIZE; + if (blocks > 0) { + keyPt = (byte*)aes->key; + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v13.2d}, [%[ctr]] \n" + + "#Create vector with the value 1 \n" + "MOVI v14.16b, #1 \n" + "USHR v14.2d, v14.2d, #56 \n" + "EOR v22.16b, v22.16b, v22.16b \n" + "EXT v14.16b, v14.16b, v22.16b, #8\n" + + + /*************************************************** + Get first out block for GHASH using AES encrypt + ***************************************************/ + "REV64 v13.16b, v13.16b \n" /* network order */ + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */ + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "REV64 v13.16b, v13.16b \n" /* revert from network order */ + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "MOV v0.16b, v13.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v16.2d}, %[inY] \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #1 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" + "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOVI v23.16b, #0x87 \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */ + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v30.16b \n" + "EOR v0.16b, v0.16b, v31.16b \n" + + "EOR v0.16b, v0.16b, v12.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "MOV v15.16b, v0.16b \n" + + "CBZ w11, 1f \n" /* only one block jump to final GHASH */ + "LD1 {v12.2d}, [%[input]], #16 \n" + + /*************************************************** + Interweave GHASH and encrypt if more then 1 block + ***************************************************/ + "2: \n" + "REV64 v13.16b, v13.16b \n" /* network order */ + "EOR v15.16b, v17.16b, v15.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */ + "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */ + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "REV64 v13.16b, v13.16b \n" /* revert from network order */ + "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ + "MOV v0.16b, v13.16b \n" + "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ + "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ + "SUB w11, w11, #1 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" + "AESE v0.16b, v30.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" + "EOR v0.16b, v0.16b, v31.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" + + "EOR v0.16b, v0.16b, v12.16b \n" + "PMULL v20.1q, v19.1d, v23.1d \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "EOR v19.16b, v18.16b, v20.16b \n" + "MOV v15.16b, v0.16b \n" + "RBIT v17.16b, v19.16b \n" + + "CBZ w11, 1f \n" + "LD1 {v12.2d}, [%[input]], #16 \n" + "B 2b \n" + + /*************************************************** + GHASH on last block + ***************************************************/ + "1: \n" + "EOR v15.16b, v17.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */ + + "#store current AES counter value \n" + "ST1 {v13.2d}, [%[ctrOut]] \n" + "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ + "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ + "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ + "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ + "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ + "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ + "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ + "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ + "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + + "#Reduce product from multiplication \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ + "EOR v19.16b, v19.16b, v21.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" + "EOR v18.16b, v18.16b, v21.16b \n" + "PMULL v20.1q, v19.1d, v23.1d \n" + "EOR v19.16b, v18.16b, v20.16b \n" + "RBIT v17.16b, v19.16b \n" + "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */ + + :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in) + ,[xOut] "=r" (xPt),"=m" (aes->H) + :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), + [input] "3" (in) + ,[inX] "4" (xPt), [inY] "m" (aes->H) + : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + } + + /* take care of partial block sizes leftover */ + if (partial != 0) { + IncrementGcmCounter(counter); + wc_AesEncrypt(aes, counter, scratch); + xorbuf(scratch, in, partial); + XMEMCPY(out, scratch, partial); + + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, out, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, aes->H); + } + + /* Hash in the lengths of A and C in bits */ + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + FlattenSzInBits(&scratch[0], authInSz); + FlattenSzInBits(&scratch[8], sz); + xorbuf(x, scratch, AES_BLOCK_SIZE); + XMEMCPY(scratch, x, AES_BLOCK_SIZE); + + keyPt = (byte*)aes->key; + __asm__ __volatile__ ( + + "LD1 {v16.16b}, [%[tag]] \n" + "LD1 {v17.16b}, %[h] \n" + "RBIT v16.16b, v16.16b \n" + + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "PMULL v18.1q, v16.1d, v17.1d \n" /* a0 * b0 = C */ + "PMULL2 v19.1q, v16.2d, v17.2d \n" /* a1 * b1 = D */ + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */ + "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" + "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n" + "PMULL v21.1q, v16.1d, v20.1d \n" /* a0 * b1 = E */ + "PMULL2 v20.1q, v16.2d, v20.2d \n" /* a1 * b0 = F */ + "LD1 {v0.2d}, [%[ctr]] \n" + + "#Set a register to all 0s using EOR \n" + "EOR v22.16b, v22.16b, v22.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + "MOVI v23.16b, #0x87 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v20.16b, v22.16b, #8 \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v20.1q, v19.1d, v23.1d \n" + "EOR v19.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v30.16b \n" + "RBIT v19.16b, v19.16b \n" + "EOR v0.16b, v0.16b, v31.16b \n" + "EOR v19.16b, v19.16b, v0.16b \n" + "STR q19, [%[out]] \n" + + :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr) + :[tag] "0" (sPt), [Key] "1" (keyPt), + [ctr] "2" (iCtr) , [h] "m" (aes->H) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14", + "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24" + ); + + + if (authTagSz > AES_BLOCK_SIZE) { + XMEMCPY(authTag, scratch, AES_BLOCK_SIZE); + } + else { + /* authTagSz can be smaller than AES_BLOCK_SIZE */ + XMEMCPY(authTag, scratch, authTagSz); + } + + return 0; +} +#endif /* WOLFSSL_AES_192 */ + +#ifdef WOLFSSL_AES_256 +/* internal function : see wc_AesGcmEncrypt */ +static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + word32 blocks; + word32 partial; + byte counter[AES_BLOCK_SIZE]; + byte initialCounter[AES_BLOCK_SIZE]; + byte x[AES_BLOCK_SIZE]; + byte scratch[AES_BLOCK_SIZE]; + + /* Noticed different optimization levels treated head of array different. + Some cases was stack pointer plus offset others was a regester containing + address. To make uniform for passing in to inline assembly code am using + pointers to the head of each local array. + */ + byte* ctr = counter; + byte* iCtr = initialCounter; + byte* xPt = x; + byte* sPt = scratch; + byte* keyPt; /* pointer to handle pointer advencment */ + + XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(initialCounter, iv, ivSz); + initialCounter[AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); + GMULT(initialCounter, aes->H); + } + XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE); + + + /* Hash in the Additional Authentication Data */ + XMEMSET(x, 0, AES_BLOCK_SIZE); + if (authInSz != 0 && authIn != NULL) { + blocks = authInSz / AES_BLOCK_SIZE; + partial = authInSz % AES_BLOCK_SIZE; + /* do as many blocks as possible */ + while (blocks--) { + xorbuf(x, authIn, AES_BLOCK_SIZE); + GMULT(x, aes->H); + authIn += AES_BLOCK_SIZE; + } + if (partial != 0) { + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, authIn, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, aes->H); + } + } + + /* do as many blocks as possible */ + blocks = sz / AES_BLOCK_SIZE; + partial = sz % AES_BLOCK_SIZE; + if (blocks > 0) { + keyPt = (byte*)aes->key; + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v13.2d}, [%[ctr]] \n" + + "#Create vector with the value 1 \n" + "MOVI v14.16b, #1 \n" + "USHR v14.2d, v14.2d, #56 \n" + "EOR v22.16b, v22.16b, v22.16b \n" + "EXT v14.16b, v14.16b, v22.16b, #8\n" + + + /*************************************************** + Get first out block for GHASH using AES encrypt + ***************************************************/ + "REV64 v13.16b, v13.16b \n" /* network order */ + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */ + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "REV64 v13.16b, v13.16b \n" /* revert from network order */ + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "MOV v0.16b, v13.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v16.2d}, %[inY] \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #1 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" + "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOVI v23.16b, #0x87 \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */ + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v28.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v29.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v30.16b \n" + "EOR v0.16b, v0.16b, v31.16b \n" + + "EOR v0.16b, v0.16b, v12.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "MOV v15.16b, v0.16b \n" + + "CBZ w11, 1f \n" /* only one block jump to final GHASH */ + "LD1 {v12.2d}, [%[input]], #16 \n" + + /*************************************************** + Interweave GHASH and encrypt if more then 1 block + ***************************************************/ + "2: \n" + "REV64 v13.16b, v13.16b \n" /* network order */ + "EOR v15.16b, v17.16b, v15.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */ + "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */ + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "REV64 v13.16b, v13.16b \n" /* revert from network order */ + "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ + "MOV v0.16b, v13.16b \n" + "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ + "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ + "SUB w11, w11, #1 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v28.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v29.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" + "AESE v0.16b, v30.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" + "EOR v0.16b, v0.16b, v31.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" + + "EOR v0.16b, v0.16b, v12.16b \n" + "PMULL v20.1q, v19.1d, v23.1d \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "EOR v19.16b, v18.16b, v20.16b \n" + "MOV v15.16b, v0.16b \n" + "RBIT v17.16b, v19.16b \n" + + "CBZ w11, 1f \n" + "LD1 {v12.2d}, [%[input]], #16 \n" + "B 2b \n" + + /*************************************************** + GHASH on last block + ***************************************************/ + "1: \n" + "EOR v15.16b, v17.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */ + + "#store current AES counter value \n" + "ST1 {v13.2d}, [%[ctrOut]] \n" + "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ + "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ + "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ + "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ + "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ + "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ + "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ + "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ + "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + + "#Reduce product from multiplication \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ + "EOR v19.16b, v19.16b, v21.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" + "EOR v18.16b, v18.16b, v21.16b \n" + "PMULL v20.1q, v19.1d, v23.1d \n" + "EOR v19.16b, v18.16b, v20.16b \n" + "RBIT v17.16b, v19.16b \n" + "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */ + + :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in) + ,[xOut] "=r" (xPt),"=m" (aes->H) + :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), + [input] "3" (in) + ,[inX] "4" (xPt), [inY] "m" (aes->H) + : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" + ); + } + + /* take care of partial block sizes leftover */ + if (partial != 0) { + IncrementGcmCounter(counter); + wc_AesEncrypt(aes, counter, scratch); + xorbuf(scratch, in, partial); + XMEMCPY(out, scratch, partial); + + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, out, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, aes->H); + } + + /* Hash in the lengths of A and C in bits */ + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + FlattenSzInBits(&scratch[0], authInSz); + FlattenSzInBits(&scratch[8], sz); + xorbuf(x, scratch, AES_BLOCK_SIZE); + XMEMCPY(scratch, x, AES_BLOCK_SIZE); + + keyPt = (byte*)aes->key; + __asm__ __volatile__ ( + + "LD1 {v16.16b}, [%[tag]] \n" + "LD1 {v17.16b}, %[h] \n" + "RBIT v16.16b, v16.16b \n" + + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "PMULL v18.1q, v16.1d, v17.1d \n" /* a0 * b0 = C */ + "PMULL2 v19.1q, v16.2d, v17.2d \n" /* a1 * b1 = D */ + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */ + "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" + "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n" + "PMULL v21.1q, v16.1d, v20.1d \n" /* a0 * b1 = E */ + "PMULL2 v20.1q, v16.2d, v20.2d \n" /* a1 * b0 = F */ + "LD1 {v0.2d}, [%[ctr]] \n" + + "#Set a register to all 0s using EOR \n" + "EOR v22.16b, v22.16b, v22.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ + "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + "MOVI v23.16b, #0x87 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v20.16b, v22.16b, #8 \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v19.16b, v19.16b, v21.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v22.16b, v20.16b, #8 \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v18.16b, v18.16b, v21.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v28.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v29.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v20.1q, v19.1d, v23.1d \n" + "EOR v19.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v30.16b \n" + "RBIT v19.16b, v19.16b \n" + "EOR v0.16b, v0.16b, v31.16b \n" + "EOR v19.16b, v19.16b, v0.16b \n" + "STR q19, [%[out]] \n" + + :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr) + :[tag] "0" (sPt), [Key] "1" (keyPt), + [ctr] "2" (iCtr) , [h] "m" (aes->H) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14", + "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + + + if (authTagSz > AES_BLOCK_SIZE) { + XMEMCPY(authTag, scratch, AES_BLOCK_SIZE); + } + else { + /* authTagSz can be smaller than AES_BLOCK_SIZE */ + XMEMCPY(authTag, scratch, authTagSz); + } + + return 0; +} +#endif /* WOLFSSL_AES_256 */ + + +/* aarch64 with PMULL and PMULL2 + * Encrypt and tag data using AES with GCM mode. + * aes: Aes structure having already been set with set key function + * out: encrypted data output buffer + * in: plain text input buffer + * sz: size of plain text and out buffer + * iv: initialization vector + * ivSz: size of iv buffer + * authTag: buffer to hold tag + * authTagSz: size of tag buffer + * authIn: additional data buffer + * authInSz: size of additional data buffer + * + * Notes: + * GHASH multiplication based from Algorithm 1 from Intel GCM white paper. + * "Carry-Less Multiplication and Its Usage for Computing the GCM Mode" + * + * GHASH reduction Based from White Paper "Implementing GCM on ARMv8" + * by Conrado P.L. Gouvea and Julio Lopez reduction on 256bit value using + * Algorithm 5 + */ +int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + /* sanity checks */ + if (aes == NULL || (iv == NULL && ivSz > 0) || + (authTag == NULL) || + (authIn == NULL && authInSz > 0) || + (in == NULL && sz > 0) || + (out == NULL && sz > 0)) { + WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0"); + return BAD_FUNC_ARG; + } + + if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ || authTagSz > AES_BLOCK_SIZE) { + WOLFSSL_MSG("GcmEncrypt authTagSz error"); + return BAD_FUNC_ARG; + } + + switch (aes->rounds) { +#ifdef WOLFSSL_AES_128 + case 10: + return Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz, + authTag, authTagSz, authIn, authInSz); +#endif +#ifdef WOLFSSL_AES_192 + case 12: + return Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz, + authTag, authTagSz, authIn, authInSz); +#endif +#ifdef WOLFSSL_AES_256 + case 14: + return Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz, + authTag, authTagSz, authIn, authInSz); +#endif + default: + WOLFSSL_MSG("AES-GCM invalid round number"); + return BAD_FUNC_ARG; + } +} + + +#ifdef HAVE_AES_DECRYPT +/* + * Check tag and decrypt data using AES with GCM mode. + * aes: Aes structure having already been set with set key function + * out: decrypted data output buffer + * in: cipher text buffer + * sz: size of plain text and out buffer + * iv: initialization vector + * ivSz: size of iv buffer + * authTag: buffer holding tag + * authTagSz: size of tag buffer + * authIn: additional data buffer + * authInSz: size of additional data buffer + */ +int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + word32 blocks = sz / AES_BLOCK_SIZE; + word32 partial = sz % AES_BLOCK_SIZE; + const byte* c = in; + byte* p = out; + byte counter[AES_BLOCK_SIZE]; + byte initialCounter[AES_BLOCK_SIZE]; + byte *ctr ; + byte scratch[AES_BLOCK_SIZE]; + + ctr = counter ; + + /* sanity checks */ + if (aes == NULL || (iv == NULL && ivSz > 0) || + (authTag == NULL) || + (authIn == NULL && authInSz > 0) || + (in == NULL && sz > 0) || + (out == NULL && sz > 0)) { + WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0"); + return BAD_FUNC_ARG; + } + + XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(initialCounter, iv, ivSz); + initialCounter[AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); + GMULT(initialCounter, aes->H); + } + XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); + + /* Calculate the authTag again using the received auth data and the + * cipher text. */ + { + byte Tprime[AES_BLOCK_SIZE]; + byte EKY0[AES_BLOCK_SIZE]; + + GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime)); + GMULT(Tprime, aes->H); + wc_AesEncrypt(aes, ctr, EKY0); + xorbuf(Tprime, EKY0, sizeof(Tprime)); + + if (ConstantCompare(authTag, Tprime, authTagSz) != 0) { + return AES_GCM_AUTH_E; + } + } + + /* do as many blocks as possible */ + if (blocks > 0) { + /* pointer needed because it is incremented when read, causing + * an issue with call to encrypt/decrypt leftovers */ + byte* keyPt = (byte*)aes->key; + switch(aes->rounds) { +#ifdef WOLFSSL_AES_128 + case 10: /* AES 128 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + + "#Create vector with the value 1 \n" + "MOVI v14.16b, #1 \n" + "USHR v14.2d, v14.2d, #56 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "EOR v13.16b, v13.16b, v13.16b \n" + "EXT v14.16b, v14.16b, v13.16b, #8 \n" + + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d}, [%[ctr]] \n" + "LD1 {v13.2d}, [%[input]], #16 \n" + + "1: \n" + "REV64 v12.16b, v12.16b \n" /* network order */ + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "ADD v12.2d, v12.2d, v14.2d \n" /* add 1 to counter */ + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "REV64 v12.16b, v12.16b \n" /* revert from network order */ + "MOV v0.16b, v12.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #1 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n" + + "EOR v0.16b, v0.16b, v13.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "CBZ w11, 2f \n" + "LD1 {v13.2d}, [%[input]], #16 \n" + "B 1b \n" + + "2: \n" + "#store current counter value at the end \n" + "ST1 {v12.16b}, [%[ctrOut]] \n" + + :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c) + :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), + [input] "3" (c) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); + break; +#endif +#ifdef WOLFSSL_AES_192 + case 12: /* AES 192 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + + "#Create vector with the value 1 \n" + "MOVI v16.16b, #1 \n" + "USHR v16.2d, v16.2d, #56 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "EOR v14.16b, v14.16b, v14.16b \n" + "EXT v16.16b, v16.16b, v14.16b, #8 \n" + + "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n" + "LD1 {v13.2d}, [%[Key]], #16 \n" + "LD1 {v14.2d}, [%[ctr]] \n" + "LD1 {v15.2d}, [%[input]], #16 \n" + + "1: \n" + "REV64 v14.16b, v14.16b \n" /* network order */ + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "ADD v14.2d, v14.2d, v16.2d \n" /* add 1 to counter */ + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "REV64 v14.16b, v14.16b \n" /* revert from network order */ + "MOV v0.16b, v14.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #1 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n" + + "EOR v0.16b, v0.16b, v15.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "CBZ w11, 2f \n" + "LD1 {v15.2d}, [%[input]], #16 \n" + "B 1b \n" + + "2: \n" + "#store current counter value at the end \n" + "ST1 {v14.2d}, [%[ctrOut]] \n" + + :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c) + :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), + [input] "3" (c) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16" + ); + break; +#endif /* WOLFSSL_AES_192 */ +#ifdef WOLFSSL_AES_256 + case 14: /* AES 256 BLOCK */ + __asm__ __volatile__ ( + "MOV w11, %w[blocks] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + + "#Create vector with the value 1 \n" + "MOVI v18.16b, #1 \n" + "USHR v18.2d, v18.2d, #56 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "EOR v19.16b, v19.16b, v19.16b \n" + "EXT v18.16b, v18.16b, v19.16b, #8 \n" + + "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n" + "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n" + "LD1 {v17.2d}, [%[ctr]] \n" + "LD1 {v16.2d}, [%[input]], #16 \n" + + "1: \n" + "REV64 v17.16b, v17.16b \n" /* network order */ + "EXT v17.16b, v17.16b, v17.16b, #8 \n" + "ADD v17.2d, v17.2d, v18.2d \n" /* add 1 to counter */ + "EXT v17.16b, v17.16b, v17.16b, #8 \n" + "REV64 v17.16b, v17.16b \n" /* revert from network order */ + "MOV v0.16b, v17.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #1 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + + "EOR v0.16b, v0.16b, v16.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "CBZ w11, 2f \n" + "LD1 {v16.2d}, [%[input]], #16 \n" + "B 1b \n" + + "2: \n" + "#store current counter value at the end \n" + "ST1 {v17.2d}, [%[ctrOut]] \n" + + :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c) + :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), + [input] "3" (c) + : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19" + ); + break; +#endif /* WOLFSSL_AES_256 */ + default: + WOLFSSL_MSG("Bad AES-GCM round value"); + return BAD_FUNC_ARG; + } + } + if (partial != 0) { + IncrementGcmCounter(ctr); + wc_AesEncrypt(aes, ctr, scratch); + + /* check if pointer is null after main AES-GCM blocks + * helps static analysis */ + if (p == NULL || c == NULL) { + return BAD_STATE_E; + } + xorbuf(scratch, c, partial); + XMEMCPY(p, scratch, partial); + } + return 0; +} + +#endif /* HAVE_AES_DECRYPT */ +#endif /* HAVE_AESGCM */ + + +/*************************************** + * not 64 bit so use 32 bit mode +****************************************/ +#else + +/* AES CCM/GCM use encrypt direct but not decrypt */ +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) + { + /* + AESE exor's input with round key + shift rows of exor'ed result + sub bytes for shifted rows + */ + + word32* keyPt = aes->key; + __asm__ __volatile__ ( + "VLD1.32 {q0}, [%[CtrIn]] \n" + "VLDM %[Key]!, {q1-q4} \n" + + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q1}, [%[Key]]! \n" + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q2}, [%[Key]]! \n" + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q3}, [%[Key]]! \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q4}, [%[Key]]! \n" + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q1}, [%[Key]]! \n" + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q2}, [%[Key]]! \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q2\n" + + "MOV r12, %[R] \n" + "CMP r12, #10 \n" + "BEQ 1f \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q2}, [%[Key]]! \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q2\n" + + "CMP r12, #12 \n" + "BEQ 1f \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q2}, [%[Key]]! \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q2\n" + + "#Final AddRoundKey then store result \n" + "1: \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "VEOR.32 q0, q0, q1\n" + "VST1.32 {q0}, [%[CtrOut]] \n" + + :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds), + "=r" (inBlock) + :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds), + [CtrIn] "3" (inBlock) + : "cc", "memory", "r12", "q0", "q1", "q2", "q3", "q4" + ); + + return 0; + } +#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */ +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + #ifdef HAVE_AES_DECRYPT + static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) + { + /* + AESE exor's input with round key + shift rows of exor'ed result + sub bytes for shifted rows + */ + + word32* keyPt = aes->key; + __asm__ __volatile__ ( + "VLD1.32 {q0}, [%[CtrIn]] \n" + "VLDM %[Key]!, {q1-q4} \n" + + "AESD.8 q0, q1\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q2\n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q1}, [%[Key]]! \n" + "AESD.8 q0, q3\n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q2}, [%[Key]]! \n" + "AESD.8 q0, q4\n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q3}, [%[Key]]! \n" + "AESD.8 q0, q1\n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q4}, [%[Key]]! \n" + "AESD.8 q0, q2\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q3\n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q1}, [%[Key]]! \n" + "AESD.8 q0, q4\n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q2}, [%[Key]]! \n" + "AESD.8 q0, q1\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q2\n" + + "MOV r12, %[R] \n" + "CMP r12, #10 \n" + "BEQ 1f \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q2}, [%[Key]]! \n" + "AESD.8 q0, q1\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q2\n" + + "CMP r12, #12 \n" + "BEQ 1f \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q2}, [%[Key]]! \n" + "AESD.8 q0, q1\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q2\n" + + "#Final AddRoundKey then store result \n" + "1: \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "VEOR.32 q0, q0, q1\n" + "VST1.32 {q0}, [%[CtrOut]] \n" + + :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds), + "=r" (inBlock) + :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds), + [CtrIn] "3" (inBlock) + : "cc", "memory", "r12", "q0", "q1", "q2", "q3", "q4" + ); + + return 0; +} + #endif /* HAVE_AES_DECRYPT */ +#endif /* DIRECT or COUNTER */ + +/* AES-CBC */ +#ifdef HAVE_AES_CBC + int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) + { + word32 numBlocks = sz / AES_BLOCK_SIZE; + + if (aes == NULL || out == NULL || (in == NULL && sz > 0)) { + return BAD_FUNC_ARG; + } + + /* do as many block size ops as possible */ + if (numBlocks > 0) { + word32* keyPt = aes->key; + word32* regPt = aes->reg; + /* + AESE exor's input with round key + shift rows of exor'ed result + sub bytes for shifted rows + + note: grouping AESE & AESMC together as pairs reduces latency + */ + switch(aes->rounds) { +#ifdef WOLFSSL_AES_128 + case 10: /* AES 128 BLOCK */ + __asm__ __volatile__ ( + "MOV r11, %[blocks] \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "VLD1.32 {q2}, [%[Key]]! \n" + "VLD1.32 {q3}, [%[Key]]! \n" + "VLD1.32 {q4}, [%[Key]]! \n" + "VLD1.32 {q5}, [%[Key]]! \n" + "VLD1.32 {q6}, [%[Key]]! \n" + "VLD1.32 {q7}, [%[Key]]! \n" + "VLD1.32 {q8}, [%[Key]]! \n" + "VLD1.32 {q9}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q0}, [%[reg]] \n" + "VLD1.32 {q12}, [%[input]]!\n" + + "1:\n" + "#CBC operations, xorbuf in with current aes->reg \n" + "VEOR.32 q0, q0, q12 \n" + "AESE.8 q0, q1 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q2 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q3 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q4 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q5 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q6 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q7 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q8 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q9 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q10\n" + "VEOR.32 q0, q0, q11 \n" + "SUB r11, r11, #1 \n" + "VST1.32 {q0}, [%[out]]! \n" + + "CMP r11, #0 \n" + "BEQ 2f \n" + "VLD1.32 {q12}, [%[input]]! \n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "VST1.32 {q0}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (regPt) + :"0" (out), [Key] "r" (keyPt), [input] "r" (in), + [blocks] "r" (numBlocks), [reg] "1" (regPt) + : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12" + ); + break; +#endif /* WOLFSSL_AES_128 */ +#ifdef WOLFSSL_AES_192 + case 12: /* AES 192 BLOCK */ + __asm__ __volatile__ ( + "MOV r11, %[blocks] \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "VLD1.32 {q2}, [%[Key]]! \n" + "VLD1.32 {q3}, [%[Key]]! \n" + "VLD1.32 {q4}, [%[Key]]! \n" + "VLD1.32 {q5}, [%[Key]]! \n" + "VLD1.32 {q6}, [%[Key]]! \n" + "VLD1.32 {q7}, [%[Key]]! \n" + "VLD1.32 {q8}, [%[Key]]! \n" + "VLD1.32 {q9}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q0}, [%[reg]] \n" + "VLD1.32 {q12}, [%[input]]!\n" + "VLD1.32 {q13}, [%[Key]]! \n" + "VLD1.32 {q14}, [%[Key]]! \n" + + "1:\n" + "#CBC operations, xorbuf in with current aes->reg \n" + "VEOR.32 q0, q0, q12 \n" + "AESE.8 q0, q1 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q2 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q3 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q4 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q5 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q6 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q7 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q8 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q9 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q10 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q11 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q13\n" + "VEOR.32 q0, q0, q14 \n" + "SUB r11, r11, #1 \n" + "VST1.32 {q0}, [%[out]]! \n" + + "CMP r11, #0 \n" + "BEQ 2f \n" + "VLD1.32 {q12}, [%[input]]! \n" + "B 1b \n" + + "2:\n" + "#store current counter qalue at the end \n" + "VST1.32 {q0}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (regPt) + :"0" (out), [Key] "r" (keyPt), [input] "r" (in), + [blocks] "r" (numBlocks), [reg] "1" (regPt) + : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14" + ); + break; +#endif /* WOLFSSL_AES_192 */ +#ifdef WOLFSSL_AES_256 + case 14: /* AES 256 BLOCK */ + __asm__ __volatile__ ( + "MOV r11, %[blocks] \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "VLD1.32 {q2}, [%[Key]]! \n" + "VLD1.32 {q3}, [%[Key]]! \n" + "VLD1.32 {q4}, [%[Key]]! \n" + "VLD1.32 {q5}, [%[Key]]! \n" + "VLD1.32 {q6}, [%[Key]]! \n" + "VLD1.32 {q7}, [%[Key]]! \n" + "VLD1.32 {q8}, [%[Key]]! \n" + "VLD1.32 {q9}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q0}, [%[reg]] \n" + "VLD1.32 {q12}, [%[input]]!\n" + "VLD1.32 {q13}, [%[Key]]! \n" + "VLD1.32 {q14}, [%[Key]]! \n" + + "1:\n" + "#CBC operations, xorbuf in with current aes->reg \n" + "VEOR.32 q0, q0, q12 \n" + "AESE.8 q0, q1 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q2 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q3 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q4 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q5 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q6 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q7 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q8 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q9 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q10 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q11 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q13 \n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q15}, [%[Key]]! \n" + "AESE.8 q0, q14 \n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q15\n" + "VLD1.32 {q15}, [%[Key]] \n" + "VEOR.32 q0, q0, q15 \n" + "SUB r11, r11, #1 \n" + "VST1.32 {q0}, [%[out]]! \n" + "SUB %[Key], %[Key], #16 \n" + + "CMP r11, #0 \n" + "BEQ 2f \n" + "VLD1.32 {q12}, [%[input]]! \n" + "B 1b \n" + + "2:\n" + "#store current counter qalue at the end \n" + "VST1.32 {q0}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (regPt), "=r" (keyPt) + :"0" (out), [Key] "2" (keyPt), [input] "r" (in), + [blocks] "r" (numBlocks), [reg] "1" (regPt) + : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + break; +#endif /* WOLFSSL_AES_256 */ + default: + WOLFSSL_MSG("Bad AES-CBC round value"); + return BAD_FUNC_ARG; + } + } + + return 0; + } + + #ifdef HAVE_AES_DECRYPT + int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) + { + word32 numBlocks = sz / AES_BLOCK_SIZE; + + if (aes == NULL || out == NULL || (in == NULL && sz > 0) + || sz % AES_BLOCK_SIZE != 0) { + return BAD_FUNC_ARG; + } + + /* do as many block size ops as possible */ + if (numBlocks > 0) { + word32* keyPt = aes->key; + word32* regPt = aes->reg; + switch(aes->rounds) { +#ifdef WOLFSSL_AES_128 + case 10: /* AES 128 BLOCK */ + __asm__ __volatile__ ( + "MOV r11, %[blocks] \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "VLD1.32 {q2}, [%[Key]]! \n" + "VLD1.32 {q3}, [%[Key]]! \n" + "VLD1.32 {q4}, [%[Key]]! \n" + "VLD1.32 {q5}, [%[Key]]! \n" + "VLD1.32 {q6}, [%[Key]]! \n" + "VLD1.32 {q7}, [%[Key]]! \n" + "VLD1.32 {q8}, [%[Key]]! \n" + "VLD1.32 {q9}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q13}, [%[reg]] \n" + "VLD1.32 {q0}, [%[input]]!\n" + + "1:\n" + "VMOV.32 q12, q0 \n" + "AESD.8 q0, q1\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q2\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q3\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q4\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q5\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q6\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q7\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q8\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q9\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q10\n" + "VEOR.32 q0, q0, q11\n" + + "VEOR.32 q0, q0, q13\n" + "SUB r11, r11, #1 \n" + "VST1.32 {q0}, [%[out]]! \n" + "VMOV.32 q13, q12 \n" + + "CMP r11, #0 \n" + "BEQ 2f \n" + "VLD1.32 {q0}, [%[input]]! \n" + "B 1b \n" + + "2: \n" + "#store current counter qalue at the end \n" + "VST1.32 {q13}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (regPt) + :"0" (out), [Key] "r" (keyPt), [input] "r" (in), + [blocks] "r" (numBlocks), [reg] "1" (regPt) + : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13" + ); + break; +#endif /* WOLFSSL_AES_128 */ +#ifdef WOLFSSL_AES_192 + case 12: /* AES 192 BLOCK */ + __asm__ __volatile__ ( + "MOV r11, %[blocks] \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "VLD1.32 {q2}, [%[Key]]! \n" + "VLD1.32 {q3}, [%[Key]]! \n" + "VLD1.32 {q4}, [%[Key]]! \n" + "VLD1.32 {q5}, [%[Key]]! \n" + "VLD1.32 {q6}, [%[Key]]! \n" + "VLD1.32 {q7}, [%[Key]]! \n" + "VLD1.32 {q8}, [%[Key]]! \n" + "VLD1.32 {q9}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q12}, [%[Key]]! \n" + "VLD1.32 {q13}, [%[Key]]! \n" + "VLD1.32 {q14}, [%[reg]] \n" + "VLD1.32 {q0}, [%[input]]!\n" + + "1: \n" + "VMOV.32 q15, q0 \n" + "AESD.8 q0, q1\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q2\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q3\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q4\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q5\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q6\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q7\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q8\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q9\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q10\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q11\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q12\n" + "VEOR.32 q0, q0, q13\n" + + "VEOR.32 q0, q0, q14\n" + "SUB r11, r11, #1 \n" + "VST1.32 {q0}, [%[out]]! \n" + "VMOV.32 q14, q15 \n" + + "CMP r11, #0 \n" + "BEQ 2f \n" + "VLD1.32 {q0}, [%[input]]! \n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "VST1.32 {q15}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (regPt) + :"0" (out), [Key] "r" (keyPt), [input] "r" (in), + [blocks] "r" (numBlocks), [reg] "1" (regPt) + : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + break; +#endif /* WOLFSSL_AES_192 */ +#ifdef WOLFSSL_AES_256 + case 14: /* AES 256 BLOCK */ + __asm__ __volatile__ ( + "MOV r11, %[blocks] \n" + "VLD1.32 {q1}, [%[Key]]! \n" + "VLD1.32 {q2}, [%[Key]]! \n" + "VLD1.32 {q3}, [%[Key]]! \n" + "VLD1.32 {q4}, [%[Key]]! \n" + "VLD1.32 {q5}, [%[Key]]! \n" + "VLD1.32 {q6}, [%[Key]]! \n" + "VLD1.32 {q7}, [%[Key]]! \n" + "VLD1.32 {q8}, [%[Key]]! \n" + "VLD1.32 {q9}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q12}, [%[Key]]! \n" + "VLD1.32 {q14}, [%[reg]] \n" + "VLD1.32 {q0}, [%[input]]!\n" + + "1:\n" + "VMOV.32 q15, q0 \n" + "AESD.8 q0, q1\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q2\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q3\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q4\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q5\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q6\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q7\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q8\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q9\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q10\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q11\n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q13}, [%[Key]]! \n" + "AESD.8 q0, q12\n" + "AESIMC.8 q0, q0\n" + "AESD.8 q0, q13\n" + "AESIMC.8 q0, q0\n" + "VLD1.32 {q13}, [%[Key]]! \n" + "AESD.8 q0, q13\n" + "VLD1.32 {q13}, [%[Key]] \n" + "VEOR.32 q0, q0, q13\n" + "SUB %[Key], %[Key], #32 \n" + + "VEOR.32 q0, q0, q14\n" + "SUB r11, r11, #1 \n" + "VST1.32 {q0}, [%[out]]! \n" + "VMOV.32 q14, q15 \n" + + "CMP r11, #0 \n" + "BEQ 2f \n" + "VLD1.32 {q0}, [%[input]]! \n" + "B 1b \n" + + "2:\n" + "#store current counter value at the end \n" + "VST1.32 {q15}, [%[regOut]] \n" + + :[out] "=r" (out), [regOut] "=r" (regPt) + :"0" (out), [Key] "r" (keyPt), [input] "r" (in), + [blocks] "r" (numBlocks), [reg] "1" (regPt) + : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + break; +#endif /* WOLFSSL_AES_256 */ + default: + WOLFSSL_MSG("Bad AES-CBC round value"); + return BAD_FUNC_ARG; + } + } + + return 0; + } + #endif + +#endif /* HAVE_AES_CBC */ + +/* AES-CTR */ +#ifdef WOLFSSL_AES_COUNTER + + /* Increment AES counter */ + static WC_INLINE void IncrementAesCounter(byte* inOutCtr) + { + int i; + + /* in network byte order so start at end and work back */ + for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) { + if (++inOutCtr[i]) /* we're done unless we overflow */ + return; + } + } + + int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) + { + byte* tmp; + word32 numBlocks; + + if (aes == NULL || out == NULL || in == NULL) { + return BAD_FUNC_ARG; + } + + tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left; + + /* consume any unused bytes left in aes->tmp */ + while (aes->left && sz) { + *(out++) = *(in++) ^ *(tmp++); + aes->left--; + sz--; + } + + /* do as many block size ops as possible */ + numBlocks = sz/AES_BLOCK_SIZE; + if (numBlocks > 0) { + /* pointer needed because it is incremented when read, causing + * an issue with call to encrypt/decrypt leftovers */ + word32* keyPt = aes->key; + word32* regPt = aes->reg; + sz -= numBlocks * AES_BLOCK_SIZE; + switch(aes->rounds) { +#ifdef WOLFSSL_AES_128 + case 10: /* AES 128 BLOCK */ + __asm__ __volatile__ ( + "MOV r11, %[blocks] \n" + "VLDM %[Key]!, {q1-q4} \n" + + "#Create vector with the value 1 \n" + "VMOV.u32 q15, #1 \n" + "VSHR.u64 q15, q15, #32 \n" + "VLDM %[Key]!, {q5-q8} \n" + "VEOR.32 q14, q14, q14 \n" + "VLDM %[Key]!, {q9-q11} \n" + "VEXT.8 q14, q15, q14, #8\n" + + "VLD1.32 {q13}, [%[reg]]\n" + + /* double block */ + "1: \n" + "CMP r11, #1 \n" + "BEQ 2f \n" + "CMP r11, #0 \n" + "BEQ 3f \n" + + "VMOV.32 q0, q13 \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13 \n" /* network order */ + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "SUB r11, r11, #2 \n" + "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */ + "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */ + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q15, q15, q15, #8 \n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q15, q15\n" /* revert from network order */ + "VREV64.8 q13, q13\n" /* revert from network order */ + "AESE.8 q0, q5\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q1\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q6\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q2\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q7\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q3\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q8\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q4\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q9\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q5\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q10\n" + "AESE.8 q15, q6\n" + "AESMC.8 q15, q15\n" + "VEOR.32 q0, q0, q11\n" + + "AESE.8 q15, q7\n" + "AESMC.8 q15, q15\n" + "VLD1.32 {q12}, [%[input]]! \n" + "AESE.8 q15, q8\n" + "AESMC.8 q15, q15\n" + + "VEOR.32 q0, q0, q12\n" + "AESE.8 q15, q9\n" + "AESMC.8 q15, q15\n" + + "VLD1.32 {q12}, [%[input]]! \n" + "AESE.8 q15, q10\n" + "VST1.32 {q0}, [%[out]]! \n" + "VEOR.32 q15, q15, q11\n" + "VEOR.32 q15, q15, q12\n" + "VST1.32 {q15}, [%[out]]! \n" + + "B 1b \n" + + /* single block */ + "2: \n" + "VMOV.32 q0, q13 \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13 \n" /* network order */ + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */ + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "SUB r11, r11, #1 \n" + "AESE.8 q0, q5\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q6\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13\n" /* revert from network order */ + "AESE.8 q0, q7\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q8\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q9\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q10\n" + "VLD1.32 {q12}, [%[input]]! \n" + "VEOR.32 q0, q0, q11\n" + "#CTR operations, increment counter and xorbuf \n" + "VEOR.32 q0, q0, q12\n" + "VST1.32 {q0}, [%[out]]! \n" + + "3: \n" + "#store current counter qalue at the end \n" + "VST1.32 {q13}, [%[regOut]] \n" + + :[out] "=r" (out), "=r" (keyPt), [regOut] "=r" (regPt), + "=r" (in) + :"0" (out), [Key] "1" (keyPt), [input] "3" (in), + [blocks] "r" (numBlocks), [reg] "2" (regPt) + : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14", "q15" + ); + break; +#endif /* WOLFSSL_AES_128 */ +#ifdef WOLFSSL_AES_192 + case 12: /* AES 192 BLOCK */ + __asm__ __volatile__ ( + "MOV r11, %[blocks] \n" + "VLDM %[Key]!, {q1-q4} \n" + + "#Create vector with the value 1 \n" + "VMOV.u32 q15, #1 \n" + "VSHR.u64 q15, q15, #32 \n" + "VLDM %[Key]!, {q5-q8} \n" + "VEOR.32 q14, q14, q14 \n" + "VEXT.8 q14, q15, q14, #8\n" + + "VLDM %[Key]!, {q9-q10} \n" + "VLD1.32 {q13}, [%[reg]]\n" + + /* double block */ + "1: \n" + "CMP r11, #1 \n" + "BEQ 2f \n" + "CMP r11, #0 \n" + "BEQ 3f \n" + + "VMOV.32 q0, q13\n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13 \n" /* network order */ + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "SUB r11, r11, #2 \n" + "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */ + "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */ + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q15, q15, q15, #8 \n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q15, q15\n" /* revert from network order */ + "VREV64.8 q13, q13\n" /* revert from network order */ + "AESE.8 q0, q5\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q1\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q6\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q2\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q7\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q3\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q8\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q4\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q9\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q5\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q10\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q15, q6\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q11\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q7\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q8\n" + "AESMC.8 q15, q15\n" + + "VLD1.32 {q12}, [%[Key]]! \n" + "AESE.8 q15, q9\n" + "AESMC.8 q15, q15\n" + "AESE.8 q15, q10\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q11\n" + "AESMC.8 q15, q15\n" + "VLD1.32 {q11}, [%[Key]] \n" + "AESE.8 q0, q12\n" + "AESE.8 q15, q12\n" + + "VLD1.32 {q12}, [%[input]]! \n" + "VEOR.32 q0, q0, q11\n" + "VEOR.32 q15, q15, q11\n" + "VEOR.32 q0, q0, q12\n" + + "VLD1.32 {q12}, [%[input]]! \n" + "VST1.32 {q0}, [%[out]]! \n" + "VEOR.32 q15, q15, q12\n" + "VST1.32 {q15}, [%[out]]! \n" + "SUB %[Key], %[Key], #32 \n" + + "B 1b \n" + + + /* single block */ + "2: \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VMOV.32 q0, q13 \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13 \n" /* network order */ + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */ + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "SUB r11, r11, #1 \n" + "AESE.8 q0, q5\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q6\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13\n" /* revert from network order */ + "AESE.8 q0, q7\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q8\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q9\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q10\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q12}, [%[Key]]! \n" + "AESE.8 q0, q11\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q11}, [%[Key]] \n" + "AESE.8 q0, q12\n" + "VLD1.32 {q12}, [%[input]]! \n" + "VEOR.32 q0, q0, q11\n" + "#CTR operations, increment counter and xorbuf \n" + "VEOR.32 q0, q0, q12\n" + "VST1.32 {q0}, [%[out]]! \n" + + "3: \n" + "#store current counter qalue at the end \n" + "VST1.32 {q13}, [%[regOut]] \n" + + :[out] "=r" (out), "=r" (keyPt), [regOut] "=r" (regPt), + "=r" (in) + :"0" (out), [Key] "1" (keyPt), [input] "3" (in), + [blocks] "r" (numBlocks), [reg] "2" (regPt) + : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14" + ); + break; +#endif /* WOLFSSL_AES_192 */ +#ifdef WOLFSSL_AES_256 + case 14: /* AES 256 BLOCK */ + __asm__ __volatile__ ( + "MOV r11, %[blocks] \n" + "VLDM %[Key]!, {q1-q4} \n" + + "#Create vector with the value 1 \n" + "VMOV.u32 q15, #1 \n" + "VSHR.u64 q15, q15, #32 \n" + "VLDM %[Key]!, {q5-q8} \n" + "VEOR.32 q14, q14, q14 \n" + "VEXT.8 q14, q15, q14, #8\n" + + "VLDM %[Key]!, {q9-q10} \n" + "VLD1.32 {q13}, [%[reg]]\n" + + /* double block */ + "1: \n" + "CMP r11, #1 \n" + "BEQ 2f \n" + "CMP r11, #0 \n" + "BEQ 3f \n" + + "VMOV.32 q0, q13 \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13 \n" /* network order */ + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "SUB r11, r11, #2 \n" + "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */ + "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */ + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q15, q15, q15, #8 \n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q15, q15\n" /* revert from network order */ + "AESE.8 q0, q5\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13\n" /* revert from network order */ + "AESE.8 q15, q1\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q6\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q2\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q7\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q3\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q8\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q4\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q9\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q5\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q10\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q15, q6\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q11\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q7\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q8\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q9\n" + "AESMC.8 q15, q15\n" + "VLD1.32 {q12}, [%[Key]]! \n" + "AESE.8 q15, q10\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q11\n" + "AESMC.8 q15, q15\n" + + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q0, q12\n" /* rnd 12*/ + "AESMC.8 q0, q0\n" + "AESE.8 q15, q12\n" /* rnd 12 */ + "AESMC.8 q15, q15\n" + + "VLD1.32 {q12}, [%[Key]]! \n" + "AESE.8 q0, q11\n" /* rnd 13 */ + "AESMC.8 q0, q0\n" + "AESE.8 q15, q11\n" /* rnd 13 */ + "AESMC.8 q15, q15\n" + + "VLD1.32 {q11}, [%[Key]] \n" + "AESE.8 q0, q12\n" /* rnd 14 */ + "AESE.8 q15, q12\n" /* rnd 14 */ + + "VLD1.32 {q12}, [%[input]]! \n" + "VEOR.32 q0, q0, q11\n" /* rnd 15 */ + "VEOR.32 q15, q15, q11\n" /* rnd 15 */ + "VEOR.32 q0, q0, q12\n" + + "VLD1.32 {q12}, [%[input]]! \n" + "VST1.32 {q0}, [%[out]]! \n" + "VEOR.32 q15, q15, q12\n" + "VST1.32 {q15}, [%[out]]! \n" + "SUB %[Key], %[Key], #64 \n" + + /* single block */ + "B 1b \n" + + "2: \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VMOV.32 q0, q13 \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13 \n" /* network order */ + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */ + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q5\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q6\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13\n" /* revert from network order */ + "AESE.8 q0, q7\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q8\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q9\n" + "AESMC.8 q0, q0\n" + "AESE.8 q0, q10\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q12}, [%[Key]]! \n" + "AESE.8 q0, q11\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q0, q12\n" /* rnd 12 */ + "AESMC.8 q0, q0\n" + "VLD1.32 {q12}, [%[Key]]! \n" + "AESE.8 q0, q11\n" /* rnd 13 */ + "AESMC.8 q0, q0\n" + "VLD1.32 {q11}, [%[Key]] \n" + "AESE.8 q0, q12\n" /* rnd 14 */ + "VLD1.32 {q12}, [%[input]]! \n" + "VEOR.32 q0, q0, q11\n" /* rnd 15 */ + "#CTR operations, increment counter and xorbuf \n" + "VEOR.32 q0, q0, q12\n" + "VST1.32 {q0}, [%[out]]! \n" + + "3: \n" + "#store current counter qalue at the end \n" + "VST1.32 {q13}, [%[regOut]] \n" + + :[out] "=r" (out), "=r" (keyPt), [regOut] "=r" (regPt), + "=r" (in) + :"0" (out), [Key] "1" (keyPt), [input] "3" (in), + [blocks] "r" (numBlocks), [reg] "2" (regPt) + : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14" + ); + break; +#endif /* WOLFSSL_AES_256 */ + default: + WOLFSSL_MSG("Bad AES-CTR round qalue"); + return BAD_FUNC_ARG; + } + + aes->left = 0; + } + + /* handle non block size remaining */ + if (sz) { + wc_AesEncrypt(aes, (byte*)aes->reg, (byte*)aes->tmp); + IncrementAesCounter((byte*)aes->reg); + + aes->left = AES_BLOCK_SIZE; + tmp = (byte*)aes->tmp; + + while (sz--) { + *(out++) = *(in++) ^ *(tmp++); + aes->left--; + } + } + + return 0; + } + +#endif /* WOLFSSL_AES_COUNTER */ + +#ifdef HAVE_AESGCM +/* + * Uses Karatsuba algorithm. Reduction algorithm is based on "Implementing GCM + * on ARMv8". Shifting left to account for bit reflection is based on + * "Carry-Less Multiplication and Its Usage for Computing the GCM mode" + */ +static void GMULT(byte* X, byte* Y) +{ + __asm__ __volatile__ ( + "VLD1.32 {q0}, [%[x]] \n" + + /* In GCM format bits are big endian, switch location of bytes to + * allow for logical shifts and carries. + */ + "VREV64.8 q0, q0 \n" + "VLD1.32 {q1}, [%[y]] \n" /* converted on set key */ + "VSWP.8 d0, d1 \n" + + "VMULL.p64 q5, d0, d2 \n" + "VMULL.p64 q6, d1, d3 \n" + "VEOR d15, d2, d3 \n" + "VEOR d14, d0, d1 \n" + "VMULL.p64 q7, d15, d14 \n" + "VEOR q7, q5 \n" + "VEOR q7, q6 \n" + "VEOR d11, d14 \n" + "VEOR d12, d15\n" + + /* shift to left by 1 to account for reflection */ + "VMOV q7, q6 \n" + "VSHL.u64 q6, q6, #1 \n" + "VSHR.u64 q7, q7, #63 \n" + "VEOR d13, d14 \n" + "VMOV q8, q5 \n" + "VSHL.u64 q5, q5, #1 \n" + "VSHR.u64 q8, q8, #63 \n" + "VEOR d12, d17 \n" + "VEOR d11, d16 \n" + + /* create constant 0xc200000000000000 */ + "VMOV.i32 d16, 0xc2000000 \n" + "VSHL.u64 d16, d16, #32 \n" + + /* reduce product of multiplication */ + "VMULL.p64 q9, d10, d16 \n" + "VEOR d11, d18 \n" + "VEOR d12, d19 \n" + "VMULL.p64 q9, d11, d16 \n" + "VEOR q6, q9 \n" + "VEOR q10, q5, q6 \n" + + /* convert to GCM format */ + "VREV64.8 q10, q10 \n" + "VSWP.8 d20, d21 \n" + + "VST1.32 {q10}, [%[xOut]] \n" + + : [xOut] "=r" (X), [yOut] "=r" (Y) + : [x] "0" (X), [y] "1" (Y) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6" ,"q7", "q8", + "q9", "q10", "q11" ,"q12", "q13", "q14", "q15" + ); +} + + +void GHASH(Aes* aes, const byte* a, word32 aSz, + const byte* c, word32 cSz, byte* s, word32 sSz) +{ + byte x[AES_BLOCK_SIZE]; + byte scratch[AES_BLOCK_SIZE]; + word32 blocks, partial; + byte* h = aes->H; + + XMEMSET(x, 0, AES_BLOCK_SIZE); + + /* Hash in A, the Additional Authentication Data */ + if (aSz != 0 && a != NULL) { + blocks = aSz / AES_BLOCK_SIZE; + partial = aSz % AES_BLOCK_SIZE; + while (blocks--) { + xorbuf(x, a, AES_BLOCK_SIZE); + GMULT(x, h); + a += AES_BLOCK_SIZE; + } + if (partial != 0) { + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, a, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, h); + } + } + + /* Hash in C, the Ciphertext */ + if (cSz != 0 && c != NULL) { + blocks = cSz / AES_BLOCK_SIZE; + partial = cSz % AES_BLOCK_SIZE; + while (blocks--) { + xorbuf(x, c, AES_BLOCK_SIZE); + GMULT(x, h); + c += AES_BLOCK_SIZE; + } + if (partial != 0) { + XMEMSET(scratch, 0, AES_BLOCK_SIZE); + XMEMCPY(scratch, c, partial); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, h); + } + } + + /* Hash in the lengths of A and C in bits */ + FlattenSzInBits(&scratch[0], aSz); + FlattenSzInBits(&scratch[8], cSz); + xorbuf(x, scratch, AES_BLOCK_SIZE); + GMULT(x, h); + + /* Copy the result into s. */ + XMEMCPY(s, x, sSz); +} + + +/* Aarch32 + * Encrypt and tag data using AES with GCM mode. + * aes: Aes structure having already been set with set key function + * out: encrypted data output buffer + * in: plain text input buffer + * sz: size of plain text and out buffer + * iv: initialization vector + * ivSz: size of iv buffer + * authTag: buffer to hold tag + * authTagSz: size of tag buffer + * authIn: additional data buffer + * authInSz: size of additional data buffer + */ +int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + word32 blocks = sz / AES_BLOCK_SIZE; + word32 partial = sz % AES_BLOCK_SIZE; + const byte* p = in; + byte* c = out; + byte counter[AES_BLOCK_SIZE]; + byte initialCounter[AES_BLOCK_SIZE]; + byte *ctr ; + byte scratch[AES_BLOCK_SIZE]; + ctr = counter ; + + /* sanity checks */ + if (aes == NULL || (iv == NULL && ivSz > 0) || + (authTag == NULL) || + (authIn == NULL && authInSz > 0) || + (in == NULL && sz > 0) || + (out == NULL && sz > 0)) { + WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0"); + return BAD_FUNC_ARG; + } + + if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ || authTagSz > AES_BLOCK_SIZE) { + WOLFSSL_MSG("GcmEncrypt authTagSz error"); + return BAD_FUNC_ARG; + } + + XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(initialCounter, iv, ivSz); + initialCounter[AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); + } + XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); + + while (blocks--) { + IncrementGcmCounter(ctr); + wc_AesEncrypt(aes, ctr, scratch); + xorbuf(scratch, p, AES_BLOCK_SIZE); + XMEMCPY(c, scratch, AES_BLOCK_SIZE); + p += AES_BLOCK_SIZE; + c += AES_BLOCK_SIZE; + } + + if (partial != 0) { + IncrementGcmCounter(ctr); + wc_AesEncrypt(aes, ctr, scratch); + xorbuf(scratch, p, partial); + XMEMCPY(c, scratch, partial); + + } + + GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz); + wc_AesEncrypt(aes, initialCounter, scratch); + if (authTagSz > AES_BLOCK_SIZE) { + xorbuf(authTag, scratch, AES_BLOCK_SIZE); + } + else { + xorbuf(authTag, scratch, authTagSz); + } + + return 0; +} + + +#ifdef HAVE_AES_DECRYPT +/* + * Check tag and decrypt data using AES with GCM mode. + * aes: Aes structure having already been set with set key function + * out: decrypted data output buffer + * in: cipher text buffer + * sz: size of plain text and out buffer + * iv: initialization vector + * ivSz: size of iv buffer + * authTag: buffer holding tag + * authTagSz: size of tag buffer + * authIn: additional data buffer + * authInSz: size of additional data buffer + */ +int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + word32 blocks = sz / AES_BLOCK_SIZE; + word32 partial = sz % AES_BLOCK_SIZE; + const byte* c = in; + byte* p = out; + byte counter[AES_BLOCK_SIZE]; + byte initialCounter[AES_BLOCK_SIZE]; + byte *ctr ; + byte scratch[AES_BLOCK_SIZE]; + ctr = counter ; + + /* sanity checks */ + if (aes == NULL || (iv == NULL && ivSz > 0) || + (authTag == NULL) || + (authIn == NULL && authInSz > 0) || + (in == NULL && sz > 0) || + (out == NULL && sz > 0)) { + WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0"); + return BAD_FUNC_ARG; + } + + XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(initialCounter, iv, ivSz); + initialCounter[AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); + } + XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); + + /* Calculate the authTag again using the received auth data and the + * cipher text. */ + { + byte Tprime[AES_BLOCK_SIZE]; + byte EKY0[AES_BLOCK_SIZE]; + + GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime)); + wc_AesEncrypt(aes, ctr, EKY0); + xorbuf(Tprime, EKY0, sizeof(Tprime)); + + if (ConstantCompare(authTag, Tprime, authTagSz) != 0) { + return AES_GCM_AUTH_E; + } + } + + while (blocks--) { + IncrementGcmCounter(ctr); + wc_AesEncrypt(aes, ctr, scratch); + xorbuf(scratch, c, AES_BLOCK_SIZE); + XMEMCPY(p, scratch, AES_BLOCK_SIZE); + p += AES_BLOCK_SIZE; + c += AES_BLOCK_SIZE; + } + if (partial != 0) { + IncrementGcmCounter(ctr); + wc_AesEncrypt(aes, ctr, scratch); + + /* check if pointer is null after main AES-GCM blocks + * helps static analysis */ + if (p == NULL || c == NULL) { + return BAD_STATE_E; + } + xorbuf(scratch, c, partial); + XMEMCPY(p, scratch, partial); + } + return 0; +} +#endif /* HAVE_AES_DECRYPT */ +#endif /* HAVE_AESGCM */ + +#endif /* aarch64 */ + + +#ifdef HAVE_AESCCM +/* Software version of AES-CCM from wolfcrypt/src/aes.c + * Gets some speed up from hardware acceleration of wc_AesEncrypt */ + +static void roll_x(Aes* aes, const byte* in, word32 inSz, byte* out) +{ + /* process the bulk of the data */ + while (inSz >= AES_BLOCK_SIZE) { + xorbuf(out, in, AES_BLOCK_SIZE); + in += AES_BLOCK_SIZE; + inSz -= AES_BLOCK_SIZE; + + wc_AesEncrypt(aes, out, out); + } + + /* process remainder of the data */ + if (inSz > 0) { + xorbuf(out, in, inSz); + wc_AesEncrypt(aes, out, out); + } +} + + +static void roll_auth(Aes* aes, const byte* in, word32 inSz, byte* out) +{ + word32 authLenSz; + word32 remainder; + + /* encode the length in */ + if (inSz <= 0xFEFF) { + authLenSz = 2; + out[0] ^= ((inSz & 0xFF00) >> 8); + out[1] ^= (inSz & 0x00FF); + } + else if (inSz <= 0xFFFFFFFF) { + authLenSz = 6; + out[0] ^= 0xFF; out[1] ^= 0xFE; + out[2] ^= ((inSz & 0xFF000000) >> 24); + out[3] ^= ((inSz & 0x00FF0000) >> 16); + out[4] ^= ((inSz & 0x0000FF00) >> 8); + out[5] ^= (inSz & 0x000000FF); + } + /* Note, the protocol handles auth data up to 2^64, but we are + * using 32-bit sizes right now, so the bigger data isn't handled + * else if (inSz <= 0xFFFFFFFFFFFFFFFF) {} */ + else + return; + + /* start fill out the rest of the first block */ + remainder = AES_BLOCK_SIZE - authLenSz; + if (inSz >= remainder) { + /* plenty of bulk data to fill the remainder of this block */ + xorbuf(out + authLenSz, in, remainder); + inSz -= remainder; + in += remainder; + } + else { + /* not enough bulk data, copy what is available, and pad zero */ + xorbuf(out + authLenSz, in, inSz); + inSz = 0; + } + wc_AesEncrypt(aes, out, out); + + if (inSz > 0) + roll_x(aes, in, inSz, out); +} + + +static WC_INLINE void AesCcmCtrInc(byte* B, word32 lenSz) +{ + word32 i; + + for (i = 0; i < lenSz; i++) { + if (++B[AES_BLOCK_SIZE - 1 - i] != 0) return; + } +} + + +/* return 0 on success */ +int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, + const byte* nonce, word32 nonceSz, + byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte A[AES_BLOCK_SIZE]; + byte B[AES_BLOCK_SIZE]; + byte lenSz; + word32 i; + byte mask = 0xFF; + word32 wordSz = (word32)sizeof(word32); + + /* sanity check on arguments */ + if (aes == NULL || out == NULL || in == NULL || nonce == NULL + || authTag == NULL || nonceSz < 7 || nonceSz > 13) + return BAD_FUNC_ARG; + + XMEMCPY(B+1, nonce, nonceSz); + lenSz = AES_BLOCK_SIZE - 1 - (byte)nonceSz; + B[0] = (authInSz > 0 ? 64 : 0) + + (8 * (((byte)authTagSz - 2) / 2)) + + (lenSz - 1); + for (i = 0; i < lenSz; i++) { + if (mask && i >= wordSz) + mask = 0x00; + B[AES_BLOCK_SIZE - 1 - i] = (inSz >> ((8 * i) & mask)) & mask; + } + + wc_AesEncrypt(aes, B, A); + + if (authInSz > 0) + roll_auth(aes, authIn, authInSz, A); + if (inSz > 0) + roll_x(aes, in, inSz, A); + XMEMCPY(authTag, A, authTagSz); + + B[0] = lenSz - 1; + for (i = 0; i < lenSz; i++) + B[AES_BLOCK_SIZE - 1 - i] = 0; + wc_AesEncrypt(aes, B, A); + xorbuf(authTag, A, authTagSz); + + B[15] = 1; + while (inSz >= AES_BLOCK_SIZE) { + wc_AesEncrypt(aes, B, A); + xorbuf(A, in, AES_BLOCK_SIZE); + XMEMCPY(out, A, AES_BLOCK_SIZE); + + AesCcmCtrInc(B, lenSz); + inSz -= AES_BLOCK_SIZE; + in += AES_BLOCK_SIZE; + out += AES_BLOCK_SIZE; + } + if (inSz > 0) { + wc_AesEncrypt(aes, B, A); + xorbuf(A, in, inSz); + XMEMCPY(out, A, inSz); + } + + ForceZero(A, AES_BLOCK_SIZE); + ForceZero(B, AES_BLOCK_SIZE); + + return 0; +} + +#ifdef HAVE_AES_DECRYPT +int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, + const byte* nonce, word32 nonceSz, + const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte A[AES_BLOCK_SIZE]; + byte B[AES_BLOCK_SIZE]; + byte* o; + byte lenSz; + word32 i, oSz; + int result = 0; + byte mask = 0xFF; + word32 wordSz = (word32)sizeof(word32); + + /* sanity check on arguments */ + if (aes == NULL || out == NULL || in == NULL || nonce == NULL + || authTag == NULL || nonceSz < 7 || nonceSz > 13) + return BAD_FUNC_ARG; + + o = out; + oSz = inSz; + XMEMCPY(B+1, nonce, nonceSz); + lenSz = AES_BLOCK_SIZE - 1 - (byte)nonceSz; + + B[0] = lenSz - 1; + for (i = 0; i < lenSz; i++) + B[AES_BLOCK_SIZE - 1 - i] = 0; + B[15] = 1; + + while (oSz >= AES_BLOCK_SIZE) { + wc_AesEncrypt(aes, B, A); + xorbuf(A, in, AES_BLOCK_SIZE); + XMEMCPY(o, A, AES_BLOCK_SIZE); + + AesCcmCtrInc(B, lenSz); + oSz -= AES_BLOCK_SIZE; + in += AES_BLOCK_SIZE; + o += AES_BLOCK_SIZE; + } + if (inSz > 0) { + wc_AesEncrypt(aes, B, A); + xorbuf(A, in, oSz); + XMEMCPY(o, A, oSz); + } + + for (i = 0; i < lenSz; i++) + B[AES_BLOCK_SIZE - 1 - i] = 0; + wc_AesEncrypt(aes, B, A); + + o = out; + oSz = inSz; + + B[0] = (authInSz > 0 ? 64 : 0) + + (8 * (((byte)authTagSz - 2) / 2)) + + (lenSz - 1); + for (i = 0; i < lenSz; i++) { + if (mask && i >= wordSz) + mask = 0x00; + B[AES_BLOCK_SIZE - 1 - i] = (inSz >> ((8 * i) & mask)) & mask; + } + + wc_AesEncrypt(aes, B, A); + + if (authInSz > 0) + roll_auth(aes, authIn, authInSz, A); + if (inSz > 0) + roll_x(aes, o, oSz, A); + + B[0] = lenSz - 1; + for (i = 0; i < lenSz; i++) + B[AES_BLOCK_SIZE - 1 - i] = 0; + wc_AesEncrypt(aes, B, B); + xorbuf(A, B, authTagSz); + + if (ConstantCompare(A, authTag, authTagSz) != 0) { + /* If the authTag check fails, don't keep the decrypted data. + * Unfortunately, you need the decrypted data to calculate the + * check value. */ + XMEMSET(out, 0, inSz); + result = AES_CCM_AUTH_E; + } + + ForceZero(A, AES_BLOCK_SIZE); + ForceZero(B, AES_BLOCK_SIZE); + o = NULL; + + return result; +} +#endif /* HAVE_AES_DECRYPT */ +#endif /* HAVE_AESCCM */ + + + +#ifdef HAVE_AESGCM /* common GCM functions 32 and 64 bit */ +int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) +{ + int ret; + byte iv[AES_BLOCK_SIZE]; + + if (!((len == 16) || (len == 24) || (len == 32))) + return BAD_FUNC_ARG; + + XMEMSET(iv, 0, AES_BLOCK_SIZE); + ret = wc_AesSetKey(aes, key, len, iv, AES_ENCRYPTION); + + if (ret == 0) { + wc_AesEncrypt(aes, iv, aes->H); + #if defined(__aarch64__) + { + word32* pt = (word32*)aes->H; + __asm__ volatile ( + "LD1 {v0.16b}, [%[h]] \n" + "RBIT v0.16b, v0.16b \n" + "ST1 {v0.16b}, [%[out]] \n" + : [out] "=r" (pt) + : [h] "0" (pt) + : "cc", "memory", "v0" + ); + } + #else + { + word32* pt = (word32*)aes->H; + __asm__ volatile ( + "VLD1.32 {q0}, [%[h]] \n" + "VREV64.8 q0, q0 \n" + "VSWP.8 d0, d1 \n" + "VST1.32 {q0}, [%[out]] \n" + : [out] "=r" (pt) + : [h] "0" (pt) + : "cc", "memory", "q0" + ); + } + #endif + } + + return ret; +} + +#endif /* HAVE_AESGCM */ + +/* AES-DIRECT */ +#if defined(WOLFSSL_AES_DIRECT) + /* Allow direct access to one block encrypt */ + void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in) + { + if (aes == NULL || out == NULL || in == NULL) { + WOLFSSL_MSG("Invalid input to wc_AesEncryptDirect"); + return; + } + wc_AesEncrypt(aes, in, out); + } + #ifdef HAVE_AES_DECRYPT + /* Allow direct access to one block decrypt */ + void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in) + { + if (aes == NULL || out == NULL || in == NULL) { + WOLFSSL_MSG("Invalid input to wc_AesDecryptDirect"); + return; + } + wc_AesDecrypt(aes, in, out); + } + #endif /* HAVE_AES_DECRYPT */ +#endif /* WOLFSSL_AES_DIRECT */ +#endif /* !NO_AES && WOLFSSL_ARMASM */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-chacha.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-chacha.c new file mode 100644 index 0000000..df76bec --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-chacha.c @@ -0,0 +1,2857 @@ +/* armv8-chacha.c + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + */ + +/* The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM + * https://cryptojedi.org/papers/neoncrypto-20120320.pdf + */ + +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#ifdef WOLFSSL_ARMASM +#ifdef HAVE_CHACHA + +#include <wolfssl/wolfcrypt/chacha.h> +#include <wolfssl/wolfcrypt/error-crypt.h> +#include <wolfssl/wolfcrypt/logging.h> +#include <wolfssl/wolfcrypt/cpuid.h> +#ifdef NO_INLINE + #include <wolfssl/wolfcrypt/misc.h> +#else + #define WOLFSSL_MISC_INCLUDED + #include <wolfcrypt/src/misc.c> +#endif + +#ifdef CHACHA_AEAD_TEST + #include <stdio.h> +#endif + +#ifdef CHACHA_TEST + #include <stdio.h> +#endif + +#ifdef BIG_ENDIAN_ORDER + #define LITTLE32(x) ByteReverseWord32(x) +#else + #define LITTLE32(x) (x) +#endif + +/* Number of rounds */ +#define ROUNDS 20 + +#define U32C(v) (v##U) +#define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF)) +#define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0]) + +#define PLUS(v,w) (U32V((v) + (w))) +#define PLUSONE(v) (PLUS((v),1)) + +#define ARM_SIMD_LEN_BYTES 16 + +/** + * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version + * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. + */ +int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) +{ + word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */ + +#ifdef CHACHA_AEAD_TEST + word32 i; + printf("NONCE : "); + for (i = 0; i < CHACHA_IV_BYTES; i++) { + printf("%02x", inIv[i]); + } + printf("\n\n"); +#endif + + if (ctx == NULL) + return BAD_FUNC_ARG; + + XMEMCPY(temp, inIv, CHACHA_IV_BYTES); + + ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */ + ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */ + ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */ + ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */ + + return 0; +} + +/* "expand 32-byte k" as unsigned 32 byte */ +static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; +/* "expand 16-byte k" as unsigned 16 byte */ +static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574}; + +/** + * Key setup. 8 word iv (nonce) + */ +int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) +{ + const word32* constants; + const byte* k; + +#ifdef XSTREAM_ALIGN + word32 alignKey[8]; +#endif + + if (ctx == NULL) + return BAD_FUNC_ARG; + + if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ) + return BAD_FUNC_ARG; + +#ifdef XSTREAM_ALIGN + if ((wolfssl_word)key % 4) { + WOLFSSL_MSG("wc_ChachaSetKey unaligned key"); + XMEMCPY(alignKey, key, keySz); + k = (byte*)alignKey; + } + else { + k = key; + } +#else + k = key; +#endif /* XSTREAM_ALIGN */ + +#ifdef CHACHA_AEAD_TEST + word32 i; + printf("ChaCha key used :\n"); + for (i = 0; i < keySz; i++) { + printf("%02x", key[i]); + if ((i + 1) % 8 == 0) + printf("\n"); + } + printf("\n\n"); +#endif + + ctx->X[4] = U8TO32_LITTLE(k + 0); + ctx->X[5] = U8TO32_LITTLE(k + 4); + ctx->X[6] = U8TO32_LITTLE(k + 8); + ctx->X[7] = U8TO32_LITTLE(k + 12); + if (keySz == CHACHA_MAX_KEY_SZ) { + k += 16; + constants = sigma; + } + else { + constants = tau; + } + ctx->X[ 8] = U8TO32_LITTLE(k + 0); + ctx->X[ 9] = U8TO32_LITTLE(k + 4); + ctx->X[10] = U8TO32_LITTLE(k + 8); + ctx->X[11] = U8TO32_LITTLE(k + 12); + ctx->X[ 0] = constants[0]; + ctx->X[ 1] = constants[1]; + ctx->X[ 2] = constants[2]; + ctx->X[ 3] = constants[3]; + + return 0; +} + +static const word32 L_chacha20_neon_inc_first_word[] = { + 0x1, + 0x0, + 0x0, + 0x0, +}; + +#ifdef __aarch64__ + +static const word32 L_chacha20_neon_add_all_counters[] = { + 0x0, + 0x1, + 0x2, + 0x3, +}; + +static const word32 L_chacha20_neon_rol8[] = { + 0x2010003, + 0x6050407, + 0xa09080b, + 0xe0d0c0f, +}; + +static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m, byte* c, word32 bytes) +{ +#ifdef CHACHA_TEST + printf("Entering wc_Chacha_encrypt_320 with %d bytes\n", bytes); +#endif /*CHACHA_TEST */ + word64 bytes64 = (word64) bytes; + __asm__ __volatile__ ( + /* + * The layout of used registers is: + * ARM + * w4-w19: these registers hold the fifth Chacha block for calculation in regular ARM + * w20: loop counter for how many even-odd rounds need to be executed + * w21: the counter offset for the block in ARM registers + * NEON + * v0-v15: the vi'th register holds the i'th word of four blocks during the quarter rounds. + * these registers are later transposed make ADDing the input and XORing the message easier. + * v16-v19: these are helper registers that are used as temporary location to store data + * v20-v23: load the next message block + * v24-v27: the 64 byte initial Chacha block + * v28: vector to increment the counter words of each block + * v29: vector of 5's to increment counters between L_chacha20_arm64_outer_%= loops + * v30: table lookup indices to rotate values by 8 + */ + + /* Load counter-add values for each block */ + "LD1 {v28.4s}, [%[L_chacha20_neon_add_all_counters]] \n\t" + /* Load index look-up for rotating left 8 bits */ + "LD1 {v30.16b}, [%[L_chacha20_neon_rol8]] \n\t" + /* For adding 5 to each counter-add for next 320-byte chunk */ + "MOVI v29.4s, #5 \n\t" + /* Counter for 5th block in regular registers */ + "MOV w21, #4 \n\t" + /* Load state to encrypt */ + "LD1 {v24.4s-v27.4s}, [%[input]] \n\t" + "\n" + "L_chacha20_arm64_outer_%=: \n\t" + /* Move state into regular registers */ + "MOV x4, v24.d[0] \n\t" + "MOV x6, v24.d[1] \n\t" + "MOV x8, v25.d[0] \n\t" + "MOV x10, v25.d[1] \n\t" + "MOV x12, v26.d[0] \n\t" + "MOV x14, v26.d[1] \n\t" + "MOV x16, v27.d[0] \n\t" + "MOV x22, v27.d[1] \n\t" + /* Move state into vector registers (x4) */ + "DUP v0.4s, v24.s[0] \n\t" + "DUP v1.4s, v24.s[1] \n\t" + "LSR x5, x4, #32 \n\t" + "DUP v2.4s, v24.s[2] \n\t" + "DUP v3.4s, v24.s[3] \n\t" + "LSR x7, x6, #32 \n\t" + "DUP v4.4s, v25.s[0] \n\t" + "DUP v5.4s, v25.s[1] \n\t" + "LSR x9, x8, #32 \n\t" + "DUP v6.4s, v25.s[2] \n\t" + "DUP v7.4s, v25.s[3] \n\t" + "LSR x11, x10, #32 \n\t" + "DUP v8.4s, v26.s[0] \n\t" + "DUP v9.4s, v26.s[1] \n\t" + "LSR x13, x12, #32 \n\t" + "DUP v10.4s, v26.s[2] \n\t" + "DUP v11.4s, v26.s[3] \n\t" + "LSR x15, x14, #32 \n\t" + "DUP v12.4s, v27.s[0] \n\t" + "DUP v13.4s, v27.s[1] \n\t" + "LSR x17, x16, #32 \n\t" + "DUP v14.4s, v27.s[2] \n\t" + "DUP v15.4s, v27.s[3] \n\t" + "LSR x19, x22, #32 \n\t" + /* Add to counter word */ + "ADD v12.4s, v12.4s, v28.4s \n\t" + "ADD w16, w16, w21 \n\t" + /* Set number of odd+even rounds to perform */ + "MOV w20, #10 \n\t" + "\n" + "L_chacha20_arm64_inner_%=: \n\t" + "SUBS w20, w20, #1 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4s, v0.4s, v4.4s \n\t" + "ADD w4, w4, w8 \n\t" + "ADD v1.4s, v1.4s, v5.4s \n\t" + "ADD w5, w5, w9 \n\t" + "ADD v2.4s, v2.4s, v6.4s \n\t" + "ADD w6, w6, w10 \n\t" + "ADD v3.4s, v3.4s, v7.4s \n\t" + "ADD w7, w7, w11 \n\t" + "EOR v12.16b, v12.16b, v0.16b \n\t" + "EOR w16, w16, w4 \n\t" + "EOR v13.16b, v13.16b, v1.16b \n\t" + "EOR w17, w17, w5 \n\t" + "EOR v14.16b, v14.16b, v2.16b \n\t" + "EOR w22, w22, w6 \n\t" + "EOR v15.16b, v15.16b, v3.16b \n\t" + "EOR w19, w19, w7 \n\t" + "REV32 v12.8h, v12.8h \n\t" + "ROR w16, w16, #16 \n\t" + "REV32 v13.8h, v13.8h \n\t" + "ROR w17, w17, #16 \n\t" + "REV32 v14.8h, v14.8h \n\t" + "ROR w22, w22, #16 \n\t" + "REV32 v15.8h, v15.8h \n\t" + "ROR w19, w19, #16 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v8.4s, v8.4s, v12.4s \n\t" + "ADD w12, w12, w16 \n\t" + "ADD v9.4s, v9.4s, v13.4s \n\t" + "ADD w13, w13, w17 \n\t" + "ADD v10.4s, v10.4s, v14.4s \n\t" + "ADD w14, w14, w22 \n\t" + "ADD v11.4s, v11.4s, v15.4s \n\t" + "ADD w15, w15, w19 \n\t" + "EOR v16.16b, v4.16b, v8.16b \n\t" + "EOR w8, w8, w12 \n\t" + "EOR v17.16b, v5.16b, v9.16b \n\t" + "EOR w9, w9, w13 \n\t" + "EOR v18.16b, v6.16b, v10.16b \n\t" + "EOR w10, w10, w14 \n\t" + "EOR v19.16b, v7.16b, v11.16b \n\t" + "EOR w11, w11, w15 \n\t" + "SHL v4.4s, v16.4s, #12 \n\t" + "ROR w8, w8, #20 \n\t" + "SHL v5.4s, v17.4s, #12 \n\t" + "ROR w9, w9, #20 \n\t" + "SHL v6.4s, v18.4s, #12 \n\t" + "ROR w10, w10, #20 \n\t" + "SHL v7.4s, v19.4s, #12 \n\t" + "ROR w11, w11, #20 \n\t" + "SRI v4.4s, v16.4s, #20 \n\t" + "SRI v5.4s, v17.4s, #20 \n\t" + "SRI v6.4s, v18.4s, #20 \n\t" + "SRI v7.4s, v19.4s, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4s, v0.4s, v4.4s \n\t" + "ADD w4, w4, w8 \n\t" + "ADD v1.4s, v1.4s, v5.4s \n\t" + "ADD w5, w5, w9 \n\t" + "ADD v2.4s, v2.4s, v6.4s \n\t" + "ADD w6, w6, w10 \n\t" + "ADD v3.4s, v3.4s, v7.4s \n\t" + "ADD w7, w7, w11 \n\t" + "EOR v12.16b, v12.16b, v0.16b \n\t" + "EOR w16, w16, w4 \n\t" + "EOR v13.16b, v13.16b, v1.16b \n\t" + "EOR w17, w17, w5 \n\t" + "EOR v14.16b, v14.16b, v2.16b \n\t" + "EOR w22, w22, w6 \n\t" + "EOR v15.16b, v15.16b, v3.16b \n\t" + "EOR w19, w19, w7 \n\t" + "TBL v12.16b, { v12.16b }, v30.16b \n\t" + "ROR w16, w16, #24 \n\t" + "TBL v13.16b, { v13.16b }, v30.16b \n\t" + "ROR w17, w17, #24 \n\t" + "TBL v14.16b, { v14.16b }, v30.16b \n\t" + "ROR w22, w22, #24 \n\t" + "TBL v15.16b, { v15.16b }, v30.16b \n\t" + "ROR w19, w19, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v8.4s, v8.4s, v12.4s \n\t" + "ADD w12, w12, w16 \n\t" + "ADD v9.4s, v9.4s, v13.4s \n\t" + "ADD w13, w13, w17 \n\t" + "ADD v10.4s, v10.4s, v14.4s \n\t" + "ADD w14, w14, w22 \n\t" + "ADD v11.4s, v11.4s, v15.4s \n\t" + "ADD w15, w15, w19 \n\t" + "EOR v16.16b, v4.16b, v8.16b \n\t" + "EOR w8, w8, w12 \n\t" + "EOR v17.16b, v5.16b, v9.16b \n\t" + "EOR w9, w9, w13 \n\t" + "EOR v18.16b, v6.16b, v10.16b \n\t" + "EOR w10, w10, w14 \n\t" + "EOR v19.16b, v7.16b, v11.16b \n\t" + "EOR w11, w11, w15 \n\t" + "SHL v4.4s, v16.4s, #7 \n\t" + "ROR w8, w8, #25 \n\t" + "SHL v5.4s, v17.4s, #7 \n\t" + "ROR w9, w9, #25 \n\t" + "SHL v6.4s, v18.4s, #7 \n\t" + "ROR w10, w10, #25 \n\t" + "SHL v7.4s, v19.4s, #7 \n\t" + "ROR w11, w11, #25 \n\t" + "SRI v4.4s, v16.4s, #25 \n\t" + "SRI v5.4s, v17.4s, #25 \n\t" + "SRI v6.4s, v18.4s, #25 \n\t" + "SRI v7.4s, v19.4s, #25 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4s, v0.4s, v5.4s \n\t" + "ADD w4, w4, w9 \n\t" + "ADD v1.4s, v1.4s, v6.4s \n\t" + "ADD w5, w5, w10 \n\t" + "ADD v2.4s, v2.4s, v7.4s \n\t" + "ADD w6, w6, w11 \n\t" + "ADD v3.4s, v3.4s, v4.4s \n\t" + "ADD w7, w7, w8 \n\t" + "EOR v15.16b, v15.16b, v0.16b \n\t" + "EOR w19, w19, w4 \n\t" + "EOR v12.16b, v12.16b, v1.16b \n\t" + "EOR w16, w16, w5 \n\t" + "EOR v13.16b, v13.16b, v2.16b \n\t" + "EOR w17, w17, w6 \n\t" + "EOR v14.16b, v14.16b, v3.16b \n\t" + "EOR w22, w22, w7 \n\t" + "REV32 v15.8h, v15.8h \n\t" + "ROR w19, w19, #16 \n\t" + "REV32 v12.8h, v12.8h \n\t" + "ROR w16, w16, #16 \n\t" + "REV32 v13.8h, v13.8h \n\t" + "ROR w17, w17, #16 \n\t" + "REV32 v14.8h, v14.8h \n\t" + "ROR w22, w22, #16 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v10.4s, v10.4s, v15.4s \n\t" + "ADD w14, w14, w19 \n\t" + "ADD v11.4s, v11.4s, v12.4s \n\t" + "ADD w15, w15, w16 \n\t" + "ADD v8.4s, v8.4s, v13.4s \n\t" + "ADD w12, w12, w17 \n\t" + "ADD v9.4s, v9.4s, v14.4s \n\t" + "ADD w13, w13, w22 \n\t" + "EOR v16.16b, v5.16b, v10.16b \n\t" + "EOR w9, w9, w14 \n\t" + "EOR v17.16b, v6.16b, v11.16b \n\t" + "EOR w10, w10, w15 \n\t" + "EOR v18.16b, v7.16b, v8.16b \n\t" + "EOR w11, w11, w12 \n\t" + "EOR v19.16b, v4.16b, v9.16b \n\t" + "EOR w8, w8, w13 \n\t" + "SHL v5.4s, v16.4s, #12 \n\t" + "ROR w9, w9, #20 \n\t" + "SHL v6.4s, v17.4s, #12 \n\t" + "ROR w10, w10, #20 \n\t" + "SHL v7.4s, v18.4s, #12 \n\t" + "ROR w11, w11, #20 \n\t" + "SHL v4.4s, v19.4s, #12 \n\t" + "ROR w8, w8, #20 \n\t" + "SRI v5.4s, v16.4s, #20 \n\t" + "SRI v6.4s, v17.4s, #20 \n\t" + "SRI v7.4s, v18.4s, #20 \n\t" + "SRI v4.4s, v19.4s, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4s, v0.4s, v5.4s \n\t" + "ADD w4, w4, w9 \n\t" + "ADD v1.4s, v1.4s, v6.4s \n\t" + "ADD w5, w5, w10 \n\t" + "ADD v2.4s, v2.4s, v7.4s \n\t" + "ADD w6, w6, w11 \n\t" + "ADD v3.4s, v3.4s, v4.4s \n\t" + "ADD w7, w7, w8 \n\t" + "EOR v15.16b, v15.16b, v0.16b \n\t" + "EOR w19, w19, w4 \n\t" + "EOR v12.16b, v12.16b, v1.16b \n\t" + "EOR w16, w16, w5 \n\t" + "EOR v13.16b, v13.16b, v2.16b \n\t" + "EOR w17, w17, w6 \n\t" + "EOR v14.16b, v14.16b, v3.16b \n\t" + "EOR w22, w22, w7 \n\t" + "TBL v15.16b, { v15.16b }, v30.16b \n\t" + "ROR w19, w19, #24 \n\t" + "TBL v12.16b, { v12.16b }, v30.16b \n\t" + "ROR w16, w16, #24 \n\t" + "TBL v13.16b, { v13.16b }, v30.16b \n\t" + "ROR w17, w17, #24 \n\t" + "TBL v14.16b, { v14.16b }, v30.16b \n\t" + "ROR w22, w22, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v10.4s, v10.4s, v15.4s \n\t" + "ADD w14, w14, w19 \n\t" + "ADD v11.4s, v11.4s, v12.4s \n\t" + "ADD w15, w15, w16 \n\t" + "ADD v8.4s, v8.4s, v13.4s \n\t" + "ADD w12, w12, w17 \n\t" + "ADD v9.4s, v9.4s, v14.4s \n\t" + "ADD w13, w13, w22 \n\t" + "EOR v16.16b, v5.16b, v10.16b \n\t" + "EOR w9, w9, w14 \n\t" + "EOR v17.16b, v6.16b, v11.16b \n\t" + "EOR w10, w10, w15 \n\t" + "EOR v18.16b, v7.16b, v8.16b \n\t" + "EOR w11, w11, w12 \n\t" + "EOR v19.16b, v4.16b, v9.16b \n\t" + "EOR w8, w8, w13 \n\t" + "SHL v5.4s, v16.4s, #7 \n\t" + "ROR w9, w9, #25 \n\t" + "SHL v6.4s, v17.4s, #7 \n\t" + "ROR w10, w10, #25 \n\t" + "SHL v7.4s, v18.4s, #7 \n\t" + "ROR w11, w11, #25 \n\t" + "SHL v4.4s, v19.4s, #7 \n\t" + "ROR w8, w8, #25 \n\t" + "SRI v5.4s, v16.4s, #25 \n\t" + "SRI v6.4s, v17.4s, #25 \n\t" + "SRI v7.4s, v18.4s, #25 \n\t" + "SRI v4.4s, v19.4s, #25 \n\t" + "BNE L_chacha20_arm64_inner_%= \n\t" + /* Add counter now rather than after transposed */ + "ADD v12.4s, v12.4s, v28.4s \n\t" + "ADD w16, w16, w21 \n\t" + /* Load message */ + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + /* Transpose vectors */ + "TRN1 v16.4s, v0.4s, v1.4s \n\t" + "TRN1 v18.4s, v2.4s, v3.4s \n\t" + "TRN2 v17.4s, v0.4s, v1.4s \n\t" + "TRN2 v19.4s, v2.4s, v3.4s \n\t" + "TRN1 v0.2d, v16.2d, v18.2d \n\t" + "TRN1 v1.2d, v17.2d, v19.2d \n\t" + "TRN2 v2.2d, v16.2d, v18.2d \n\t" + "TRN2 v3.2d, v17.2d, v19.2d \n\t" + "TRN1 v16.4s, v4.4s, v5.4s \n\t" + "TRN1 v18.4s, v6.4s, v7.4s \n\t" + "TRN2 v17.4s, v4.4s, v5.4s \n\t" + "TRN2 v19.4s, v6.4s, v7.4s \n\t" + "TRN1 v4.2d, v16.2d, v18.2d \n\t" + "TRN1 v5.2d, v17.2d, v19.2d \n\t" + "TRN2 v6.2d, v16.2d, v18.2d \n\t" + "TRN2 v7.2d, v17.2d, v19.2d \n\t" + "TRN1 v16.4s, v8.4s, v9.4s \n\t" + "TRN1 v18.4s, v10.4s, v11.4s \n\t" + "TRN2 v17.4s, v8.4s, v9.4s \n\t" + "TRN2 v19.4s, v10.4s, v11.4s \n\t" + "TRN1 v8.2d, v16.2d, v18.2d \n\t" + "TRN1 v9.2d, v17.2d, v19.2d \n\t" + "TRN2 v10.2d, v16.2d, v18.2d \n\t" + "TRN2 v11.2d, v17.2d, v19.2d \n\t" + "TRN1 v16.4s, v12.4s, v13.4s \n\t" + "TRN1 v18.4s, v14.4s, v15.4s \n\t" + "TRN2 v17.4s, v12.4s, v13.4s \n\t" + "TRN2 v19.4s, v14.4s, v15.4s \n\t" + "TRN1 v12.2d, v16.2d, v18.2d \n\t" + "TRN1 v13.2d, v17.2d, v19.2d \n\t" + "TRN2 v14.2d, v16.2d, v18.2d \n\t" + "TRN2 v15.2d, v17.2d, v19.2d \n\t" + /* Add back state, XOR in message and store (load next block) */ + "ADD v16.4s, v0.4s, v24.4s \n\t" + "ADD v17.4s, v4.4s, v25.4s \n\t" + "ADD v18.4s, v8.4s, v26.4s \n\t" + "ADD v19.4s, v12.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + "ADD v16.4s, v1.4s, v24.4s \n\t" + "ADD v17.4s, v5.4s, v25.4s \n\t" + "ADD v18.4s, v9.4s, v26.4s \n\t" + "ADD v19.4s, v13.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + "ADD v16.4s, v2.4s, v24.4s \n\t" + "ADD v17.4s, v6.4s, v25.4s \n\t" + "ADD v18.4s, v10.4s, v26.4s \n\t" + "ADD v19.4s, v14.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + "ADD v16.4s, v3.4s, v24.4s \n\t" + "ADD v17.4s, v7.4s, v25.4s \n\t" + "ADD v18.4s, v11.4s, v26.4s \n\t" + "ADD v19.4s, v15.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + /* Move regular registers into vector registers for adding and xor */ + "ORR x4, x4, x5, LSL #32 \n\t" + "ORR x6, x6, x7, LSL #32 \n\t" + "ORR x8, x8, x9, LSL #32 \n\t" + "MOV v16.d[0], x4 \n\t" + "ORR x10, x10, x11, LSL #32 \n\t" + "MOV v16.d[1], x6 \n\t" + "ORR x12, x12, x13, LSL #32 \n\t" + "MOV v17.d[0], x8 \n\t" + "ORR x14, x14, x15, LSL #32 \n\t" + "MOV v17.d[1], x10 \n\t" + "ORR x16, x16, x17, LSL #32 \n\t" + "MOV v18.d[0], x12 \n\t" + "ORR x22, x22, x19, LSL #32 \n\t" + "MOV v18.d[1], x14 \n\t" + "MOV v19.d[0], x16 \n\t" + "MOV v19.d[1], x22 \n\t" + /* Add back state, XOR in message and store */ + "ADD v16.4s, v16.4s, v24.4s \n\t" + "ADD v17.4s, v17.4s, v25.4s \n\t" + "ADD v18.4s, v18.4s, v26.4s \n\t" + "ADD v19.4s, v19.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "ADD w21, w21, #5 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + "SUBS %[bytes], %[bytes], #320 \n\t" + "ADD v28.4s, v28.4s, v29.4s \n\t" + "BNE L_chacha20_arm64_outer_%= \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), + [bytes] "+r" (bytes64) + : [L_chacha20_neon_add_all_counters] "r" (L_chacha20_neon_add_all_counters), + [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8) + : "memory", "cc", + "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", + "x13", "x14", "x15", "x16", "x17", "x22", "x19", "x20", "x21", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27" + ); +} +#endif /* __aarch64__ */ + +/** + * Converts word into bytes with rotations having been done. + */ +static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c) +{ +#ifdef CHACHA_TEST + printf("Entering wc_Chacha_encrypt_256\n"); +#endif /*CHACHA_TEST */ + +#ifdef __aarch64__ + __asm__ __volatile__ ( + // v0-v3 - first block + // v12 first block helper + // v4-v7 - second block + // v13 second block helper + // v8-v11 - third block + // v14 third block helper + // w4-w19 - fourth block + + // v0 0 1 2 3 + // v1 4 5 6 7 + // v2 8 9 10 11 + // v3 12 13 14 15 + // load CHACHA state with indices placed as shown above + /* Load state to encrypt */ + "LD1 {v20.4S-v23.4S}, [%[input]] \n\t" + /* Load index look-up for rotating left 8 bits */ + "LD1 {v24.16B}, [%[L_chacha20_neon_rol8]] \n\t" + /* Move state into regular registers */ + "MOV x4, v20.D[0] \n\t" + "MOV x6, v20.D[1] \n\t" + "MOV x8, v21.D[0] \n\t" + "MOV x10, v21.D[1] \n\t" + "MOV x12, v22.D[0] \n\t" + "MOV x14, v22.D[1] \n\t" + "MOV x16, v23.D[0] \n\t" + "MOV x22, v23.D[1] \n\t" + /* Move state into vector registers (x3) */ + "MOV v0.16B, v20.16B \n\t" + "MOV v1.16B, v21.16B \n\t" + "LSR x19, x22, #32 \n\t" + "MOV v2.16B, v22.16B \n\t" + "ADD w20, w16, #1 \n\t" + "MOV v3.16B, v23.16B \n\t" + "LSR x17, x16, #32 \n\t" + "MOV v4.16B, v20.16B \n\t" + "MOV v5.16B, v21.16B \n\t" + "LSR x15, x14, #32 \n\t" + "MOV v6.16B, v22.16B \n\t" + "ADD w21, w16, #2 \n\t" + "MOV v7.16B, v23.16B \n\t" + "LSR x13, x12, #32 \n\t" + "MOV v8.16B, v20.16B \n\t" + "MOV v9.16B, v21.16B \n\t" + "LSR x11, x10, #32 \n\t" + "MOV v10.16B, v22.16B \n\t" + "ADD w16, w16, #3 \n\t" + "MOV v11.16B, v23.16B \n\t" + "LSR x9, x8, #32 \n\t" + /* Set counter word */ + "MOV v7.S[0], w20 \n\t" + "LSR x7, x6, #32 \n\t" + "MOV v11.S[0], w21 \n\t" + "LSR x5, x4, #32 \n\t" + /* Set number of odd+even rounds to perform */ + "MOV w3, #10 \n\t" + "\n" + "L_chacha20_arm64_256_loop_%=: \n\t" + "SUBS w3, w3, #1 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD w4, w4, w8 \n\t" + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD w5, w5, w9 \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "ADD w6, w6, w10 \n\t" + "ADD v8.4S, v8.4S, v9.4S \n\t" + "ADD w7, w7, w11 \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR w16, w16, w4 \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "EOR w17, w17, w5 \n\t" + "EOR v11.16B, v11.16B, v8.16B \n\t" + "EOR w22, w22, w6 \n\t" + "REV32 v3.8H, v3.8H \n\t" + "EOR w19, w19, w7 \n\t" + "REV32 v7.8H, v7.8H \n\t" + "ROR w16, w16, #16 \n\t" + "REV32 v11.8H, v11.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ROR w17, w17, #16 \n\t" + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ROR w22, w22, #16 \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "ROR w19, w19, #16 \n\t" + "ADD v10.4S, v10.4S, v11.4S \n\t" + "ADD w12, w12, w16 \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "ADD w13, w13, w17 \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "ADD w14, w14, w22 \n\t" + "EOR v14.16B, v9.16B, v10.16B \n\t" + "ADD w15, w15, w19 \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "EOR w8, w8, w12 \n\t" + "SHL v5.4S, v13.4S, #12 \n\t" + "EOR w9, w9, w13 \n\t" + "SHL v9.4S, v14.4S, #12 \n\t" + "EOR w10, w10, w14 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + "EOR w11, w11, w15 \n\t" + "SRI v5.4S, v13.4S, #20 \n\t" + "ROR w8, w8, #20 \n\t" + "SRI v9.4S, v14.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ROR w9, w9, #20 \n\t" + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ROR w10, w10, #20 \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "ROR w11, w11, #20 \n\t" + "ADD v8.4S, v8.4S, v9.4S \n\t" + "ADD w4, w4, w8 \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "ADD w5, w5, w9 \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "ADD w6, w6, w10 \n\t" + "EOR v11.16B, v11.16B, v8.16B \n\t" + "ADD w7, w7, w11 \n\t" + "TBL v3.16B, { v3.16B }, v24.16B \n\t" + "EOR w16, w16, w4 \n\t" + "TBL v7.16B, { v7.16B }, v24.16B \n\t" + "EOR w17, w17, w5 \n\t" + "TBL v11.16B, { v11.16B }, v24.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "EOR w22, w22, w6 \n\t" + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR w19, w19, w7 \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "ROR w16, w16, #24 \n\t" + "ADD v10.4S, v10.4S, v11.4S \n\t" + "ROR w17, w17, #24 \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "ROR w22, w22, #24 \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "ROR w19, w19, #24 \n\t" + "EOR v14.16B, v9.16B, v10.16B \n\t" + "ADD w12, w12, w16 \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "ADD w13, w13, w17 \n\t" + "SHL v5.4S, v13.4S, #7 \n\t" + "ADD w14, w14, w22 \n\t" + "SHL v9.4S, v14.4S, #7 \n\t" + "ADD w15, w15, w19 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EOR w8, w8, w12 \n\t" + "SRI v5.4S, v13.4S, #25 \n\t" + "EOR w9, w9, w13 \n\t" + "SRI v9.4S, v14.4S, #25 \n\t" + "EOR w10, w10, w14 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EOR w11, w11, w15 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + "ROR w8, w8, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "ROR w9, w9, #25 \n\t" + "EXT v5.16B, v5.16B, v5.16B, #4 \n\t" + "ROR w10, w10, #25 \n\t" + "EXT v6.16B, v6.16B, v6.16B, #8 \n\t" + "ROR w11, w11, #25 \n\t" + "EXT v7.16B, v7.16B, v7.16B, #12 \n\t" + "EXT v9.16B, v9.16B, v9.16B, #4 \n\t" + "EXT v10.16B, v10.16B, v10.16B, #8 \n\t" + "EXT v11.16B, v11.16B, v11.16B, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD w4, w4, w9 \n\t" + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD w5, w5, w10 \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "ADD w6, w6, w11 \n\t" + "ADD v8.4S, v8.4S, v9.4S \n\t" + "ADD w7, w7, w8 \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR w19, w19, w4 \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "EOR w16, w16, w5 \n\t" + "EOR v11.16B, v11.16B, v8.16B \n\t" + "EOR w17, w17, w6 \n\t" + "REV32 v3.8H, v3.8H \n\t" + "EOR w22, w22, w7 \n\t" + "REV32 v7.8H, v7.8H \n\t" + "ROR w19, w19, #16 \n\t" + "REV32 v11.8H, v11.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ROR w16, w16, #16 \n\t" + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ROR w17, w17, #16 \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "ROR w22, w22, #16 \n\t" + "ADD v10.4S, v10.4S, v11.4S \n\t" + "ADD w14, w14, w19 \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "ADD w15, w15, w16 \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "ADD w12, w12, w17 \n\t" + "EOR v14.16B, v9.16B, v10.16B \n\t" + "ADD w13, w13, w22 \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "EOR w9, w9, w14 \n\t" + "SHL v5.4S, v13.4S, #12 \n\t" + "EOR w10, w10, w15 \n\t" + "SHL v9.4S, v14.4S, #12 \n\t" + "EOR w11, w11, w12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + "EOR w8, w8, w13 \n\t" + "SRI v5.4S, v13.4S, #20 \n\t" + "ROR w9, w9, #20 \n\t" + "SRI v9.4S, v14.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ROR w10, w10, #20 \n\t" + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ROR w11, w11, #20 \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "ROR w8, w8, #20 \n\t" + "ADD v8.4S, v8.4S, v9.4S \n\t" + "ADD w4, w4, w9 \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "ADD w5, w5, w10 \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "ADD w6, w6, w11 \n\t" + "EOR v11.16B, v11.16B, v8.16B \n\t" + "ADD w7, w7, w8 \n\t" + "TBL v3.16B, { v3.16B }, v24.16B \n\t" + "EOR w19, w19, w4 \n\t" + "TBL v7.16B, { v7.16B }, v24.16B \n\t" + "EOR w16, w16, w5 \n\t" + "TBL v11.16B, { v11.16B }, v24.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "EOR w17, w17, w6 \n\t" + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR w22, w22, w7 \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "ROR w19, w19, #24 \n\t" + "ADD v10.4S, v10.4S, v11.4S \n\t" + "ROR w16, w16, #24 \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "ROR w17, w17, #24 \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "ROR w22, w22, #24 \n\t" + "EOR v14.16B, v9.16B, v10.16B \n\t" + "ADD w14, w14, w19 \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "ADD w15, w15, w16 \n\t" + "SHL v5.4S, v13.4S, #7 \n\t" + "ADD w12, w12, w17 \n\t" + "SHL v9.4S, v14.4S, #7 \n\t" + "ADD w13, w13, w22 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EOR w9, w9, w14 \n\t" + "SRI v5.4S, v13.4S, #25 \n\t" + "EOR w10, w10, w15 \n\t" + "SRI v9.4S, v14.4S, #25 \n\t" + "EOR w11, w11, w12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EOR w8, w8, w13 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + "ROR w9, w9, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "ROR w10, w10, #25 \n\t" + "EXT v5.16B, v5.16B, v5.16B, #12 \n\t" + "ROR w11, w11, #25 \n\t" + "EXT v6.16B, v6.16B, v6.16B, #8 \n\t" + "ROR w8, w8, #25 \n\t" + "EXT v7.16B, v7.16B, v7.16B, #4 \n\t" + "EXT v9.16B, v9.16B, v9.16B, #12 \n\t" + "EXT v10.16B, v10.16B, v10.16B, #8 \n\t" + "EXT v11.16B, v11.16B, v11.16B, #4 \n\t" + "BNE L_chacha20_arm64_256_loop_%= \n\t" + /* Load message */ + "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t" + /* Add one (2 added during calculating vector results) */ + "ADD w16, w16, #1 \n\t" + /* Add back state, XOR in message and store (load next block) */ + "ADD v0.4S, v0.4S, v20.4S \n\t" + "ADD v1.4S, v1.4S, v21.4S \n\t" + "ADD v2.4S, v2.4S, v22.4S \n\t" + "ADD v3.4S, v3.4S, v23.4S \n\t" + "EOR v0.16B, v0.16B, v16.16B \n\t" + "EOR v1.16B, v1.16B, v17.16B \n\t" + "EOR v2.16B, v2.16B, v18.16B \n\t" + "EOR v3.16B, v3.16B, v19.16B \n\t" + "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t" + "ST1 {v0.4S-v3.4S}, [%[c]], #64 \n\t" + "MOV v23.S[0], w20 \n\t" + "ADD v4.4S, v4.4S, v20.4S \n\t" + "ADD v5.4S, v5.4S, v21.4S \n\t" + "ADD v6.4S, v6.4S, v22.4S \n\t" + "ADD v7.4S, v7.4S, v23.4S \n\t" + "EOR v4.16B, v4.16B, v16.16B \n\t" + "EOR v5.16B, v5.16B, v17.16B \n\t" + "EOR v6.16B, v6.16B, v18.16B \n\t" + "EOR v7.16B, v7.16B, v19.16B \n\t" + "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t" + "ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t" + "MOV v23.S[0], w21 \n\t" + "ADD v8.4S, v8.4S, v20.4S \n\t" + "ADD v9.4S, v9.4S, v21.4S \n\t" + "ADD v10.4S, v10.4S, v22.4S \n\t" + "ADD v11.4S, v11.4S, v23.4S \n\t" + "EOR v8.16B, v8.16B, v16.16B \n\t" + "EOR v9.16B, v9.16B, v17.16B \n\t" + "EOR v10.16B, v10.16B, v18.16B \n\t" + "EOR v11.16B, v11.16B, v19.16B \n\t" + "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t" + "ST1 {v8.4S-v11.4S}, [%[c]], #64 \n\t" + /* Move regular registers into vector registers for adding and xor */ + "ORR x4, x4, x5, lsl #32 \n\t" + "ORR x6, x6, x7, lsl #32 \n\t" + "ORR x8, x8, x9, lsl #32 \n\t" + "MOV v12.D[0], x4 \n\t" + "ORR x10, x10, x11, lsl #32 \n\t" + "MOV v12.D[1], x6 \n\t" + "ORR x12, x12, x13, lsl #32 \n\t" + "MOV v13.D[0], x8 \n\t" + "ORR x14, x14, x15, lsl #32 \n\t" + "MOV v13.D[1], x10 \n\t" + "ORR x16, x16, x17, lsl #32 \n\t" + "MOV v14.D[0], x12 \n\t" + "ORR x22, x22, x19, lsl #32 \n\t" + "MOV v14.D[1], x14 \n\t" + "MOV v15.D[0], x16 \n\t" + "MOV v15.D[1], x22 \n\t" + /* Add back state, XOR in message and store */ + "ADD v12.4S, v12.4S, v20.4S \n\t" + "ADD v13.4S, v13.4S, v21.4S \n\t" + "ADD v14.4S, v14.4S, v22.4S \n\t" + "ADD v15.4S, v15.4S, v23.4S \n\t" + "EOR v12.16B, v12.16B, v16.16B \n\t" + "EOR v13.16B, v13.16B, v17.16B \n\t" + "EOR v14.16B, v14.16B, v18.16B \n\t" + "EOR v15.16B, v15.16B, v19.16B \n\t" + "ST1 {v12.4S-v15.4S}, [%[c]], #64 \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c) + : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", + "x10", "x11", "x12", "x13", "x14", "x15", "x16", + "x17", "x22", "x19", "x20", "x21", "v0", "v1", + "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23" + ); +#else + word32 x[CHACHA_CHUNK_WORDS]; + word32* x_addr = x; + __asm__ __volatile__ ( + // The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM + // https://cryptojedi.org/papers/neoncrypto-20120320.pdf + + ".align 2 \n\t" + "LDR r14, %[input] \n\t" // load input address + + "LDM r14, { r0-r12 } \n\t" + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 10 11 12 + "VMOV d0, r0, r1 \n\t" + "VMOV d1, r2, r3 \n\t" + "VMOV d2, r4, r5 \n\t" + "VMOV d3, r6, r7 \n\t" + "VMOV d4, r8, r9 \n\t" + "STRD r10, r11, %[x_10] \n\t" + "VMOV d5, r10, r11 \n\t" + "LDRD r11, r10, [r14, #4*14] \n\t" + "VMOV q4, q0 \n\t" + "VMOV q5, q1 \n\t" + "VMOV q6, q2 \n\t" + "VMOV q8, q0 \n\t" + "VMOV q9, q1 \n\t" + "VMOV q10, q2 \n\t" + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 15 14 12 + "VMOV d7, r11, r10 \n\t" + "STR r10, %[x_15] \n\t" + "VMOV d15, r11, r10 \n\t" + "VMOV d23, r11, r10 \n\t" + "MOV r10, r12 \n\t" + "MOV r12, r11 \n\t" + "LDR r11, [r14, #4*13] \n\t" + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 12 13 14 + + "MOV r14, %[rounds] \n\t" + + "VMOV d6, r10, r11 \n\t" + "ADD r10, r10, #1 \n\t" + "VMOV d14, r10, r11 \n\t" + "ADD r10, r10, #1 \n\t" + "VMOV d22, r10, r11 \n\t" + "ADD r10, r10, #1 \n\t" // ARM calculates the fourth block (two was already added earlier) + "\n" + "L_chacha20_arm32_256_loop_%=: \n\t" + "SUBS r14, r14, #1 \n\t" + + // 0, 4, 8, 12 + // 1, 5, 9, 13 + + // ODD ROUND + "ADD r0, r0, r4 \n\t" // 0 0 4 + "VADD.I32 q0, q0, q1 \n\t" + "ADD r1, r1, r5 \n\t" // 1 1 5 + "VADD.I32 q4, q4, q5 \n\t" + "EOR r10, r10, r0 \n\t" // 12 12 0 + "VADD.I32 q8, q8, q9 \n\t" + "EOR r11, r11, r1 \n\t" // 13 13 1 + "VEOR q12, q3, q0 \n\t" + "ROR r10, r10, #16 \n\t" // 12 12 + "VEOR q13, q7, q4 \n\t" + "ROR r11, r11, #16 \n\t" // 13 13 + "VEOR q14, q11, q8 \n\t" + "ADD r8, r8, r10 \n\t" // 8 8 12 + // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words + "VREV32.16 q3, q12 \n\t" + "ADD r9, r9, r11 \n\t" // 9 9 13 + "VREV32.16 q7, q13 \n\t" + "EOR r4, r4, r8 \n\t" // 4 4 8 + "VREV32.16 q11, q14 \n\t" + + "EOR r5, r5, r9 \n\t" // 5 5 9 + "VADD.I32 q2, q2, q3 \n\t" + "ROR r4, r4, #20 \n\t" // 4 4 + "VADD.I32 q6, q6, q7 \n\t" + "ROR r5, r5, #20 \n\t" // 5 5 + "VADD.I32 q10, q10, q11 \n\t" + "ADD r0, r0, r4 \n\t" // 0 0 4 + "VEOR q12, q1, q2 \n\t" + "ADD r1, r1, r5 \n\t" // 1 1 5 + "VEOR q13, q5, q6 \n\t" + "EOR r10, r10, r0 \n\t" // 12 12 0 + "VEOR q14, q9, q10 \n\t" + "EOR r11, r11, r1 \n\t" // 13 13 1 + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q12, #12 \n\t" + "ROR r10, r10, #24 \n\t" // 12 12 + "VSHL.I32 q5, q13, #12 \n\t" + "ROR r11, r11, #24 \n\t" // 13 13 + "VSHL.I32 q9, q14, #12 \n\t" + "ADD r8, r8, r10 \n\t" // 8 8 12 + "VSRI.I32 q1, q12, #20 \n\t" + "ADD r9, r9, r11 \n\t" // 9 9 13 + "VSRI.I32 q5, q13, #20 \n\t" + "STR r11, %[x_13] \n\t" + "VSRI.I32 q9, q14, #20 \n\t" + + "LDR r11, %[x_15] \n\t" + "VADD.I32 q0, q0, q1 \n\t" + "EOR r4, r4, r8 \n\t" // 4 4 8 + "VADD.I32 q4, q4, q5 \n\t" + "STR r8, %[x_8] \n\t" + "VADD.I32 q8, q8, q9 \n\t" + "LDR r8, %[x_10] \n\t" + "VEOR q12, q3, q0 \n\t" + "EOR r5, r5, r9 \n\t" // 5 5 9 + "VEOR q13, q7, q4 \n\t" + "STR r9, %[x_9] \n\t" + "VEOR q14, q11, q8 \n\t" + "LDR r9, %[x_11] \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q3, q12, #8 \n\t" + "ROR r4, r4, #25 \n\t" // 4 4 + "VSHL.I32 q7, q13, #8 \n\t" + "ROR r5, r5, #25 \n\t" // 5 5 + "VSHL.I32 q11, q14, #8 \n\t" + + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 10 11 12 15 14 + + // 2, 6, 10, 14 + // 3, 7, 11, 15 + + "ADD r2, r2, r6 \n\t" // 2 2 6 + "VSRI.I32 q3, q12, #24 \n\t" + "ADD r3, r3, r7 \n\t" // 3 3 7 + "VSRI.I32 q7, q13, #24 \n\t" + "EOR r12, r12, r2 \n\t" // 14 14 2 + "VSRI.I32 q11, q14, #24 \n\t" + + "EOR r11, r11, r3 \n\t" // 15 15 3 + "VADD.I32 q2, q2, q3 \n\t" + "ROR r12, r12, #16 \n\t" // 14 14 + "VADD.I32 q6, q6, q7 \n\t" + "ROR r11, r11, #16 \n\t" // 15 15 + "VADD.I32 q10, q10, q11 \n\t" + "ADD r8, r8, r12 \n\t" // 10 10 14 + "VEOR q12, q1, q2 \n\t" + "ADD r9, r9, r11 \n\t" // 11 11 15 + "VEOR q13, q5, q6 \n\t" + "EOR r6, r6, r8 \n\t" // 6 6 10 + "VEOR q14, q9, q10 \n\t" + "EOR r7, r7, r9 \n\t" // 7 7 11 + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q12, #7 \n\t" + "ROR r6, r6, #20 \n\t" // 6 6 + "VSHL.I32 q5, q13, #7 \n\t" + "ROR r7, r7, #20 \n\t" // 7 7 + "VSHL.I32 q9, q14, #7 \n\t" + "ADD r2, r2, r6 \n\t" // 2 2 6 + "VSRI.I32 q1, q12, #25 \n\t" + "ADD r3, r3, r7 \n\t" // 3 3 7 + "VSRI.I32 q5, q13, #25 \n\t" + "EOR r12, r12, r2 \n\t" // 14 14 2 + "VSRI.I32 q9, q14, #25 \n\t" + + // EVEN ROUND + + "EOR r11, r11, r3 \n\t" // 15 15 3 + "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one + "ROR r12, r12, #24 \n\t" // 14 14 + "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two + "ROR r11, r11, #24 \n\t" // 15 15 + "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three + + "ADD r8, r8, r12 \n\t" // 10 10 14 + "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one + "ADD r9, r9, r11 \n\t" // 11 11 15 + "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two + "EOR r6, r6, r8 \n\t" // 6 6 10 + "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three + + "EOR r7, r7, r9 \n\t" // 7 7 11 + "VEXT.8 q9, q9, q9, #4 \n\t" // permute elements left by one + "ROR r6, r6, #25 \n\t" // 6 6 + "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two + "ROR r7, r7, #25 \n\t" // 7 7 + "VEXT.8 q11, q11, q11, #12 \n\t" // permute elements left by three + + // 0, 5, 10, 15 + // 1, 6, 11, 12 + + "ADD r0, r0, r5 \n\t" // 0 0 5 + "VADD.I32 q0, q0, q1 \n\t" + "ADD r1, r1, r6 \n\t" // 1 1 6 + "VADD.I32 q4, q4, q5 \n\t" + "EOR r11, r11, r0 \n\t" // 15 15 0 + "VADD.I32 q8, q8, q9 \n\t" + "EOR r10, r10, r1 \n\t" // 12 12 1 + "VEOR q12, q3, q0 \n\t" + "ROR r11, r11, #16 \n\t" // 15 15 + "VEOR q13, q7, q4 \n\t" + "ROR r10, r10, #16 \n\t" // 12 12 + "VEOR q14, q11, q8 \n\t" + "ADD r8, r8, r11 \n\t" // 10 10 15 + // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words + "VREV32.16 q3, q12 \n\t" + "ADD r9, r9, r10 \n\t" // 11 11 12 + "VREV32.16 q7, q13 \n\t" + "EOR r5, r5, r8 \n\t" // 5 5 10 + "VREV32.16 q11, q14 \n\t" + + "EOR r6, r6, r9 \n\t" // 6 6 11 + "VADD.I32 q2, q2, q3 \n\t" + "ROR r5, r5, #20 \n\t" // 5 5 + "VADD.I32 q6, q6, q7 \n\t" + "ROR r6, r6, #20 \n\t" // 6 6 + "VADD.I32 q10, q10, q11 \n\t" + "ADD r0, r0, r5 \n\t" // 0 0 5 + "VEOR q12, q1, q2 \n\t" + "ADD r1, r1, r6 \n\t" // 1 1 6 + "VEOR q13, q5, q6 \n\t" + "EOR r11, r11, r0 \n\t" // 15 15 0 + "VEOR q14, q9, q10 \n\t" + "EOR r10, r10, r1 \n\t" // 12 12 1 + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q12, #12 \n\t" + "ROR r11, r11, #24 \n\t" // 15 15 + "VSHL.I32 q5, q13, #12 \n\t" + "ROR r10, r10, #24 \n\t" // 12 12 + "VSHL.I32 q9, q14, #12 \n\t" + "ADD r8, r8, r11 \n\t" // 10 10 15 + "VSRI.I32 q1, q12, #20 \n\t" + "STR r11, %[x_15] \n\t" + "VSRI.I32 q5, q13, #20 \n\t" + "LDR r11, %[x_13] \n\t" + "VSRI.I32 q9, q14, #20 \n\t" + + "ADD r9, r9, r10 \n\t" // 11 11 12 + "VADD.I32 q0, q0, q1 \n\t" + "EOR r5, r5, r8 \n\t" // 5 5 10 + "VADD.I32 q4, q4, q5 \n\t" + "STR r8, %[x_10] \n\t" + "VADD.I32 q8, q8, q9 \n\t" + "LDR r8, %[x_8] \n\t" + "VEOR q12, q3, q0 \n\t" + "EOR r6, r6, r9 \n\t" // 6 6 11 + "VEOR q13, q7, q4 \n\t" + "STR r9, %[x_11] \n\t" + "VEOR q14, q11, q8 \n\t" + "LDR r9, %[x_9] \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q3, q12, #8 \n\t" + "ROR r5, r5, #25 \n\t" // 5 5 + "VSHL.I32 q7, q13, #8 \n\t" + "ROR r6, r6, #25 \n\t" // 6 6 + "VSHL.I32 q11, q14, #8 \n\t" + + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 12 13 14 + + // 2, 7, 8, 13 + // 3, 4, 9, 14 + + "ADD r2, r2, r7 \n\t" // 2 2 7 + "VSRI.I32 q3, q12, #24 \n\t" + "ADD r3, r3, r4 \n\t" // 3 3 4 + "VSRI.I32 q7, q13, #24 \n\t" + "EOR r11, r11, r2 \n\t" // 13 13 2 + "VSRI.I32 q11, q14, #24 \n\t" + + "EOR r12, r12, r3 \n\t" // 14 14 3 + "VADD.I32 q2, q2, q3 \n\t" + "ROR r11, r11, #16 \n\t" // 13 13 + "VADD.I32 q6, q6, q7 \n\t" + "ROR r12, r12, #16 \n\t" // 14 14 + "VADD.I32 q10, q10, q11 \n\t" + "ADD r8, r8, r11 \n\t" // 8 8 13 + "VEOR q12, q1, q2 \n\t" + "ADD r9, r9, r12 \n\t" // 9 9 14 + "VEOR q13, q5, q6 \n\t" + "EOR r7, r7, r8 \n\t" // 7 7 8 + "VEOR q14, q9, q10 \n\t" + "EOR r4, r4, r9 \n\t" // 4 4 9 + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q12, #7 \n\t" + "ROR r7, r7, #20 \n\t" // 7 7 + "VSHL.I32 q5, q13, #7 \n\t" + "ROR r4, r4, #20 \n\t" // 4 4 + "VSHL.I32 q9, q14, #7 \n\t" + "ADD r2, r2, r7 \n\t" // 2 2 7 + "VSRI.I32 q1, q12, #25 \n\t" + "ADD r3, r3, r4 \n\t" // 3 3 4 + "VSRI.I32 q5, q13, #25 \n\t" + "EOR r11, r11, r2 \n\t" // 13 13 2 + "VSRI.I32 q9, q14, #25 \n\t" + + "EOR r12, r12, r3 \n\t" // 14 14 3 + "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three + "ROR r11, r11, #24 \n\t" // 13 13 + "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two + "ROR r12, r12, #24 \n\t" // 14 14 + "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one + + "ADD r8, r8, r11 \n\t" // 8 8 13 + "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three + "ADD r9, r9, r12 \n\t" // 9 9 14 + "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two + "EOR r7, r7, r8 \n\t" // 7 7 8 + "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one + + "EOR r4, r4, r9 \n\t" // 4 4 9 + "VEXT.8 q9, q9, q9, #12 \n\t" // permute elements left by three + "ROR r7, r7, #25 \n\t" // 7 7 + "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two + "ROR r4, r4, #25 \n\t" // 4 4 + "VEXT.8 q11, q11, q11, #4 \n\t" // permute elements left by one + + "BNE L_chacha20_arm32_256_loop_%= \n\t" + + "LDR r14, %[x_addr] \n\t" // load address of x to r14 + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 12 13 14 + "ADD r10, r10, #3 \n\t" // add three here to make later NEON easier + "STM r14, { r0-r9 } \n\t" + "STRD r10, r11, [r14, #4*12] \n\t" + "LDR r9, %[input] \n\t" // load input address + "STR r12, [r14, #4*14] \n\t" + "LDR r10, %[c] \n\t" // load c address + + "VLDM r9, { q12-q15 } \n\t" + "LDR r12, %[m] \n\t" // load m address + + "VADD.I32 q0, q0, q12 \n\t" + "VADD.I32 q1, q1, q13 \n\t" + "VADD.I32 q2, q2, q14 \n\t" + "VADD.I32 q3, q3, q15 \n\t" + + "VADD.I32 q4, q4, q12 \n\t" + "VADD.I32 q5, q5, q13 \n\t" + "VADD.I32 q6, q6, q14 \n\t" + "VADD.I32 q7, q7, q15 \n\t" + + "MOV r11, #1 \n\t" + + "VADD.I32 q8, q8, q12 \n\t" + "VMOV.I32 q12, #0 \n\t" + "VADD.I32 q9, q9, q13 \n\t" + "VMOV.I32 d24[0], r11 \n\t" + "VADD.I32 q10, q10, q14 \n\t" + "VADD.I32 q11, q11, q15 \n\t" + + "VADD.I32 q11, q11, q12 \n\t" // add one to counter + "VADD.I32 q7, q7, q12 \n\t" // add one to counter + "VADD.I32 q11, q11, q12 \n\t" // add one to counter + + "VLDM r12!, { q12-q15 } \n\t" // load m + "VEOR q0, q0, q12 \n\t" + "VEOR q1, q1, q13 \n\t" + "VEOR q2, q2, q14 \n\t" + "VEOR q3, q3, q15 \n\t" + "VSTM r10!, { q0-q3 } \n\t" // store to c + + "VLDM r14, { q0-q3 } \n\t " // load final block from x + + "VLDM r12!, { q12-q15 } \n\t" // load m + "VEOR q4, q4, q12 \n\t" + "VEOR q5, q5, q13 \n\t" + "VEOR q6, q6, q14 \n\t" + "VEOR q7, q7, q15 \n\t" + "VSTM r10!, { q4-q7 } \n\t" // store to c + + "VLDM r9, { q4-q7 } \n\t" // load input + + "VLDM r12!, { q12-q15 } \n\t" // load m + "VEOR q8, q8, q12 \n\t" + "VEOR q9, q9, q13 \n\t" + "VEOR q10, q10, q14 \n\t" + "VEOR q11, q11, q15 \n\t" + "VSTM r10!, { q8-q11 } \n\t" // store to c + + "VLDM r12!, { q12-q15 } \n\t" // load m + "VADD.I32 q0, q0, q4 \n\t" + "VADD.I32 q1, q1, q5 \n\t" + "VADD.I32 q2, q2, q6 \n\t" + "VADD.I32 q3, q3, q7 \n\t" // three was added earlier + "VEOR q0, q0, q12 \n\t" + "VEOR q1, q1, q13 \n\t" + "VEOR q2, q2, q14 \n\t" + "VEOR q3, q3, q15 \n\t" + "VSTM r10!, { q0-q3 } \n\t" // store to c + + : [c] "+m" (c), + [x_0] "=m" (x), + [x_8] "=m" (x[8]), + [x_9] "=m" (x[9]), + [x_10] "=m" (x[10]), + [x_11] "=m" (x[11]), + [x_13] "=m" (x[13]), + [x_15] "=m" (x[15]) + : [rounds] "I" (ROUNDS/2), [input] "m" (input), + [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES), + [m] "m" (m), [x_addr] "m" (x_addr) + : "memory", "cc", + "r0", "r1", "r2", "r3", + "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", "r12", "r14", + "q0", "q1", "q2", "q3", "q4", + "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); + +#endif /* __aarch64__ */ + return CHACHA_CHUNK_BYTES * 4; +} + + +static WC_INLINE int wc_Chacha_encrypt_128(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c) +{ +#ifdef CHACHA_TEST + printf("Entering wc_Chacha_encrypt_128\n"); +#endif /*CHACHA_TEST */ + +#ifdef __aarch64__ + __asm__ __volatile__ ( + /* Load incrementer register to modify counter */ + "LD1 {v22.16B}, [%[L_chacha20_neon_inc_first_word]] \n\t" + /* Load index look-up for rotating left 8 bits */ + "LD1 {v23.16B}, [%[L_chacha20_neon_rol8]] \n\t" + /* Load state to encrypt */ + "LD1 {v18.4S-v21.4S}, [%[input]] \n\t" + /* Load message */ + "LD1 {v14.4S-v17.4S}, [%[m]], #64 \n\t" + /* Move state into vector registers (x3) */ + "MOV v0.16B, v18.16B \n\t" + "MOV v1.16B, v19.16B \n\t" + "MOV v2.16B, v20.16B \n\t" + "MOV v3.16B, v21.16B \n\t" + "MOV v4.16B, v18.16B \n\t" + "MOV v5.16B, v19.16B \n\t" + "MOV v6.16B, v20.16B \n\t" + "MOV v7.16B, v21.16B \n\t" + /* Add counter word */ + "ADD v7.4S, v7.4S, v22.4S \n\t" + /* Set number of odd+even rounds to perform */ + "MOV w3, #10 \n\t" + "\n" + "L_chacha20_arm64_128_loop_%=: \n\t" + "SUBS w3, w3, #1 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + "REV32 v7.8H, v7.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SHL v5.4S, v13.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + "SRI v5.4S, v13.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "TBL v3.16B, { v3.16B }, v23.16B \n\t" + "TBL v7.16B, { v7.16B }, v23.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SHL v5.4S, v13.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "SRI v5.4S, v13.4S, #25 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v5.16B, v5.16B, v5.16B, #4 \n\t" + "EXT v6.16B, v6.16B, v6.16B, #8 \n\t" + "EXT v7.16B, v7.16B, v7.16B, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + "REV32 v7.8H, v7.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SHL v5.4S, v13.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + "SRI v5.4S, v13.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "TBL v3.16B, { v3.16B }, v23.16B \n\t" + "TBL v7.16B, { v7.16B }, v23.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SHL v5.4S, v13.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "SRI v5.4S, v13.4S, #25 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v5.16B, v5.16B, v5.16B, #12 \n\t" + "EXT v6.16B, v6.16B, v6.16B, #8 \n\t" + "EXT v7.16B, v7.16B, v7.16B, #4 \n\t" + "BNE L_chacha20_arm64_128_loop_%= \n\t" + /* Add back state, XOR in message and store (load next block) */ + "ADD v0.4S, v0.4S, v18.4S \n\t" + "ADD v1.4S, v1.4S, v19.4S \n\t" + "ADD v2.4S, v2.4S, v20.4S \n\t" + "ADD v3.4S, v3.4S, v21.4S \n\t" + "EOR v0.16B, v0.16B, v14.16B \n\t" + "EOR v1.16B, v1.16B, v15.16B \n\t" + "EOR v2.16B, v2.16B, v16.16B \n\t" + "EOR v3.16B, v3.16B, v17.16B \n\t" + "LD1 {v14.4S-v17.4S}, [%[m]], #64 \n\t" + "ST1 {v0.4S-v3.4S}, [%[c]], #64 \n\t" + "ADD v21.4S, v21.4S, v22.4S \n\t" + "ADD v4.4S, v4.4S, v18.4S \n\t" + "ADD v5.4S, v5.4S, v19.4S \n\t" + "ADD v6.4S, v6.4S, v20.4S \n\t" + "ADD v7.4S, v7.4S, v21.4S \n\t" + "EOR v4.16B, v4.16B, v14.16B \n\t" + "EOR v5.16B, v5.16B, v15.16B \n\t" + "EOR v6.16B, v6.16B, v16.16B \n\t" + "EOR v7.16B, v7.16B, v17.16B \n\t" + "ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c) + : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8), + [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word) + : "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21" + ); +#else + __asm__ __volatile__ ( + "MOV r11, %[rounds] \n\t" + "MOV r12, #1 \n\t" + "VLDM %[input], { q0-q3 } \n\t" + "VMOV.I32 q8, #0 \n\t" + "VMOV q4, q0 \n\t" + "VMOV.I32 d16[0], r12 \n\t" + "VMOV q5, q1 \n\t" + "VMOV q6, q2 \n\t" + "VADD.I32 q7, q3, q8 \n\t" // add one to counter + + // store input + "VMOV q10, q0 \n\t" + "VMOV q11, q1 \n\t" + "VMOV q12, q2 \n\t" + "VMOV q13, q3 \n\t" + "\n" + "L_chacha20_arm32_128_loop_%=: \n\t" + "SUBS r11, r11, #1 \n\t" + + // ODD ROUND + "VADD.I32 q0, q0, q1 \n\t" + "VADD.I32 q4, q4, q5 \n\t" + "VEOR q8, q3, q0 \n\t" + "VEOR q9, q7, q4 \n\t" + // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words + "VREV32.16 q3, q8 \n\t" + "VREV32.16 q7, q9 \n\t" + + "VADD.I32 q2, q2, q3 \n\t" + "VADD.I32 q6, q6, q7 \n\t" + "VEOR q8, q1, q2 \n\t" + "VEOR q9, q5, q6 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q8, #12 \n\t" + "VSHL.I32 q5, q9, #12 \n\t" + "VSRI.I32 q1, q8, #20 \n\t" + "VSRI.I32 q5, q9, #20 \n\t" + + "VADD.I32 q0, q0, q1 \n\t" + "VADD.I32 q4, q4, q5 \n\t" + "VEOR q8, q3, q0 \n\t" + "VEOR q9, q7, q4 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q3, q8, #8 \n\t" + "VSHL.I32 q7, q9, #8 \n\t" + "VSRI.I32 q3, q8, #24 \n\t" + "VSRI.I32 q7, q9, #24 \n\t" + + "VADD.I32 q2, q2, q3 \n\t" + "VADD.I32 q6, q6, q7 \n\t" + "VEOR q8, q1, q2 \n\t" + "VEOR q9, q5, q6 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q8, #7 \n\t" + "VSHL.I32 q5, q9, #7 \n\t" + "VSRI.I32 q1, q8, #25 \n\t" + "VSRI.I32 q5, q9, #25 \n\t" + + // EVEN ROUND + + "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one + "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two + "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three + + "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one + "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two + "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three + + "VADD.I32 q0, q0, q1 \n\t" + "VADD.I32 q4, q4, q5 \n\t" + "VEOR q8, q3, q0 \n\t" + "VEOR q9, q7, q4 \n\t" + // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words + "VREV32.16 q3, q8 \n\t" + "VREV32.16 q7, q9 \n\t" + + "VADD.I32 q2, q2, q3 \n\t" + "VADD.I32 q6, q6, q7 \n\t" + "VEOR q8, q1, q2 \n\t" + "VEOR q9, q5, q6 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q8, #12 \n\t" + "VSHL.I32 q5, q9, #12 \n\t" + "VSRI.I32 q1, q8, #20 \n\t" + "VSRI.I32 q5, q9, #20 \n\t" + + "VADD.I32 q0, q0, q1 \n\t" + "VADD.I32 q4, q4, q5 \n\t" + "VEOR q8, q3, q0 \n\t" + "VEOR q9, q7, q4 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q3, q8, #8 \n\t" + "VSHL.I32 q7, q9, #8 \n\t" + "VSRI.I32 q3, q8, #24 \n\t" + "VSRI.I32 q7, q9, #24 \n\t" + + "VADD.I32 q2, q2, q3 \n\t" + "VADD.I32 q6, q6, q7 \n\t" + "VEOR q8, q1, q2 \n\t" + "VEOR q9, q5, q6 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q8, #7 \n\t" + "VSHL.I32 q5, q9, #7 \n\t" + "VSRI.I32 q1, q8, #25 \n\t" + "VSRI.I32 q5, q9, #25 \n\t" + + "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three + "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two + "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one + + "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three + "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two + "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one + + "BNE L_chacha20_arm32_128_loop_%= \n\t" + + "VMOV.I32 q8, #0 \n\t" + "VADD.I32 q0, q0, q10 \n\t" + "VADD.I32 q1, q1, q11 \n\t" + "VMOV.I32 d16[0], r12 \n\t" + "VADD.I32 q2, q2, q12 \n\t" + "VADD.I32 q3, q3, q13 \n\t" + + "VADD.I32 q13, q13, q8 \n\t" // add one to counter + + "VADD.I32 q4, q4, q10 \n\t" + "VADD.I32 q5, q5, q11 \n\t" + "VADD.I32 q6, q6, q12 \n\t" + "VADD.I32 q7, q7, q13 \n\t" + + "VLDM %[m], { q8-q15 } \n\t" + "VEOR q0, q0, q8 \n\t" + "VEOR q1, q1, q9 \n\t" + "VEOR q2, q2, q10 \n\t" + "VEOR q3, q3, q11 \n\t" + "VEOR q4, q4, q12 \n\t" + "VEOR q5, q5, q13 \n\t" + "VEOR q6, q6, q14 \n\t" + "VEOR q7, q7, q15 \n\t" + "VSTM %[c], { q0-q7 } \n\t" + + : [c] "+r" (c), [m] "+r" (m) + : [rounds] "I" (ROUNDS/2), [input] "r" (input), + [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES) + : "memory", "cc", + "r11", "r12", + "q0", "q1", "q2", "q3", "q4", + "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif /* __aarch64__ */ + return CHACHA_CHUNK_BYTES * 2; +} + +static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, + byte* c, word32 bytes) +{ +#ifdef CHACHA_TEST + printf("Entering wc_Chacha_encrypt_64 with %d bytes\n", bytes); +#endif /*CHACHA_TEST */ + +#ifdef __aarch64__ + word64 bytes64 = (word64) bytes; + __asm__ __volatile__ ( + /* Load index look-up for rotating left 8 bits */ + "LD1 {v13.16B}, [%[L_chacha20_neon_rol8]] \n\t" + "LD1 {v14.4S}, [%[L_chacha20_neon_inc_first_word]] \n\t" + /* Load state to encrypt */ + "LD1 {v8.4S-v11.4S}, [%[input]] \n\t" + "\n" + "L_chacha20_arm64_64_loop_%=: \n\t" + /* Move state into vector registers (x3) */ + "MOV v0.16B, v8.16B \n\t" + "MOV v1.16B, v9.16B \n\t" + "MOV v2.16B, v10.16B \n\t" + "MOV v3.16B, v11.16B \n\t" + /* Add counter word */ + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Add back state */ + "ADD v0.4S, v0.4S, v8.4S \n\t" + "ADD v1.4S, v1.4S, v9.4S \n\t" + "ADD v2.4S, v2.4S, v10.4S \n\t" + "ADD v3.4S, v3.4S, v11.4S \n\t" + "CMP %[bytes], #64 \n\t" + "BLT L_chacha20_arm64_64_lt_64_%= \n\t" + "LD1 {v4.4S-v7.4S}, [%[m]], #64 \n\t" + "EOR v4.16B, v4.16B, v0.16B \n\t" + "EOR v5.16B, v5.16B, v1.16B \n\t" + "EOR v6.16B, v6.16B, v2.16B \n\t" + "EOR v7.16B, v7.16B, v3.16B \n\t" + "ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t" + "SUBS %[bytes], %[bytes], #64 \n\t" + "ADD v11.4S, v11.4S, v14.4S \n\t" + "BNE L_chacha20_arm64_64_loop_%= \n\t" + "B L_chacha20_arm64_64_done_%= \n\t" + "\n" + "L_chacha20_arm64_64_lt_64_%=: \n\t" + "CMP %[bytes], #32 \n\t" + "BLT L_chacha20_arm64_64_lt_32_%= \n\t" + "LD1 {v4.4S, v5.4S}, [%[m]], #32 \n\t" + "EOR v4.16B, v4.16B, v0.16B \n\t" + "EOR v5.16B, v5.16B, v1.16B \n\t" + "ST1 {v4.4S, v5.4S}, [%[c]], #32 \n\t" + "SUBS %[bytes], %[bytes], #32 \n\t" + "MOV v0.16B, v2.16B \n\t" + "MOV v1.16B, v3.16B \n\t" + "BEQ L_chacha20_arm64_64_done_%= \n\t" + "\n" + "L_chacha20_arm64_64_lt_32_%=: \n\t" + "CMP %[bytes], #16 \n\t" + "BLT L_chacha20_arm64_64_lt_16_%= \n\t" + "LD1 {v4.4S}, [%[m]], #16 \n\t" + "EOR v4.16B, v4.16B, v0.16B \n\t" + "ST1 {v4.4S}, [%[c]], #16 \n\t" + "SUBS %[bytes], %[bytes], #16 \n\t" + "MOV v0.16B, v1.16B \n\t" + "BEQ L_chacha20_arm64_64_done_%= \n\t" + "\n" + "L_chacha20_arm64_64_lt_16_%=: \n\t" + "CMP %[bytes], #8 \n\t" + "BLT L_chacha20_arm64_64_lt_8_%= \n\t" + "LD1 {v4.2S}, [%[m]], #8 \n\t" + "EOR v4.8B, v4.8B, v0.8B \n\t" + "ST1 {v4.2S}, [%[c]], #8 \n\t" + "SUBS %[bytes], %[bytes], #8 \n\t" + "MOV v0.D[0], v0.D[1] \n\t" + "BEQ L_chacha20_arm64_64_done_%= \n\t" + "\n" + "L_chacha20_arm64_64_lt_8_%=: \n\t" + "MOV x4, v0.D[0] \n\t" + "LSL x5, %[bytes], #3 \n\t" + "\n" + "L_chacha20_arm64_64_loop_lt_8_%=: \n\t" + "LDRB w6, [%[m], %[bytes]] \n\t" + "ROR x7, x4, x5 \n\t" + "EOR w6, w6, w7 \n\t" + "STRB w6, [%[c], %[bytes]] \n\t" + "SUBS %[bytes], %[bytes], #1 \n\t" + "SUB x5, x5, #8 \n\t" + "BGE L_chacha20_arm64_64_loop_lt_8_%= \n\t" + "\n" + "L_chacha20_arm64_64_done_%=: \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes64) + : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8), + [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word) + : "memory", "x4", "x5", "x6", "x7", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); +#else + __asm__ __volatile__ ( + /* Get the input state */ + "VLDM %[input], { q8-q11 } \n\t" + /* Get the incrementer register */ + "VLDM %[L_chacha20_neon_inc_first_word], { q14 } \n\t" + "\n" + "L_chacha20_arm32_64_outer_loop_%=: \n\t" + /* Copy over the input state */ + "VMOV q0, q8 \n\t" + "VMOV q1, q9 \n\t" + "VMOV q2, q10 \n\t" + "VMOV q3, q11 \n\t" + /* Compute quarter rounds */ + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Add back state */ + "VADD.I32 q0, q0, q8 \n\t" + "VADD.I32 q1, q1, q9 \n\t" + "VADD.I32 q2, q2, q10 \n\t" + "VADD.I32 q3, q3, q11 \n\t" + "CMP %[bytes], #64 \n\t" + "BLT L_chacha20_arm32_64_lt_64_%= \n\t" + /* XOR full 64 byte block */ + "VLDM %[m], { q4-q7 } \n\t" + "ADD %[m], %[m], #64 \n\t" + "VEOR q0, q0, q4 \n\t" + "VEOR q1, q1, q5 \n\t" + "VEOR q2, q2, q6 \n\t" + "VEOR q3, q3, q7 \n\t" + "VSTM %[c], { q0-q3 } \n\t" + "ADD %[c], %[c], #64 \n\t" + "SUBS %[bytes], %[bytes], #64 \n\t" + "VADD.I32 q11, q11, q14 \n\t" + "BNE L_chacha20_arm32_64_outer_loop_%= \n\t" + "B L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_64_%=: \n\t" + /* XOR 32 bytes */ + "CMP %[bytes], #32 \n\t" + "BLT L_chacha20_arm32_64_lt_32_%= \n\t" + "VLDM %[m], { q4-q5 } \n\t" + "ADD %[m], %[m], #32 \n\t" + "VEOR q4, q4, q0 \n\t" + "VEOR q5, q5, q1 \n\t" + "VSTM %[c], { q4-q5 } \n\t" + "ADD %[c], %[c], #32 \n\t" + "SUBS %[bytes], %[bytes], #32 \n\t" + "VMOV q0, q2 \n\t" + "VMOV q1, q3 \n\t" + "BEQ L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_32_%=: \n\t" + /* XOR 16 bytes */ + "CMP %[bytes], #16 \n\t" + "BLT L_chacha20_arm32_64_lt_16_%= \n\t" + "VLDM %[m], { q4 } \n\t" + "ADD %[m], %[m], #16 \n\t" + "VEOR q4, q4, q0 \n\t" + "VSTM %[c], { q4 } \n\t" + "ADD %[c], %[c], #16 \n\t" + "SUBS %[bytes], %[bytes], #16 \n\t" + "VMOV q0, q1 \n\t" + "BEQ L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_16_%=: \n\t" + /* XOR 8 bytes */ + "CMP %[bytes], #8 \n\t" + "BLT L_chacha20_arm32_64_lt_8_%= \n\t" + "VLDR d8, [%[m], #0] \n\t" + "ADD %[m], %[m], #8 \n\t" + "VEOR d8, d8, d0 \n\t" + "VSTR d8, [%[c], #0] \n\t" + "ADD %[c], %[c], #8 \n\t" + "SUBS %[bytes], %[bytes], #8 \n\t" + "VMOV d0, d1 \n\t" + "BEQ L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_8_%=: \n\t" + /* XOR 4 bytes */ + "CMP %[bytes], #4 \n\t" + "BLT L_chacha20_arm32_64_lt_4_%= \n\t" + "LDR r12, [%[m]], #4 \n\t" + "VMOV r14, d0[0] \n\t" + "EOR r12, r12, r14 \n\t" + "STR r12, [%[c]], #4 \n\t" + "SUBS %[bytes], %[bytes], #4 \n\t" + "VTRN.32 d0, d0 \n\t" + "BEQ L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_4_%=: \n\t" + /* XOR remaining bytes */ + "VMOV r14, d0[0] \n\t" + "\n" + "L_chacha20_arm32_64_lt_4_loop_%=: \n\t" + "LDRB r12, [%[m]], #1 \n\t" + "EOR r12, r12, r14 \n\t" + "STRB r12, [%[c]], #1 \n\t" + "SUBS %[bytes], %[bytes], #1 \n\t" + "LSR r14, r14, #8 \n\t" + "BGT L_chacha20_arm32_64_lt_4_loop_%= \n\t" + "\n" + "L_chacha20_arm32_64_done_%=: \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes) + : [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word) + : "memory", "cc", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q14", "r12", "r14" + ); +#endif /* __aarch64__ */ +} + +/** + * Encrypt a stream of bytes + */ +static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c, + word32 bytes) +{ + int processed; + +#ifdef __aarch64__ + if (bytes >= CHACHA_CHUNK_BYTES * 5) { + processed = (bytes / (CHACHA_CHUNK_BYTES * 5)) * CHACHA_CHUNK_BYTES * 5; + wc_Chacha_encrypt_320(ctx->X, m, c, processed); + + bytes -= processed; + c += processed; + m += processed; + ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES); + } + if (bytes >= CHACHA_CHUNK_BYTES * 4) { +#else + while (bytes >= CHACHA_CHUNK_BYTES * 4) { +#endif /*__aarch64__ */ + processed = wc_Chacha_encrypt_256(ctx->X, m, c); + + bytes -= processed; + c += processed; + m += processed; + ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES); + } + if (bytes >= CHACHA_CHUNK_BYTES * 2) { + processed = wc_Chacha_encrypt_128(ctx->X, m, c); + + bytes -= processed; + c += processed; + m += processed; + ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES); + } + if (bytes > 0) { + wc_Chacha_encrypt_64(ctx->X, m, c, bytes); + if (bytes > 64) + ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); + ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); + } +} + +/** + * API to encrypt/decrypt a message of any size. + */ +int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, + word32 msglen) +{ + if (ctx == NULL || output == NULL || input == NULL) + return BAD_FUNC_ARG; + + wc_Chacha_encrypt_bytes(ctx, input, output, msglen); + + return 0; +} + +#endif /* HAVE_CHACHA */ +#endif /* WOLFSSL_ARMASM */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S b/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S new file mode 100644 index 0000000..891c6d8 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -0,0 +1,6715 @@ +/* armv8-curve25519 + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S + */ +#ifdef __aarch64__ + .text + .align 2 + .globl fe_init + .type fe_init, %function +fe_init: + ret + .size fe_init,.-fe_init + .text + .align 2 + .globl fe_frombytes + .type fe_frombytes, %function +fe_frombytes: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + .size fe_frombytes,.-fe_frombytes + .text + .align 2 + .globl fe_tobytes + .type fe_tobytes, %function +fe_tobytes: + mov x7, #19 + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + adds x6, x2, x7 + adcs x6, x3, xzr + adcs x6, x4, xzr + adc x6, x5, xzr + and x6, x7, x6, asr 63 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + adc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + .size fe_tobytes,.-fe_tobytes + .text + .align 2 + .globl fe_1 + .type fe_1, %function +fe_1: + # Set one + mov x1, #1 + stp x1, xzr, [x0] + stp xzr, xzr, [x0, #16] + ret + .size fe_1,.-fe_1 + .text + .align 2 + .globl fe_0 + .type fe_0, %function +fe_0: + # Set zero + stp xzr, xzr, [x0] + stp xzr, xzr, [x0, #16] + ret + .size fe_0,.-fe_0 + .text + .align 2 + .globl fe_copy + .type fe_copy, %function +fe_copy: + # Copy + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + .size fe_copy,.-fe_copy + .text + .align 2 + .globl fe_sub + .type fe_sub, %function +fe_sub: + # Sub + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + subs x3, x3, x7 + sbcs x4, x4, x8 + sbcs x5, x5, x9 + sbcs x6, x6, x10 + mov x12, #-19 + csetm x11, cc + # Mask the modulus + and x12, x11, x12 + and x13, x11, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x3, x3, x12 + adcs x4, x4, x11 + adcs x5, x5, x11 + adc x6, x6, x13 + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ret + .size fe_sub,.-fe_sub + .text + .align 2 + .globl fe_add + .type fe_add, %function +fe_add: + # Add + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + adds x3, x3, x7 + adcs x4, x4, x8 + adcs x5, x5, x9 + adc x6, x6, x10 + mov x12, #-19 + asr x11, x6, #63 + # Mask the modulus + and x12, x11, x12 + and x13, x11, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x3, x3, x12 + sbcs x4, x4, x11 + sbcs x5, x5, x11 + sbc x6, x6, x13 + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ret + .size fe_add,.-fe_add + .text + .align 2 + .globl fe_neg + .type fe_neg, %function +fe_neg: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x6, #-19 + mov x7, #-1 + mov x8, #-1 + mov x9, #0x7fffffffffffffff + subs x6, x6, x2 + sbcs x7, x7, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + ret + .size fe_neg,.-fe_neg + .text + .align 2 + .globl fe_isnonzero + .type fe_isnonzero, %function +fe_isnonzero: + mov x6, #19 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + adds x5, x1, x6 + adcs x5, x2, xzr + adcs x5, x3, xzr + adc x5, x4, xzr + and x5, x6, x5, asr 63 + adds x1, x1, x5 + adcs x2, x2, xzr + adcs x3, x3, xzr + adc x4, x4, xzr + and x4, x4, #0x7fffffffffffffff + orr x0, x1, x2 + orr x3, x3, x4 + orr x0, x0, x3 + ret + .size fe_isnonzero,.-fe_isnonzero + .text + .align 2 + .globl fe_isnegative + .type fe_isnegative, %function +fe_isnegative: + mov x6, #19 + ldp x1, x2, [x0] + ldp x3, x4, [x0, #16] + adds x5, x1, x6 + adcs x5, x2, xzr + adcs x5, x3, xzr + adc x5, x4, xzr + and x0, x1, #1 + eor x0, x0, x5, lsr 63 + ret + .size fe_isnegative,.-fe_isnegative + .text + .align 2 + .globl fe_cmov_table + .type fe_cmov_table, %function +fe_cmov_table: + stp x29, x30, [sp, #-128]! + add x29, sp, #0 + str x17, [x29, #40] + str x19, [x29, #48] + stp x20, x21, [x29, #56] + stp x22, x23, [x29, #72] + stp x24, x25, [x29, #88] + stp x26, x27, [x29, #104] + str x28, [x29, #120] + str x0, [x29, #16] + sxtb x2, w2 + sbfx x3, x2, #7, #1 + eor x0, x2, x3 + sub x0, x0, x3 + mov x4, #1 + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, #1 + mov x9, xzr + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + mov x15, xzr + cmp x0, #1 + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x1, #32] + ldp x23, x24, [x1, #48] + ldp x25, x26, [x1, #64] + ldp x27, x28, [x1, #80] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #2 + ldp x16, x17, [x1, #96] + ldp x19, x20, [x1, #112] + ldp x21, x22, [x1, #128] + ldp x23, x24, [x1, #144] + ldp x25, x26, [x1, #160] + ldp x27, x28, [x1, #176] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #3 + ldp x16, x17, [x1, #192] + ldp x19, x20, [x1, #208] + ldp x21, x22, [x1, #224] + ldp x23, x24, [x1, #240] + ldp x25, x26, [x1, #256] + ldp x27, x28, [x1, #272] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #4 + ldp x16, x17, [x1, #288] + ldp x19, x20, [x1, #304] + ldp x21, x22, [x1, #320] + ldp x23, x24, [x1, #336] + ldp x25, x26, [x1, #352] + ldp x27, x28, [x1, #368] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + add x1, x1, #0x180 + cmp x0, #5 + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x1, #32] + ldp x23, x24, [x1, #48] + ldp x25, x26, [x1, #64] + ldp x27, x28, [x1, #80] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #6 + ldp x16, x17, [x1, #96] + ldp x19, x20, [x1, #112] + ldp x21, x22, [x1, #128] + ldp x23, x24, [x1, #144] + ldp x25, x26, [x1, #160] + ldp x27, x28, [x1, #176] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #7 + ldp x16, x17, [x1, #192] + ldp x19, x20, [x1, #208] + ldp x21, x22, [x1, #224] + ldp x23, x24, [x1, #240] + ldp x25, x26, [x1, #256] + ldp x27, x28, [x1, #272] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + cmp x0, #8 + ldp x16, x17, [x1, #288] + ldp x19, x20, [x1, #304] + ldp x21, x22, [x1, #320] + ldp x23, x24, [x1, #336] + ldp x25, x26, [x1, #352] + ldp x27, x28, [x1, #368] + csel x4, x16, x4, eq + csel x5, x17, x5, eq + csel x6, x19, x6, eq + csel x7, x20, x7, eq + csel x8, x21, x8, eq + csel x9, x22, x9, eq + csel x10, x23, x10, eq + csel x11, x24, x11, eq + csel x12, x25, x12, eq + csel x13, x26, x13, eq + csel x14, x27, x14, eq + csel x15, x28, x15, eq + mov x16, #-19 + mov x17, #-1 + mov x19, #-1 + mov x20, #0x7fffffffffffffff + subs x16, x16, x12 + sbcs x17, x17, x13 + sbcs x19, x19, x14 + sbc x20, x20, x15 + cmp x2, #0 + mov x3, x4 + csel x4, x8, x4, lt + csel x8, x3, x8, lt + mov x3, x5 + csel x5, x9, x5, lt + csel x9, x3, x9, lt + mov x3, x6 + csel x6, x10, x6, lt + csel x10, x3, x10, lt + mov x3, x7 + csel x7, x11, x7, lt + csel x11, x3, x11, lt + csel x12, x16, x12, lt + csel x13, x17, x13, lt + csel x14, x19, x14, lt + csel x15, x20, x15, lt + ldr x0, [x29, #16] + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + stp x8, x9, [x0, #32] + stp x10, x11, [x0, #48] + stp x12, x13, [x0, #64] + stp x14, x15, [x0, #80] + ldr x17, [x29, #40] + ldr x19, [x29, #48] + ldp x20, x21, [x29, #56] + ldp x22, x23, [x29, #72] + ldp x24, x25, [x29, #88] + ldp x26, x27, [x29, #104] + ldr x28, [x29, #120] + ldp x29, x30, [sp], #0x80 + ret + .size fe_cmov_table,.-fe_cmov_table + .text + .align 2 + .globl fe_mul + .type fe_mul, %function +fe_mul: + stp x29, x30, [sp, #-64]! + add x29, sp, #0 + str x17, [x29, #24] + str x19, [x29, #32] + stp x20, x21, [x29, #40] + str x22, [x29, #56] + # Multiply + ldp x14, x15, [x1] + ldp x16, x17, [x1, #16] + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] + # A[0] * B[0] + mul x6, x14, x19 + umulh x7, x14, x19 + # A[0] * B[1] + mul x3, x14, x20 + umulh x8, x14, x20 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x15, x19 + umulh x4, x15, x19 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x14, x21 + umulh x4, x14, x21 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x15, x20 + umulh x4, x15, x20 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, xzr, xzr + # A[2] * B[0] + mul x3, x16, x19 + umulh x4, x16, x19 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x10, x10, xzr + # A[0] * B[3] + mul x3, x14, x22 + umulh x4, x14, x22 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, xzr, xzr + # A[1] * B[2] + mul x3, x15, x21 + umulh x4, x15, x21 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[2] * B[1] + mul x3, x16, x20 + umulh x4, x16, x20 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[3] * B[0] + mul x3, x17, x19 + umulh x4, x17, x19 + adds x9, x9, x3 + adcs x10, x10, x4 + adc x11, x11, xzr + # A[1] * B[3] + mul x3, x15, x22 + umulh x4, x15, x22 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, xzr, xzr + # A[2] * B[2] + mul x3, x16, x21 + umulh x4, x16, x21 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[3] * B[1] + mul x3, x17, x20 + umulh x4, x17, x20 + adds x10, x10, x3 + adcs x11, x11, x4 + adc x12, x12, xzr + # A[2] * B[3] + mul x3, x16, x22 + umulh x4, x16, x22 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[3] * B[2] + mul x3, x17, x21 + umulh x4, x17, x21 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[3] * B[3] + mul x3, x17, x22 + umulh x4, x17, x22 + adds x12, x12, x3 + adc x13, x13, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x10 + umulh x10, x3, x10 + adds x6, x6, x4 + mul x4, x3, x11 + umulh x11, x3, x11 + adcs x7, x7, x4 + mul x4, x3, x12 + umulh x12, x3, x12 + adcs x8, x8, x4 + mul x4, x3, x13 + umulh x5, x3, x13 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x10 + adcs x8, x8, x11 + adcs x9, x9, x12 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + ldr x17, [x29, #24] + ldr x19, [x29, #32] + ldp x20, x21, [x29, #40] + ldr x22, [x29, #56] + ldp x29, x30, [sp], #0x40 + ret + .size fe_mul,.-fe_mul + .text + .align 2 + .globl fe_sq + .type fe_sq, %function +fe_sq: + # Square + ldp x13, x14, [x1] + ldp x15, x16, [x1, #16] + # A[0] * A[1] + mul x6, x13, x14 + umulh x7, x13, x14 + # A[0] * A[2] + mul x2, x13, x15 + umulh x8, x13, x15 + adds x7, x7, x2 + adc x8, x8, xzr + # A[0] * A[3] + mul x2, x13, x16 + umulh x9, x13, x16 + adds x8, x8, x2 + adc x9, x9, xzr + # A[1] * A[2] + mul x2, x14, x15 + umulh x3, x14, x15 + adds x8, x8, x2 + adcs x9, x9, x3 + adc x10, xzr, xzr + # A[1] * A[3] + mul x2, x14, x16 + umulh x3, x14, x16 + adds x9, x9, x2 + adc x10, x10, x3 + # A[2] * A[3] + mul x2, x15, x16 + umulh x11, x15, x16 + adds x10, x10, x2 + adc x11, x11, xzr + # Double + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adc x12, xzr, xzr + # A[0] * A[0] + mul x5, x13, x13 + umulh x4, x13, x13 + # A[1] * A[1] + mul x2, x14, x14 + umulh x3, x14, x14 + adds x6, x6, x4 + adcs x7, x7, x2 + adc x4, x3, xzr + # A[2] * A[2] + mul x2, x15, x15 + umulh x3, x15, x15 + adds x8, x8, x4 + adcs x9, x9, x2 + adc x4, x3, xzr + # A[3] * A[3] + mul x2, x16, x16 + umulh x3, x16, x16 + adds x10, x10, x4 + adcs x11, x11, x2 + adc x12, x12, x3 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x12, x12, x11, #63 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + and x8, x8, #0x7fffffffffffffff + # Multiply top half by 19 + mov x2, #19 + mul x3, x2, x9 + umulh x9, x2, x9 + adds x5, x5, x3 + mul x3, x2, x10 + umulh x10, x2, x10 + adcs x6, x6, x3 + mul x3, x2, x11 + umulh x11, x2, x11 + adcs x7, x7, x3 + mul x3, x2, x12 + umulh x4, x2, x12 + adcs x8, x8, x3 + adc x4, x4, xzr + # Add remaining product results in + adds x6, x6, x9 + adcs x7, x7, x10 + adcs x8, x8, x11 + adc x4, x4, xzr + # Overflow + extr x4, x4, x8, #63 + mul x4, x4, x2 + and x8, x8, #0x7fffffffffffffff + adds x5, x5, x4 + adcs x6, x6, xzr + adcs x7, x7, xzr + adc x8, x8, xzr + # Reduce if top bit set + and x4, x2, x8, asr 63 + and x8, x8, #0x7fffffffffffffff + adds x5, x5, x4 + adcs x6, x6, xzr + adcs x7, x7, xzr + adc x8, x8, xzr + # Store + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + ret + .size fe_sq,.-fe_sq + .text + .align 2 + .globl fe_invert + .type fe_invert, %function +fe_invert: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x20, [x29, #168] + # Invert + str x0, [x29, #144] + str x1, [x29, #152] + add x0, x29, #16 + bl fe_sq + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + add x1, x29, #48 + bl fe_sq + ldr x1, [x29, #152] + add x2, x29, #48 + bl fe_mul + add x0, x29, #16 + add x1, x29, #16 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x50 + bl fe_sq + add x0, x29, #48 + add x1, x29, #48 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x50 + bl fe_sq + mov x20, #4 + add x1, x29, #0x50 +L_fe_invert1: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert1 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x50 + add x1, x29, #48 + bl fe_sq + mov x20, #9 + add x1, x29, #0x50 +L_fe_invert2: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert2 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x70 + bl fe_sq + mov x20, #19 + add x1, x29, #0x70 +L_fe_invert3: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert3 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + mov x20, #10 + add x1, x29, #0x50 +L_fe_invert4: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert4 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x50 + add x1, x29, #48 + bl fe_sq + mov x20, #49 + add x1, x29, #0x50 +L_fe_invert5: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert5 + add x2, x29, #48 + bl fe_mul + add x0, x29, #0x70 + bl fe_sq + mov x20, #0x63 + add x1, x29, #0x70 +L_fe_invert6: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert6 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + mov x20, #50 + add x1, x29, #0x50 +L_fe_invert7: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert7 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + mov x20, #5 + add x1, x29, #48 +L_fe_invert8: + bl fe_sq + sub x20, x20, #1 + cmp x20, #0 + bne L_fe_invert8 + ldr x0, [x29, #144] + add x2, x29, #16 + bl fe_mul + ldr x20, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_invert,.-fe_invert + .text + .align 2 + .globl curve25519 + .type curve25519, %function +curve25519: + stp x29, x30, [sp, #-288]! + add x29, sp, #0 + str x17, [x29, #200] + str x19, [x29, #208] + stp x20, x21, [x29, #216] + stp x22, x23, [x29, #232] + stp x24, x25, [x29, #248] + stp x26, x27, [x29, #264] + str x28, [x29, #280] + mov x23, xzr + str x0, [x29, #176] + str x2, [x29, #184] + # Copy + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + # Set one + mov x2, #1 + stp x2, xzr, [x0] + stp xzr, xzr, [x0, #16] + # Set zero + stp xzr, xzr, [x29, #16] + stp xzr, xzr, [x29, #32] + # Set one + mov x2, #1 + stp x2, xzr, [x29, #48] + stp xzr, xzr, [x29, #64] + mov x25, #62 + mov x24, #24 +L_curve25519_words: +L_curve25519_bits: + ldr x2, [x1, x24] + lsr x2, x2, x25 + and x2, x2, #1 + eor x23, x23, x2 + # Conditional Swap + cmp x23, #1 + ldp x10, x11, [x0] + ldp x12, x13, [x0, #16] + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] + csel x14, x10, x6, eq + csel x10, x6, x10, eq + csel x15, x11, x7, eq + csel x11, x7, x11, eq + csel x16, x12, x8, eq + csel x12, x8, x12, eq + csel x17, x13, x9, eq + csel x13, x9, x13, eq + # Conditional Swap + cmp x23, #1 + ldp x19, x20, [x29, #16] + ldp x21, x22, [x29, #32] + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] + csel x5, x19, x6, eq + csel x19, x6, x19, eq + csel x26, x20, x7, eq + csel x20, x7, x20, eq + csel x27, x21, x8, eq + csel x21, x8, x21, eq + csel x28, x22, x9, eq + csel x22, x9, x22, eq + mov x23, x2 + # Add + adds x6, x10, x19 + adcs x7, x11, x20 + adcs x8, x12, x21 + adc x9, x13, x22 + mov x3, #-19 + asr x2, x9, #63 + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x6, x6, x3 + sbcs x7, x7, x2 + sbcs x8, x8, x2 + sbc x9, x9, x4 + # Sub + subs x19, x10, x19 + sbcs x20, x11, x20 + sbcs x21, x12, x21 + sbcs x22, x13, x22 + mov x3, #-19 + csetm x2, cc + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x19, x19, x3 + adcs x20, x20, x2 + adcs x21, x21, x2 + adc x22, x22, x4 + stp x19, x20, [x29, #144] + stp x21, x22, [x29, #160] + # Add + adds x10, x14, x5 + adcs x11, x15, x26 + adcs x12, x16, x27 + adc x13, x17, x28 + mov x3, #-19 + asr x2, x13, #63 + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x10, x10, x3 + sbcs x11, x11, x2 + sbcs x12, x12, x2 + sbc x13, x13, x4 + # Sub + subs x14, x14, x5 + sbcs x15, x15, x26 + sbcs x16, x16, x27 + sbcs x17, x17, x28 + mov x3, #-19 + csetm x2, cc + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x14, x14, x3 + adcs x15, x15, x2 + adcs x16, x16, x2 + adc x17, x17, x4 + # Multiply + # A[0] * B[0] + mul x19, x14, x6 + umulh x20, x14, x6 + # A[0] * B[1] + mul x3, x14, x7 + umulh x21, x14, x7 + adds x20, x20, x3 + adc x21, x21, xzr + # A[1] * B[0] + mul x3, x15, x6 + umulh x4, x15, x6 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, xzr, xzr + # A[0] * B[2] + mul x3, x14, x8 + umulh x4, x14, x8 + adds x21, x21, x3 + adc x22, x22, x4 + # A[1] * B[1] + mul x3, x15, x7 + umulh x4, x15, x7 + adds x21, x21, x3 + adcs x22, x22, x4 + adc x2, xzr, xzr + # A[2] * B[0] + mul x3, x16, x6 + umulh x4, x16, x6 + adds x21, x21, x3 + adcs x22, x22, x4 + adc x2, x2, xzr + # A[0] * B[3] + mul x3, x14, x9 + umulh x4, x14, x9 + adds x22, x22, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x8 + umulh x4, x15, x8 + adds x22, x22, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x7 + umulh x4, x16, x7 + adds x22, x22, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x6 + umulh x4, x17, x6 + adds x22, x22, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x9 + umulh x4, x15, x9 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x8 + umulh x4, x16, x8 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x7 + umulh x4, x17, x7 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x9 + umulh x4, x16, x9 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x8 + umulh x4, x17, x8 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x9 + umulh x4, x17, x9 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x22, #63 + and x22, x22, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x19, x19, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x20, x20, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x21, x21, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x22, x22, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x20, x20, x2 + adcs x21, x21, x26 + adcs x22, x22, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x22, #63 + mul x5, x5, x3 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x22, x22, xzr + # Reduce if top bit set + and x5, x3, x22, asr 63 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x22, x22, xzr + # Store + stp x19, x20, [x29, #112] + stp x21, x22, [x29, #128] + # Multiply + ldp x2, x26, [x29, #144] + ldp x27, x28, [x29, #160] + # A[0] * B[0] + mul x19, x10, x2 + umulh x20, x10, x2 + # A[0] * B[1] + mul x3, x10, x26 + umulh x21, x10, x26 + adds x20, x20, x3 + adc x21, x21, xzr + # A[1] * B[0] + mul x3, x11, x2 + umulh x4, x11, x2 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, xzr, xzr + # A[0] * B[2] + mul x3, x10, x27 + umulh x4, x10, x27 + adds x21, x21, x3 + adc x22, x22, x4 + # A[1] * B[1] + mul x3, x11, x26 + umulh x4, x11, x26 + adds x21, x21, x3 + adcs x22, x22, x4 + adc x14, xzr, xzr + # A[2] * B[0] + mul x3, x12, x2 + umulh x4, x12, x2 + adds x21, x21, x3 + adcs x22, x22, x4 + adc x14, x14, xzr + # A[0] * B[3] + mul x3, x10, x28 + umulh x4, x10, x28 + adds x22, x22, x3 + adcs x14, x14, x4 + adc x15, xzr, xzr + # A[1] * B[2] + mul x3, x11, x27 + umulh x4, x11, x27 + adds x22, x22, x3 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[2] * B[1] + mul x3, x12, x26 + umulh x4, x12, x26 + adds x22, x22, x3 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[3] * B[0] + mul x3, x13, x2 + umulh x4, x13, x2 + adds x22, x22, x3 + adcs x14, x14, x4 + adc x15, x15, xzr + # A[1] * B[3] + mul x3, x11, x28 + umulh x4, x11, x28 + adds x14, x14, x3 + adcs x15, x15, x4 + adc x16, xzr, xzr + # A[2] * B[2] + mul x3, x12, x27 + umulh x4, x12, x27 + adds x14, x14, x3 + adcs x15, x15, x4 + adc x16, x16, xzr + # A[3] * B[1] + mul x3, x13, x26 + umulh x4, x13, x26 + adds x14, x14, x3 + adcs x15, x15, x4 + adc x16, x16, xzr + # A[2] * B[3] + mul x3, x12, x28 + umulh x4, x12, x28 + adds x15, x15, x3 + adcs x16, x16, x4 + adc x17, xzr, xzr + # A[3] * B[2] + mul x3, x13, x27 + umulh x4, x13, x27 + adds x15, x15, x3 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[3] * B[3] + mul x3, x13, x28 + umulh x4, x13, x28 + adds x16, x16, x3 + adc x17, x17, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + extr x15, x15, x14, #63 + extr x14, x14, x22, #63 + and x22, x22, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x14 + umulh x14, x3, x14 + adds x19, x19, x4 + mul x4, x3, x15 + umulh x15, x3, x15 + adcs x20, x20, x4 + mul x4, x3, x16 + umulh x16, x3, x16 + adcs x21, x21, x4 + mul x4, x3, x17 + umulh x5, x3, x17 + adcs x22, x22, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x20, x20, x14 + adcs x21, x21, x15 + adcs x22, x22, x16 + adc x5, x5, xzr + # Overflow + extr x5, x5, x22, #63 + mul x5, x5, x3 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x22, x22, xzr + # Reduce if top bit set + and x5, x3, x22, asr 63 + and x22, x22, #0x7fffffffffffffff + adds x19, x19, x5 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x22, x22, xzr + # Store + # Square + # A[0] * A[1] + mul x11, x2, x26 + umulh x12, x2, x26 + # A[0] * A[2] + mul x3, x2, x27 + umulh x13, x2, x27 + adds x12, x12, x3 + adc x13, x13, xzr + # A[0] * A[3] + mul x3, x2, x28 + umulh x14, x2, x28 + adds x13, x13, x3 + adc x14, x14, xzr + # A[1] * A[2] + mul x3, x26, x27 + umulh x4, x26, x27 + adds x13, x13, x3 + adcs x14, x14, x4 + adc x15, xzr, xzr + # A[1] * A[3] + mul x3, x26, x28 + umulh x4, x26, x28 + adds x14, x14, x3 + adc x15, x15, x4 + # A[2] * A[3] + mul x3, x27, x28 + umulh x16, x27, x28 + adds x15, x15, x3 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + mul x10, x2, x2 + umulh x5, x2, x2 + # A[1] * A[1] + mul x3, x26, x26 + umulh x4, x26, x26 + adds x11, x11, x5 + adcs x12, x12, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x27, x27 + umulh x4, x27, x27 + adds x13, x13, x5 + adcs x14, x14, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x28, x28 + umulh x4, x28, x28 + adds x15, x15, x5 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + extr x15, x15, x14, #63 + extr x14, x14, x13, #63 + and x13, x13, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x14 + umulh x14, x3, x14 + adds x10, x10, x4 + mul x4, x3, x15 + umulh x15, x3, x15 + adcs x11, x11, x4 + mul x4, x3, x16 + umulh x16, x3, x16 + adcs x12, x12, x4 + mul x4, x3, x17 + umulh x5, x3, x17 + adcs x13, x13, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x11, x11, x14 + adcs x12, x12, x15 + adcs x13, x13, x16 + adc x5, x5, xzr + # Overflow + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Reduce if top bit set + and x5, x3, x13, asr 63 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Store + # Square + # A[0] * A[1] + mul x15, x6, x7 + umulh x16, x6, x7 + # A[0] * A[2] + mul x3, x6, x8 + umulh x17, x6, x8 + adds x16, x16, x3 + adc x17, x17, xzr + # A[0] * A[3] + mul x3, x6, x9 + umulh x2, x6, x9 + adds x17, x17, x3 + adc x2, x2, xzr + # A[1] * A[2] + mul x3, x7, x8 + umulh x4, x7, x8 + adds x17, x17, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * A[3] + mul x3, x7, x9 + umulh x4, x7, x9 + adds x2, x2, x3 + adc x26, x26, x4 + # A[2] * A[3] + mul x3, x8, x9 + umulh x27, x8, x9 + adds x26, x26, x3 + adc x27, x27, xzr + # Double + adds x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x2, x2, x2 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + mul x14, x6, x6 + umulh x5, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + umulh x4, x7, x7 + adds x15, x15, x5 + adcs x16, x16, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x8, x8 + umulh x4, x8, x8 + adds x17, x17, x5 + adcs x2, x2, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x9, x9 + umulh x4, x9, x9 + adds x26, x26, x5 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x17, #63 + and x17, x17, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x14, x14, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x15, x15, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x16, x16, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x17, x17, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x15, x15, x2 + adcs x16, x16, x26 + adcs x17, x17, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Reduce if top bit set + and x5, x3, x17, asr 63 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Store + # Multiply + # A[0] * B[0] + mul x6, x14, x10 + umulh x7, x14, x10 + # A[0] * B[1] + mul x3, x14, x11 + umulh x8, x14, x11 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x15, x10 + umulh x4, x15, x10 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x14, x12 + umulh x4, x14, x12 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x15, x11 + umulh x4, x15, x11 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x2, xzr, xzr + # A[2] * B[0] + mul x3, x16, x10 + umulh x4, x16, x10 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x2, x2, xzr + # A[0] * B[3] + mul x3, x14, x13 + umulh x4, x14, x13 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x12 + umulh x4, x15, x12 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x11 + umulh x4, x16, x11 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x10 + umulh x4, x17, x10 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x13 + umulh x4, x15, x13 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x12 + umulh x4, x16, x12 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x11 + umulh x4, x17, x11 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x13 + umulh x4, x16, x13 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x12 + umulh x4, x17, x12 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x13 + umulh x4, x17, x13 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x2 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + # Sub + subs x14, x14, x10 + sbcs x15, x15, x11 + sbcs x16, x16, x12 + sbcs x17, x17, x13 + mov x3, #-19 + csetm x2, cc + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x14, x14, x3 + adcs x15, x15, x2 + adcs x16, x16, x2 + adc x17, x17, x4 + # Multiply by 121666 + mov x5, #0xdb42 + movk x5, #1, lsl 16 + mul x6, x14, x5 + umulh x7, x14, x5 + mul x3, x15, x5 + umulh x4, x15, x5 + adds x7, x7, x3 + adc x8, xzr, x4 + mul x3, x16, x5 + umulh x4, x16, x5 + adds x8, x8, x3 + adc x9, xzr, x4 + mul x3, x17, x5 + umulh x4, x17, x5 + adds x9, x9, x3 + adc x4, xzr, x4 + mov x5, #19 + extr x4, x4, x9, #63 + mul x4, x4, x5 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x4 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Add + adds x10, x10, x6 + adcs x11, x11, x7 + adcs x12, x12, x8 + adc x13, x13, x9 + mov x3, #-19 + asr x2, x13, #63 + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x10, x10, x3 + sbcs x11, x11, x2 + sbcs x12, x12, x2 + sbc x13, x13, x4 + # Multiply + # A[0] * B[0] + mul x6, x14, x10 + umulh x7, x14, x10 + # A[0] * B[1] + mul x3, x14, x11 + umulh x8, x14, x11 + adds x7, x7, x3 + adc x8, x8, xzr + # A[1] * B[0] + mul x3, x15, x10 + umulh x4, x15, x10 + adds x7, x7, x3 + adcs x8, x8, x4 + adc x9, xzr, xzr + # A[0] * B[2] + mul x3, x14, x12 + umulh x4, x14, x12 + adds x8, x8, x3 + adc x9, x9, x4 + # A[1] * B[1] + mul x3, x15, x11 + umulh x4, x15, x11 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x2, xzr, xzr + # A[2] * B[0] + mul x3, x16, x10 + umulh x4, x16, x10 + adds x8, x8, x3 + adcs x9, x9, x4 + adc x2, x2, xzr + # A[0] * B[3] + mul x3, x14, x13 + umulh x4, x14, x13 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x12 + umulh x4, x15, x12 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x11 + umulh x4, x16, x11 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x10 + umulh x4, x17, x10 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x13 + umulh x4, x15, x13 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x12 + umulh x4, x16, x12 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x11 + umulh x4, x17, x11 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x13 + umulh x4, x16, x13 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x12 + umulh x4, x17, x12 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x13 + umulh x4, x17, x13 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x2 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #16] + stp x8, x9, [x29, #32] + # Add + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] + adds x10, x6, x19 + adcs x11, x7, x20 + adcs x12, x8, x21 + adc x13, x9, x22 + mov x3, #-19 + asr x2, x13, #63 + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x10, x10, x3 + sbcs x11, x11, x2 + sbcs x12, x12, x2 + sbc x13, x13, x4 + # Sub + subs x19, x6, x19 + sbcs x20, x7, x20 + sbcs x21, x8, x21 + sbcs x22, x9, x22 + mov x3, #-19 + csetm x2, cc + # Mask the modulus + and x3, x2, x3 + and x4, x2, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x19, x19, x3 + adcs x20, x20, x2 + adcs x21, x21, x2 + adc x22, x22, x4 + # Square + # A[0] * A[1] + mul x7, x10, x11 + umulh x8, x10, x11 + # A[0] * A[2] + mul x3, x10, x12 + umulh x9, x10, x12 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x10, x13 + umulh x2, x10, x13 + adds x9, x9, x3 + adc x2, x2, xzr + # A[1] * A[2] + mul x3, x11, x12 + umulh x4, x11, x12 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * A[3] + mul x3, x11, x13 + umulh x4, x11, x13 + adds x2, x2, x3 + adc x26, x26, x4 + # A[2] * A[3] + mul x3, x12, x13 + umulh x27, x12, x13 + adds x26, x26, x3 + adc x27, x27, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x2, x2, x2 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + mul x6, x10, x10 + umulh x5, x10, x10 + # A[1] * A[1] + mul x3, x11, x11 + umulh x4, x11, x11 + adds x7, x7, x5 + adcs x8, x8, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x12, x12 + umulh x4, x12, x12 + adds x9, x9, x5 + adcs x2, x2, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x13, x13 + umulh x4, x13, x13 + adds x26, x26, x5 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x2 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + # Square + # A[0] * A[1] + mul x7, x19, x20 + umulh x8, x19, x20 + # A[0] * A[2] + mul x3, x19, x21 + umulh x9, x19, x21 + adds x8, x8, x3 + adc x9, x9, xzr + # A[0] * A[3] + mul x3, x19, x22 + umulh x2, x19, x22 + adds x9, x9, x3 + adc x2, x2, xzr + # A[1] * A[2] + mul x3, x20, x21 + umulh x4, x20, x21 + adds x9, x9, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * A[3] + mul x3, x20, x22 + umulh x4, x20, x22 + adds x2, x2, x3 + adc x26, x26, x4 + # A[2] * A[3] + mul x3, x21, x22 + umulh x27, x21, x22 + adds x26, x26, x3 + adc x27, x27, xzr + # Double + adds x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x2, x2, x2 + adcs x26, x26, x26 + adcs x27, x27, x27 + adc x28, xzr, xzr + # A[0] * A[0] + mul x6, x19, x19 + umulh x5, x19, x19 + # A[1] * A[1] + mul x3, x20, x20 + umulh x4, x20, x20 + adds x7, x7, x5 + adcs x8, x8, x3 + adc x5, x4, xzr + # A[2] * A[2] + mul x3, x21, x21 + umulh x4, x21, x21 + adds x9, x9, x5 + adcs x2, x2, x3 + adc x5, x4, xzr + # A[3] * A[3] + mul x3, x22, x22 + umulh x4, x22, x22 + adds x26, x26, x5 + adcs x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x9, #63 + and x9, x9, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x6, x6, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x7, x7, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x8, x8, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x9, x9, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x7, x7, x2 + adcs x8, x8, x26 + adcs x9, x9, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Reduce if top bit set + and x5, x3, x9, asr 63 + and x9, x9, #0x7fffffffffffffff + adds x6, x6, x5 + adcs x7, x7, xzr + adcs x8, x8, xzr + adc x9, x9, xzr + # Store + ldr x2, [x29, #184] + # Multiply + ldp x14, x15, [x2] + ldp x16, x17, [x2, #16] + # A[0] * B[0] + mul x10, x14, x6 + umulh x11, x14, x6 + # A[0] * B[1] + mul x3, x14, x7 + umulh x12, x14, x7 + adds x11, x11, x3 + adc x12, x12, xzr + # A[1] * B[0] + mul x3, x15, x6 + umulh x4, x15, x6 + adds x11, x11, x3 + adcs x12, x12, x4 + adc x13, xzr, xzr + # A[0] * B[2] + mul x3, x14, x8 + umulh x4, x14, x8 + adds x12, x12, x3 + adc x13, x13, x4 + # A[1] * B[1] + mul x3, x15, x7 + umulh x4, x15, x7 + adds x12, x12, x3 + adcs x13, x13, x4 + adc x2, xzr, xzr + # A[2] * B[0] + mul x3, x16, x6 + umulh x4, x16, x6 + adds x12, x12, x3 + adcs x13, x13, x4 + adc x2, x2, xzr + # A[0] * B[3] + mul x3, x14, x9 + umulh x4, x14, x9 + adds x13, x13, x3 + adcs x2, x2, x4 + adc x26, xzr, xzr + # A[1] * B[2] + mul x3, x15, x8 + umulh x4, x15, x8 + adds x13, x13, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[2] * B[1] + mul x3, x16, x7 + umulh x4, x16, x7 + adds x13, x13, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[3] * B[0] + mul x3, x17, x6 + umulh x4, x17, x6 + adds x13, x13, x3 + adcs x2, x2, x4 + adc x26, x26, xzr + # A[1] * B[3] + mul x3, x15, x9 + umulh x4, x15, x9 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, xzr, xzr + # A[2] * B[2] + mul x3, x16, x8 + umulh x4, x16, x8 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[3] * B[1] + mul x3, x17, x7 + umulh x4, x17, x7 + adds x2, x2, x3 + adcs x26, x26, x4 + adc x27, x27, xzr + # A[2] * B[3] + mul x3, x16, x9 + umulh x4, x16, x9 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, xzr, xzr + # A[3] * B[2] + mul x3, x17, x8 + umulh x4, x17, x8 + adds x26, x26, x3 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[3] + mul x3, x17, x9 + umulh x4, x17, x9 + adds x27, x27, x3 + adc x28, x28, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x28, x28, x27, #63 + extr x27, x27, x26, #63 + extr x26, x26, x2, #63 + extr x2, x2, x13, #63 + and x13, x13, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x2 + umulh x2, x3, x2 + adds x10, x10, x4 + mul x4, x3, x26 + umulh x26, x3, x26 + adcs x11, x11, x4 + mul x4, x3, x27 + umulh x27, x3, x27 + adcs x12, x12, x4 + mul x4, x3, x28 + umulh x5, x3, x28 + adcs x13, x13, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x11, x11, x2 + adcs x12, x12, x26 + adcs x13, x13, x27 + adc x5, x5, xzr + # Overflow + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Reduce if top bit set + and x5, x3, x13, asr 63 + and x13, x13, #0x7fffffffffffffff + adds x10, x10, x5 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + # Store + stp x10, x11, [x29, #48] + stp x12, x13, [x29, #64] + sub x25, x25, #1 + cmp x25, #0 + bge L_curve25519_bits + mov x25, #63 + sub x24, x24, #8 + cmp x24, #0 + bge L_curve25519_words + # Invert + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + add x0, x29, #0x50 + add x1, x29, #48 + bl fe_sq + add x1, x29, #0x50 + bl fe_sq + add x1, x29, #16 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #48 + add x1, x29, #48 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x70 + bl fe_sq + add x0, x29, #0x50 + add x1, x29, #0x50 + add x2, x29, #0x70 + bl fe_mul + add x0, x29, #0x70 + bl fe_sq + mov x24, #4 + add x1, x29, #0x70 +L_curve25519_inv_1: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_1 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x70 + add x1, x29, #0x50 + bl fe_sq + mov x24, #9 + add x1, x29, #0x70 +L_curve25519_inv_2: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_2 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x90 + bl fe_sq + mov x24, #19 + add x1, x29, #0x90 +L_curve25519_inv_3: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_3 + add x0, x29, #0x70 + add x2, x29, #0x70 + bl fe_mul + mov x24, #10 + add x1, x29, #0x70 +L_curve25519_inv_4: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_4 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x70 + add x1, x29, #0x50 + bl fe_sq + mov x24, #49 + add x1, x29, #0x70 +L_curve25519_inv_5: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_5 + add x2, x29, #0x50 + bl fe_mul + add x0, x29, #0x90 + bl fe_sq + mov x24, #0x63 + add x1, x29, #0x90 +L_curve25519_inv_6: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_6 + add x0, x29, #0x70 + add x2, x29, #0x70 + bl fe_mul + mov x24, #50 + add x1, x29, #0x70 +L_curve25519_inv_7: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_7 + add x0, x29, #0x50 + add x2, x29, #0x50 + bl fe_mul + mov x24, #5 + add x1, x29, #0x50 +L_curve25519_inv_8: + bl fe_sq + sub x24, x24, #1 + cmp x24, #0 + bne L_curve25519_inv_8 + add x0, x29, #16 + add x2, x29, #48 + bl fe_mul + ldr x0, [x29, #176] + # Multiply + ldp x6, x7, [x0] + ldp x8, x9, [x0, #16] + ldp x10, x11, [x29, #16] + ldp x12, x13, [x29, #32] + # A[0] * B[0] + mul x14, x6, x10 + umulh x15, x6, x10 + # A[0] * B[1] + mul x3, x6, x11 + umulh x16, x6, x11 + adds x15, x15, x3 + adc x16, x16, xzr + # A[1] * B[0] + mul x3, x7, x10 + umulh x4, x7, x10 + adds x15, x15, x3 + adcs x16, x16, x4 + adc x17, xzr, xzr + # A[0] * B[2] + mul x3, x6, x12 + umulh x4, x6, x12 + adds x16, x16, x3 + adc x17, x17, x4 + # A[1] * B[1] + mul x3, x7, x11 + umulh x4, x7, x11 + adds x16, x16, x3 + adcs x17, x17, x4 + adc x19, xzr, xzr + # A[2] * B[0] + mul x3, x8, x10 + umulh x4, x8, x10 + adds x16, x16, x3 + adcs x17, x17, x4 + adc x19, x19, xzr + # A[0] * B[3] + mul x3, x6, x13 + umulh x4, x6, x13 + adds x17, x17, x3 + adcs x19, x19, x4 + adc x20, xzr, xzr + # A[1] * B[2] + mul x3, x7, x12 + umulh x4, x7, x12 + adds x17, x17, x3 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[2] * B[1] + mul x3, x8, x11 + umulh x4, x8, x11 + adds x17, x17, x3 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[3] * B[0] + mul x3, x9, x10 + umulh x4, x9, x10 + adds x17, x17, x3 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[1] * B[3] + mul x3, x7, x13 + umulh x4, x7, x13 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, xzr, xzr + # A[2] * B[2] + mul x3, x8, x12 + umulh x4, x8, x12 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[3] * B[1] + mul x3, x9, x11 + umulh x4, x9, x11 + adds x19, x19, x3 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[3] + mul x3, x8, x13 + umulh x4, x8, x13 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, xzr, xzr + # A[3] * B[2] + mul x3, x9, x12 + umulh x4, x9, x12 + adds x20, x20, x3 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[3] + mul x3, x9, x13 + umulh x4, x9, x13 + adds x21, x21, x3 + adc x22, x22, x4 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x22, x22, x21, #63 + extr x21, x21, x20, #63 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + and x17, x17, #0x7fffffffffffffff + # Multiply top half by 19 + mov x3, #19 + mul x4, x3, x19 + umulh x19, x3, x19 + adds x14, x14, x4 + mul x4, x3, x20 + umulh x20, x3, x20 + adcs x15, x15, x4 + mul x4, x3, x21 + umulh x21, x3, x21 + adcs x16, x16, x4 + mul x4, x3, x22 + umulh x5, x3, x22 + adcs x17, x17, x4 + adc x5, x5, xzr + # Add remaining product results in + adds x15, x15, x19 + adcs x16, x16, x20 + adcs x17, x17, x21 + adc x5, x5, xzr + # Overflow + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Reduce if top bit set + and x5, x3, x17, asr 63 + and x17, x17, #0x7fffffffffffffff + adds x14, x14, x5 + adcs x15, x15, xzr + adcs x16, x16, xzr + adc x17, x17, xzr + # Store + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + mov x0, xzr + ldr x17, [x29, #200] + ldr x19, [x29, #208] + ldp x20, x21, [x29, #216] + ldp x22, x23, [x29, #232] + ldp x24, x25, [x29, #248] + ldp x26, x27, [x29, #264] + ldr x28, [x29, #280] + ldp x29, x30, [sp], #0x120 + ret + .size curve25519,.-curve25519 + .text + .align 2 + .globl fe_pow22523 + .type fe_pow22523, %function +fe_pow22523: + stp x29, x30, [sp, #-144]! + add x29, sp, #0 + str x21, [x29, #136] + # pow22523 + str x0, [x29, #112] + str x1, [x29, #120] + add x0, x29, #16 + bl fe_sq + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + add x1, x29, #48 + bl fe_sq + ldr x1, [x29, #120] + add x2, x29, #48 + bl fe_mul + add x0, x29, #16 + add x1, x29, #16 + add x2, x29, #48 + bl fe_mul + bl fe_sq + add x1, x29, #48 + add x2, x29, #16 + bl fe_mul + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + mov x21, #4 + add x1, x29, #48 +L_fe_pow22523_1: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_1 + add x0, x29, #16 + add x2, x29, #16 + bl fe_mul + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + mov x21, #9 + add x1, x29, #48 +L_fe_pow22523_2: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_2 + add x2, x29, #16 + bl fe_mul + add x0, x29, #0x50 + bl fe_sq + mov x21, #19 + add x1, x29, #0x50 +L_fe_pow22523_3: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_3 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + mov x21, #10 + add x1, x29, #48 +L_fe_pow22523_4: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_4 + add x0, x29, #16 + add x2, x29, #16 + bl fe_mul + add x0, x29, #48 + add x1, x29, #16 + bl fe_sq + mov x21, #49 + add x1, x29, #48 +L_fe_pow22523_5: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_5 + add x2, x29, #16 + bl fe_mul + add x0, x29, #0x50 + bl fe_sq + mov x21, #0x63 + add x1, x29, #0x50 +L_fe_pow22523_6: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_6 + add x0, x29, #48 + add x2, x29, #48 + bl fe_mul + mov x21, #50 + add x1, x29, #48 +L_fe_pow22523_7: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_7 + add x0, x29, #16 + add x2, x29, #16 + bl fe_mul + mov x21, #2 + add x1, x29, #16 +L_fe_pow22523_8: + bl fe_sq + sub x21, x21, #1 + cmp x21, #0 + bne L_fe_pow22523_8 + ldr x0, [x29, #112] + ldr x2, [x29, #120] + bl fe_mul + ldr x21, [x29, #136] + ldp x29, x30, [sp], #0x90 + ret + .size fe_pow22523,.-fe_pow22523 + .text + .align 2 + .globl fe_ge_to_p2 + .type fe_ge_to_p2, %function +fe_ge_to_p2: + stp x29, x30, [sp, #-112]! + add x29, sp, #0 + str x17, [x29, #72] + str x19, [x29, #80] + stp x20, x21, [x29, #88] + str x22, [x29, #104] + str x1, [x29, #16] + str x2, [x29, #24] + str x3, [x29, #32] + str x4, [x29, #40] + str x5, [x29, #48] + str x6, [x29, #56] + ldr x1, [x29, #32] + ldr x2, [x29, #56] + # Multiply + ldp x11, x12, [x1] + ldp x13, x14, [x1, #16] + ldp x15, x16, [x2] + ldp x17, x19, [x2, #16] + # A[0] * B[0] + mul x3, x11, x15 + umulh x4, x11, x15 + # A[0] * B[1] + mul x20, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x20 + adc x5, x5, xzr + # A[1] * B[0] + mul x20, x12, x15 + umulh x21, x12, x15 + adds x4, x4, x20 + adcs x5, x5, x21 + adc x6, xzr, xzr + # A[0] * B[2] + mul x20, x11, x17 + umulh x21, x11, x17 + adds x5, x5, x20 + adc x6, x6, x21 + # A[1] * B[1] + mul x20, x12, x16 + umulh x21, x12, x16 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, xzr, xzr + # A[2] * B[0] + mul x20, x13, x15 + umulh x21, x13, x15 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, x7, xzr + # A[0] * B[3] + mul x20, x11, x19 + umulh x21, x11, x19 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, xzr, xzr + # A[1] * B[2] + mul x20, x12, x17 + umulh x21, x12, x17 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[2] * B[1] + mul x20, x13, x16 + umulh x21, x13, x16 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[3] * B[0] + mul x20, x14, x15 + umulh x21, x14, x15 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[1] * B[3] + mul x20, x12, x19 + umulh x21, x12, x19 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, xzr, xzr + # A[2] * B[2] + mul x20, x13, x17 + umulh x21, x13, x17 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[3] * B[1] + mul x20, x14, x16 + umulh x21, x14, x16 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[2] * B[3] + mul x20, x13, x19 + umulh x21, x13, x19 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, xzr, xzr + # A[3] * B[2] + mul x20, x14, x17 + umulh x21, x14, x17 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, x10, xzr + # A[3] * B[3] + mul x20, x14, x19 + umulh x21, x14, x19 + adds x9, x9, x20 + adc x10, x10, x21 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x20, #19 + mul x21, x20, x7 + umulh x7, x20, x7 + adds x3, x3, x21 + mul x21, x20, x8 + umulh x8, x20, x8 + adcs x4, x4, x21 + mul x21, x20, x9 + umulh x9, x20, x9 + adcs x5, x5, x21 + mul x21, x20, x10 + umulh x22, x20, x10 + adcs x6, x6, x21 + adc x22, x22, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x22, x22, xzr + # Overflow + extr x22, x22, x6, #63 + mul x22, x22, x20 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x22, x20, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #16] + ldr x1, [x29, #40] + ldr x2, [x29, #48] + # Multiply + ldp x11, x12, [x1] + ldp x13, x14, [x1, #16] + ldp x15, x16, [x2] + ldp x17, x19, [x2, #16] + # A[0] * B[0] + mul x3, x11, x15 + umulh x4, x11, x15 + # A[0] * B[1] + mul x20, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x20 + adc x5, x5, xzr + # A[1] * B[0] + mul x20, x12, x15 + umulh x21, x12, x15 + adds x4, x4, x20 + adcs x5, x5, x21 + adc x6, xzr, xzr + # A[0] * B[2] + mul x20, x11, x17 + umulh x21, x11, x17 + adds x5, x5, x20 + adc x6, x6, x21 + # A[1] * B[1] + mul x20, x12, x16 + umulh x21, x12, x16 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, xzr, xzr + # A[2] * B[0] + mul x20, x13, x15 + umulh x21, x13, x15 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, x7, xzr + # A[0] * B[3] + mul x20, x11, x19 + umulh x21, x11, x19 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, xzr, xzr + # A[1] * B[2] + mul x20, x12, x17 + umulh x21, x12, x17 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[2] * B[1] + mul x20, x13, x16 + umulh x21, x13, x16 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[3] * B[0] + mul x20, x14, x15 + umulh x21, x14, x15 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[1] * B[3] + mul x20, x12, x19 + umulh x21, x12, x19 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, xzr, xzr + # A[2] * B[2] + mul x20, x13, x17 + umulh x21, x13, x17 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[3] * B[1] + mul x20, x14, x16 + umulh x21, x14, x16 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[2] * B[3] + mul x20, x13, x19 + umulh x21, x13, x19 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, xzr, xzr + # A[3] * B[2] + mul x20, x14, x17 + umulh x21, x14, x17 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, x10, xzr + # A[3] * B[3] + mul x20, x14, x19 + umulh x21, x14, x19 + adds x9, x9, x20 + adc x10, x10, x21 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x20, #19 + mul x21, x20, x7 + umulh x7, x20, x7 + adds x3, x3, x21 + mul x21, x20, x8 + umulh x8, x20, x8 + adcs x4, x4, x21 + mul x21, x20, x9 + umulh x9, x20, x9 + adcs x5, x5, x21 + mul x21, x20, x10 + umulh x22, x20, x10 + adcs x6, x6, x21 + adc x22, x22, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x22, x22, xzr + # Overflow + extr x22, x22, x6, #63 + mul x22, x22, x20 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x22, x20, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #24] + ldr x2, [x29, #56] + # Multiply + ldp x11, x12, [x2] + ldp x13, x14, [x2, #16] + # A[0] * B[0] + mul x3, x15, x11 + umulh x4, x15, x11 + # A[0] * B[1] + mul x20, x15, x12 + umulh x5, x15, x12 + adds x4, x4, x20 + adc x5, x5, xzr + # A[1] * B[0] + mul x20, x16, x11 + umulh x21, x16, x11 + adds x4, x4, x20 + adcs x5, x5, x21 + adc x6, xzr, xzr + # A[0] * B[2] + mul x20, x15, x13 + umulh x21, x15, x13 + adds x5, x5, x20 + adc x6, x6, x21 + # A[1] * B[1] + mul x20, x16, x12 + umulh x21, x16, x12 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, xzr, xzr + # A[2] * B[0] + mul x20, x17, x11 + umulh x21, x17, x11 + adds x5, x5, x20 + adcs x6, x6, x21 + adc x7, x7, xzr + # A[0] * B[3] + mul x20, x15, x14 + umulh x21, x15, x14 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, xzr, xzr + # A[1] * B[2] + mul x20, x16, x13 + umulh x21, x16, x13 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[2] * B[1] + mul x20, x17, x12 + umulh x21, x17, x12 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[3] * B[0] + mul x20, x19, x11 + umulh x21, x19, x11 + adds x6, x6, x20 + adcs x7, x7, x21 + adc x8, x8, xzr + # A[1] * B[3] + mul x20, x16, x14 + umulh x21, x16, x14 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, xzr, xzr + # A[2] * B[2] + mul x20, x17, x13 + umulh x21, x17, x13 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[3] * B[1] + mul x20, x19, x12 + umulh x21, x19, x12 + adds x7, x7, x20 + adcs x8, x8, x21 + adc x9, x9, xzr + # A[2] * B[3] + mul x20, x17, x14 + umulh x21, x17, x14 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, xzr, xzr + # A[3] * B[2] + mul x20, x19, x13 + umulh x21, x19, x13 + adds x8, x8, x20 + adcs x9, x9, x21 + adc x10, x10, xzr + # A[3] * B[3] + mul x20, x19, x14 + umulh x21, x19, x14 + adds x9, x9, x20 + adc x10, x10, x21 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x20, #19 + mul x21, x20, x7 + umulh x7, x20, x7 + adds x3, x3, x21 + mul x21, x20, x8 + umulh x8, x20, x8 + adcs x4, x4, x21 + mul x21, x20, x9 + umulh x9, x20, x9 + adcs x5, x5, x21 + mul x21, x20, x10 + umulh x22, x20, x10 + adcs x6, x6, x21 + adc x22, x22, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x22, x22, xzr + # Overflow + extr x22, x22, x6, #63 + mul x22, x22, x20 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x22, x20, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x22 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x17, [x29, #72] + ldr x19, [x29, #80] + ldp x20, x21, [x29, #88] + ldr x22, [x29, #104] + ldp x29, x30, [sp], #0x70 + ret + .size fe_ge_to_p2,.-fe_ge_to_p2 + .text + .align 2 + .globl fe_ge_to_p3 + .type fe_ge_to_p3, %function +fe_ge_to_p3: + stp x29, x30, [sp, #-160]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + str x26, [x29, #152] + str x1, [x29, #16] + str x2, [x29, #24] + str x3, [x29, #32] + str x4, [x29, #40] + str x5, [x29, #48] + str x6, [x29, #56] + str x7, [x29, #64] + ldr x1, [x29, #40] + ldr x2, [x29, #64] + # Multiply + ldp x11, x12, [x1] + ldp x13, x14, [x1, #16] + ldp x15, x16, [x2] + ldp x17, x19, [x2, #16] + # A[0] * B[0] + mul x3, x11, x15 + umulh x4, x11, x15 + # A[0] * B[1] + mul x24, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x24 + adc x5, x5, xzr + # A[1] * B[0] + mul x24, x12, x15 + umulh x25, x12, x15 + adds x4, x4, x24 + adcs x5, x5, x25 + adc x6, xzr, xzr + # A[0] * B[2] + mul x24, x11, x17 + umulh x25, x11, x17 + adds x5, x5, x24 + adc x6, x6, x25 + # A[1] * B[1] + mul x24, x12, x16 + umulh x25, x12, x16 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[2] * B[0] + mul x24, x13, x15 + umulh x25, x13, x15 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, x7, xzr + # A[0] * B[3] + mul x24, x11, x19 + umulh x25, x11, x19 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, xzr, xzr + # A[1] * B[2] + mul x24, x12, x17 + umulh x25, x12, x17 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[2] * B[1] + mul x24, x13, x16 + umulh x25, x13, x16 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[3] * B[0] + mul x24, x14, x15 + umulh x25, x14, x15 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[1] * B[3] + mul x24, x12, x19 + umulh x25, x12, x19 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, xzr, xzr + # A[2] * B[2] + mul x24, x13, x17 + umulh x25, x13, x17 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[3] * B[1] + mul x24, x14, x16 + umulh x25, x14, x16 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[2] * B[3] + mul x24, x13, x19 + umulh x25, x13, x19 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, xzr, xzr + # A[3] * B[2] + mul x24, x14, x17 + umulh x25, x14, x17 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, x10, xzr + # A[3] * B[3] + mul x24, x14, x19 + umulh x25, x14, x19 + adds x9, x9, x24 + adc x10, x10, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x26, x26, xzr + # Overflow + extr x26, x26, x6, #63 + mul x26, x26, x24 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x26, x24, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #32] + ldr x2, [x29, #48] + # Multiply + ldp x20, x21, [x2] + ldp x22, x23, [x2, #16] + # A[0] * B[0] + mul x3, x11, x20 + umulh x4, x11, x20 + # A[0] * B[1] + mul x24, x11, x21 + umulh x5, x11, x21 + adds x4, x4, x24 + adc x5, x5, xzr + # A[1] * B[0] + mul x24, x12, x20 + umulh x25, x12, x20 + adds x4, x4, x24 + adcs x5, x5, x25 + adc x6, xzr, xzr + # A[0] * B[2] + mul x24, x11, x22 + umulh x25, x11, x22 + adds x5, x5, x24 + adc x6, x6, x25 + # A[1] * B[1] + mul x24, x12, x21 + umulh x25, x12, x21 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[2] * B[0] + mul x24, x13, x20 + umulh x25, x13, x20 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, x7, xzr + # A[0] * B[3] + mul x24, x11, x23 + umulh x25, x11, x23 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, xzr, xzr + # A[1] * B[2] + mul x24, x12, x22 + umulh x25, x12, x22 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[2] * B[1] + mul x24, x13, x21 + umulh x25, x13, x21 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[3] * B[0] + mul x24, x14, x20 + umulh x25, x14, x20 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[1] * B[3] + mul x24, x12, x23 + umulh x25, x12, x23 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, xzr, xzr + # A[2] * B[2] + mul x24, x13, x22 + umulh x25, x13, x22 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[3] * B[1] + mul x24, x14, x21 + umulh x25, x14, x21 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[2] * B[3] + mul x24, x13, x23 + umulh x25, x13, x23 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, xzr, xzr + # A[3] * B[2] + mul x24, x14, x22 + umulh x25, x14, x22 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, x10, xzr + # A[3] * B[3] + mul x24, x14, x23 + umulh x25, x14, x23 + adds x9, x9, x24 + adc x10, x10, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x26, x26, xzr + # Overflow + extr x26, x26, x6, #63 + mul x26, x26, x24 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x26, x24, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #16] + ldr x2, [x29, #56] + # Multiply + ldp x11, x12, [x2] + ldp x13, x14, [x2, #16] + # A[0] * B[0] + mul x3, x20, x11 + umulh x4, x20, x11 + # A[0] * B[1] + mul x24, x20, x12 + umulh x5, x20, x12 + adds x4, x4, x24 + adc x5, x5, xzr + # A[1] * B[0] + mul x24, x21, x11 + umulh x25, x21, x11 + adds x4, x4, x24 + adcs x5, x5, x25 + adc x6, xzr, xzr + # A[0] * B[2] + mul x24, x20, x13 + umulh x25, x20, x13 + adds x5, x5, x24 + adc x6, x6, x25 + # A[1] * B[1] + mul x24, x21, x12 + umulh x25, x21, x12 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[2] * B[0] + mul x24, x22, x11 + umulh x25, x22, x11 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, x7, xzr + # A[0] * B[3] + mul x24, x20, x14 + umulh x25, x20, x14 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, xzr, xzr + # A[1] * B[2] + mul x24, x21, x13 + umulh x25, x21, x13 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[2] * B[1] + mul x24, x22, x12 + umulh x25, x22, x12 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[3] * B[0] + mul x24, x23, x11 + umulh x25, x23, x11 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[1] * B[3] + mul x24, x21, x14 + umulh x25, x21, x14 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, xzr, xzr + # A[2] * B[2] + mul x24, x22, x13 + umulh x25, x22, x13 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[3] * B[1] + mul x24, x23, x12 + umulh x25, x23, x12 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[2] * B[3] + mul x24, x22, x14 + umulh x25, x22, x14 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, xzr, xzr + # A[3] * B[2] + mul x24, x23, x13 + umulh x25, x23, x13 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, x10, xzr + # A[3] * B[3] + mul x24, x23, x14 + umulh x25, x23, x14 + adds x9, x9, x24 + adc x10, x10, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x26, x26, xzr + # Overflow + extr x26, x26, x6, #63 + mul x26, x26, x24 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x26, x24, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x0, [x29, #24] + # Multiply + # A[0] * B[0] + mul x3, x11, x15 + umulh x4, x11, x15 + # A[0] * B[1] + mul x24, x11, x16 + umulh x5, x11, x16 + adds x4, x4, x24 + adc x5, x5, xzr + # A[1] * B[0] + mul x24, x12, x15 + umulh x25, x12, x15 + adds x4, x4, x24 + adcs x5, x5, x25 + adc x6, xzr, xzr + # A[0] * B[2] + mul x24, x11, x17 + umulh x25, x11, x17 + adds x5, x5, x24 + adc x6, x6, x25 + # A[1] * B[1] + mul x24, x12, x16 + umulh x25, x12, x16 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, xzr, xzr + # A[2] * B[0] + mul x24, x13, x15 + umulh x25, x13, x15 + adds x5, x5, x24 + adcs x6, x6, x25 + adc x7, x7, xzr + # A[0] * B[3] + mul x24, x11, x19 + umulh x25, x11, x19 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, xzr, xzr + # A[1] * B[2] + mul x24, x12, x17 + umulh x25, x12, x17 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[2] * B[1] + mul x24, x13, x16 + umulh x25, x13, x16 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[3] * B[0] + mul x24, x14, x15 + umulh x25, x14, x15 + adds x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, xzr + # A[1] * B[3] + mul x24, x12, x19 + umulh x25, x12, x19 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, xzr, xzr + # A[2] * B[2] + mul x24, x13, x17 + umulh x25, x13, x17 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[3] * B[1] + mul x24, x14, x16 + umulh x25, x14, x16 + adds x7, x7, x24 + adcs x8, x8, x25 + adc x9, x9, xzr + # A[2] * B[3] + mul x24, x13, x19 + umulh x25, x13, x19 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, xzr, xzr + # A[3] * B[2] + mul x24, x14, x17 + umulh x25, x14, x17 + adds x8, x8, x24 + adcs x9, x9, x25 + adc x10, x10, xzr + # A[3] * B[3] + mul x24, x14, x19 + umulh x25, x14, x19 + adds x9, x9, x24 + adc x10, x10, x25 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + extr x7, x7, x6, #63 + and x6, x6, #0x7fffffffffffffff + # Multiply top half by 19 + mov x24, #19 + mul x25, x24, x7 + umulh x7, x24, x7 + adds x3, x3, x25 + mul x25, x24, x8 + umulh x8, x24, x8 + adcs x4, x4, x25 + mul x25, x24, x9 + umulh x9, x24, x9 + adcs x5, x5, x25 + mul x25, x24, x10 + umulh x26, x24, x10 + adcs x6, x6, x25 + adc x26, x26, xzr + # Add remaining product results in + adds x4, x4, x7 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x26, x26, xzr + # Overflow + extr x26, x26, x6, #63 + mul x26, x26, x24 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Reduce if top bit set + and x26, x24, x6, asr 63 + and x6, x6, #0x7fffffffffffffff + adds x3, x3, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # Store + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldr x26, [x29, #152] + ldp x29, x30, [sp], #0xa0 + ret + .size fe_ge_to_p3,.-fe_ge_to_p3 + .text + .align 2 + .globl fe_ge_dbl + .type fe_ge_dbl, %function +fe_ge_dbl: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + ldr x1, [x29, #48] + # Square + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + # A[0] * A[1] + mul x5, x12, x13 + umulh x6, x12, x13 + # A[0] * A[2] + mul x25, x12, x14 + umulh x7, x12, x14 + adds x6, x6, x25 + adc x7, x7, xzr + # A[0] * A[3] + mul x25, x12, x15 + umulh x8, x12, x15 + adds x7, x7, x25 + adc x8, x8, xzr + # A[1] * A[2] + mul x25, x13, x14 + umulh x26, x13, x14 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * A[3] + mul x25, x13, x15 + umulh x26, x13, x15 + adds x8, x8, x25 + adc x9, x9, x26 + # A[2] * A[3] + mul x25, x14, x15 + umulh x10, x14, x15 + adds x9, x9, x25 + adc x10, x10, xzr + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, xzr, xzr + # A[0] * A[0] + mul x4, x12, x12 + umulh x27, x12, x12 + # A[1] * A[1] + mul x25, x13, x13 + umulh x26, x13, x13 + adds x5, x5, x27 + adcs x6, x6, x25 + adc x27, x26, xzr + # A[2] * A[2] + mul x25, x14, x14 + umulh x26, x14, x14 + adds x7, x7, x27 + adcs x8, x8, x25 + adc x27, x26, xzr + # A[3] * A[3] + mul x25, x15, x15 + umulh x26, x15, x15 + adds x9, x9, x27 + adcs x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x0, [x29, #32] + ldr x1, [x29, #56] + # Square + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * A[1] + mul x9, x21, x22 + umulh x10, x21, x22 + # A[0] * A[2] + mul x25, x21, x23 + umulh x11, x21, x23 + adds x10, x10, x25 + adc x11, x11, xzr + # A[0] * A[3] + mul x25, x21, x24 + umulh x16, x21, x24 + adds x11, x11, x25 + adc x16, x16, xzr + # A[1] * A[2] + mul x25, x22, x23 + umulh x26, x22, x23 + adds x11, x11, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * A[3] + mul x25, x22, x24 + umulh x26, x22, x24 + adds x16, x16, x25 + adc x17, x17, x26 + # A[2] * A[3] + mul x25, x23, x24 + umulh x19, x23, x24 + adds x17, x17, x25 + adc x19, x19, xzr + # Double + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x19, x19, x19 + adc x20, xzr, xzr + # A[0] * A[0] + mul x8, x21, x21 + umulh x27, x21, x21 + # A[1] * A[1] + mul x25, x22, x22 + umulh x26, x22, x22 + adds x9, x9, x27 + adcs x10, x10, x25 + adc x27, x26, xzr + # A[2] * A[2] + mul x25, x23, x23 + umulh x26, x23, x23 + adds x11, x11, x27 + adcs x16, x16, x25 + adc x27, x26, xzr + # A[3] * A[3] + mul x25, x24, x24 + umulh x26, x24, x24 + adds x17, x17, x27 + adcs x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x11, #63 + and x11, x11, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x8, x8, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x9, x9, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x10, x10, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x11, x11, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x9, x9, x16 + adcs x10, x10, x17 + adcs x11, x11, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x11, #63 + mul x27, x27, x25 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Reduce if top bit set + and x27, x25, x11, asr 63 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Store + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + ldr x0, [x29, #24] + # Add + adds x12, x12, x21 + adcs x13, x13, x22 + adcs x14, x14, x23 + adc x15, x15, x24 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + ldr x0, [x29, #40] + # Square + # A[0] * A[1] + mul x17, x12, x13 + umulh x19, x12, x13 + # A[0] * A[2] + mul x25, x12, x14 + umulh x20, x12, x14 + adds x19, x19, x25 + adc x20, x20, xzr + # A[0] * A[3] + mul x25, x12, x15 + umulh x21, x12, x15 + adds x20, x20, x25 + adc x21, x21, xzr + # A[1] * A[2] + mul x25, x13, x14 + umulh x26, x13, x14 + adds x20, x20, x25 + adcs x21, x21, x26 + adc x22, xzr, xzr + # A[1] * A[3] + mul x25, x13, x15 + umulh x26, x13, x15 + adds x21, x21, x25 + adc x22, x22, x26 + # A[2] * A[3] + mul x25, x14, x15 + umulh x23, x14, x15 + adds x22, x22, x25 + adc x23, x23, xzr + # Double + adds x17, x17, x17 + adcs x19, x19, x19 + adcs x20, x20, x20 + adcs x21, x21, x21 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x24, xzr, xzr + # A[0] * A[0] + mul x16, x12, x12 + umulh x27, x12, x12 + # A[1] * A[1] + mul x25, x13, x13 + umulh x26, x13, x13 + adds x17, x17, x27 + adcs x19, x19, x25 + adc x27, x26, xzr + # A[2] * A[2] + mul x25, x14, x14 + umulh x26, x14, x14 + adds x20, x20, x27 + adcs x21, x21, x25 + adc x27, x26, xzr + # A[3] * A[3] + mul x25, x15, x15 + umulh x26, x15, x15 + adds x22, x22, x27 + adcs x23, x23, x25 + adc x24, x24, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x24, x24, x23, #63 + extr x23, x23, x22, #63 + extr x22, x22, x21, #63 + extr x21, x21, x20, #63 + and x20, x20, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x21 + umulh x21, x25, x21 + adds x16, x16, x26 + mul x26, x25, x22 + umulh x22, x25, x22 + adcs x17, x17, x26 + mul x26, x25, x23 + umulh x23, x25, x23 + adcs x19, x19, x26 + mul x26, x25, x24 + umulh x27, x25, x24 + adcs x20, x20, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x17, x17, x21 + adcs x19, x19, x22 + adcs x20, x20, x23 + adc x27, x27, xzr + # Overflow + extr x27, x27, x20, #63 + mul x27, x27, x25 + and x20, x20, #0x7fffffffffffffff + adds x16, x16, x27 + adcs x17, x17, xzr + adcs x19, x19, xzr + adc x20, x20, xzr + # Reduce if top bit set + and x27, x25, x20, asr 63 + and x20, x20, #0x7fffffffffffffff + adds x16, x16, x27 + adcs x17, x17, xzr + adcs x19, x19, xzr + adc x20, x20, xzr + # Store + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + ldr x0, [x29, #24] + ldr x1, [x29, #32] + # Add + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x21, x8, x4 + sbcs x22, x9, x5 + sbcs x23, x10, x6 + sbcs x24, x11, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x21, x21, x25 + adcs x22, x22, x28 + adcs x23, x23, x28 + adc x24, x24, x26 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x21, x22, [x1] + stp x23, x24, [x1, #16] + ldr x0, [x29, #16] + # Sub + subs x16, x16, x12 + sbcs x17, x17, x13 + sbcs x19, x19, x14 + sbcs x20, x20, x15 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #64] + # Square * 2 + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + # A[0] * A[1] + mul x5, x12, x13 + umulh x6, x12, x13 + # A[0] * A[2] + mul x25, x12, x14 + umulh x7, x12, x14 + adds x6, x6, x25 + adc x7, x7, xzr + # A[0] * A[3] + mul x25, x12, x15 + umulh x8, x12, x15 + adds x7, x7, x25 + adc x8, x8, xzr + # A[1] * A[2] + mul x25, x13, x14 + umulh x26, x13, x14 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * A[3] + mul x25, x13, x15 + umulh x26, x13, x15 + adds x8, x8, x25 + adc x9, x9, x26 + # A[2] * A[3] + mul x25, x14, x15 + umulh x10, x14, x15 + adds x9, x9, x25 + adc x10, x10, xzr + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, xzr, xzr + # A[0] * A[0] + mul x4, x12, x12 + umulh x28, x12, x12 + # A[1] * A[1] + mul x25, x13, x13 + umulh x26, x13, x13 + adds x5, x5, x28 + adcs x6, x6, x25 + adc x28, x26, xzr + # A[2] * A[2] + mul x25, x14, x14 + umulh x26, x14, x14 + adds x7, x7, x28 + adcs x8, x8, x25 + adc x28, x26, xzr + # A[3] * A[3] + mul x25, x15, x15 + umulh x26, x15, x15 + adds x9, x9, x28 + adcs x10, x10, x25 + adc x11, x11, x26 + # Double and Reduce + mov x25, #0x169 + # Move top half into t4-t7 and remove top bit from t3 + lsr x28, x11, #61 + extr x11, x11, x10, #62 + extr x10, x10, x9, #62 + extr x9, x9, x8, #62 + extr x8, x8, x7, #62 + extr x7, x7, x6, #63 + extr x6, x6, x5, #63 + extr x5, x5, x4, #63 + lsl x4, x4, #1 + and x7, x7, #0x7fffffffffffffff + # Two left, only one right + and x11, x11, #0x7fffffffffffffff + # Multiply top bits by 19*19 + mul x28, x28, x25 + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x4, x4, x28 + adcs x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #40] + # Sub + subs x4, x4, x21 + sbcs x5, x5, x22 + sbcs x6, x6, x23 + sbcs x7, x7, x24 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x4, x4, x25 + adcs x5, x5, x28 + adcs x6, x6, x28 + adc x7, x7, x26 + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_dbl,.-fe_ge_dbl + .text + .align 2 + .globl fe_ge_madd + .type fe_ge_madd, %function +fe_ge_madd: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x19, x20, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + # Sub + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 + ldr x0, [x29, #32] + ldr x2, [x29, #184] + # Multiply + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x12, x4, x21 + umulh x13, x4, x21 + # A[0] * B[1] + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 + adc x14, x14, xzr + # A[1] * B[0] + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[0] * B[2] + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 + # A[1] * B[1] + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x15, #63 + mul x27, x27, x25 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Reduce if top bit set + and x27, x25, x15, asr 63 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #192] + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * B[0] + mul x4, x8, x21 + umulh x5, x8, x21 + # A[0] * B[1] + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #176] + ldr x3, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x3] + ldp x23, x24, [x3, #16] + # A[0] * B[0] + mul x4, x16, x21 + umulh x5, x16, x21 + # A[0] * B[1] + mul x25, x16, x22 + umulh x6, x16, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x17, x21 + umulh x26, x17, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x16, x23 + umulh x26, x16, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x17, x22 + umulh x26, x17, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, xzr, xzr + # A[2] * B[0] + mul x25, x19, x21 + umulh x26, x19, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, x8, xzr + # A[0] * B[3] + mul x25, x16, x24 + umulh x26, x16, x24 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * B[2] + mul x25, x17, x23 + umulh x26, x17, x23 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[2] * B[1] + mul x25, x19, x22 + umulh x26, x19, x22 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[3] * B[0] + mul x25, x20, x21 + umulh x26, x20, x21 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[3] + mul x25, x17, x24 + umulh x26, x17, x24 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, xzr, xzr + # A[2] * B[2] + mul x25, x19, x23 + umulh x26, x19, x23 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[3] * B[1] + mul x25, x20, x22 + umulh x26, x20, x22 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[3] + mul x25, x19, x24 + umulh x26, x19, x24 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[3] * B[2] + mul x25, x20, x23 + umulh x26, x20, x23 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[3] + mul x25, x20, x24 + umulh x26, x20, x24 + adds x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #32] + ldr x1, [x29, #64] + # Double + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + adds x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, x11, x11 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + ldr x1, [x29, #40] + # Add + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x16, x8, x4 + sbcs x17, x9, x5 + sbcs x19, x10, x6 + sbcs x20, x11, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_madd,.-fe_ge_madd + .text + .align 2 + .globl fe_ge_msub + .type fe_ge_msub, %function +fe_ge_msub: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x19, x20, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + # Sub + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 + ldr x0, [x29, #32] + ldr x2, [x29, #192] + # Multiply + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x12, x4, x21 + umulh x13, x4, x21 + # A[0] * B[1] + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 + adc x14, x14, xzr + # A[1] * B[0] + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[0] * B[2] + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 + # A[1] * B[1] + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x15, #63 + mul x27, x27, x25 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Reduce if top bit set + and x27, x25, x15, asr 63 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #184] + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * B[0] + mul x4, x8, x21 + umulh x5, x8, x21 + # A[0] * B[1] + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x0, [x29, #40] + ldr x1, [x29, #176] + ldr x3, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x3] + ldp x23, x24, [x3, #16] + # A[0] * B[0] + mul x4, x16, x21 + umulh x5, x16, x21 + # A[0] * B[1] + mul x25, x16, x22 + umulh x6, x16, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x17, x21 + umulh x26, x17, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x16, x23 + umulh x26, x16, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x17, x22 + umulh x26, x17, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, xzr, xzr + # A[2] * B[0] + mul x25, x19, x21 + umulh x26, x19, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, x8, xzr + # A[0] * B[3] + mul x25, x16, x24 + umulh x26, x16, x24 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * B[2] + mul x25, x17, x23 + umulh x26, x17, x23 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[2] * B[1] + mul x25, x19, x22 + umulh x26, x19, x22 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[3] * B[0] + mul x25, x20, x21 + umulh x26, x20, x21 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[3] + mul x25, x17, x24 + umulh x26, x17, x24 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, xzr, xzr + # A[2] * B[2] + mul x25, x19, x23 + umulh x26, x19, x23 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[3] * B[1] + mul x25, x20, x22 + umulh x26, x20, x22 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[3] + mul x25, x19, x24 + umulh x26, x19, x24 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[3] * B[2] + mul x25, x20, x23 + umulh x26, x20, x23 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[3] + mul x25, x20, x24 + umulh x26, x20, x24 + adds x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #32] + ldr x1, [x29, #64] + # Double + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + adds x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, x11, x11 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + ldr x1, [x29, #40] + # Add + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adc x15, x11, x7 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x16, x8, x4 + sbcs x17, x9, x5 + sbcs x19, x10, x6 + sbcs x20, x11, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x12, x13, [x1] + stp x14, x15, [x1, #16] + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_msub,.-fe_ge_msub + .text + .align 2 + .globl fe_ge_add + .type fe_ge_add, %function +fe_ge_add: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x19, x20, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + # Sub + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 + ldr x0, [x29, #32] + ldr x2, [x29, #192] + # Multiply + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x12, x4, x21 + umulh x13, x4, x21 + # A[0] * B[1] + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 + adc x14, x14, xzr + # A[1] * B[0] + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[0] * B[2] + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 + # A[1] * B[1] + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x15, #63 + mul x27, x27, x25 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Reduce if top bit set + and x27, x25, x15, asr 63 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #200] + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * B[0] + mul x4, x8, x21 + umulh x5, x8, x21 + # A[0] * B[1] + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x0, [x29, #48] + ldr x1, [x29, #64] + ldr x2, [x29, #176] + # Multiply + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + ldp x16, x17, [x2] + ldp x19, x20, [x2, #16] + # A[0] * B[0] + mul x4, x12, x16 + umulh x5, x12, x16 + # A[0] * B[1] + mul x25, x12, x17 + umulh x6, x12, x17 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x13, x16 + umulh x26, x13, x16 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x12, x19 + umulh x26, x12, x19 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x13, x17 + umulh x26, x13, x17 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, xzr, xzr + # A[2] * B[0] + mul x25, x14, x16 + umulh x26, x14, x16 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, x8, xzr + # A[0] * B[3] + mul x25, x12, x20 + umulh x26, x12, x20 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * B[2] + mul x25, x13, x19 + umulh x26, x13, x19 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[2] * B[1] + mul x25, x14, x17 + umulh x26, x14, x17 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[3] * B[0] + mul x25, x15, x16 + umulh x26, x15, x16 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[3] + mul x25, x13, x20 + umulh x26, x13, x20 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, xzr, xzr + # A[2] * B[2] + mul x25, x14, x19 + umulh x26, x14, x19 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[3] * B[1] + mul x25, x15, x17 + umulh x26, x15, x17 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[3] + mul x25, x14, x20 + umulh x26, x14, x20 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[3] * B[2] + mul x25, x15, x19 + umulh x26, x15, x19 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[3] + mul x25, x15, x20 + umulh x26, x15, x20 + adds x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #48] + # Double + adds x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adc x7, x7, x7 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + ldr x0, [x29, #40] + ldr x1, [x29, #184] + ldr x2, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x8, x16, x21 + umulh x9, x16, x21 + # A[0] * B[1] + mul x25, x16, x22 + umulh x10, x16, x22 + adds x9, x9, x25 + adc x10, x10, xzr + # A[1] * B[0] + mul x25, x17, x21 + umulh x26, x17, x21 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[0] * B[2] + mul x25, x16, x23 + umulh x26, x16, x23 + adds x10, x10, x25 + adc x11, x11, x26 + # A[1] * B[1] + mul x25, x17, x22 + umulh x26, x17, x22 + adds x10, x10, x25 + adcs x11, x11, x26 + adc x12, xzr, xzr + # A[2] * B[0] + mul x25, x19, x21 + umulh x26, x19, x21 + adds x10, x10, x25 + adcs x11, x11, x26 + adc x12, x12, xzr + # A[0] * B[3] + mul x25, x16, x24 + umulh x26, x16, x24 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, xzr, xzr + # A[1] * B[2] + mul x25, x17, x23 + umulh x26, x17, x23 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[2] * B[1] + mul x25, x19, x22 + umulh x26, x19, x22 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[3] * B[0] + mul x25, x20, x21 + umulh x26, x20, x21 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[1] * B[3] + mul x25, x17, x24 + umulh x26, x17, x24 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, xzr, xzr + # A[2] * B[2] + mul x25, x19, x23 + umulh x26, x19, x23 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, x14, xzr + # A[3] * B[1] + mul x25, x20, x22 + umulh x26, x20, x22 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, x14, xzr + # A[2] * B[3] + mul x25, x19, x24 + umulh x26, x19, x24 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[3] * B[2] + mul x25, x20, x23 + umulh x26, x20, x23 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, x15, xzr + # A[3] * B[3] + mul x25, x20, x24 + umulh x26, x20, x24 + adds x14, x14, x25 + adc x15, x15, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x15, x15, x14, #63 + extr x14, x14, x13, #63 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + and x11, x11, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x12 + umulh x12, x25, x12 + adds x8, x8, x26 + mul x26, x25, x13 + umulh x13, x25, x13 + adcs x9, x9, x26 + mul x26, x25, x14 + umulh x14, x25, x14 + adcs x10, x10, x26 + mul x26, x25, x15 + umulh x27, x25, x15 + adcs x11, x11, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x9, x9, x12 + adcs x10, x10, x13 + adcs x11, x11, x14 + adc x27, x27, xzr + # Overflow + extr x27, x27, x11, #63 + mul x27, x27, x25 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Reduce if top bit set + and x27, x25, x11, asr 63 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Store + ldr x0, [x29, #32] + ldr x1, [x29, #40] + # Add + adds x12, x4, x8 + adcs x13, x5, x9 + adcs x14, x6, x10 + adc x15, x7, x11 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x16, x4, x8 + sbcs x17, x5, x9 + sbcs x19, x6, x10 + sbcs x20, x7, x11 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_add,.-fe_ge_add + .text + .align 2 + .globl fe_ge_sub + .type fe_ge_sub, %function +fe_ge_sub: + stp x29, x30, [sp, #-176]! + add x29, sp, #0 + str x17, [x29, #88] + str x19, [x29, #96] + stp x20, x21, [x29, #104] + stp x22, x23, [x29, #120] + stp x24, x25, [x29, #136] + stp x26, x27, [x29, #152] + str x28, [x29, #168] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + str x3, [x29, #40] + str x4, [x29, #48] + str x5, [x29, #56] + str x6, [x29, #64] + str x7, [x29, #72] + ldr x2, [x29, #56] + ldr x3, [x29, #48] + # Add + ldp x12, x13, [x2] + ldp x14, x15, [x2, #16] + ldp x16, x17, [x3] + ldp x19, x20, [x3, #16] + adds x4, x12, x16 + adcs x5, x13, x17 + adcs x6, x14, x19 + adc x7, x15, x20 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + # Sub + subs x8, x12, x16 + sbcs x9, x13, x17 + sbcs x10, x14, x19 + sbcs x11, x15, x20 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x8, x8, x25 + adcs x9, x9, x28 + adcs x10, x10, x28 + adc x11, x11, x26 + ldr x0, [x29, #32] + ldr x2, [x29, #200] + # Multiply + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x12, x4, x21 + umulh x13, x4, x21 + # A[0] * B[1] + mul x25, x4, x22 + umulh x14, x4, x22 + adds x13, x13, x25 + adc x14, x14, xzr + # A[1] * B[0] + mul x25, x5, x21 + umulh x26, x5, x21 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[0] * B[2] + mul x25, x4, x23 + umulh x26, x4, x23 + adds x14, x14, x25 + adc x15, x15, x26 + # A[1] * B[1] + mul x25, x5, x22 + umulh x26, x5, x22 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x6, x21 + umulh x26, x6, x21 + adds x14, x14, x25 + adcs x15, x15, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x4, x24 + umulh x26, x4, x24 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x5, x23 + umulh x26, x5, x23 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x6, x22 + umulh x26, x6, x22 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x7, x21 + umulh x26, x7, x21 + adds x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x5, x24 + umulh x26, x5, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x6, x23 + umulh x26, x6, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x7, x22 + umulh x26, x7, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x6, x24 + umulh x26, x6, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x7, x23 + umulh x26, x7, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x7, x24 + umulh x26, x7, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x15, #63 + and x15, x15, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x12, x12, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x13, x13, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x14, x14, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x15, x15, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x13, x13, x16 + adcs x14, x14, x17 + adcs x15, x15, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x15, #63 + mul x27, x27, x25 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Reduce if top bit set + and x27, x25, x15, asr 63 + and x15, x15, #0x7fffffffffffffff + adds x12, x12, x27 + adcs x13, x13, xzr + adcs x14, x14, xzr + adc x15, x15, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #192] + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + # A[0] * B[0] + mul x4, x8, x21 + umulh x5, x8, x21 + # A[0] * B[1] + mul x25, x8, x22 + umulh x6, x8, x22 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x9, x21 + umulh x26, x9, x21 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x8, x23 + umulh x26, x8, x23 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x9, x22 + umulh x26, x9, x22 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, xzr, xzr + # A[2] * B[0] + mul x25, x10, x21 + umulh x26, x10, x21 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x16, x16, xzr + # A[0] * B[3] + mul x25, x8, x24 + umulh x26, x8, x24 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, xzr, xzr + # A[1] * B[2] + mul x25, x9, x23 + umulh x26, x9, x23 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[2] * B[1] + mul x25, x10, x22 + umulh x26, x10, x22 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[3] * B[0] + mul x25, x11, x21 + umulh x26, x11, x21 + adds x7, x7, x25 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[3] + mul x25, x9, x24 + umulh x26, x9, x24 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, xzr, xzr + # A[2] * B[2] + mul x25, x10, x23 + umulh x26, x10, x23 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[3] * B[1] + mul x25, x11, x22 + umulh x26, x11, x22 + adds x16, x16, x25 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[3] + mul x25, x10, x24 + umulh x26, x10, x24 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, xzr, xzr + # A[3] * B[2] + mul x25, x11, x23 + umulh x26, x11, x23 + adds x17, x17, x25 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[3] + mul x25, x11, x24 + umulh x26, x11, x24 + adds x19, x19, x25 + adc x20, x20, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x20, x20, x19, #63 + extr x19, x19, x17, #63 + extr x17, x17, x16, #63 + extr x16, x16, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x16 + umulh x16, x25, x16 + adds x4, x4, x26 + mul x26, x25, x17 + umulh x17, x25, x17 + adcs x5, x5, x26 + mul x26, x25, x19 + umulh x19, x25, x19 + adcs x6, x6, x26 + mul x26, x25, x20 + umulh x27, x25, x20 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x19 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #24] + ldr x1, [x29, #16] + # Add + adds x8, x12, x4 + adcs x9, x13, x5 + adcs x10, x14, x6 + adc x11, x15, x7 + mov x25, #-19 + asr x28, x11, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + # Sub + subs x16, x12, x4 + sbcs x17, x13, x5 + sbcs x19, x14, x6 + sbcs x20, x15, x7 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x0, [x29, #48] + ldr x1, [x29, #64] + ldr x2, [x29, #176] + # Multiply + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + ldp x16, x17, [x2] + ldp x19, x20, [x2, #16] + # A[0] * B[0] + mul x4, x12, x16 + umulh x5, x12, x16 + # A[0] * B[1] + mul x25, x12, x17 + umulh x6, x12, x17 + adds x5, x5, x25 + adc x6, x6, xzr + # A[1] * B[0] + mul x25, x13, x16 + umulh x26, x13, x16 + adds x5, x5, x25 + adcs x6, x6, x26 + adc x7, xzr, xzr + # A[0] * B[2] + mul x25, x12, x19 + umulh x26, x12, x19 + adds x6, x6, x25 + adc x7, x7, x26 + # A[1] * B[1] + mul x25, x13, x17 + umulh x26, x13, x17 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, xzr, xzr + # A[2] * B[0] + mul x25, x14, x16 + umulh x26, x14, x16 + adds x6, x6, x25 + adcs x7, x7, x26 + adc x8, x8, xzr + # A[0] * B[3] + mul x25, x12, x20 + umulh x26, x12, x20 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, xzr, xzr + # A[1] * B[2] + mul x25, x13, x19 + umulh x26, x13, x19 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[2] * B[1] + mul x25, x14, x17 + umulh x26, x14, x17 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[3] * B[0] + mul x25, x15, x16 + umulh x26, x15, x16 + adds x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[3] + mul x25, x13, x20 + umulh x26, x13, x20 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, xzr, xzr + # A[2] * B[2] + mul x25, x14, x19 + umulh x26, x14, x19 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[3] * B[1] + mul x25, x15, x17 + umulh x26, x15, x17 + adds x8, x8, x25 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[3] + mul x25, x14, x20 + umulh x26, x14, x20 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[3] * B[2] + mul x25, x15, x19 + umulh x26, x15, x19 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[3] + mul x25, x15, x20 + umulh x26, x15, x20 + adds x10, x10, x25 + adc x11, x11, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x11, x11, x10, #63 + extr x10, x10, x9, #63 + extr x9, x9, x8, #63 + extr x8, x8, x7, #63 + and x7, x7, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x8 + umulh x8, x25, x8 + adds x4, x4, x26 + mul x26, x25, x9 + umulh x9, x25, x9 + adcs x5, x5, x26 + mul x26, x25, x10 + umulh x10, x25, x10 + adcs x6, x6, x26 + mul x26, x25, x11 + umulh x27, x25, x11 + adcs x7, x7, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x5, x5, x8 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x27, x27, xzr + # Overflow + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Reduce if top bit set + and x27, x25, x7, asr 63 + and x7, x7, #0x7fffffffffffffff + adds x4, x4, x27 + adcs x5, x5, xzr + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + ldr x0, [x29, #48] + # Double + adds x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adc x7, x7, x7 + mov x25, #-19 + asr x28, x7, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x4, x4, x25 + sbcs x5, x5, x28 + sbcs x6, x6, x28 + sbc x7, x7, x26 + ldr x0, [x29, #40] + ldr x1, [x29, #184] + ldr x2, [x29, #72] + # Multiply + ldp x16, x17, [x1] + ldp x19, x20, [x1, #16] + ldp x21, x22, [x2] + ldp x23, x24, [x2, #16] + # A[0] * B[0] + mul x8, x16, x21 + umulh x9, x16, x21 + # A[0] * B[1] + mul x25, x16, x22 + umulh x10, x16, x22 + adds x9, x9, x25 + adc x10, x10, xzr + # A[1] * B[0] + mul x25, x17, x21 + umulh x26, x17, x21 + adds x9, x9, x25 + adcs x10, x10, x26 + adc x11, xzr, xzr + # A[0] * B[2] + mul x25, x16, x23 + umulh x26, x16, x23 + adds x10, x10, x25 + adc x11, x11, x26 + # A[1] * B[1] + mul x25, x17, x22 + umulh x26, x17, x22 + adds x10, x10, x25 + adcs x11, x11, x26 + adc x12, xzr, xzr + # A[2] * B[0] + mul x25, x19, x21 + umulh x26, x19, x21 + adds x10, x10, x25 + adcs x11, x11, x26 + adc x12, x12, xzr + # A[0] * B[3] + mul x25, x16, x24 + umulh x26, x16, x24 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, xzr, xzr + # A[1] * B[2] + mul x25, x17, x23 + umulh x26, x17, x23 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[2] * B[1] + mul x25, x19, x22 + umulh x26, x19, x22 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[3] * B[0] + mul x25, x20, x21 + umulh x26, x20, x21 + adds x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, xzr + # A[1] * B[3] + mul x25, x17, x24 + umulh x26, x17, x24 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, xzr, xzr + # A[2] * B[2] + mul x25, x19, x23 + umulh x26, x19, x23 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, x14, xzr + # A[3] * B[1] + mul x25, x20, x22 + umulh x26, x20, x22 + adds x12, x12, x25 + adcs x13, x13, x26 + adc x14, x14, xzr + # A[2] * B[3] + mul x25, x19, x24 + umulh x26, x19, x24 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, xzr, xzr + # A[3] * B[2] + mul x25, x20, x23 + umulh x26, x20, x23 + adds x13, x13, x25 + adcs x14, x14, x26 + adc x15, x15, xzr + # A[3] * B[3] + mul x25, x20, x24 + umulh x26, x20, x24 + adds x14, x14, x25 + adc x15, x15, x26 + # Reduce + # Move top half into t4-t7 and remove top bit from t3 + extr x15, x15, x14, #63 + extr x14, x14, x13, #63 + extr x13, x13, x12, #63 + extr x12, x12, x11, #63 + and x11, x11, #0x7fffffffffffffff + # Multiply top half by 19 + mov x25, #19 + mul x26, x25, x12 + umulh x12, x25, x12 + adds x8, x8, x26 + mul x26, x25, x13 + umulh x13, x25, x13 + adcs x9, x9, x26 + mul x26, x25, x14 + umulh x14, x25, x14 + adcs x10, x10, x26 + mul x26, x25, x15 + umulh x27, x25, x15 + adcs x11, x11, x26 + adc x27, x27, xzr + # Add remaining product results in + adds x9, x9, x12 + adcs x10, x10, x13 + adcs x11, x11, x14 + adc x27, x27, xzr + # Overflow + extr x27, x27, x11, #63 + mul x27, x27, x25 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Reduce if top bit set + and x27, x25, x11, asr 63 + and x11, x11, #0x7fffffffffffffff + adds x8, x8, x27 + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + # Store + ldr x0, [x29, #40] + ldr x1, [x29, #32] + # Add + adds x12, x4, x8 + adcs x13, x5, x9 + adcs x14, x6, x10 + adc x15, x7, x11 + mov x25, #-19 + asr x28, x15, #63 + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Sub modulus (if overflow) + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + # Sub + subs x16, x4, x8 + sbcs x17, x5, x9 + sbcs x19, x6, x10 + sbcs x20, x7, x11 + mov x25, #-19 + csetm x28, cc + # Mask the modulus + and x25, x28, x25 + and x26, x28, #0x7fffffffffffffff + # Add modulus (if underflow) + adds x16, x16, x25 + adcs x17, x17, x28 + adcs x19, x19, x28 + adc x20, x20, x26 + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + stp x16, x17, [x1] + stp x19, x20, [x1, #16] + ldr x17, [x29, #88] + ldr x19, [x29, #96] + ldp x20, x21, [x29, #104] + ldp x22, x23, [x29, #120] + ldp x24, x25, [x29, #136] + ldp x26, x27, [x29, #152] + ldr x28, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret + .size fe_ge_sub,.-fe_ge_sub +#endif /* __aarch64__ */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c new file mode 100644 index 0000000..d1ab4c8 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c @@ -0,0 +1,6725 @@ +/* armv8-curve25519 + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c + */ +#ifdef __aarch64__ +#include <stdint.h> +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#ifdef WOLFSSL_ARMASM +#include <wolfssl/wolfcrypt/fe_operations.h> +#include <stdint.h> + +void fe_init() +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "\n\t" + "ldp x29, x30, [sp], #16\n\t" + : + : + : "memory" + ); +} + +void fe_frombytes(fe out, const unsigned char* in) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "ldp x2, x3, [%x[in]]\n\t" + "ldp x4, x5, [%x[in], #16]\n\t" + "and x5, x5, #0x7fffffffffffffff\n\t" + "stp x2, x3, [%x[out]]\n\t" + "stp x4, x5, [%x[out], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [out] "+r" (out), [in] "+r" (in) + : + : "memory", "x2", "x3", "x4", "x5", "x6" + ); +} + +void fe_tobytes(unsigned char* out, const fe n) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "mov x7, #19\n\t" + "ldp x2, x3, [%x[n]]\n\t" + "ldp x4, x5, [%x[n], #16]\n\t" + "adds x6, x2, x7\n\t" + "adcs x6, x3, xzr\n\t" + "adcs x6, x4, xzr\n\t" + "adc x6, x5, xzr\n\t" + "and x6, x7, x6, asr 63\n\t" + "adds x2, x2, x6\n\t" + "adcs x3, x3, xzr\n\t" + "adcs x4, x4, xzr\n\t" + "adc x5, x5, xzr\n\t" + "and x5, x5, #0x7fffffffffffffff\n\t" + "stp x2, x3, [%x[out]]\n\t" + "stp x4, x5, [%x[out], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [out] "+r" (out), [n] "+r" (n) + : + : "memory", "x2", "x3", "x4", "x5", "x6", "x7" + ); +} + +void fe_1(fe n) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Set one */ + "mov x1, #1\n\t" + "stp x1, xzr, [%x[n]]\n\t" + "stp xzr, xzr, [%x[n], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [n] "+r" (n) + : + : "memory", "x1" + ); +} + +void fe_0(fe n) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Set zero */ + "stp xzr, xzr, [%x[n]]\n\t" + "stp xzr, xzr, [%x[n], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [n] "+r" (n) + : + : "memory" + ); +} + +void fe_copy(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Copy */ + "ldp x2, x3, [%x[a]]\n\t" + "ldp x4, x5, [%x[a], #16]\n\t" + "stp x2, x3, [%x[r]]\n\t" + "stp x4, x5, [%x[r], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x2", "x3", "x4", "x5" + ); +} + +void fe_sub(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Sub */ + "ldp x3, x4, [%x[a]]\n\t" + "ldp x5, x6, [%x[a], #16]\n\t" + "ldp x7, x8, [%x[b]]\n\t" + "ldp x9, x10, [%x[b], #16]\n\t" + "subs x3, x3, x7\n\t" + "sbcs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "mov x12, #-19\n\t" + "csetm x11, cc\n\t" + /* Mask the modulus */ + "and x12, x11, x12\n\t" + "and x13, x11, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x3, x3, x12\n\t" + "adcs x4, x4, x11\n\t" + "adcs x5, x5, x11\n\t" + "adc x6, x6, x13\n\t" + "stp x3, x4, [%x[r]]\n\t" + "stp x5, x6, [%x[r], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13" + ); +} + +void fe_add(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Add */ + "ldp x3, x4, [%x[a]]\n\t" + "ldp x5, x6, [%x[a], #16]\n\t" + "ldp x7, x8, [%x[b]]\n\t" + "ldp x9, x10, [%x[b], #16]\n\t" + "adds x3, x3, x7\n\t" + "adcs x4, x4, x8\n\t" + "adcs x5, x5, x9\n\t" + "adc x6, x6, x10\n\t" + "mov x12, #-19\n\t" + "asr x11, x6, #63\n\t" + /* Mask the modulus */ + "and x12, x11, x12\n\t" + "and x13, x11, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x3, x3, x12\n\t" + "sbcs x4, x4, x11\n\t" + "sbcs x5, x5, x11\n\t" + "sbc x6, x6, x13\n\t" + "stp x3, x4, [%x[r]]\n\t" + "stp x5, x6, [%x[r], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13" + ); +} + +void fe_neg(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "ldp x2, x3, [%x[a]]\n\t" + "ldp x4, x5, [%x[a], #16]\n\t" + "mov x6, #-19\n\t" + "mov x7, #-1\n\t" + "mov x8, #-1\n\t" + "mov x9, #0x7fffffffffffffff\n\t" + "subs x6, x6, x2\n\t" + "sbcs x7, x7, x3\n\t" + "sbcs x8, x8, x4\n\t" + "sbc x9, x9, x5\n\t" + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9" + ); +} + +int fe_isnonzero(const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "mov x6, #19\n\t" + "ldp x1, x2, [%x[a]]\n\t" + "ldp x3, x4, [%x[a], #16]\n\t" + "adds x5, x1, x6\n\t" + "adcs x5, x2, xzr\n\t" + "adcs x5, x3, xzr\n\t" + "adc x5, x4, xzr\n\t" + "and x5, x6, x5, asr 63\n\t" + "adds x1, x1, x5\n\t" + "adcs x2, x2, xzr\n\t" + "adcs x3, x3, xzr\n\t" + "adc x4, x4, xzr\n\t" + "and x4, x4, #0x7fffffffffffffff\n\t" + "orr %x[a], x1, x2\n\t" + "orr x3, x3, x4\n\t" + "orr %x[a], %x[a], x3\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [a] "+r" (a) + : + : "memory", "x1", "x2", "x3", "x4", "x5", "x6" + ); + return (uint32_t)(size_t)a; +} + +int fe_isnegative(const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "mov x6, #19\n\t" + "ldp x1, x2, [%x[a]]\n\t" + "ldp x3, x4, [%x[a], #16]\n\t" + "adds x5, x1, x6\n\t" + "adcs x5, x2, xzr\n\t" + "adcs x5, x3, xzr\n\t" + "adc x5, x4, xzr\n\t" + "and %x[a], x1, #1\n\t" + "eor %x[a], %x[a], x5, lsr 63\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [a] "+r" (a) + : + : "memory", "x1", "x2", "x3", "x4", "x5", "x6" + ); + return (uint32_t)(size_t)a; +} + +void fe_cmov_table(fe* r, fe* base, signed char b) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" + "str %x[r], [x29, #16]\n\t" + "sxtb %x[b], %w[b]\n\t" + "sbfx x3, %x[b], #7, #1\n\t" + "eor %x[r], %x[b], x3\n\t" + "sub %x[r], %x[r], x3\n\t" + "mov x4, #1\n\t" + "mov x5, xzr\n\t" + "mov x6, xzr\n\t" + "mov x7, xzr\n\t" + "mov x8, #1\n\t" + "mov x9, xzr\n\t" + "mov x10, xzr\n\t" + "mov x11, xzr\n\t" + "mov x12, xzr\n\t" + "mov x13, xzr\n\t" + "mov x14, xzr\n\t" + "mov x15, xzr\n\t" + "cmp %x[r], #1\n\t" + "ldp x16, x17, [%x[base]]\n\t" + "ldp x19, x20, [%x[base], #16]\n\t" + "ldp x21, x22, [%x[base], #32]\n\t" + "ldp x23, x24, [%x[base], #48]\n\t" + "ldp x25, x26, [%x[base], #64]\n\t" + "ldp x27, x28, [%x[base], #80]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #2\n\t" + "ldp x16, x17, [%x[base], #96]\n\t" + "ldp x19, x20, [%x[base], #112]\n\t" + "ldp x21, x22, [%x[base], #128]\n\t" + "ldp x23, x24, [%x[base], #144]\n\t" + "ldp x25, x26, [%x[base], #160]\n\t" + "ldp x27, x28, [%x[base], #176]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #3\n\t" + "ldp x16, x17, [%x[base], #192]\n\t" + "ldp x19, x20, [%x[base], #208]\n\t" + "ldp x21, x22, [%x[base], #224]\n\t" + "ldp x23, x24, [%x[base], #240]\n\t" + "ldp x25, x26, [%x[base], #256]\n\t" + "ldp x27, x28, [%x[base], #272]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #4\n\t" + "ldp x16, x17, [%x[base], #288]\n\t" + "ldp x19, x20, [%x[base], #304]\n\t" + "ldp x21, x22, [%x[base], #320]\n\t" + "ldp x23, x24, [%x[base], #336]\n\t" + "ldp x25, x26, [%x[base], #352]\n\t" + "ldp x27, x28, [%x[base], #368]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "add %x[base], %x[base], #0x180\n\t" + "cmp %x[r], #5\n\t" + "ldp x16, x17, [%x[base]]\n\t" + "ldp x19, x20, [%x[base], #16]\n\t" + "ldp x21, x22, [%x[base], #32]\n\t" + "ldp x23, x24, [%x[base], #48]\n\t" + "ldp x25, x26, [%x[base], #64]\n\t" + "ldp x27, x28, [%x[base], #80]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #6\n\t" + "ldp x16, x17, [%x[base], #96]\n\t" + "ldp x19, x20, [%x[base], #112]\n\t" + "ldp x21, x22, [%x[base], #128]\n\t" + "ldp x23, x24, [%x[base], #144]\n\t" + "ldp x25, x26, [%x[base], #160]\n\t" + "ldp x27, x28, [%x[base], #176]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #7\n\t" + "ldp x16, x17, [%x[base], #192]\n\t" + "ldp x19, x20, [%x[base], #208]\n\t" + "ldp x21, x22, [%x[base], #224]\n\t" + "ldp x23, x24, [%x[base], #240]\n\t" + "ldp x25, x26, [%x[base], #256]\n\t" + "ldp x27, x28, [%x[base], #272]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "cmp %x[r], #8\n\t" + "ldp x16, x17, [%x[base], #288]\n\t" + "ldp x19, x20, [%x[base], #304]\n\t" + "ldp x21, x22, [%x[base], #320]\n\t" + "ldp x23, x24, [%x[base], #336]\n\t" + "ldp x25, x26, [%x[base], #352]\n\t" + "ldp x27, x28, [%x[base], #368]\n\t" + "csel x4, x16, x4, eq\n\t" + "csel x5, x17, x5, eq\n\t" + "csel x6, x19, x6, eq\n\t" + "csel x7, x20, x7, eq\n\t" + "csel x8, x21, x8, eq\n\t" + "csel x9, x22, x9, eq\n\t" + "csel x10, x23, x10, eq\n\t" + "csel x11, x24, x11, eq\n\t" + "csel x12, x25, x12, eq\n\t" + "csel x13, x26, x13, eq\n\t" + "csel x14, x27, x14, eq\n\t" + "csel x15, x28, x15, eq\n\t" + "mov x16, #-19\n\t" + "mov x17, #-1\n\t" + "mov x19, #-1\n\t" + "mov x20, #0x7fffffffffffffff\n\t" + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "sbcs x19, x19, x14\n\t" + "sbc x20, x20, x15\n\t" + "cmp %x[b], #0\n\t" + "mov x3, x4\n\t" + "csel x4, x8, x4, lt\n\t" + "csel x8, x3, x8, lt\n\t" + "mov x3, x5\n\t" + "csel x5, x9, x5, lt\n\t" + "csel x9, x3, x9, lt\n\t" + "mov x3, x6\n\t" + "csel x6, x10, x6, lt\n\t" + "csel x10, x3, x10, lt\n\t" + "mov x3, x7\n\t" + "csel x7, x11, x7, lt\n\t" + "csel x11, x3, x11, lt\n\t" + "csel x12, x16, x12, lt\n\t" + "csel x13, x17, x13, lt\n\t" + "csel x14, x19, x14, lt\n\t" + "csel x15, x20, x15, lt\n\t" + "ldr %x[r], [x29, #16]\n\t" + "stp x4, x5, [%x[r]]\n\t" + "stp x6, x7, [%x[r], #16]\n\t" + "stp x8, x9, [%x[r], #32]\n\t" + "stp x10, x11, [%x[r], #48]\n\t" + "stp x12, x13, [%x[r], #64]\n\t" + "stp x14, x15, [%x[r], #80]\n\t" + "ldp x29, x30, [sp], #32\n\t" + : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +void fe_mul(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Multiply */ + "ldp x14, x15, [%x[a]]\n\t" + "ldp x16, x17, [%x[a], #16]\n\t" + "ldp x19, x20, [%x[b]]\n\t" + "ldp x21, x22, [%x[b], #16]\n\t" + /* A[0] * B[0] */ + "mul x6, x14, x19\n\t" + "umulh x7, x14, x19\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x20\n\t" + "umulh x8, x14, x20\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x19\n\t" + "umulh x4, x15, x19\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x21\n\t" + "umulh x4, x14, x21\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x20\n\t" + "umulh x4, x15, x20\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x16, x19\n\t" + "umulh x4, x16, x19\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x22\n\t" + "umulh x4, x14, x22\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x21\n\t" + "umulh x4, x15, x21\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x20\n\t" + "umulh x4, x16, x20\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x19\n\t" + "umulh x4, x17, x19\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x15, x22\n\t" + "umulh x4, x15, x22\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x21\n\t" + "umulh x4, x16, x21\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x20\n\t" + "umulh x4, x17, x20\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x22\n\t" + "umulh x4, x16, x22\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x21\n\t" + "umulh x4, x17, x21\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x22\n\t" + "umulh x4, x17, x22\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x10\n\t" + "umulh x10, x3, x10\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x11\n\t" + "umulh x11, x3, x11\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x12\n\t" + "umulh x12, x3, x12\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x13\n\t" + "umulh x5, x3, x13\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adcs x9, x9, x12\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x9, asr 63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" + ); +} + +void fe_sq(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + /* Square */ + "ldp x13, x14, [%x[a]]\n\t" + "ldp x15, x16, [%x[a], #16]\n\t" + /* A[0] * A[1] */ + "mul x6, x13, x14\n\t" + "umulh x7, x13, x14\n\t" + /* A[0] * A[2] */ + "mul x2, x13, x15\n\t" + "umulh x8, x13, x15\n\t" + "adds x7, x7, x2\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * A[3] */ + "mul x2, x13, x16\n\t" + "umulh x9, x13, x16\n\t" + "adds x8, x8, x2\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * A[2] */ + "mul x2, x14, x15\n\t" + "umulh x3, x14, x15\n\t" + "adds x8, x8, x2\n\t" + "adcs x9, x9, x3\n\t" + "adc x10, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x2, x14, x16\n\t" + "umulh x3, x14, x16\n\t" + "adds x9, x9, x2\n\t" + "adc x10, x10, x3\n\t" + /* A[2] * A[3] */ + "mul x2, x15, x16\n\t" + "umulh x11, x15, x16\n\t" + "adds x10, x10, x2\n\t" + "adc x11, x11, xzr\n\t" + /* Double */ + "adds x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adcs x11, x11, x11\n\t" + "adc x12, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x5, x13, x13\n\t" + "umulh x4, x13, x13\n\t" + /* A[1] * A[1] */ + "mul x2, x14, x14\n\t" + "umulh x3, x14, x14\n\t" + "adds x6, x6, x4\n\t" + "adcs x7, x7, x2\n\t" + "adc x4, x3, xzr\n\t" + /* A[2] * A[2] */ + "mul x2, x15, x15\n\t" + "umulh x3, x15, x15\n\t" + "adds x8, x8, x4\n\t" + "adcs x9, x9, x2\n\t" + "adc x4, x3, xzr\n\t" + /* A[3] * A[3] */ + "mul x2, x16, x16\n\t" + "umulh x3, x16, x16\n\t" + "adds x10, x10, x4\n\t" + "adcs x11, x11, x2\n\t" + "adc x12, x12, x3\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x12, x12, x11, #63\n\t" + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "and x8, x8, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x2, #19\n\t" + "mul x3, x2, x9\n\t" + "umulh x9, x2, x9\n\t" + "adds x5, x5, x3\n\t" + "mul x3, x2, x10\n\t" + "umulh x10, x2, x10\n\t" + "adcs x6, x6, x3\n\t" + "mul x3, x2, x11\n\t" + "umulh x11, x2, x11\n\t" + "adcs x7, x7, x3\n\t" + "mul x3, x2, x12\n\t" + "umulh x4, x2, x12\n\t" + "adcs x8, x8, x3\n\t" + "adc x4, x4, xzr\n\t" + /* Add remaining product results in */ + "adds x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adc x4, x4, xzr\n\t" + /* Overflow */ + "extr x4, x4, x8, #63\n\t" + "mul x4, x4, x2\n\t" + "and x8, x8, #0x7fffffffffffffff\n\t" + "adds x5, x5, x4\n\t" + "adcs x6, x6, xzr\n\t" + "adcs x7, x7, xzr\n\t" + "adc x8, x8, xzr\n\t" + /* Reduce if top bit set */ + "and x4, x2, x8, asr 63\n\t" + "and x8, x8, #0x7fffffffffffffff\n\t" + "adds x5, x5, x4\n\t" + "adcs x6, x6, xzr\n\t" + "adcs x7, x7, xzr\n\t" + "adc x8, x8, xzr\n\t" + /* Store */ + "stp x5, x6, [%x[r]]\n\t" + "stp x7, x8, [%x[r], #16]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16" + ); +} + +void fe_invert(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-160]!\n\t" + "add x29, sp, #0\n\t" + /* Invert */ + "str %x[r], [x29, #144]\n\t" + "str %x[a], [x29, #152]\n\t" + "add x0, x29, #16\n\t" + "bl fe_sq\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "ldr x1, [x29, #152]\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #16\n\t" + "add x1, x29, #16\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x50\n\t" + "bl fe_sq\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x50\n\t" + "bl fe_sq\n\t" + "mov x20, #4\n\t" + "add x1, x29, #0x50\n\t" + "\n" + "L_fe_invert1_%=: \n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert1_%=\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x50\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "mov x20, #9\n\t" + "add x1, x29, #0x50\n\t" + "\n" + "L_fe_invert2_%=: \n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert2_%=\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x70\n\t" + "bl fe_sq\n\t" + "mov x20, #19\n\t" + "add x1, x29, #0x70\n\t" + "\n" + "L_fe_invert3_%=: \n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert3_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "mov x20, #10\n\t" + "add x1, x29, #0x50\n\t" + "\n" + "L_fe_invert4_%=: \n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert4_%=\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x50\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "mov x20, #49\n\t" + "add x1, x29, #0x50\n\t" + "\n" + "L_fe_invert5_%=: \n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert5_%=\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x70\n\t" + "bl fe_sq\n\t" + "mov x20, #0x63\n\t" + "add x1, x29, #0x70\n\t" + "\n" + "L_fe_invert6_%=: \n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert6_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "mov x20, #50\n\t" + "add x1, x29, #0x50\n\t" + "\n" + "L_fe_invert7_%=: \n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert7_%=\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "mov x20, #5\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_invert8_%=: \n\t" + "bl fe_sq\n\t" + "sub x20, x20, #1\n\t" + "cmp x20, #0\n\t" + "bne L_fe_invert8_%=\n\t" + "ldr x0, [x29, #144]\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "ldp x29, x30, [sp], #0xa0\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x20" + ); +} + +int curve25519(byte* r, byte* n, byte* a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-192]!\n\t" + "add x29, sp, #0\n\t" + "mov x23, xzr\n\t" + "str %x[r], [x29, #176]\n\t" + "str %x[a], [x29, #184]\n\t" + /* Copy */ + "ldp x6, x7, [%x[a]]\n\t" + "ldp x8, x9, [%x[a], #16]\n\t" + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + /* Set one */ + "mov %x[a], #1\n\t" + "stp %x[a], xzr, [%x[r]]\n\t" + "stp xzr, xzr, [%x[r], #16]\n\t" + /* Set zero */ + "stp xzr, xzr, [x29, #16]\n\t" + "stp xzr, xzr, [x29, #32]\n\t" + /* Set one */ + "mov %x[a], #1\n\t" + "stp %x[a], xzr, [x29, #48]\n\t" + "stp xzr, xzr, [x29, #64]\n\t" + "mov x25, #62\n\t" + "mov x24, #24\n\t" + "\n" + "L_curve25519_words_%=: \n\t" + "\n" + "L_curve25519_bits_%=: \n\t" + "ldr %x[a], [%x[n], x24]\n\t" + "lsr %x[a], %x[a], x25\n\t" + "and %x[a], %x[a], #1\n\t" + "eor x23, x23, %x[a]\n\t" + /* Conditional Swap */ + "cmp x23, #1\n\t" + "ldp x10, x11, [%x[r]]\n\t" + "ldp x12, x13, [%x[r], #16]\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" + "csel x14, x10, x6, eq\n\t" + "csel x10, x6, x10, eq\n\t" + "csel x15, x11, x7, eq\n\t" + "csel x11, x7, x11, eq\n\t" + "csel x16, x12, x8, eq\n\t" + "csel x12, x8, x12, eq\n\t" + "csel x17, x13, x9, eq\n\t" + "csel x13, x9, x13, eq\n\t" + /* Conditional Swap */ + "cmp x23, #1\n\t" + "ldp x19, x20, [x29, #16]\n\t" + "ldp x21, x22, [x29, #32]\n\t" + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" + "csel x5, x19, x6, eq\n\t" + "csel x19, x6, x19, eq\n\t" + "csel x26, x20, x7, eq\n\t" + "csel x20, x7, x20, eq\n\t" + "csel x27, x21, x8, eq\n\t" + "csel x21, x8, x21, eq\n\t" + "csel x28, x22, x9, eq\n\t" + "csel x22, x9, x22, eq\n\t" + "mov x23, %x[a]\n\t" + /* Add */ + "adds x6, x10, x19\n\t" + "adcs x7, x11, x20\n\t" + "adcs x8, x12, x21\n\t" + "adc x9, x13, x22\n\t" + "mov x3, #-19\n\t" + "asr %x[a], x9, #63\n\t" + /* Mask the modulus */ + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x6, x6, x3\n\t" + "sbcs x7, x7, %x[a]\n\t" + "sbcs x8, x8, %x[a]\n\t" + "sbc x9, x9, x4\n\t" + /* Sub */ + "subs x19, x10, x19\n\t" + "sbcs x20, x11, x20\n\t" + "sbcs x21, x12, x21\n\t" + "sbcs x22, x13, x22\n\t" + "mov x3, #-19\n\t" + "csetm %x[a], cc\n\t" + /* Mask the modulus */ + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x19, x19, x3\n\t" + "adcs x20, x20, %x[a]\n\t" + "adcs x21, x21, %x[a]\n\t" + "adc x22, x22, x4\n\t" + "stp x19, x20, [x29, #144]\n\t" + "stp x21, x22, [x29, #160]\n\t" + /* Add */ + "adds x10, x14, x5\n\t" + "adcs x11, x15, x26\n\t" + "adcs x12, x16, x27\n\t" + "adc x13, x17, x28\n\t" + "mov x3, #-19\n\t" + "asr %x[a], x13, #63\n\t" + /* Mask the modulus */ + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x10, x10, x3\n\t" + "sbcs x11, x11, %x[a]\n\t" + "sbcs x12, x12, %x[a]\n\t" + "sbc x13, x13, x4\n\t" + /* Sub */ + "subs x14, x14, x5\n\t" + "sbcs x15, x15, x26\n\t" + "sbcs x16, x16, x27\n\t" + "sbcs x17, x17, x28\n\t" + "mov x3, #-19\n\t" + "csetm %x[a], cc\n\t" + /* Mask the modulus */ + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x14, x14, x3\n\t" + "adcs x15, x15, %x[a]\n\t" + "adcs x16, x16, %x[a]\n\t" + "adc x17, x17, x4\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "mul x19, x14, x6\n\t" + "umulh x20, x14, x6\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x7\n\t" + "umulh x21, x14, x7\n\t" + "adds x20, x20, x3\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x6\n\t" + "umulh x4, x15, x6\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x8\n\t" + "umulh x4, x14, x8\n\t" + "adds x21, x21, x3\n\t" + "adc x22, x22, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x7\n\t" + "umulh x4, x15, x7\n\t" + "adds x21, x21, x3\n\t" + "adcs x22, x22, x4\n\t" + "adc %x[a], xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x16, x6\n\t" + "umulh x4, x16, x6\n\t" + "adds x21, x21, x3\n\t" + "adcs x22, x22, x4\n\t" + "adc %x[a], %x[a], xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x9\n\t" + "umulh x4, x14, x9\n\t" + "adds x22, x22, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x8\n\t" + "umulh x4, x15, x8\n\t" + "adds x22, x22, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x7\n\t" + "umulh x4, x16, x7\n\t" + "adds x22, x22, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x6\n\t" + "umulh x4, x17, x6\n\t" + "adds x22, x22, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x15, x9\n\t" + "umulh x4, x15, x9\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x8\n\t" + "umulh x4, x16, x8\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x7\n\t" + "umulh x4, x17, x7\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x9\n\t" + "umulh x4, x16, x9\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x8\n\t" + "umulh x4, x17, x8\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x9\n\t" + "umulh x4, x17, x9\n\t" + "adds x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x22, #63\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" + "adds x19, x19, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x20, x20, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x21, x21, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x22, x22, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x20, x20, %x[a]\n\t" + "adcs x21, x21, x26\n\t" + "adcs x22, x22, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x22, #63\n\t" + "mul x5, x5, x3\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adds x19, x19, x5\n\t" + "adcs x20, x20, xzr\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x22, asr 63\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adds x19, x19, x5\n\t" + "adcs x20, x20, xzr\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" + /* Store */ + "stp x19, x20, [x29, #112]\n\t" + "stp x21, x22, [x29, #128]\n\t" + /* Multiply */ + "ldp %x[a], x26, [x29, #144]\n\t" + "ldp x27, x28, [x29, #160]\n\t" + /* A[0] * B[0] */ + "mul x19, x10, %x[a]\n\t" + "umulh x20, x10, %x[a]\n\t" + /* A[0] * B[1] */ + "mul x3, x10, x26\n\t" + "umulh x21, x10, x26\n\t" + "adds x20, x20, x3\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x11, %x[a]\n\t" + "umulh x4, x11, %x[a]\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x10, x27\n\t" + "umulh x4, x10, x27\n\t" + "adds x21, x21, x3\n\t" + "adc x22, x22, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x11, x26\n\t" + "umulh x4, x11, x26\n\t" + "adds x21, x21, x3\n\t" + "adcs x22, x22, x4\n\t" + "adc x14, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x12, %x[a]\n\t" + "umulh x4, x12, %x[a]\n\t" + "adds x21, x21, x3\n\t" + "adcs x22, x22, x4\n\t" + "adc x14, x14, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x10, x28\n\t" + "umulh x4, x10, x28\n\t" + "adds x22, x22, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x11, x27\n\t" + "umulh x4, x11, x27\n\t" + "adds x22, x22, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, x15, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x12, x26\n\t" + "umulh x4, x12, x26\n\t" + "adds x22, x22, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, x15, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x13, %x[a]\n\t" + "umulh x4, x13, %x[a]\n\t" + "adds x22, x22, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x11, x28\n\t" + "umulh x4, x11, x28\n\t" + "adds x14, x14, x3\n\t" + "adcs x15, x15, x4\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x12, x27\n\t" + "umulh x4, x12, x27\n\t" + "adds x14, x14, x3\n\t" + "adcs x15, x15, x4\n\t" + "adc x16, x16, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x13, x26\n\t" + "umulh x4, x13, x26\n\t" + "adds x14, x14, x3\n\t" + "adcs x15, x15, x4\n\t" + "adc x16, x16, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x12, x28\n\t" + "umulh x4, x12, x28\n\t" + "adds x15, x15, x3\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x13, x27\n\t" + "umulh x4, x13, x27\n\t" + "adds x15, x15, x3\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x13, x28\n\t" + "umulh x4, x13, x28\n\t" + "adds x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "extr x15, x15, x14, #63\n\t" + "extr x14, x14, x22, #63\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x14\n\t" + "umulh x14, x3, x14\n\t" + "adds x19, x19, x4\n\t" + "mul x4, x3, x15\n\t" + "umulh x15, x3, x15\n\t" + "adcs x20, x20, x4\n\t" + "mul x4, x3, x16\n\t" + "umulh x16, x3, x16\n\t" + "adcs x21, x21, x4\n\t" + "mul x4, x3, x17\n\t" + "umulh x5, x3, x17\n\t" + "adcs x22, x22, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x20, x20, x14\n\t" + "adcs x21, x21, x15\n\t" + "adcs x22, x22, x16\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x22, #63\n\t" + "mul x5, x5, x3\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adds x19, x19, x5\n\t" + "adcs x20, x20, xzr\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x22, asr 63\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "adds x19, x19, x5\n\t" + "adcs x20, x20, xzr\n\t" + "adcs x21, x21, xzr\n\t" + "adc x22, x22, xzr\n\t" + /* Store */ + /* Square */ + /* A[0] * A[1] */ + "mul x11, %x[a], x26\n\t" + "umulh x12, %x[a], x26\n\t" + /* A[0] * A[2] */ + "mul x3, %x[a], x27\n\t" + "umulh x13, %x[a], x27\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, %x[a], x28\n\t" + "umulh x14, %x[a], x28\n\t" + "adds x13, x13, x3\n\t" + "adc x14, x14, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x26, x27\n\t" + "umulh x4, x26, x27\n\t" + "adds x13, x13, x3\n\t" + "adcs x14, x14, x4\n\t" + "adc x15, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x26, x28\n\t" + "umulh x4, x26, x28\n\t" + "adds x14, x14, x3\n\t" + "adc x15, x15, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x27, x28\n\t" + "umulh x16, x27, x28\n\t" + "adds x15, x15, x3\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x10, %x[a], %x[a]\n\t" + "umulh x5, %x[a], %x[a]\n\t" + /* A[1] * A[1] */ + "mul x3, x26, x26\n\t" + "umulh x4, x26, x26\n\t" + "adds x11, x11, x5\n\t" + "adcs x12, x12, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x27, x27\n\t" + "umulh x4, x27, x27\n\t" + "adds x13, x13, x5\n\t" + "adcs x14, x14, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x28, x28\n\t" + "umulh x4, x28, x28\n\t" + "adds x15, x15, x5\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "extr x15, x15, x14, #63\n\t" + "extr x14, x14, x13, #63\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x14\n\t" + "umulh x14, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "mul x4, x3, x15\n\t" + "umulh x15, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "mul x4, x3, x16\n\t" + "umulh x16, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "mul x4, x3, x17\n\t" + "umulh x5, x3, x17\n\t" + "adcs x13, x13, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x11, x11, x14\n\t" + "adcs x12, x12, x15\n\t" + "adcs x13, x13, x16\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adds x10, x10, x5\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x13, asr 63\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adds x10, x10, x5\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Store */ + /* Square */ + /* A[0] * A[1] */ + "mul x15, x6, x7\n\t" + "umulh x16, x6, x7\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "umulh x17, x6, x8\n\t" + "adds x16, x16, x3\n\t" + "adc x17, x17, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x6, x9\n\t" + "umulh %x[a], x6, x9\n\t" + "adds x17, x17, x3\n\t" + "adc %x[a], %x[a], xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "umulh x4, x7, x8\n\t" + "adds x17, x17, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "umulh x4, x7, x9\n\t" + "adds %x[a], %x[a], x3\n\t" + "adc x26, x26, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "umulh x27, x8, x9\n\t" + "adds x26, x26, x3\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs %x[a], %x[a], %x[a]\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x14, x6, x6\n\t" + "umulh x5, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "umulh x4, x7, x7\n\t" + "adds x15, x15, x5\n\t" + "adcs x16, x16, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "umulh x4, x8, x8\n\t" + "adds x17, x17, x5\n\t" + "adcs %x[a], %x[a], x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "umulh x4, x9, x9\n\t" + "adds x26, x26, x5\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x17, #63\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" + "adds x14, x14, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x15, x15, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x16, x16, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x17, x17, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x15, x15, %x[a]\n\t" + "adcs x16, x16, x26\n\t" + "adcs x17, x17, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "adds x14, x14, x5\n\t" + "adcs x15, x15, xzr\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x17, asr 63\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "adds x14, x14, x5\n\t" + "adcs x15, x15, xzr\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" + /* Store */ + /* Multiply */ + /* A[0] * B[0] */ + "mul x6, x14, x10\n\t" + "umulh x7, x14, x10\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x11\n\t" + "umulh x8, x14, x11\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x10\n\t" + "umulh x4, x15, x10\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x12\n\t" + "umulh x4, x14, x12\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x11\n\t" + "umulh x4, x15, x11\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc %x[a], xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x16, x10\n\t" + "umulh x4, x16, x10\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc %x[a], %x[a], xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x13\n\t" + "umulh x4, x14, x13\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x12\n\t" + "umulh x4, x15, x12\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x11\n\t" + "umulh x4, x16, x11\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x10\n\t" + "umulh x4, x17, x10\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x15, x13\n\t" + "umulh x4, x15, x13\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x12\n\t" + "umulh x4, x16, x12\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x11\n\t" + "umulh x4, x17, x11\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x13\n\t" + "umulh x4, x16, x13\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x12\n\t" + "umulh x4, x17, x12\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x13\n\t" + "umulh x4, x17, x13\n\t" + "adds x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, %x[a]\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x9, asr 63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [%x[r]]\n\t" + "stp x8, x9, [%x[r], #16]\n\t" + /* Sub */ + "subs x14, x14, x10\n\t" + "sbcs x15, x15, x11\n\t" + "sbcs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "mov x3, #-19\n\t" + "csetm %x[a], cc\n\t" + /* Mask the modulus */ + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x14, x14, x3\n\t" + "adcs x15, x15, %x[a]\n\t" + "adcs x16, x16, %x[a]\n\t" + "adc x17, x17, x4\n\t" + /* Multiply by 121666 */ + "mov x5, #0xdb42\n\t" + "movk x5, #1, lsl 16\n\t" + "mul x6, x14, x5\n\t" + "umulh x7, x14, x5\n\t" + "mul x3, x15, x5\n\t" + "umulh x4, x15, x5\n\t" + "adds x7, x7, x3\n\t" + "adc x8, xzr, x4\n\t" + "mul x3, x16, x5\n\t" + "umulh x4, x16, x5\n\t" + "adds x8, x8, x3\n\t" + "adc x9, xzr, x4\n\t" + "mul x3, x17, x5\n\t" + "umulh x4, x17, x5\n\t" + "adds x9, x9, x3\n\t" + "adc x4, xzr, x4\n\t" + "mov x5, #19\n\t" + "extr x4, x4, x9, #63\n\t" + "mul x4, x4, x5\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x4\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Add */ + "adds x10, x10, x6\n\t" + "adcs x11, x11, x7\n\t" + "adcs x12, x12, x8\n\t" + "adc x13, x13, x9\n\t" + "mov x3, #-19\n\t" + "asr %x[a], x13, #63\n\t" + /* Mask the modulus */ + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x10, x10, x3\n\t" + "sbcs x11, x11, %x[a]\n\t" + "sbcs x12, x12, %x[a]\n\t" + "sbc x13, x13, x4\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "mul x6, x14, x10\n\t" + "umulh x7, x14, x10\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x11\n\t" + "umulh x8, x14, x11\n\t" + "adds x7, x7, x3\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x10\n\t" + "umulh x4, x15, x10\n\t" + "adds x7, x7, x3\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x12\n\t" + "umulh x4, x14, x12\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x11\n\t" + "umulh x4, x15, x11\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc %x[a], xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x16, x10\n\t" + "umulh x4, x16, x10\n\t" + "adds x8, x8, x3\n\t" + "adcs x9, x9, x4\n\t" + "adc %x[a], %x[a], xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x13\n\t" + "umulh x4, x14, x13\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x12\n\t" + "umulh x4, x15, x12\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x11\n\t" + "umulh x4, x16, x11\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x10\n\t" + "umulh x4, x17, x10\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x15, x13\n\t" + "umulh x4, x15, x13\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x12\n\t" + "umulh x4, x16, x12\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x11\n\t" + "umulh x4, x17, x11\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x13\n\t" + "umulh x4, x16, x13\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x12\n\t" + "umulh x4, x17, x12\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x13\n\t" + "umulh x4, x17, x13\n\t" + "adds x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, %x[a]\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x9, asr 63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #16]\n\t" + "stp x8, x9, [x29, #32]\n\t" + /* Add */ + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" + "adds x10, x6, x19\n\t" + "adcs x11, x7, x20\n\t" + "adcs x12, x8, x21\n\t" + "adc x13, x9, x22\n\t" + "mov x3, #-19\n\t" + "asr %x[a], x13, #63\n\t" + /* Mask the modulus */ + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x10, x10, x3\n\t" + "sbcs x11, x11, %x[a]\n\t" + "sbcs x12, x12, %x[a]\n\t" + "sbc x13, x13, x4\n\t" + /* Sub */ + "subs x19, x6, x19\n\t" + "sbcs x20, x7, x20\n\t" + "sbcs x21, x8, x21\n\t" + "sbcs x22, x9, x22\n\t" + "mov x3, #-19\n\t" + "csetm %x[a], cc\n\t" + /* Mask the modulus */ + "and x3, %x[a], x3\n\t" + "and x4, %x[a], #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x19, x19, x3\n\t" + "adcs x20, x20, %x[a]\n\t" + "adcs x21, x21, %x[a]\n\t" + "adc x22, x22, x4\n\t" + /* Square */ + /* A[0] * A[1] */ + "mul x7, x10, x11\n\t" + "umulh x8, x10, x11\n\t" + /* A[0] * A[2] */ + "mul x3, x10, x12\n\t" + "umulh x9, x10, x12\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x10, x13\n\t" + "umulh %x[a], x10, x13\n\t" + "adds x9, x9, x3\n\t" + "adc %x[a], %x[a], xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x11, x12\n\t" + "umulh x4, x11, x12\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x11, x13\n\t" + "umulh x4, x11, x13\n\t" + "adds %x[a], %x[a], x3\n\t" + "adc x26, x26, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x12, x13\n\t" + "umulh x27, x12, x13\n\t" + "adds x26, x26, x3\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs %x[a], %x[a], %x[a]\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x6, x10, x10\n\t" + "umulh x5, x10, x10\n\t" + /* A[1] * A[1] */ + "mul x3, x11, x11\n\t" + "umulh x4, x11, x11\n\t" + "adds x7, x7, x5\n\t" + "adcs x8, x8, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x12, x12\n\t" + "umulh x4, x12, x12\n\t" + "adds x9, x9, x5\n\t" + "adcs %x[a], %x[a], x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x13, x13\n\t" + "umulh x4, x13, x13\n\t" + "adds x26, x26, x5\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, %x[a]\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x9, asr 63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + /* Square */ + /* A[0] * A[1] */ + "mul x7, x19, x20\n\t" + "umulh x8, x19, x20\n\t" + /* A[0] * A[2] */ + "mul x3, x19, x21\n\t" + "umulh x9, x19, x21\n\t" + "adds x8, x8, x3\n\t" + "adc x9, x9, xzr\n\t" + /* A[0] * A[3] */ + "mul x3, x19, x22\n\t" + "umulh %x[a], x19, x22\n\t" + "adds x9, x9, x3\n\t" + "adc %x[a], %x[a], xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x20, x21\n\t" + "umulh x4, x20, x21\n\t" + "adds x9, x9, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x3, x20, x22\n\t" + "umulh x4, x20, x22\n\t" + "adds %x[a], %x[a], x3\n\t" + "adc x26, x26, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x21, x22\n\t" + "umulh x27, x21, x22\n\t" + "adds x26, x26, x3\n\t" + "adc x27, x27, xzr\n\t" + /* Double */ + "adds x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs %x[a], %x[a], %x[a]\n\t" + "adcs x26, x26, x26\n\t" + "adcs x27, x27, x27\n\t" + "adc x28, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x6, x19, x19\n\t" + "umulh x5, x19, x19\n\t" + /* A[1] * A[1] */ + "mul x3, x20, x20\n\t" + "umulh x4, x20, x20\n\t" + "adds x7, x7, x5\n\t" + "adcs x8, x8, x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[2] * A[2] */ + "mul x3, x21, x21\n\t" + "umulh x4, x21, x21\n\t" + "adds x9, x9, x5\n\t" + "adcs %x[a], %x[a], x3\n\t" + "adc x5, x4, xzr\n\t" + /* A[3] * A[3] */ + "mul x3, x22, x22\n\t" + "umulh x4, x22, x22\n\t" + "adds x26, x26, x5\n\t" + "adcs x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x9, #63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" + "adds x6, x6, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x9, x9, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x7, x7, %x[a]\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x9, asr 63\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adds x6, x6, x5\n\t" + "adcs x7, x7, xzr\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + /* Store */ + "ldr %x[a], [x29, #184]\n\t" + /* Multiply */ + "ldp x14, x15, [%x[a]]\n\t" + "ldp x16, x17, [%x[a], #16]\n\t" + /* A[0] * B[0] */ + "mul x10, x14, x6\n\t" + "umulh x11, x14, x6\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x7\n\t" + "umulh x12, x14, x7\n\t" + "adds x11, x11, x3\n\t" + "adc x12, x12, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x15, x6\n\t" + "umulh x4, x15, x6\n\t" + "adds x11, x11, x3\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x8\n\t" + "umulh x4, x14, x8\n\t" + "adds x12, x12, x3\n\t" + "adc x13, x13, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x15, x7\n\t" + "umulh x4, x15, x7\n\t" + "adds x12, x12, x3\n\t" + "adcs x13, x13, x4\n\t" + "adc %x[a], xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x16, x6\n\t" + "umulh x4, x16, x6\n\t" + "adds x12, x12, x3\n\t" + "adcs x13, x13, x4\n\t" + "adc %x[a], %x[a], xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x9\n\t" + "umulh x4, x14, x9\n\t" + "adds x13, x13, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x8\n\t" + "umulh x4, x15, x8\n\t" + "adds x13, x13, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x16, x7\n\t" + "umulh x4, x16, x7\n\t" + "adds x13, x13, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x6\n\t" + "umulh x4, x17, x6\n\t" + "adds x13, x13, x3\n\t" + "adcs %x[a], %x[a], x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x15, x9\n\t" + "umulh x4, x15, x9\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x16, x8\n\t" + "umulh x4, x16, x8\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x17, x7\n\t" + "umulh x4, x17, x7\n\t" + "adds %x[a], %x[a], x3\n\t" + "adcs x26, x26, x4\n\t" + "adc x27, x27, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x16, x9\n\t" + "umulh x4, x16, x9\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x8\n\t" + "umulh x4, x17, x8\n\t" + "adds x26, x26, x3\n\t" + "adcs x27, x27, x4\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x9\n\t" + "umulh x4, x17, x9\n\t" + "adds x27, x27, x3\n\t" + "adc x28, x28, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x28, x28, x27, #63\n\t" + "extr x27, x27, x26, #63\n\t" + "extr x26, x26, %x[a], #63\n\t" + "extr %x[a], %x[a], x13, #63\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, %x[a]\n\t" + "umulh %x[a], x3, %x[a]\n\t" + "adds x10, x10, x4\n\t" + "mul x4, x3, x26\n\t" + "umulh x26, x3, x26\n\t" + "adcs x11, x11, x4\n\t" + "mul x4, x3, x27\n\t" + "umulh x27, x3, x27\n\t" + "adcs x12, x12, x4\n\t" + "mul x4, x3, x28\n\t" + "umulh x5, x3, x28\n\t" + "adcs x13, x13, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x11, x11, %x[a]\n\t" + "adcs x12, x12, x26\n\t" + "adcs x13, x13, x27\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adds x10, x10, x5\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x13, asr 63\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adds x10, x10, x5\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" + /* Store */ + "stp x10, x11, [x29, #48]\n\t" + "stp x12, x13, [x29, #64]\n\t" + "sub x25, x25, #1\n\t" + "cmp x25, #0\n\t" + "bge L_curve25519_bits_%=\n\t" + "mov x25, #63\n\t" + "sub x24, x24, #8\n\t" + "cmp x24, #0\n\t" + "bge L_curve25519_words_%=\n\t" + /* Invert */ + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "add x0, x29, #0x50\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "add x1, x29, #0x50\n\t" + "bl fe_sq\n\t" + "add x1, x29, #16\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x70\n\t" + "bl fe_sq\n\t" + "add x0, x29, #0x50\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #0x70\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x70\n\t" + "bl fe_sq\n\t" + "mov x24, #4\n\t" + "add x1, x29, #0x70\n\t" + "\n" + "L_curve25519_inv_1_%=: \n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_1_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x70\n\t" + "add x1, x29, #0x50\n\t" + "bl fe_sq\n\t" + "mov x24, #9\n\t" + "add x1, x29, #0x70\n\t" + "\n" + "L_curve25519_inv_2_%=: \n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_2_%=\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x90\n\t" + "bl fe_sq\n\t" + "mov x24, #19\n\t" + "add x1, x29, #0x90\n\t" + "\n" + "L_curve25519_inv_3_%=: \n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_3_%=\n\t" + "add x0, x29, #0x70\n\t" + "add x2, x29, #0x70\n\t" + "bl fe_mul\n\t" + "mov x24, #10\n\t" + "add x1, x29, #0x70\n\t" + "\n" + "L_curve25519_inv_4_%=: \n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_4_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x70\n\t" + "add x1, x29, #0x50\n\t" + "bl fe_sq\n\t" + "mov x24, #49\n\t" + "add x1, x29, #0x70\n\t" + "\n" + "L_curve25519_inv_5_%=: \n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_5_%=\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x90\n\t" + "bl fe_sq\n\t" + "mov x24, #0x63\n\t" + "add x1, x29, #0x90\n\t" + "\n" + "L_curve25519_inv_6_%=: \n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_6_%=\n\t" + "add x0, x29, #0x70\n\t" + "add x2, x29, #0x70\n\t" + "bl fe_mul\n\t" + "mov x24, #50\n\t" + "add x1, x29, #0x70\n\t" + "\n" + "L_curve25519_inv_7_%=: \n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_7_%=\n\t" + "add x0, x29, #0x50\n\t" + "add x2, x29, #0x50\n\t" + "bl fe_mul\n\t" + "mov x24, #5\n\t" + "add x1, x29, #0x50\n\t" + "\n" + "L_curve25519_inv_8_%=: \n\t" + "bl fe_sq\n\t" + "sub x24, x24, #1\n\t" + "cmp x24, #0\n\t" + "bne L_curve25519_inv_8_%=\n\t" + "add x0, x29, #16\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "ldr %x[r], [x29, #176]\n\t" + /* Multiply */ + "ldp x6, x7, [%x[r]]\n\t" + "ldp x8, x9, [%x[r], #16]\n\t" + "ldp x10, x11, [x29, #16]\n\t" + "ldp x12, x13, [x29, #32]\n\t" + /* A[0] * B[0] */ + "mul x14, x6, x10\n\t" + "umulh x15, x6, x10\n\t" + /* A[0] * B[1] */ + "mul x3, x6, x11\n\t" + "umulh x16, x6, x11\n\t" + "adds x15, x15, x3\n\t" + "adc x16, x16, xzr\n\t" + /* A[1] * B[0] */ + "mul x3, x7, x10\n\t" + "umulh x4, x7, x10\n\t" + "adds x15, x15, x3\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x6, x12\n\t" + "umulh x4, x6, x12\n\t" + "adds x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* A[1] * B[1] */ + "mul x3, x7, x11\n\t" + "umulh x4, x7, x11\n\t" + "adds x16, x16, x3\n\t" + "adcs x17, x17, x4\n\t" + "adc x19, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x3, x8, x10\n\t" + "umulh x4, x8, x10\n\t" + "adds x16, x16, x3\n\t" + "adcs x17, x17, x4\n\t" + "adc x19, x19, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x6, x13\n\t" + "umulh x4, x6, x13\n\t" + "adds x17, x17, x3\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x7, x12\n\t" + "umulh x4, x7, x12\n\t" + "adds x17, x17, x3\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" + /* A[2] * B[1] */ + "mul x3, x8, x11\n\t" + "umulh x4, x8, x11\n\t" + "adds x17, x17, x3\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x9, x10\n\t" + "umulh x4, x9, x10\n\t" + "adds x17, x17, x3\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[3] */ + "mul x3, x7, x13\n\t" + "umulh x4, x7, x13\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x8, x12\n\t" + "umulh x4, x8, x12\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[3] * B[1] */ + "mul x3, x9, x11\n\t" + "umulh x4, x9, x11\n\t" + "adds x19, x19, x3\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[3] */ + "mul x3, x8, x13\n\t" + "umulh x4, x8, x13\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x3, x9, x12\n\t" + "umulh x4, x9, x12\n\t" + "adds x20, x20, x3\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[3] * B[3] */ + "mul x3, x9, x13\n\t" + "umulh x4, x9, x13\n\t" + "adds x21, x21, x3\n\t" + "adc x22, x22, x4\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x22, x22, x21, #63\n\t" + "extr x21, x21, x20, #63\n\t" + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x3, #19\n\t" + "mul x4, x3, x19\n\t" + "umulh x19, x3, x19\n\t" + "adds x14, x14, x4\n\t" + "mul x4, x3, x20\n\t" + "umulh x20, x3, x20\n\t" + "adcs x15, x15, x4\n\t" + "mul x4, x3, x21\n\t" + "umulh x21, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "mul x4, x3, x22\n\t" + "umulh x5, x3, x22\n\t" + "adcs x17, x17, x4\n\t" + "adc x5, x5, xzr\n\t" + /* Add remaining product results in */ + "adds x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adcs x17, x17, x21\n\t" + "adc x5, x5, xzr\n\t" + /* Overflow */ + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "adds x14, x14, x5\n\t" + "adcs x15, x15, xzr\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" + /* Reduce if top bit set */ + "and x5, x3, x17, asr 63\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "adds x14, x14, x5\n\t" + "adcs x15, x15, xzr\n\t" + "adcs x16, x16, xzr\n\t" + "adc x17, x17, xzr\n\t" + /* Store */ + "stp x14, x15, [%x[r]]\n\t" + "stp x16, x17, [%x[r], #16]\n\t" + "mov x0, xzr\n\t" + "ldp x29, x30, [sp], #0xc0\n\t" + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); + return (uint32_t)(size_t)r; +} + +void fe_pow22523(fe r, const fe a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-128]!\n\t" + "add x29, sp, #0\n\t" + /* pow22523 */ + "str %x[r], [x29, #112]\n\t" + "str %x[a], [x29, #120]\n\t" + "add x0, x29, #16\n\t" + "bl fe_sq\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "add x1, x29, #48\n\t" + "bl fe_sq\n\t" + "ldr x1, [x29, #120]\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "add x0, x29, #16\n\t" + "add x1, x29, #16\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "bl fe_sq\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "mov x21, #4\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_1_%=: \n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_1_%=\n\t" + "add x0, x29, #16\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "mov x21, #9\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_2_%=: \n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_2_%=\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x50\n\t" + "bl fe_sq\n\t" + "mov x21, #19\n\t" + "add x1, x29, #0x50\n\t" + "\n" + "L_fe_pow22523_3_%=: \n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_3_%=\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "mov x21, #10\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_4_%=: \n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_4_%=\n\t" + "add x0, x29, #16\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #16\n\t" + "bl fe_sq\n\t" + "mov x21, #49\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_5_%=: \n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_5_%=\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "add x0, x29, #0x50\n\t" + "bl fe_sq\n\t" + "mov x21, #0x63\n\t" + "add x1, x29, #0x50\n\t" + "\n" + "L_fe_pow22523_6_%=: \n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_6_%=\n\t" + "add x0, x29, #48\n\t" + "add x2, x29, #48\n\t" + "bl fe_mul\n\t" + "mov x21, #50\n\t" + "add x1, x29, #48\n\t" + "\n" + "L_fe_pow22523_7_%=: \n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_7_%=\n\t" + "add x0, x29, #16\n\t" + "add x2, x29, #16\n\t" + "bl fe_mul\n\t" + "mov x21, #2\n\t" + "add x1, x29, #16\n\t" + "\n" + "L_fe_pow22523_8_%=: \n\t" + "bl fe_sq\n\t" + "sub x21, x21, #1\n\t" + "cmp x21, #0\n\t" + "bne L_fe_pow22523_8_%=\n\t" + "ldr x0, [x29, #112]\n\t" + "ldr x2, [x29, #120]\n\t" + "bl fe_mul\n\t" + "ldp x29, x30, [sp], #0x80\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x21" + ); +} + +void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-64]!\n\t" + "add x29, sp, #0\n\t" + "str %x[ry], [x29, #16]\n\t" + "str %x[rz], [x29, #24]\n\t" + "str %x[px], [x29, #32]\n\t" + "str %x[py], [x29, #40]\n\t" + "str %x[pz], [x29, #48]\n\t" + "str %x[pt], [x29, #56]\n\t" + "ldr x1, [x29, #32]\n\t" + "ldr x2, [x29, #56]\n\t" + /* Multiply */ + "ldp x11, x12, [x1]\n\t" + "ldp x13, x14, [x1, #16]\n\t" + "ldp x15, x16, [x2]\n\t" + "ldp x17, x19, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x15\n\t" + "umulh x4, x11, x15\n\t" + /* A[0] * B[1] */ + "mul x20, x11, x16\n\t" + "umulh x5, x11, x16\n\t" + "adds x4, x4, x20\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x20, x12, x15\n\t" + "umulh x21, x12, x15\n\t" + "adds x4, x4, x20\n\t" + "adcs x5, x5, x21\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x20, x11, x17\n\t" + "umulh x21, x11, x17\n\t" + "adds x5, x5, x20\n\t" + "adc x6, x6, x21\n\t" + /* A[1] * B[1] */ + "mul x20, x12, x16\n\t" + "umulh x21, x12, x16\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x20, x13, x15\n\t" + "umulh x21, x13, x15\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x20, x11, x19\n\t" + "umulh x21, x11, x19\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x20, x12, x17\n\t" + "umulh x21, x12, x17\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x20, x13, x16\n\t" + "umulh x21, x13, x16\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x20, x14, x15\n\t" + "umulh x21, x14, x15\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x20, x12, x19\n\t" + "umulh x21, x12, x19\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x20, x13, x17\n\t" + "umulh x21, x13, x17\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x20, x14, x16\n\t" + "umulh x21, x14, x16\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x20, x13, x19\n\t" + "umulh x21, x13, x19\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x20, x14, x17\n\t" + "umulh x21, x14, x17\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x20, x14, x19\n\t" + "umulh x21, x14, x19\n\t" + "adds x9, x9, x20\n\t" + "adc x10, x10, x21\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x20, #19\n\t" + "mul x21, x20, x7\n\t" + "umulh x7, x20, x7\n\t" + "adds x3, x3, x21\n\t" + "mul x21, x20, x8\n\t" + "umulh x8, x20, x8\n\t" + "adcs x4, x4, x21\n\t" + "mul x21, x20, x9\n\t" + "umulh x9, x20, x9\n\t" + "adcs x5, x5, x21\n\t" + "mul x21, x20, x10\n\t" + "umulh x22, x20, x10\n\t" + "adcs x6, x6, x21\n\t" + "adc x22, x22, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x22, x22, xzr\n\t" + /* Overflow */ + "extr x22, x22, x6, #63\n\t" + "mul x22, x22, x20\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x22\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "and x22, x20, x6, asr 63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x22\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #16]\n\t" + "ldr x1, [x29, #40]\n\t" + "ldr x2, [x29, #48]\n\t" + /* Multiply */ + "ldp x11, x12, [x1]\n\t" + "ldp x13, x14, [x1, #16]\n\t" + "ldp x15, x16, [x2]\n\t" + "ldp x17, x19, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x15\n\t" + "umulh x4, x11, x15\n\t" + /* A[0] * B[1] */ + "mul x20, x11, x16\n\t" + "umulh x5, x11, x16\n\t" + "adds x4, x4, x20\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x20, x12, x15\n\t" + "umulh x21, x12, x15\n\t" + "adds x4, x4, x20\n\t" + "adcs x5, x5, x21\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x20, x11, x17\n\t" + "umulh x21, x11, x17\n\t" + "adds x5, x5, x20\n\t" + "adc x6, x6, x21\n\t" + /* A[1] * B[1] */ + "mul x20, x12, x16\n\t" + "umulh x21, x12, x16\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x20, x13, x15\n\t" + "umulh x21, x13, x15\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x20, x11, x19\n\t" + "umulh x21, x11, x19\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x20, x12, x17\n\t" + "umulh x21, x12, x17\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x20, x13, x16\n\t" + "umulh x21, x13, x16\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x20, x14, x15\n\t" + "umulh x21, x14, x15\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x20, x12, x19\n\t" + "umulh x21, x12, x19\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x20, x13, x17\n\t" + "umulh x21, x13, x17\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x20, x14, x16\n\t" + "umulh x21, x14, x16\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x20, x13, x19\n\t" + "umulh x21, x13, x19\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x20, x14, x17\n\t" + "umulh x21, x14, x17\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x20, x14, x19\n\t" + "umulh x21, x14, x19\n\t" + "adds x9, x9, x20\n\t" + "adc x10, x10, x21\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x20, #19\n\t" + "mul x21, x20, x7\n\t" + "umulh x7, x20, x7\n\t" + "adds x3, x3, x21\n\t" + "mul x21, x20, x8\n\t" + "umulh x8, x20, x8\n\t" + "adcs x4, x4, x21\n\t" + "mul x21, x20, x9\n\t" + "umulh x9, x20, x9\n\t" + "adcs x5, x5, x21\n\t" + "mul x21, x20, x10\n\t" + "umulh x22, x20, x10\n\t" + "adcs x6, x6, x21\n\t" + "adc x22, x22, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x22, x22, xzr\n\t" + /* Overflow */ + "extr x22, x22, x6, #63\n\t" + "mul x22, x22, x20\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x22\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "and x22, x20, x6, asr 63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x22\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x2, [x29, #56]\n\t" + /* Multiply */ + "ldp x11, x12, [x2]\n\t" + "ldp x13, x14, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x15, x11\n\t" + "umulh x4, x15, x11\n\t" + /* A[0] * B[1] */ + "mul x20, x15, x12\n\t" + "umulh x5, x15, x12\n\t" + "adds x4, x4, x20\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x20, x16, x11\n\t" + "umulh x21, x16, x11\n\t" + "adds x4, x4, x20\n\t" + "adcs x5, x5, x21\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x20, x15, x13\n\t" + "umulh x21, x15, x13\n\t" + "adds x5, x5, x20\n\t" + "adc x6, x6, x21\n\t" + /* A[1] * B[1] */ + "mul x20, x16, x12\n\t" + "umulh x21, x16, x12\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x20, x17, x11\n\t" + "umulh x21, x17, x11\n\t" + "adds x5, x5, x20\n\t" + "adcs x6, x6, x21\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x20, x15, x14\n\t" + "umulh x21, x15, x14\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x20, x16, x13\n\t" + "umulh x21, x16, x13\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x20, x17, x12\n\t" + "umulh x21, x17, x12\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x20, x19, x11\n\t" + "umulh x21, x19, x11\n\t" + "adds x6, x6, x20\n\t" + "adcs x7, x7, x21\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x20, x16, x14\n\t" + "umulh x21, x16, x14\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x20, x17, x13\n\t" + "umulh x21, x17, x13\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x20, x19, x12\n\t" + "umulh x21, x19, x12\n\t" + "adds x7, x7, x20\n\t" + "adcs x8, x8, x21\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x20, x17, x14\n\t" + "umulh x21, x17, x14\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x20, x19, x13\n\t" + "umulh x21, x19, x13\n\t" + "adds x8, x8, x20\n\t" + "adcs x9, x9, x21\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x20, x19, x14\n\t" + "umulh x21, x19, x14\n\t" + "adds x9, x9, x20\n\t" + "adc x10, x10, x21\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x20, #19\n\t" + "mul x21, x20, x7\n\t" + "umulh x7, x20, x7\n\t" + "adds x3, x3, x21\n\t" + "mul x21, x20, x8\n\t" + "umulh x8, x20, x8\n\t" + "adcs x4, x4, x21\n\t" + "mul x21, x20, x9\n\t" + "umulh x9, x20, x9\n\t" + "adcs x5, x5, x21\n\t" + "mul x21, x20, x10\n\t" + "umulh x22, x20, x10\n\t" + "adcs x6, x6, x21\n\t" + "adc x22, x22, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x22, x22, xzr\n\t" + /* Overflow */ + "extr x22, x22, x6, #63\n\t" + "mul x22, x22, x20\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x22\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "and x22, x20, x6, asr 63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x22\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldp x29, x30, [sp], #0x40\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" + ); +} + +void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %x[ry], [x29, #16]\n\t" + "str %x[rz], [x29, #24]\n\t" + "str %x[rt], [x29, #32]\n\t" + "str %x[px], [x29, #40]\n\t" + "str %x[py], [x29, #48]\n\t" + "str %x[pz], [x29, #56]\n\t" + "str %x[pt], [x29, #64]\n\t" + "ldr x1, [x29, #40]\n\t" + "ldr x2, [x29, #64]\n\t" + /* Multiply */ + "ldp x11, x12, [x1]\n\t" + "ldp x13, x14, [x1, #16]\n\t" + "ldp x15, x16, [x2]\n\t" + "ldp x17, x19, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x15\n\t" + "umulh x4, x11, x15\n\t" + /* A[0] * B[1] */ + "mul x24, x11, x16\n\t" + "umulh x5, x11, x16\n\t" + "adds x4, x4, x24\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x24, x12, x15\n\t" + "umulh x25, x12, x15\n\t" + "adds x4, x4, x24\n\t" + "adcs x5, x5, x25\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x24, x11, x17\n\t" + "umulh x25, x11, x17\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, x25\n\t" + /* A[1] * B[1] */ + "mul x24, x12, x16\n\t" + "umulh x25, x12, x16\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x24, x13, x15\n\t" + "umulh x25, x13, x15\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x24, x11, x19\n\t" + "umulh x25, x11, x19\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x24, x12, x17\n\t" + "umulh x25, x12, x17\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x24, x13, x16\n\t" + "umulh x25, x13, x16\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x24, x14, x15\n\t" + "umulh x25, x14, x15\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x24, x12, x19\n\t" + "umulh x25, x12, x19\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x24, x13, x17\n\t" + "umulh x25, x13, x17\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x24, x14, x16\n\t" + "umulh x25, x14, x16\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x24, x13, x19\n\t" + "umulh x25, x13, x19\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x24, x14, x17\n\t" + "umulh x25, x14, x17\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x24, x14, x19\n\t" + "umulh x25, x14, x19\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, x25\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x24, #19\n\t" + "mul x25, x24, x7\n\t" + "umulh x7, x24, x7\n\t" + "adds x3, x3, x25\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adcs x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x26, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "adc x26, x26, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x26, x26, xzr\n\t" + /* Overflow */ + "extr x26, x26, x6, #63\n\t" + "mul x26, x26, x24\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "and x26, x24, x6, asr 63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #48]\n\t" + /* Multiply */ + "ldp x20, x21, [x2]\n\t" + "ldp x22, x23, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x11, x20\n\t" + "umulh x4, x11, x20\n\t" + /* A[0] * B[1] */ + "mul x24, x11, x21\n\t" + "umulh x5, x11, x21\n\t" + "adds x4, x4, x24\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x24, x12, x20\n\t" + "umulh x25, x12, x20\n\t" + "adds x4, x4, x24\n\t" + "adcs x5, x5, x25\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x24, x11, x22\n\t" + "umulh x25, x11, x22\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, x25\n\t" + /* A[1] * B[1] */ + "mul x24, x12, x21\n\t" + "umulh x25, x12, x21\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x24, x13, x20\n\t" + "umulh x25, x13, x20\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x24, x11, x23\n\t" + "umulh x25, x11, x23\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x24, x12, x22\n\t" + "umulh x25, x12, x22\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x24, x13, x21\n\t" + "umulh x25, x13, x21\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x24, x14, x20\n\t" + "umulh x25, x14, x20\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x24, x12, x23\n\t" + "umulh x25, x12, x23\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x24, x13, x22\n\t" + "umulh x25, x13, x22\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x24, x14, x21\n\t" + "umulh x25, x14, x21\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x24, x13, x23\n\t" + "umulh x25, x13, x23\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x24, x14, x22\n\t" + "umulh x25, x14, x22\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x24, x14, x23\n\t" + "umulh x25, x14, x23\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, x25\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x24, #19\n\t" + "mul x25, x24, x7\n\t" + "umulh x7, x24, x7\n\t" + "adds x3, x3, x25\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adcs x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x26, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "adc x26, x26, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x26, x26, xzr\n\t" + /* Overflow */ + "extr x26, x26, x6, #63\n\t" + "mul x26, x26, x24\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "and x26, x24, x6, asr 63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #16]\n\t" + "ldr x2, [x29, #56]\n\t" + /* Multiply */ + "ldp x11, x12, [x2]\n\t" + "ldp x13, x14, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x3, x20, x11\n\t" + "umulh x4, x20, x11\n\t" + /* A[0] * B[1] */ + "mul x24, x20, x12\n\t" + "umulh x5, x20, x12\n\t" + "adds x4, x4, x24\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x24, x21, x11\n\t" + "umulh x25, x21, x11\n\t" + "adds x4, x4, x24\n\t" + "adcs x5, x5, x25\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x24, x20, x13\n\t" + "umulh x25, x20, x13\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, x25\n\t" + /* A[1] * B[1] */ + "mul x24, x21, x12\n\t" + "umulh x25, x21, x12\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x24, x22, x11\n\t" + "umulh x25, x22, x11\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x24, x20, x14\n\t" + "umulh x25, x20, x14\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x24, x21, x13\n\t" + "umulh x25, x21, x13\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x24, x22, x12\n\t" + "umulh x25, x22, x12\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x24, x23, x11\n\t" + "umulh x25, x23, x11\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x24, x21, x14\n\t" + "umulh x25, x21, x14\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x24, x22, x13\n\t" + "umulh x25, x22, x13\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x24, x23, x12\n\t" + "umulh x25, x23, x12\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x24, x22, x14\n\t" + "umulh x25, x22, x14\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x24, x23, x13\n\t" + "umulh x25, x23, x13\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x24, x23, x14\n\t" + "umulh x25, x23, x14\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, x25\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x24, #19\n\t" + "mul x25, x24, x7\n\t" + "umulh x7, x24, x7\n\t" + "adds x3, x3, x25\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adcs x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x26, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "adc x26, x26, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x26, x26, xzr\n\t" + /* Overflow */ + "extr x26, x26, x6, #63\n\t" + "mul x26, x26, x24\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "and x26, x24, x6, asr 63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "mul x3, x11, x15\n\t" + "umulh x4, x11, x15\n\t" + /* A[0] * B[1] */ + "mul x24, x11, x16\n\t" + "umulh x5, x11, x16\n\t" + "adds x4, x4, x24\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[0] */ + "mul x24, x12, x15\n\t" + "umulh x25, x12, x15\n\t" + "adds x4, x4, x24\n\t" + "adcs x5, x5, x25\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x24, x11, x17\n\t" + "umulh x25, x11, x17\n\t" + "adds x5, x5, x24\n\t" + "adc x6, x6, x25\n\t" + /* A[1] * B[1] */ + "mul x24, x12, x16\n\t" + "umulh x25, x12, x16\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x24, x13, x15\n\t" + "umulh x25, x13, x15\n\t" + "adds x5, x5, x24\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x24, x11, x19\n\t" + "umulh x25, x11, x19\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x24, x12, x17\n\t" + "umulh x25, x12, x17\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[2] * B[1] */ + "mul x24, x13, x16\n\t" + "umulh x25, x13, x16\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[3] * B[0] */ + "mul x24, x14, x15\n\t" + "umulh x25, x14, x15\n\t" + "adds x6, x6, x24\n\t" + "adcs x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * B[3] */ + "mul x24, x12, x19\n\t" + "umulh x25, x12, x19\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x24, x13, x17\n\t" + "umulh x25, x13, x17\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[1] */ + "mul x24, x14, x16\n\t" + "umulh x25, x14, x16\n\t" + "adds x7, x7, x24\n\t" + "adcs x8, x8, x25\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[3] */ + "mul x24, x13, x19\n\t" + "umulh x25, x13, x19\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" + "adc x10, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x24, x14, x17\n\t" + "umulh x25, x14, x17\n\t" + "adds x8, x8, x24\n\t" + "adcs x9, x9, x25\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[3] */ + "mul x24, x14, x19\n\t" + "umulh x25, x14, x19\n\t" + "adds x9, x9, x24\n\t" + "adc x10, x10, x25\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "extr x7, x7, x6, #63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x24, #19\n\t" + "mul x25, x24, x7\n\t" + "umulh x7, x24, x7\n\t" + "adds x3, x3, x25\n\t" + "mul x25, x24, x8\n\t" + "umulh x8, x24, x8\n\t" + "adcs x4, x4, x25\n\t" + "mul x25, x24, x9\n\t" + "umulh x9, x24, x9\n\t" + "adcs x5, x5, x25\n\t" + "mul x25, x24, x10\n\t" + "umulh x26, x24, x10\n\t" + "adcs x6, x6, x25\n\t" + "adc x26, x26, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x7\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x26, x26, xzr\n\t" + /* Overflow */ + "extr x26, x26, x6, #63\n\t" + "mul x26, x26, x24\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Reduce if top bit set */ + "and x26, x24, x6, asr 63\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adds x3, x3, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* Store */ + "stp x3, x4, [x0]\n\t" + "stp x5, x6, [x0, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26" + ); +} + +void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "ldr x1, [x29, #48]\n\t" + /* Square */ + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" + /* A[0] * A[1] */ + "mul x5, x12, x13\n\t" + "umulh x6, x12, x13\n\t" + /* A[0] * A[2] */ + "mul x25, x12, x14\n\t" + "umulh x7, x12, x14\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * A[3] */ + "mul x25, x12, x15\n\t" + "umulh x8, x12, x15\n\t" + "adds x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x13, x14\n\t" + "umulh x26, x13, x14\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x25, x13, x15\n\t" + "umulh x26, x13, x15\n\t" + "adds x8, x8, x25\n\t" + "adc x9, x9, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x14, x15\n\t" + "umulh x10, x14, x15\n\t" + "adds x9, x9, x25\n\t" + "adc x10, x10, xzr\n\t" + /* Double */ + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x4, x12, x12\n\t" + "umulh x27, x12, x12\n\t" + /* A[1] * A[1] */ + "mul x25, x13, x13\n\t" + "umulh x26, x13, x13\n\t" + "adds x5, x5, x27\n\t" + "adcs x6, x6, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[2] * A[2] */ + "mul x25, x14, x14\n\t" + "umulh x26, x14, x14\n\t" + "adds x7, x7, x27\n\t" + "adcs x8, x8, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[3] * A[3] */ + "mul x25, x15, x15\n\t" + "umulh x26, x15, x15\n\t" + "adds x9, x9, x27\n\t" + "adcs x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x1, [x29, #56]\n\t" + /* Square */ + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + /* A[0] * A[1] */ + "mul x9, x21, x22\n\t" + "umulh x10, x21, x22\n\t" + /* A[0] * A[2] */ + "mul x25, x21, x23\n\t" + "umulh x11, x21, x23\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * A[3] */ + "mul x25, x21, x24\n\t" + "umulh x16, x21, x24\n\t" + "adds x11, x11, x25\n\t" + "adc x16, x16, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x22, x23\n\t" + "umulh x26, x22, x23\n\t" + "adds x11, x11, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x25, x22, x24\n\t" + "umulh x26, x22, x24\n\t" + "adds x16, x16, x25\n\t" + "adc x17, x17, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x23, x24\n\t" + "umulh x19, x23, x24\n\t" + "adds x17, x17, x25\n\t" + "adc x19, x19, xzr\n\t" + /* Double */ + "adds x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adcs x11, x11, x11\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs x19, x19, x19\n\t" + "adc x20, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x8, x21, x21\n\t" + "umulh x27, x21, x21\n\t" + /* A[1] * A[1] */ + "mul x25, x22, x22\n\t" + "umulh x26, x22, x22\n\t" + "adds x9, x9, x27\n\t" + "adcs x10, x10, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[2] * A[2] */ + "mul x25, x23, x23\n\t" + "umulh x26, x23, x23\n\t" + "adds x11, x11, x27\n\t" + "adcs x16, x16, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[3] * A[3] */ + "mul x25, x24, x24\n\t" + "umulh x26, x24, x24\n\t" + "adds x17, x17, x27\n\t" + "adcs x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x11, #63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x8, x8, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x9, x9, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x10, x10, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x11, x11, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x9, x9, x16\n\t" + "adcs x10, x10, x17\n\t" + "adcs x11, x11, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x11, #63\n\t" + "mul x27, x27, x25\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x27\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x11, asr 63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x27\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Store */ + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + /* Add */ + "adds x12, x12, x21\n\t" + "adcs x13, x13, x22\n\t" + "adcs x14, x14, x23\n\t" + "adc x15, x15, x24\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" + "ldr x0, [x29, #40]\n\t" + /* Square */ + /* A[0] * A[1] */ + "mul x17, x12, x13\n\t" + "umulh x19, x12, x13\n\t" + /* A[0] * A[2] */ + "mul x25, x12, x14\n\t" + "umulh x20, x12, x14\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, xzr\n\t" + /* A[0] * A[3] */ + "mul x25, x12, x15\n\t" + "umulh x21, x12, x15\n\t" + "adds x20, x20, x25\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x13, x14\n\t" + "umulh x26, x13, x14\n\t" + "adds x20, x20, x25\n\t" + "adcs x21, x21, x26\n\t" + "adc x22, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x25, x13, x15\n\t" + "umulh x26, x13, x15\n\t" + "adds x21, x21, x25\n\t" + "adc x22, x22, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x14, x15\n\t" + "umulh x23, x14, x15\n\t" + "adds x22, x22, x25\n\t" + "adc x23, x23, xzr\n\t" + /* Double */ + "adds x17, x17, x17\n\t" + "adcs x19, x19, x19\n\t" + "adcs x20, x20, x20\n\t" + "adcs x21, x21, x21\n\t" + "adcs x22, x22, x22\n\t" + "adcs x23, x23, x23\n\t" + "adc x24, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x16, x12, x12\n\t" + "umulh x27, x12, x12\n\t" + /* A[1] * A[1] */ + "mul x25, x13, x13\n\t" + "umulh x26, x13, x13\n\t" + "adds x17, x17, x27\n\t" + "adcs x19, x19, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[2] * A[2] */ + "mul x25, x14, x14\n\t" + "umulh x26, x14, x14\n\t" + "adds x20, x20, x27\n\t" + "adcs x21, x21, x25\n\t" + "adc x27, x26, xzr\n\t" + /* A[3] * A[3] */ + "mul x25, x15, x15\n\t" + "umulh x26, x15, x15\n\t" + "adds x22, x22, x27\n\t" + "adcs x23, x23, x25\n\t" + "adc x24, x24, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x24, x24, x23, #63\n\t" + "extr x23, x23, x22, #63\n\t" + "extr x22, x22, x21, #63\n\t" + "extr x21, x21, x20, #63\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x21\n\t" + "umulh x21, x25, x21\n\t" + "adds x16, x16, x26\n\t" + "mul x26, x25, x22\n\t" + "umulh x22, x25, x22\n\t" + "adcs x17, x17, x26\n\t" + "mul x26, x25, x23\n\t" + "umulh x23, x25, x23\n\t" + "adcs x19, x19, x26\n\t" + "mul x26, x25, x24\n\t" + "umulh x27, x25, x24\n\t" + "adcs x20, x20, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x17, x17, x21\n\t" + "adcs x19, x19, x22\n\t" + "adcs x20, x20, x23\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x20, #63\n\t" + "mul x27, x27, x25\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "adds x16, x16, x27\n\t" + "adcs x17, x17, xzr\n\t" + "adcs x19, x19, xzr\n\t" + "adc x20, x20, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x20, asr 63\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "adds x16, x16, x27\n\t" + "adcs x17, x17, xzr\n\t" + "adcs x19, x19, xzr\n\t" + "adc x20, x20, xzr\n\t" + /* Store */ + "stp x16, x17, [x0]\n\t" + "stp x19, x20, [x0, #16]\n\t" + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #32]\n\t" + /* Add */ + "adds x12, x8, x4\n\t" + "adcs x13, x9, x5\n\t" + "adcs x14, x10, x6\n\t" + "adc x15, x11, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" + /* Sub */ + "subs x21, x8, x4\n\t" + "sbcs x22, x9, x5\n\t" + "sbcs x23, x10, x6\n\t" + "sbcs x24, x11, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x21, x21, x25\n\t" + "adcs x22, x22, x28\n\t" + "adcs x23, x23, x28\n\t" + "adc x24, x24, x26\n\t" + "stp x12, x13, [x0]\n\t" + "stp x14, x15, [x0, #16]\n\t" + "stp x21, x22, [x1]\n\t" + "stp x23, x24, [x1, #16]\n\t" + "ldr x0, [x29, #16]\n\t" + /* Sub */ + "subs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "sbcs x19, x19, x14\n\t" + "sbcs x20, x20, x15\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x16, x17, [x0]\n\t" + "stp x19, x20, [x0, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #64]\n\t" + /* Square * 2 */ + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" + /* A[0] * A[1] */ + "mul x5, x12, x13\n\t" + "umulh x6, x12, x13\n\t" + /* A[0] * A[2] */ + "mul x25, x12, x14\n\t" + "umulh x7, x12, x14\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * A[3] */ + "mul x25, x12, x15\n\t" + "umulh x8, x12, x15\n\t" + "adds x7, x7, x25\n\t" + "adc x8, x8, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x13, x14\n\t" + "umulh x26, x13, x14\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * A[3] */ + "mul x25, x13, x15\n\t" + "umulh x26, x13, x15\n\t" + "adds x8, x8, x25\n\t" + "adc x9, x9, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x14, x15\n\t" + "umulh x10, x14, x15\n\t" + "adds x9, x9, x25\n\t" + "adc x10, x10, xzr\n\t" + /* Double */ + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * A[0] */ + "mul x4, x12, x12\n\t" + "umulh x28, x12, x12\n\t" + /* A[1] * A[1] */ + "mul x25, x13, x13\n\t" + "umulh x26, x13, x13\n\t" + "adds x5, x5, x28\n\t" + "adcs x6, x6, x25\n\t" + "adc x28, x26, xzr\n\t" + /* A[2] * A[2] */ + "mul x25, x14, x14\n\t" + "umulh x26, x14, x14\n\t" + "adds x7, x7, x28\n\t" + "adcs x8, x8, x25\n\t" + "adc x28, x26, xzr\n\t" + /* A[3] * A[3] */ + "mul x25, x15, x15\n\t" + "umulh x26, x15, x15\n\t" + "adds x9, x9, x28\n\t" + "adcs x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* Double and Reduce */ + "mov x25, #0x169\n\t" + /* Move top half into t4-t7 and remove top bit from t3 */ + "lsr x28, x11, #61\n\t" + "extr x11, x11, x10, #62\n\t" + "extr x10, x10, x9, #62\n\t" + "extr x9, x9, x8, #62\n\t" + "extr x8, x8, x7, #62\n\t" + "extr x7, x7, x6, #63\n\t" + "extr x6, x6, x5, #63\n\t" + "extr x5, x5, x4, #63\n\t" + "lsl x4, x4, #1\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Two left, only one right */ + "and x11, x11, #0x7fffffffffffffff\n\t" + /* Multiply top bits by 19*19 */ + "mul x28, x28, x25\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x4, x4, x28\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #40]\n\t" + /* Sub */ + "subs x4, x4, x21\n\t" + "sbcs x5, x5, x22\n\t" + "sbcs x6, x6, x23\n\t" + "sbcs x7, x7, x24\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x4, x4, x25\n\t" + "adcs x5, x5, x28\n\t" + "adcs x6, x6, x28\n\t" + "adc x7, x7, x26\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz) + : + : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" + "ldr x2, [x29, #56]\n\t" + "ldr x3, [x29, #48]\n\t" + /* Add */ + "ldp x12, x13, [x2]\n\t" + "ldp x14, x15, [x2, #16]\n\t" + "ldp x16, x17, [x3]\n\t" + "ldp x19, x20, [x3, #16]\n\t" + "adds x4, x12, x16\n\t" + "adcs x5, x13, x17\n\t" + "adcs x6, x14, x19\n\t" + "adc x7, x15, x20\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" + /* Sub */ + "subs x8, x12, x16\n\t" + "sbcs x9, x13, x17\n\t" + "sbcs x10, x14, x19\n\t" + "sbcs x11, x15, x20\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x8, x8, x25\n\t" + "adcs x9, x9, x28\n\t" + "adcs x10, x10, x28\n\t" + "adc x11, x11, x26\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #168]\n\t" + /* Multiply */ + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x12, x4, x21\n\t" + "umulh x13, x4, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x4, x22\n\t" + "umulh x14, x4, x22\n\t" + "adds x13, x13, x25\n\t" + "adc x14, x14, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x5, x21\n\t" + "umulh x26, x5, x21\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" + "adc x15, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x4, x23\n\t" + "umulh x26, x4, x23\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x5, x22\n\t" + "umulh x26, x5, x22\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x6, x21\n\t" + "umulh x26, x6, x21\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x4, x24\n\t" + "umulh x26, x4, x24\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x5, x23\n\t" + "umulh x26, x5, x23\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x6, x22\n\t" + "umulh x26, x6, x22\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x7, x21\n\t" + "umulh x26, x7, x21\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x5, x24\n\t" + "umulh x26, x5, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x23\n\t" + "umulh x26, x6, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x22\n\t" + "umulh x26, x7, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x24\n\t" + "umulh x26, x6, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x23\n\t" + "umulh x26, x7, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x7, x24\n\t" + "umulh x26, x7, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x12, x12, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x13, x13, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x14, x14, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x15, x15, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x13, x13, x16\n\t" + "adcs x14, x14, x17\n\t" + "adcs x15, x15, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x15, #63\n\t" + "mul x27, x27, x25\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x27\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x15, asr 63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x27\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #176]\n\t" + /* Multiply */ + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x8, x21\n\t" + "umulh x5, x8, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x8, x22\n\t" + "umulh x6, x8, x22\n\t" + "adds x5, x5, x25\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x9, x21\n\t" + "umulh x26, x9, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x8, x23\n\t" + "umulh x26, x8, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x9, x22\n\t" + "umulh x26, x9, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x10, x21\n\t" + "umulh x26, x10, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x8, x24\n\t" + "umulh x26, x8, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x9, x23\n\t" + "umulh x26, x9, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x10, x22\n\t" + "umulh x26, x10, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x11, x21\n\t" + "umulh x26, x11, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x9, x24\n\t" + "umulh x26, x9, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x10, x23\n\t" + "umulh x26, x10, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x11, x22\n\t" + "umulh x26, x11, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x10, x24\n\t" + "umulh x26, x10, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x11, x23\n\t" + "umulh x26, x11, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x11, x24\n\t" + "umulh x26, x11, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #16]\n\t" + /* Add */ + "adds x8, x12, x4\n\t" + "adcs x9, x13, x5\n\t" + "adcs x10, x14, x6\n\t" + "adc x11, x15, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" + /* Sub */ + "subs x16, x12, x4\n\t" + "sbcs x17, x13, x5\n\t" + "sbcs x19, x14, x6\n\t" + "sbcs x20, x15, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x19, x20, [x1, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #160]\n\t" + "ldr x3, [x29, #72]\n\t" + /* Multiply */ + "ldp x16, x17, [x1]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x3]\n\t" + "ldp x23, x24, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x16, x21\n\t" + "umulh x5, x16, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x16, x22\n\t" + "umulh x6, x16, x22\n\t" + "adds x5, x5, x25\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x17, x21\n\t" + "umulh x26, x17, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x16, x23\n\t" + "umulh x26, x16, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x17, x22\n\t" + "umulh x26, x17, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x19, x21\n\t" + "umulh x26, x19, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x16, x24\n\t" + "umulh x26, x16, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x17, x23\n\t" + "umulh x26, x17, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x19, x22\n\t" + "umulh x26, x19, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x20, x21\n\t" + "umulh x26, x20, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x17, x24\n\t" + "umulh x26, x17, x24\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x19, x23\n\t" + "umulh x26, x19, x23\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x20, x22\n\t" + "umulh x26, x20, x22\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x19, x24\n\t" + "umulh x26, x19, x24\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x20, x23\n\t" + "umulh x26, x20, x23\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x20, x24\n\t" + "umulh x26, x20, x24\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #32]\n\t" + "ldr x1, [x29, #64]\n\t" + /* Double */ + "ldp x8, x9, [x1]\n\t" + "ldp x10, x11, [x1, #16]\n\t" + "adds x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, x11, x11\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" + "ldr x1, [x29, #40]\n\t" + /* Add */ + "adds x12, x8, x4\n\t" + "adcs x13, x9, x5\n\t" + "adcs x14, x10, x6\n\t" + "adc x15, x11, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" + /* Sub */ + "subs x16, x8, x4\n\t" + "sbcs x17, x9, x5\n\t" + "sbcs x19, x10, x6\n\t" + "sbcs x20, x11, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x12, x13, [x0]\n\t" + "stp x14, x15, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x19, x20, [x1, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" + "ldr x2, [x29, #56]\n\t" + "ldr x3, [x29, #48]\n\t" + /* Add */ + "ldp x12, x13, [x2]\n\t" + "ldp x14, x15, [x2, #16]\n\t" + "ldp x16, x17, [x3]\n\t" + "ldp x19, x20, [x3, #16]\n\t" + "adds x4, x12, x16\n\t" + "adcs x5, x13, x17\n\t" + "adcs x6, x14, x19\n\t" + "adc x7, x15, x20\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" + /* Sub */ + "subs x8, x12, x16\n\t" + "sbcs x9, x13, x17\n\t" + "sbcs x10, x14, x19\n\t" + "sbcs x11, x15, x20\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x8, x8, x25\n\t" + "adcs x9, x9, x28\n\t" + "adcs x10, x10, x28\n\t" + "adc x11, x11, x26\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #176]\n\t" + /* Multiply */ + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x12, x4, x21\n\t" + "umulh x13, x4, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x4, x22\n\t" + "umulh x14, x4, x22\n\t" + "adds x13, x13, x25\n\t" + "adc x14, x14, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x5, x21\n\t" + "umulh x26, x5, x21\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" + "adc x15, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x4, x23\n\t" + "umulh x26, x4, x23\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x5, x22\n\t" + "umulh x26, x5, x22\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x6, x21\n\t" + "umulh x26, x6, x21\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x4, x24\n\t" + "umulh x26, x4, x24\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x5, x23\n\t" + "umulh x26, x5, x23\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x6, x22\n\t" + "umulh x26, x6, x22\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x7, x21\n\t" + "umulh x26, x7, x21\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x5, x24\n\t" + "umulh x26, x5, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x23\n\t" + "umulh x26, x6, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x22\n\t" + "umulh x26, x7, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x24\n\t" + "umulh x26, x6, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x23\n\t" + "umulh x26, x7, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x7, x24\n\t" + "umulh x26, x7, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x12, x12, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x13, x13, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x14, x14, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x15, x15, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x13, x13, x16\n\t" + "adcs x14, x14, x17\n\t" + "adcs x15, x15, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x15, #63\n\t" + "mul x27, x27, x25\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x27\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x15, asr 63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x27\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #168]\n\t" + /* Multiply */ + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x8, x21\n\t" + "umulh x5, x8, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x8, x22\n\t" + "umulh x6, x8, x22\n\t" + "adds x5, x5, x25\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x9, x21\n\t" + "umulh x26, x9, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x8, x23\n\t" + "umulh x26, x8, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x9, x22\n\t" + "umulh x26, x9, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x10, x21\n\t" + "umulh x26, x10, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x8, x24\n\t" + "umulh x26, x8, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x9, x23\n\t" + "umulh x26, x9, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x10, x22\n\t" + "umulh x26, x10, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x11, x21\n\t" + "umulh x26, x11, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x9, x24\n\t" + "umulh x26, x9, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x10, x23\n\t" + "umulh x26, x10, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x11, x22\n\t" + "umulh x26, x11, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x10, x24\n\t" + "umulh x26, x10, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x11, x23\n\t" + "umulh x26, x11, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x11, x24\n\t" + "umulh x26, x11, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #16]\n\t" + /* Add */ + "adds x8, x12, x4\n\t" + "adcs x9, x13, x5\n\t" + "adcs x10, x14, x6\n\t" + "adc x11, x15, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" + /* Sub */ + "subs x16, x12, x4\n\t" + "sbcs x17, x13, x5\n\t" + "sbcs x19, x14, x6\n\t" + "sbcs x20, x15, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x19, x20, [x1, #16]\n\t" + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #160]\n\t" + "ldr x3, [x29, #72]\n\t" + /* Multiply */ + "ldp x16, x17, [x1]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x3]\n\t" + "ldp x23, x24, [x3, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x16, x21\n\t" + "umulh x5, x16, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x16, x22\n\t" + "umulh x6, x16, x22\n\t" + "adds x5, x5, x25\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x17, x21\n\t" + "umulh x26, x17, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x16, x23\n\t" + "umulh x26, x16, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x17, x22\n\t" + "umulh x26, x17, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x19, x21\n\t" + "umulh x26, x19, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x16, x24\n\t" + "umulh x26, x16, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x17, x23\n\t" + "umulh x26, x17, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x19, x22\n\t" + "umulh x26, x19, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x20, x21\n\t" + "umulh x26, x20, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x17, x24\n\t" + "umulh x26, x17, x24\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x19, x23\n\t" + "umulh x26, x19, x23\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x20, x22\n\t" + "umulh x26, x20, x22\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x19, x24\n\t" + "umulh x26, x19, x24\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x20, x23\n\t" + "umulh x26, x20, x23\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x20, x24\n\t" + "umulh x26, x20, x24\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #32]\n\t" + "ldr x1, [x29, #64]\n\t" + /* Double */ + "ldp x8, x9, [x1]\n\t" + "ldp x10, x11, [x1, #16]\n\t" + "adds x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, x11, x11\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" + "ldr x1, [x29, #40]\n\t" + /* Add */ + "adds x12, x8, x4\n\t" + "adcs x13, x9, x5\n\t" + "adcs x14, x10, x6\n\t" + "adc x15, x11, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" + /* Sub */ + "subs x16, x8, x4\n\t" + "sbcs x17, x9, x5\n\t" + "sbcs x19, x10, x6\n\t" + "sbcs x20, x11, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x12, x13, [x1]\n\t" + "stp x14, x15, [x1, #16]\n\t" + "stp x16, x17, [x0]\n\t" + "stp x19, x20, [x0, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" + "ldr x2, [x29, #56]\n\t" + "ldr x3, [x29, #48]\n\t" + /* Add */ + "ldp x12, x13, [x2]\n\t" + "ldp x14, x15, [x2, #16]\n\t" + "ldp x16, x17, [x3]\n\t" + "ldp x19, x20, [x3, #16]\n\t" + "adds x4, x12, x16\n\t" + "adcs x5, x13, x17\n\t" + "adcs x6, x14, x19\n\t" + "adc x7, x15, x20\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" + /* Sub */ + "subs x8, x12, x16\n\t" + "sbcs x9, x13, x17\n\t" + "sbcs x10, x14, x19\n\t" + "sbcs x11, x15, x20\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x8, x8, x25\n\t" + "adcs x9, x9, x28\n\t" + "adcs x10, x10, x28\n\t" + "adc x11, x11, x26\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #176]\n\t" + /* Multiply */ + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x12, x4, x21\n\t" + "umulh x13, x4, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x4, x22\n\t" + "umulh x14, x4, x22\n\t" + "adds x13, x13, x25\n\t" + "adc x14, x14, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x5, x21\n\t" + "umulh x26, x5, x21\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" + "adc x15, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x4, x23\n\t" + "umulh x26, x4, x23\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x5, x22\n\t" + "umulh x26, x5, x22\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x6, x21\n\t" + "umulh x26, x6, x21\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x4, x24\n\t" + "umulh x26, x4, x24\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x5, x23\n\t" + "umulh x26, x5, x23\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x6, x22\n\t" + "umulh x26, x6, x22\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x7, x21\n\t" + "umulh x26, x7, x21\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x5, x24\n\t" + "umulh x26, x5, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x23\n\t" + "umulh x26, x6, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x22\n\t" + "umulh x26, x7, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x24\n\t" + "umulh x26, x6, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x23\n\t" + "umulh x26, x7, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x7, x24\n\t" + "umulh x26, x7, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x12, x12, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x13, x13, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x14, x14, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x15, x15, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x13, x13, x16\n\t" + "adcs x14, x14, x17\n\t" + "adcs x15, x15, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x15, #63\n\t" + "mul x27, x27, x25\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x27\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x15, asr 63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x27\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #184]\n\t" + /* Multiply */ + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x8, x21\n\t" + "umulh x5, x8, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x8, x22\n\t" + "umulh x6, x8, x22\n\t" + "adds x5, x5, x25\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x9, x21\n\t" + "umulh x26, x9, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x8, x23\n\t" + "umulh x26, x8, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x9, x22\n\t" + "umulh x26, x9, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x10, x21\n\t" + "umulh x26, x10, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x8, x24\n\t" + "umulh x26, x8, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x9, x23\n\t" + "umulh x26, x9, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x10, x22\n\t" + "umulh x26, x10, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x11, x21\n\t" + "umulh x26, x11, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x9, x24\n\t" + "umulh x26, x9, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x10, x23\n\t" + "umulh x26, x10, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x11, x22\n\t" + "umulh x26, x11, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x10, x24\n\t" + "umulh x26, x10, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x11, x23\n\t" + "umulh x26, x11, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x11, x24\n\t" + "umulh x26, x11, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #16]\n\t" + /* Add */ + "adds x8, x12, x4\n\t" + "adcs x9, x13, x5\n\t" + "adcs x10, x14, x6\n\t" + "adc x11, x15, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" + /* Sub */ + "subs x16, x12, x4\n\t" + "sbcs x17, x13, x5\n\t" + "sbcs x19, x14, x6\n\t" + "sbcs x20, x15, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x19, x20, [x1, #16]\n\t" + "ldr x0, [x29, #48]\n\t" + "ldr x1, [x29, #64]\n\t" + "ldr x2, [x29, #160]\n\t" + /* Multiply */ + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" + "ldp x16, x17, [x2]\n\t" + "ldp x19, x20, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x12, x16\n\t" + "umulh x5, x12, x16\n\t" + /* A[0] * B[1] */ + "mul x25, x12, x17\n\t" + "umulh x6, x12, x17\n\t" + "adds x5, x5, x25\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x13, x16\n\t" + "umulh x26, x13, x16\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x12, x19\n\t" + "umulh x26, x12, x19\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x13, x17\n\t" + "umulh x26, x13, x17\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x14, x16\n\t" + "umulh x26, x14, x16\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x12, x20\n\t" + "umulh x26, x12, x20\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x13, x19\n\t" + "umulh x26, x13, x19\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x14, x17\n\t" + "umulh x26, x14, x17\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x15, x16\n\t" + "umulh x26, x15, x16\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x13, x20\n\t" + "umulh x26, x13, x20\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x14, x19\n\t" + "umulh x26, x14, x19\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x15, x17\n\t" + "umulh x26, x15, x17\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x14, x20\n\t" + "umulh x26, x14, x20\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x15, x19\n\t" + "umulh x26, x15, x19\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x15, x20\n\t" + "umulh x26, x15, x20\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #48]\n\t" + /* Double */ + "adds x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #168]\n\t" + "ldr x2, [x29, #72]\n\t" + /* Multiply */ + "ldp x16, x17, [x1]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x8, x16, x21\n\t" + "umulh x9, x16, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x16, x22\n\t" + "umulh x10, x16, x22\n\t" + "adds x9, x9, x25\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x17, x21\n\t" + "umulh x26, x17, x21\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x16, x23\n\t" + "umulh x26, x16, x23\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x17, x22\n\t" + "umulh x26, x17, x22\n\t" + "adds x10, x10, x25\n\t" + "adcs x11, x11, x26\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x19, x21\n\t" + "umulh x26, x19, x21\n\t" + "adds x10, x10, x25\n\t" + "adcs x11, x11, x26\n\t" + "adc x12, x12, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x16, x24\n\t" + "umulh x26, x16, x24\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x17, x23\n\t" + "umulh x26, x17, x23\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, x13, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x19, x22\n\t" + "umulh x26, x19, x22\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x20, x21\n\t" + "umulh x26, x20, x21\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, x13, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x17, x24\n\t" + "umulh x26, x17, x24\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" + "adc x14, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x19, x23\n\t" + "umulh x26, x19, x23\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" + "adc x14, x14, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x20, x22\n\t" + "umulh x26, x20, x22\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" + "adc x14, x14, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x19, x24\n\t" + "umulh x26, x19, x24\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" + "adc x15, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x20, x23\n\t" + "umulh x26, x20, x23\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" + "adc x15, x15, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x20, x24\n\t" + "umulh x26, x20, x24\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x15, x15, x14, #63\n\t" + "extr x14, x14, x13, #63\n\t" + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x12\n\t" + "umulh x12, x25, x12\n\t" + "adds x8, x8, x26\n\t" + "mul x26, x25, x13\n\t" + "umulh x13, x25, x13\n\t" + "adcs x9, x9, x26\n\t" + "mul x26, x25, x14\n\t" + "umulh x14, x25, x14\n\t" + "adcs x10, x10, x26\n\t" + "mul x26, x25, x15\n\t" + "umulh x27, x25, x15\n\t" + "adcs x11, x11, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adcs x11, x11, x14\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x11, #63\n\t" + "mul x27, x27, x25\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x27\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x11, asr 63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x27\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Store */ + "ldr x0, [x29, #32]\n\t" + "ldr x1, [x29, #40]\n\t" + /* Add */ + "adds x12, x4, x8\n\t" + "adcs x13, x5, x9\n\t" + "adcs x14, x6, x10\n\t" + "adc x15, x7, x11\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" + /* Sub */ + "subs x16, x4, x8\n\t" + "sbcs x17, x5, x9\n\t" + "sbcs x19, x6, x10\n\t" + "sbcs x20, x7, x11\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x12, x13, [x0]\n\t" + "stp x14, x15, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x19, x20, [x1, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %x[rx], [x29, #16]\n\t" + "str %x[ry], [x29, #24]\n\t" + "str %x[rz], [x29, #32]\n\t" + "str %x[rt], [x29, #40]\n\t" + "str %x[px], [x29, #48]\n\t" + "str %x[py], [x29, #56]\n\t" + "str %x[pz], [x29, #64]\n\t" + "str %x[pt], [x29, #72]\n\t" + "ldr x2, [x29, #56]\n\t" + "ldr x3, [x29, #48]\n\t" + /* Add */ + "ldp x12, x13, [x2]\n\t" + "ldp x14, x15, [x2, #16]\n\t" + "ldp x16, x17, [x3]\n\t" + "ldp x19, x20, [x3, #16]\n\t" + "adds x4, x12, x16\n\t" + "adcs x5, x13, x17\n\t" + "adcs x6, x14, x19\n\t" + "adc x7, x15, x20\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" + /* Sub */ + "subs x8, x12, x16\n\t" + "sbcs x9, x13, x17\n\t" + "sbcs x10, x14, x19\n\t" + "sbcs x11, x15, x20\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x8, x8, x25\n\t" + "adcs x9, x9, x28\n\t" + "adcs x10, x10, x28\n\t" + "adc x11, x11, x26\n\t" + "ldr x0, [x29, #32]\n\t" + "ldr x2, [x29, #184]\n\t" + /* Multiply */ + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x12, x4, x21\n\t" + "umulh x13, x4, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x4, x22\n\t" + "umulh x14, x4, x22\n\t" + "adds x13, x13, x25\n\t" + "adc x14, x14, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x5, x21\n\t" + "umulh x26, x5, x21\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" + "adc x15, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x4, x23\n\t" + "umulh x26, x4, x23\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x5, x22\n\t" + "umulh x26, x5, x22\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x6, x21\n\t" + "umulh x26, x6, x21\n\t" + "adds x14, x14, x25\n\t" + "adcs x15, x15, x26\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x4, x24\n\t" + "umulh x26, x4, x24\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x5, x23\n\t" + "umulh x26, x5, x23\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x6, x22\n\t" + "umulh x26, x6, x22\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x7, x21\n\t" + "umulh x26, x7, x21\n\t" + "adds x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x5, x24\n\t" + "umulh x26, x5, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x23\n\t" + "umulh x26, x6, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x22\n\t" + "umulh x26, x7, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x24\n\t" + "umulh x26, x6, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x23\n\t" + "umulh x26, x7, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x7, x24\n\t" + "umulh x26, x7, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x15, #63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x12, x12, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x13, x13, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x14, x14, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x15, x15, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x13, x13, x16\n\t" + "adcs x14, x14, x17\n\t" + "adcs x15, x15, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x15, #63\n\t" + "mul x27, x27, x25\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x27\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x15, asr 63\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adds x12, x12, x27\n\t" + "adcs x13, x13, xzr\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #176]\n\t" + /* Multiply */ + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x8, x21\n\t" + "umulh x5, x8, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x8, x22\n\t" + "umulh x6, x8, x22\n\t" + "adds x5, x5, x25\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x9, x21\n\t" + "umulh x26, x9, x21\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x8, x23\n\t" + "umulh x26, x8, x23\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x9, x22\n\t" + "umulh x26, x9, x22\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x16, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x10, x21\n\t" + "umulh x26, x10, x21\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x16, x16, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x8, x24\n\t" + "umulh x26, x8, x24\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x9, x23\n\t" + "umulh x26, x9, x23\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x10, x22\n\t" + "umulh x26, x10, x22\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x11, x21\n\t" + "umulh x26, x11, x21\n\t" + "adds x7, x7, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x9, x24\n\t" + "umulh x26, x9, x24\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x10, x23\n\t" + "umulh x26, x10, x23\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x11, x22\n\t" + "umulh x26, x11, x22\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x10, x24\n\t" + "umulh x26, x10, x24\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x11, x23\n\t" + "umulh x26, x11, x23\n\t" + "adds x17, x17, x25\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x11, x24\n\t" + "umulh x26, x11, x24\n\t" + "adds x19, x19, x25\n\t" + "adc x20, x20, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x20, x20, x19, #63\n\t" + "extr x19, x19, x17, #63\n\t" + "extr x17, x17, x16, #63\n\t" + "extr x16, x16, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x16\n\t" + "umulh x16, x25, x16\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x17\n\t" + "umulh x17, x25, x17\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x19\n\t" + "umulh x19, x25, x19\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x20\n\t" + "umulh x27, x25, x20\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x19\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #24]\n\t" + "ldr x1, [x29, #16]\n\t" + /* Add */ + "adds x8, x12, x4\n\t" + "adcs x9, x13, x5\n\t" + "adcs x10, x14, x6\n\t" + "adc x11, x15, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x11, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" + /* Sub */ + "subs x16, x12, x4\n\t" + "sbcs x17, x13, x5\n\t" + "sbcs x19, x14, x6\n\t" + "sbcs x20, x15, x7\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x19, x20, [x1, #16]\n\t" + "ldr x0, [x29, #48]\n\t" + "ldr x1, [x29, #64]\n\t" + "ldr x2, [x29, #160]\n\t" + /* Multiply */ + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" + "ldp x16, x17, [x2]\n\t" + "ldp x19, x20, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x4, x12, x16\n\t" + "umulh x5, x12, x16\n\t" + /* A[0] * B[1] */ + "mul x25, x12, x17\n\t" + "umulh x6, x12, x17\n\t" + "adds x5, x5, x25\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x13, x16\n\t" + "umulh x26, x13, x16\n\t" + "adds x5, x5, x25\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x12, x19\n\t" + "umulh x26, x12, x19\n\t" + "adds x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x13, x17\n\t" + "umulh x26, x13, x17\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x8, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x14, x16\n\t" + "umulh x26, x14, x16\n\t" + "adds x6, x6, x25\n\t" + "adcs x7, x7, x26\n\t" + "adc x8, x8, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x12, x20\n\t" + "umulh x26, x12, x20\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x13, x19\n\t" + "umulh x26, x13, x19\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x14, x17\n\t" + "umulh x26, x14, x17\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x15, x16\n\t" + "umulh x26, x15, x16\n\t" + "adds x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x13, x20\n\t" + "umulh x26, x13, x20\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x14, x19\n\t" + "umulh x26, x14, x19\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x15, x17\n\t" + "umulh x26, x15, x17\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x14, x20\n\t" + "umulh x26, x14, x20\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x15, x19\n\t" + "umulh x26, x15, x19\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x15, x20\n\t" + "umulh x26, x15, x20\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x11, x11, x10, #63\n\t" + "extr x10, x10, x9, #63\n\t" + "extr x9, x9, x8, #63\n\t" + "extr x8, x8, x7, #63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x8\n\t" + "umulh x8, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "mul x26, x25, x9\n\t" + "umulh x9, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "mul x26, x25, x10\n\t" + "umulh x10, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "mul x26, x25, x11\n\t" + "umulh x27, x25, x11\n\t" + "adcs x7, x7, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x7, asr 63\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "adds x4, x4, x27\n\t" + "adcs x5, x5, xzr\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, xzr\n\t" + /* Store */ + "ldr x0, [x29, #48]\n\t" + /* Double */ + "adds x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, x7, x7\n\t" + "mov x25, #-19\n\t" + "asr x28, x7, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x4, x4, x25\n\t" + "sbcs x5, x5, x28\n\t" + "sbcs x6, x6, x28\n\t" + "sbc x7, x7, x26\n\t" + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #168]\n\t" + "ldr x2, [x29, #72]\n\t" + /* Multiply */ + "ldp x16, x17, [x1]\n\t" + "ldp x19, x20, [x1, #16]\n\t" + "ldp x21, x22, [x2]\n\t" + "ldp x23, x24, [x2, #16]\n\t" + /* A[0] * B[0] */ + "mul x8, x16, x21\n\t" + "umulh x9, x16, x21\n\t" + /* A[0] * B[1] */ + "mul x25, x16, x22\n\t" + "umulh x10, x16, x22\n\t" + "adds x9, x9, x25\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[0] */ + "mul x25, x17, x21\n\t" + "umulh x26, x17, x21\n\t" + "adds x9, x9, x25\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x16, x23\n\t" + "umulh x26, x16, x23\n\t" + "adds x10, x10, x25\n\t" + "adc x11, x11, x26\n\t" + /* A[1] * B[1] */ + "mul x25, x17, x22\n\t" + "umulh x26, x17, x22\n\t" + "adds x10, x10, x25\n\t" + "adcs x11, x11, x26\n\t" + "adc x12, xzr, xzr\n\t" + /* A[2] * B[0] */ + "mul x25, x19, x21\n\t" + "umulh x26, x19, x21\n\t" + "adds x10, x10, x25\n\t" + "adcs x11, x11, x26\n\t" + "adc x12, x12, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x16, x24\n\t" + "umulh x26, x16, x24\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, xzr, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x17, x23\n\t" + "umulh x26, x17, x23\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, x13, xzr\n\t" + /* A[2] * B[1] */ + "mul x25, x19, x22\n\t" + "umulh x26, x19, x22\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, x13, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x20, x21\n\t" + "umulh x26, x20, x21\n\t" + "adds x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, x13, xzr\n\t" + /* A[1] * B[3] */ + "mul x25, x17, x24\n\t" + "umulh x26, x17, x24\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" + "adc x14, xzr, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x19, x23\n\t" + "umulh x26, x19, x23\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" + "adc x14, x14, xzr\n\t" + /* A[3] * B[1] */ + "mul x25, x20, x22\n\t" + "umulh x26, x20, x22\n\t" + "adds x12, x12, x25\n\t" + "adcs x13, x13, x26\n\t" + "adc x14, x14, xzr\n\t" + /* A[2] * B[3] */ + "mul x25, x19, x24\n\t" + "umulh x26, x19, x24\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" + "adc x15, xzr, xzr\n\t" + /* A[3] * B[2] */ + "mul x25, x20, x23\n\t" + "umulh x26, x20, x23\n\t" + "adds x13, x13, x25\n\t" + "adcs x14, x14, x26\n\t" + "adc x15, x15, xzr\n\t" + /* A[3] * B[3] */ + "mul x25, x20, x24\n\t" + "umulh x26, x20, x24\n\t" + "adds x14, x14, x25\n\t" + "adc x15, x15, x26\n\t" + /* Reduce */ + /* Move top half into t4-t7 and remove top bit from t3 */ + "extr x15, x15, x14, #63\n\t" + "extr x14, x14, x13, #63\n\t" + "extr x13, x13, x12, #63\n\t" + "extr x12, x12, x11, #63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + /* Multiply top half by 19 */ + "mov x25, #19\n\t" + "mul x26, x25, x12\n\t" + "umulh x12, x25, x12\n\t" + "adds x8, x8, x26\n\t" + "mul x26, x25, x13\n\t" + "umulh x13, x25, x13\n\t" + "adcs x9, x9, x26\n\t" + "mul x26, x25, x14\n\t" + "umulh x14, x25, x14\n\t" + "adcs x10, x10, x26\n\t" + "mul x26, x25, x15\n\t" + "umulh x27, x25, x15\n\t" + "adcs x11, x11, x26\n\t" + "adc x27, x27, xzr\n\t" + /* Add remaining product results in */ + "adds x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adcs x11, x11, x14\n\t" + "adc x27, x27, xzr\n\t" + /* Overflow */ + "extr x27, x27, x11, #63\n\t" + "mul x27, x27, x25\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x27\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Reduce if top bit set */ + "and x27, x25, x11, asr 63\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adds x8, x8, x27\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Store */ + "ldr x0, [x29, #40]\n\t" + "ldr x1, [x29, #32]\n\t" + /* Add */ + "adds x12, x4, x8\n\t" + "adcs x13, x5, x9\n\t" + "adcs x14, x6, x10\n\t" + "adc x15, x7, x11\n\t" + "mov x25, #-19\n\t" + "asr x28, x15, #63\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Sub modulus (if overflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" + /* Sub */ + "subs x16, x4, x8\n\t" + "sbcs x17, x5, x9\n\t" + "sbcs x19, x6, x10\n\t" + "sbcs x20, x7, x11\n\t" + "mov x25, #-19\n\t" + "csetm x28, cc\n\t" + /* Mask the modulus */ + "and x25, x28, x25\n\t" + "and x26, x28, #0x7fffffffffffffff\n\t" + /* Add modulus (if underflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, x28\n\t" + "adcs x19, x19, x28\n\t" + "adc x20, x20, x26\n\t" + "stp x12, x13, [x0]\n\t" + "stp x14, x15, [x0, #16]\n\t" + "stp x16, x17, [x1]\n\t" + "stp x19, x20, [x1, #16]\n\t" + "ldp x29, x30, [sp], #0x50\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +#endif /* WOLFSSL_ARMASM */ +#endif /* __aarch64__ */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-poly1305.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-poly1305.c new file mode 100644 index 0000000..3df07f7 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-poly1305.c @@ -0,0 +1,1166 @@ +/* armv8-poly1305.c + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* + * Based off the public domain implementations by Andrew Moon + * and Daniel J. Bernstein + */ + + +#ifdef __aarch64__ + +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#ifdef WOLFSSL_ARMASM +#ifdef HAVE_POLY1305 +#include <wolfssl/wolfcrypt/poly1305.h> +#include <wolfssl/wolfcrypt/error-crypt.h> +#include <wolfssl/wolfcrypt/logging.h> +#include <wolfssl/wolfcrypt/cpuid.h> +#ifdef NO_INLINE + #include <wolfssl/wolfcrypt/misc.h> +#else + #define WOLFSSL_MISC_INCLUDED + #include <wolfcrypt/src/misc.c> +#endif +#ifdef CHACHA_AEAD_TEST + #include <stdio.h> +#endif + +static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m, + size_t bytes) +{ + __asm__ __volatile__ ( + "CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t" + "BLO L_poly1305_16_64_done_%= \n\t" + /* Load r and h */ + "LDP x21, x23, %[ctx_r] \n\t" + "LDR w25, %[ctx_r_4] \n\t" + "LDP x2, x4, %[ctx_h] \n\t" + "LDR w6, %[ctx_h_4] \n\t" + "LSR x22, x21, #32 \n\t" + "LSR x24, x23, #32 \n\t" + "LSR x3, x2, #32 \n\t" + "LSR x5, x4, #32 \n\t" + "AND x21, x21, #0x3ffffff \n\t" + "AND x23, x23, #0x3ffffff \n\t" + "AND x2, x2, #0x3ffffff \n\t" + "AND x4, x4, #0x3ffffff \n\t" + /* s1 = r1 * 5; */ + /* s2 = r2 * 5; */ + /* s3 = r3 * 5; */ + /* s4 = r4 * 5; */ + "MOV x15, #5 \n\t" + "CMP %[finished], #0 \n\t" + "MUL w7, w22, w15 \n\t" + "CSET %[finished], EQ \n\t" + "MUL w8, w23, w15 \n\t" + "LSL %[finished], %[finished], #24 \n\t" + "MUL w9, w24, w15 \n\t" + "MOV x14, #0x3ffffff \n\t" + "MUL w10, w25, w15 \n\t" + "\n" + ".align 2 \n\t" + "L_poly1305_16_64_loop_%=: \n\t" + /* t0 = U8TO64(&m[0]); */ + /* t1 = U8TO64(&m[8]); */ + "LDP x16, x17, [%[m]], #16 \n\t" + /* h0 += (U8TO32(m + 0)) & 0x3ffffff; */ + "AND x26, x16, #0x3ffffff \n\t" + "ADD x2, x2, x26 \n\t" + /* h1 += (U8TO32(m + 3) >> 2) & 0x3ffffff; */ + "AND x26, x14, x16, LSR #26 \n\t" + "ADD x3, x3, x26 \n\t" + /* h2 += (U8TO32(m + 6) >> 4) & 0x3ffffff; */ + "EXTR x26, x17, x16, #52 \n\t" + "AND x26, x26, #0x3ffffff \n\t" + "ADD x4, x4, x26 \n\t" + /* h3 += (U8TO32(m + 9) >> 6) & 0x3ffffff; */ + "AND x26, x14, x17, LSR #14 \n\t" + "ADD x5, x5, x26 \n\t" + /* h4 += (U8TO32(m + 12) >> 8) | hibit; */ + "ORR x17, %[finished], x17, LSR #40 \n\t" + "ADD x6, x6, x17 \n\t" + /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */ + /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */ + /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */ + /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */ + /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */ + "MUL x16, x2, x21 \n\t" + "MUL x17, x2, x22 \n\t" + "MUL x26, x2, x23 \n\t" + "MUL x19, x2, x24 \n\t" + "MUL x20, x2, x25 \n\t" + "MADD x16, x3, x10, x16 \n\t" + "MADD x17, x3, x21, x17 \n\t" + "MADD x26, x3, x22, x26 \n\t" + "MADD x19, x3, x23, x19 \n\t" + "MADD x20, x3, x24, x20 \n\t" + "MADD x16, x4, x9, x16 \n\t" + "MADD x17, x4, x10, x17 \n\t" + "MADD x26, x4, x21, x26 \n\t" + "MADD x19, x4, x22, x19 \n\t" + "MADD x20, x4, x23, x20 \n\t" + "MADD x16, x5, x8, x16 \n\t" + "MADD x17, x5, x9, x17 \n\t" + "MADD x26, x5, x10, x26 \n\t" + "MADD x19, x5, x21, x19 \n\t" + "MADD x20, x5, x22, x20 \n\t" + "MADD x16, x6, x7, x16 \n\t" + "MADD x17, x6, x8, x17 \n\t" + "MADD x26, x6, x9, x26 \n\t" + "MADD x19, x6, x10, x19 \n\t" + "MADD x20, x6, x21, x20 \n\t" + /* d1 = d1 + d0 >> 26 */ + /* d2 = d2 + d1 >> 26 */ + /* d3 = d3 + d2 >> 26 */ + /* d4 = d4 + d3 >> 26 */ + /* h0 = d0 & 0x3ffffff */ + /* h1 = d1 & 0x3ffffff */ + /* h2 = d2 & 0x3ffffff */ + /* h0 = h0 + (d4 >> 26) * 5 */ + /* h1 = h1 + h0 >> 26 */ + /* h3 = d3 & 0x3ffffff */ + /* h4 = d4 & 0x3ffffff */ + /* h0 = h0 & 0x3ffffff */ + "ADD x17, x17, x16, LSR #26 \n\t" + "ADD x20, x20, x19, LSR #26 \n\t" + "AND x16, x16, #0x3ffffff \n\t" + "LSR x2, x20, #26 \n\t" + "AND x19, x19, #0x3ffffff \n\t" + "MADD x16, x2, x15, x16 \n\t" + "ADD x26, x26, x17, LSR #26 \n\t" + "AND x17, x17, #0x3ffffff \n\t" + "AND x20, x20, #0x3ffffff \n\t" + "ADD x19, x19, x26, LSR #26 \n\t" + "AND x4, x26, #0x3ffffff \n\t" + "ADD x3, x17, x16, LSR #26 \n\t" + "AND x2, x16, #0x3ffffff \n\t" + "ADD x6, x20, x19, LSR #26 \n\t" + "AND x5, x19, #0x3ffffff \n\t" + "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE] \n\t" + "CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t" + "BHS L_poly1305_16_64_loop_%= \n\t" + /* Store h */ + "ORR x2, x2, x3, LSL #32 \n\t" + "ORR x4, x4, x5, LSL #32 \n\t" + "STP x2, x4, %[ctx_h] \n\t" + "STR w6, %[ctx_h_4] \n\t" + "\n" + ".align 2 \n\t" + "L_poly1305_16_64_done_%=: \n\t" + : [ctx_h] "+m" (ctx->h[0]), + [ctx_h_4] "+m" (ctx->h[4]), + [bytes] "+r" (bytes), + [m] "+r" (m) + : [POLY1305_BLOCK_SIZE] "I" (POLY1305_BLOCK_SIZE), + [ctx_r] "m" (ctx->r[0]), + [ctx_r_4] "m" (ctx->r[4]), + [finished] "r" ((word64)ctx->finished) + : "memory", "cc", + "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w15", + "w21", "w22", "w23", "w24", "w25", "x2", "x3", "x4", "x5", "x6", + "x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17", "x19", "x20", + "x21", "x22", "x23", "x24", "x25", "x26" + ); +} + +void poly1305_blocks(Poly1305* ctx, const unsigned char *m, + size_t bytes) +{ + __asm__ __volatile__ ( + /* If less than 4 blocks to process then use regular method */ + "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t" + "BLO L_poly1305_64_done_%= \n\t" + "MOV x9, #0x3ffffff \n\t" + /* Load h */ + "LDP x20, x22, [%[h]] \n\t" + "MOV v27.D[0], x9 \n\t" + "LDR w24, [%[h], #16] \n\t" + "MOV v27.D[1], x9 \n\t" + "LSR x21, x20, #32 \n\t" + "DUP v29.4S, v27.S[0] \n\t" + "LSR x23, x22, #32 \n\t" + "MOV x9, #5 \n\t" + "AND x20, x20, #0x3ffffff \n\t" + "MOV v28.D[0], x9 \n\t" + "AND x22, x22, #0x3ffffff \n\t" + /* Zero accumulator registers */ + "MOVI v15.2D, #0x0 \n\t" + "MOVI v16.2D, #0x0 \n\t" + "MOVI v17.2D, #0x0 \n\t" + "MOVI v18.2D, #0x0 \n\t" + "MOVI v19.2D, #0x0 \n\t" + /* Set hibit */ + "CMP %[finished], #0 \n\t" + "CSET x9, EQ \n\t" + "LSL x9, x9, #24 \n\t" + "MOV v26.D[0], x9 \n\t" + "MOV v26.D[1], x9 \n\t" + "DUP v30.4S, v26.S[0] \n\t" + "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t" + "BLO L_poly1305_64_start_block_size_64_%= \n\t" + /* Load r^2 to NEON v0, v1, v2, v3, v4 */ + "LD4 { v0.S-v3.S }[2], [%[r_2]], #16 \n\t" + "LD1 { v4.S }[2], [%[r_2]] \n\t" + "SUB %[r_2], %[r_2], #16 \n\t" + /* Load r^4 to NEON v0, v1, v2, v3, v4 */ + "LD4 { v0.S-v3.S }[0], [%[r_4]], #16 \n\t" + "LD1 { v4.S }[0], [%[r_4]] \n\t" + "SUB %[r_4], %[r_4], #16 \n\t" + "MOV v0.S[1], v0.S[0] \n\t" + "MOV v0.S[3], v0.S[2] \n\t" + "MOV v1.S[1], v1.S[0] \n\t" + "MOV v1.S[3], v1.S[2] \n\t" + "MOV v2.S[1], v2.S[0] \n\t" + "MOV v2.S[3], v2.S[2] \n\t" + "MOV v3.S[1], v3.S[0] \n\t" + "MOV v3.S[3], v3.S[2] \n\t" + "MOV v4.S[1], v4.S[0] \n\t" + "MOV v4.S[3], v4.S[2] \n\t" + /* Store [r^4, r^2] * 5 */ + "MUL v5.4S, v0.4S, v28.S[0] \n\t" + "MUL v6.4S, v1.4S, v28.S[0] \n\t" + "MUL v7.4S, v2.4S, v28.S[0] \n\t" + "MUL v8.4S, v3.4S, v28.S[0] \n\t" + "MUL v9.4S, v4.4S, v28.S[0] \n\t" + /* Copy r^4 to ARM */ + "MOV w25, v0.S[0] \n\t" + "MOV w26, v1.S[0] \n\t" + "MOV w27, v2.S[0] \n\t" + "MOV w28, v3.S[0] \n\t" + "MOV w30, v4.S[0] \n\t" + /* Copy 5*r^4 to ARM */ + "MOV w15, v5.S[0] \n\t" + "MOV w16, v6.S[0] \n\t" + "MOV w17, v7.S[0] \n\t" + "MOV w8, v8.S[0] \n\t" + "MOV w19, v9.S[0] \n\t" + /* Load m */ + /* Load four message blocks to NEON v10, v11, v12, v13, v14 */ + "LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t" + "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t" + "USHR v14.4S, v13.4S, #8 \n\t" + "ORR v14.16B, v14.16B, v30.16B \n\t" + "SHL v13.4S, v13.4S, #18 \n\t" + "SRI v13.4S, v12.4S, #14 \n\t" + "SHL v12.4S, v12.4S, #12 \n\t" + "SRI v12.4S, v11.4S, #20 \n\t" + "SHL v11.4S, v11.4S, #6 \n\t" + "SRI v11.4S, v10.4S, #26 \n\t" + "AND v10.16B, v10.16B, v29.16B \n\t" + "AND v11.16B, v11.16B, v29.16B \n\t" + "AND v12.16B, v12.16B, v29.16B \n\t" + "AND v13.16B, v13.16B, v29.16B \n\t" + "AND v14.16B, v14.16B, v29.16B \n\t" + /* Four message blocks loaded */ + /* Add messages to accumulator */ + "ADD v15.2S, v15.2S, v10.2S \n\t" + "ADD v16.2S, v16.2S, v11.2S \n\t" + "ADD v17.2S, v17.2S, v12.2S \n\t" + "ADD v18.2S, v18.2S, v13.2S \n\t" + "ADD v19.2S, v19.2S, v14.2S \n\t" + "\n" + ".align 2 \n\t" + "L_poly1305_64_loop_128_%=: \n\t" + /* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */ + /* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */ + /* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */ + /* d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*s4 */ + /* d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 */ + "UMULL v21.2D, v15.2S, v0.2S \n\t" + /* Compute h*r^2 */ + /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */ + /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */ + /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */ + /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */ + /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */ + "MUL x9, x20, x25 \n\t" + "UMULL v22.2D, v15.2S, v1.2S \n\t" + "MUL x10, x20, x26 \n\t" + "UMULL v23.2D, v15.2S, v2.2S \n\t" + "MUL x11, x20, x27 \n\t" + "UMULL v24.2D, v15.2S, v3.2S \n\t" + "MUL x12, x20, x28 \n\t" + "UMULL v25.2D, v15.2S, v4.2S \n\t" + "MUL x13, x20, x30 \n\t" + "UMLAL v21.2D, v16.2S, v9.2S \n\t" + "MADD x9, x21, x19, x9 \n\t" + "UMLAL v22.2D, v16.2S, v0.2S \n\t" + "MADD x10, x21, x25, x10 \n\t" + "UMLAL v23.2D, v16.2S, v1.2S \n\t" + "MADD x11, x21, x26, x11 \n\t" + "UMLAL v24.2D, v16.2S, v2.2S \n\t" + "MADD x12, x21, x27, x12 \n\t" + "UMLAL v25.2D, v16.2S, v3.2S \n\t" + "MADD x13, x21, x28, x13 \n\t" + "UMLAL v21.2D, v17.2S, v8.2S \n\t" + "MADD x9, x22, x8, x9 \n\t" + "UMLAL v22.2D, v17.2S, v9.2S \n\t" + "MADD x10, x22, x19, x10 \n\t" + "UMLAL v23.2D, v17.2S, v0.2S \n\t" + "MADD x11, x22, x25, x11 \n\t" + "UMLAL v24.2D, v17.2S, v1.2S \n\t" + "MADD x12, x22, x26, x12 \n\t" + "UMLAL v25.2D, v17.2S, v2.2S \n\t" + "MADD x13, x22, x27, x13 \n\t" + "UMLAL v21.2D, v18.2S, v7.2S \n\t" + "MADD x9, x23, x17, x9 \n\t" + "UMLAL v22.2D, v18.2S, v8.2S \n\t" + "MADD x10, x23, x8, x10 \n\t" + "UMLAL v23.2D, v18.2S, v9.2S \n\t" + "MADD x11, x23, x19, x11 \n\t" + "UMLAL v24.2D, v18.2S, v0.2S \n\t" + "MADD x12, x23, x25, x12 \n\t" + "UMLAL v25.2D, v18.2S, v1.2S \n\t" + "MADD x13, x23, x26, x13 \n\t" + "UMLAL v21.2D, v19.2S, v6.2S \n\t" + "MADD x9, x24, x16, x9 \n\t" + "UMLAL v22.2D, v19.2S, v7.2S \n\t" + "MADD x10, x24, x17, x10 \n\t" + "UMLAL v23.2D, v19.2S, v8.2S \n\t" + "MADD x11, x24, x8, x11 \n\t" + "UMLAL v24.2D, v19.2S, v9.2S \n\t" + "MADD x12, x24, x19, x12 \n\t" + "UMLAL v25.2D, v19.2S, v0.2S \n\t" + "MADD x13, x24, x25, x13 \n\t" + /* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */ + /* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */ + /* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */ + /* d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*s4 */ + /* d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 */ + "UMLAL2 v21.2D, v10.4S, v0.4S \n\t" + /* Reduce h % P */ + "MOV x14, #5 \n\t" + "UMLAL2 v22.2D, v10.4S, v1.4S \n\t" + "ADD x10, x10, x9, LSR #26 \n\t" + "UMLAL2 v23.2D, v10.4S, v2.4S \n\t" + "ADD x13, x13, x12, LSR #26 \n\t" + "UMLAL2 v24.2D, v10.4S, v3.4S \n\t" + "AND x9, x9, #0x3ffffff \n\t" + "UMLAL2 v25.2D, v10.4S, v4.4S \n\t" + "LSR x20, x13, #26 \n\t" + "UMLAL2 v21.2D, v11.4S, v9.4S \n\t" + "AND x12, x12, #0x3ffffff \n\t" + "UMLAL2 v22.2D, v11.4S, v0.4S \n\t" + "MADD x9, x20, x14, x9 \n\t" + "UMLAL2 v23.2D, v11.4S, v1.4S \n\t" + "ADD x11, x11, x10, LSR #26 \n\t" + "UMLAL2 v24.2D, v11.4S, v2.4S \n\t" + "AND x10, x10, #0x3ffffff \n\t" + "UMLAL2 v25.2D, v11.4S, v3.4S \n\t" + "AND x13, x13, #0x3ffffff \n\t" + "UMLAL2 v21.2D, v12.4S, v8.4S \n\t" + "ADD x12, x12, x11, LSR #26 \n\t" + "UMLAL2 v22.2D, v12.4S, v9.4S \n\t" + "AND x22, x11, #0x3ffffff \n\t" + "UMLAL2 v23.2D, v12.4S, v0.4S \n\t" + "ADD x21, x10, x9, LSR #26 \n\t" + "UMLAL2 v24.2D, v12.4S, v1.4S \n\t" + "AND x20, x9, #0x3ffffff \n\t" + "UMLAL2 v25.2D, v12.4S, v2.4S \n\t" + "ADD x24, x13, x12, LSR #26 \n\t" + "UMLAL2 v21.2D, v13.4S, v7.4S \n\t" + "AND x23, x12, #0x3ffffff \n\t" + "UMLAL2 v22.2D, v13.4S, v8.4S \n\t" + "UMLAL2 v23.2D, v13.4S, v9.4S \n\t" + "UMLAL2 v24.2D, v13.4S, v0.4S \n\t" + "UMLAL2 v25.2D, v13.4S, v1.4S \n\t" + "UMLAL2 v21.2D, v14.4S, v6.4S \n\t" + "UMLAL2 v22.2D, v14.4S, v7.4S \n\t" + "UMLAL2 v23.2D, v14.4S, v8.4S \n\t" + "UMLAL2 v24.2D, v14.4S, v9.4S \n\t" + "UMLAL2 v25.2D, v14.4S, v0.4S \n\t" + /* If less than six message blocks left then leave loop */ + "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t" + "BLS L_poly1305_64_loop_128_final_%= \n\t" + /* Load m */ + /* Load four message blocks to NEON v10, v11, v12, v13, v14 */ + "LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t" + "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t" + "USHR v14.4S, v13.4S, #8 \n\t" + "ORR v14.16B, v14.16B, v30.16B \n\t" + "SHL v13.4S, v13.4S, #18 \n\t" + "SRI v13.4S, v12.4S, #14 \n\t" + "SHL v12.4S, v12.4S, #12 \n\t" + "SRI v12.4S, v11.4S, #20 \n\t" + "SHL v11.4S, v11.4S, #6 \n\t" + "SRI v11.4S, v10.4S, #26 \n\t" + "AND v10.16B, v10.16B, v29.16B \n\t" + "AND v11.16B, v11.16B, v29.16B \n\t" + "AND v12.16B, v12.16B, v29.16B \n\t" + "AND v13.16B, v13.16B, v29.16B \n\t" + "AND v14.16B, v14.16B, v29.16B \n\t" + /* Four message blocks loaded */ + /* Add new message block to accumulator */ + "UADDW v21.2D, v21.2D, v10.2S \n\t" + "UADDW v22.2D, v22.2D, v11.2S \n\t" + "UADDW v23.2D, v23.2D, v12.2S \n\t" + "UADDW v24.2D, v24.2D, v13.2S \n\t" + "UADDW v25.2D, v25.2D, v14.2S \n\t" + /* Reduce radix 26 NEON */ + /* Interleave h0 -> h1 -> h2 -> h3 -> h4 */ + /* with h3 -> h4 -> h0 -> h1 */ + "USRA v22.2D, v21.2D, #26 \n\t" + "AND v21.16B, v21.16B, v27.16B \n\t" + "USRA v25.2D, v24.2D, #26 \n\t" + "AND v24.16B, v24.16B, v27.16B \n\t" + "USHR v15.2D, v25.2D, #26 \n\t" + "USRA v23.2D, v22.2D, #26 \n\t" + /* Simulate multiplying by 5 using adding and shifting */ + "SHL v18.2D, v15.2D, #2 \n\t" + "AND v16.16B, v22.16B, v27.16B \n\t" + "ADD v18.2D, v18.2D, v15.2D \n\t" + "AND v19.16B, v25.16B, v27.16B \n\t" + "ADD v21.2D, v21.2D, v18.2D \n\t" + "USRA v24.2D, v23.2D, #26 \n\t" + "AND v17.16B, v23.16B, v27.16B \n\t" + "USRA v16.2D, v21.2D, #26 \n\t" + "AND v15.16B, v21.16B, v27.16B \n\t" + "USRA v19.2D, v24.2D, #26 \n\t" + "AND v18.16B, v24.16B, v27.16B \n\t" + /* Copy values to lower halves of result registers */ + "MOV v15.S[1], v15.S[2] \n\t" + "MOV v16.S[1], v16.S[2] \n\t" + "MOV v17.S[1], v17.S[2] \n\t" + "MOV v18.S[1], v18.S[2] \n\t" + "MOV v19.S[1], v19.S[2] \n\t" + "B L_poly1305_64_loop_128_%= \n\t" + "\n" + ".align 2 \n\t" + "L_poly1305_64_loop_128_final_%=: \n\t" + /* Load m */ + /* Load two message blocks to NEON v10, v11, v12, v13, v14 */ + "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t" + /* Copy r^2 to lower half of registers */ + "MOV v0.D[0], v0.D[1] \n\t" + "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "MOV v5.D[0], v5.D[1] \n\t" + "USHR v14.2D, v11.2D, #40 \n\t" + "MOV v1.D[0], v1.D[1] \n\t" + "ORR v14.16B, v14.16B, v26.16B \n\t" + "MOV v6.D[0], v6.D[1] \n\t" + "USHR v13.2D, v11.2D, #14 \n\t" + "MOV v2.D[0], v2.D[1] \n\t" + "AND v13.16B, v13.16B, v27.16B \n\t" + "MOV v7.D[0], v7.D[1] \n\t" + "SHL v12.2D, v11.2D, #12 \n\t" + "MOV v3.D[0], v3.D[1] \n\t" + "SRI v12.2D, v10.2D, #52 \n\t" + "MOV v8.D[0], v8.D[1] \n\t" + "AND v12.16B, v12.16B, v27.16B \n\t" + "MOV v4.D[0], v4.D[1] \n\t" + "USHR v11.2D, v10.2D, #26 \n\t" + "MOV v9.D[0], v9.D[1] \n\t" + "AND v11.16B, v11.16B, v27.16B \n\t" + /* Copy r^2 to ARM */ + "MOV w25, v0.S[2] \n\t" + "AND v10.16B, v10.16B, v27.16B \n\t" + "MOV w26, v1.S[2] \n\t" + /* Two message blocks loaded */ + /* Add last messages */ + "ADD v21.2D, v21.2D, v10.2D \n\t" + "MOV w27, v2.S[2] \n\t" + "ADD v22.2D, v22.2D, v11.2D \n\t" + "MOV w28, v3.S[2] \n\t" + "ADD v23.2D, v23.2D, v12.2D \n\t" + "MOV w30, v4.S[2] \n\t" + "ADD v24.2D, v24.2D, v13.2D \n\t" + /* Copy 5*r^2 to ARM */ + "MOV w15, v5.S[2] \n\t" + "ADD v25.2D, v25.2D, v14.2D \n\t" + "MOV w16, v6.S[2] \n\t" + /* Reduce message to be ready for next multiplication */ + /* Reduce radix 26 NEON */ + /* Interleave h0 -> h1 -> h2 -> h3 -> h4 */ + /* with h3 -> h4 -> h0 -> h1 */ + "USRA v22.2D, v21.2D, #26 \n\t" + "MOV w17, v7.S[2] \n\t" + "AND v21.16B, v21.16B, v27.16B \n\t" + "MOV w8, v8.S[2] \n\t" + "USRA v25.2D, v24.2D, #26 \n\t" + "MOV w19, v9.S[2] \n\t" + "AND v24.16B, v24.16B, v27.16B \n\t" + "USHR v15.2D, v25.2D, #26 \n\t" + "USRA v23.2D, v22.2D, #26 \n\t" + /* Simulate multiplying by 5 using adding and shifting */ + "SHL v18.2D, v15.2D, #2 \n\t" + "AND v16.16B, v22.16B, v27.16B \n\t" + "ADD v18.2D, v18.2D, v15.2D \n\t" + "AND v19.16B, v25.16B, v27.16B \n\t" + "ADD v21.2D, v21.2D, v18.2D \n\t" + "USRA v24.2D, v23.2D, #26 \n\t" + "AND v17.16B, v23.16B, v27.16B \n\t" + "USRA v16.2D, v21.2D, #26 \n\t" + "AND v15.16B, v21.16B, v27.16B \n\t" + "USRA v19.2D, v24.2D, #26 \n\t" + "AND v18.16B, v24.16B, v27.16B \n\t" + /* Copy values to lower halves of result registers */ + "MOV v15.S[1], v15.S[2] \n\t" + "MOV v16.S[1], v16.S[2] \n\t" + "MOV v17.S[1], v17.S[2] \n\t" + "MOV v18.S[1], v18.S[2] \n\t" + "MOV v19.S[1], v19.S[2] \n\t" + /* If less than 2 blocks left go straight to final multiplication. */ + "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "BLO L_poly1305_64_last_mult_%= \n\t" + /* Else go to one loop of L_poly1305_64_loop_64 */ + "B L_poly1305_64_loop_64_%= \n\t" + "\n" + ".align 2 \n\t" + "L_poly1305_64_start_block_size_64_%=: \n\t" + /* Load r^2 to NEON v0, v1, v2, v3, v4 */ + "LD4R { v0.2S-v3.2S }, [%[r_2]], #16 \n\t" + "LD1R { v4.2S }, [%[r_2]] \n\t" + "SUB %[r_2], %[r_2], #16 \n\t" + /* Store r^2 * 5 */ + "MUL v5.4S, v0.4S, v28.S[0] \n\t" + "MUL v6.4S, v1.4S, v28.S[0] \n\t" + "MUL v7.4S, v2.4S, v28.S[0] \n\t" + "MUL v8.4S, v3.4S, v28.S[0] \n\t" + "MUL v9.4S, v4.4S, v28.S[0] \n\t" + /* Copy r^2 to ARM */ + "MOV w25, v0.S[0] \n\t" + "MOV w26, v1.S[0] \n\t" + "MOV w27, v2.S[0] \n\t" + "MOV w28, v3.S[0] \n\t" + "MOV w30, v4.S[0] \n\t" + /* Copy 5*r^2 to ARM */ + "MOV w15, v5.S[0] \n\t" + "MOV w16, v6.S[0] \n\t" + "MOV w17, v7.S[0] \n\t" + "MOV w8, v8.S[0] \n\t" + "MOV w19, v9.S[0] \n\t" + /* Load m */ + /* Load two message blocks to NEON v10, v11, v12, v13, v14 */ + "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t" + "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "USHR v14.2D, v11.2D, #40 \n\t" + "ORR v14.16B, v14.16B, v26.16B \n\t" + "USHR v13.2D, v11.2D, #14 \n\t" + "AND v13.16B, v13.16B, v27.16B \n\t" + "SHL v12.2D, v11.2D, #12 \n\t" + "SRI v12.2D, v10.2D, #52 \n\t" + "AND v12.16B, v12.16B, v27.16B \n\t" + "USHR v11.2D, v10.2D, #26 \n\t" + "AND v11.16B, v11.16B, v27.16B \n\t" + "AND v10.16B, v10.16B, v27.16B \n\t" + "MOV v10.S[1], v10.S[2] \n\t" + "MOV v11.S[1], v11.S[2] \n\t" + "MOV v12.S[1], v12.S[2] \n\t" + "MOV v13.S[1], v13.S[2] \n\t" + "MOV v14.S[1], v14.S[2] \n\t" + /* Two message blocks loaded */ + /* Add messages to accumulator */ + "ADD v15.2S, v15.2S, v10.2S \n\t" + "ADD v16.2S, v16.2S, v11.2S \n\t" + "ADD v17.2S, v17.2S, v12.2S \n\t" + "ADD v18.2S, v18.2S, v13.2S \n\t" + "ADD v19.2S, v19.2S, v14.2S \n\t" + "\n" + ".align 2 \n\t" + "L_poly1305_64_loop_64_%=: \n\t" + /* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */ + /* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */ + /* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */ + /* d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*s4 */ + /* d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 */ + "UMULL v21.2D, v15.2S, v0.2S \n\t" + /* Compute h*r^2 */ + /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */ + /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */ + /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */ + /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */ + /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */ + "MUL x9, x20, x25 \n\t" + "UMULL v22.2D, v15.2S, v1.2S \n\t" + "MUL x10, x20, x26 \n\t" + "UMULL v23.2D, v15.2S, v2.2S \n\t" + "MUL x11, x20, x27 \n\t" + "UMULL v24.2D, v15.2S, v3.2S \n\t" + "MUL x12, x20, x28 \n\t" + "UMULL v25.2D, v15.2S, v4.2S \n\t" + "MUL x13, x20, x30 \n\t" + "UMLAL v21.2D, v16.2S, v9.2S \n\t" + "MADD x9, x21, x19, x9 \n\t" + "UMLAL v22.2D, v16.2S, v0.2S \n\t" + "MADD x10, x21, x25, x10 \n\t" + "UMLAL v23.2D, v16.2S, v1.2S \n\t" + "MADD x11, x21, x26, x11 \n\t" + "UMLAL v24.2D, v16.2S, v2.2S \n\t" + "MADD x12, x21, x27, x12 \n\t" + "UMLAL v25.2D, v16.2S, v3.2S \n\t" + "MADD x13, x21, x28, x13 \n\t" + "UMLAL v21.2D, v17.2S, v8.2S \n\t" + "MADD x9, x22, x8, x9 \n\t" + "UMLAL v22.2D, v17.2S, v9.2S \n\t" + "MADD x10, x22, x19, x10 \n\t" + "UMLAL v23.2D, v17.2S, v0.2S \n\t" + "MADD x11, x22, x25, x11 \n\t" + "UMLAL v24.2D, v17.2S, v1.2S \n\t" + "MADD x12, x22, x26, x12 \n\t" + "UMLAL v25.2D, v17.2S, v2.2S \n\t" + "MADD x13, x22, x27, x13 \n\t" + "UMLAL v21.2D, v18.2S, v7.2S \n\t" + "MADD x9, x23, x17, x9 \n\t" + "UMLAL v22.2D, v18.2S, v8.2S \n\t" + "MADD x10, x23, x8, x10 \n\t" + "UMLAL v23.2D, v18.2S, v9.2S \n\t" + "MADD x11, x23, x19, x11 \n\t" + "UMLAL v24.2D, v18.2S, v0.2S \n\t" + "MADD x12, x23, x25, x12 \n\t" + "UMLAL v25.2D, v18.2S, v1.2S \n\t" + "MADD x13, x23, x26, x13 \n\t" + "UMLAL v21.2D, v19.2S, v6.2S \n\t" + "MADD x9, x24, x16, x9 \n\t" + "UMLAL v22.2D, v19.2S, v7.2S \n\t" + "MADD x10, x24, x17, x10 \n\t" + "UMLAL v23.2D, v19.2S, v8.2S \n\t" + "MADD x11, x24, x8, x11 \n\t" + "UMLAL v24.2D, v19.2S, v9.2S \n\t" + "MADD x12, x24, x19, x12 \n\t" + "UMLAL v25.2D, v19.2S, v0.2S \n\t" + "MADD x13, x24, x25, x13 \n\t" + /* Load m */ + /* Load two message blocks to NEON v10, v11, v12, v13, v14 */ + "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t" + /* Reduce h % P */ + "MOV x14, #5 \n\t" + "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "ADD x10, x10, x9, LSR #26 \n\t" + "USHR v14.2D, v11.2D, #40 \n\t" + "ADD x13, x13, x12, LSR #26 \n\t" + "ORR v14.16B, v14.16B, v26.16B \n\t" + "AND x9, x9, #0x3ffffff \n\t" + "USHR v13.2D, v11.2D, #14 \n\t" + "LSR x20, x13, #26 \n\t" + "AND v13.16B, v13.16B, v27.16B \n\t" + "AND x12, x12, #0x3ffffff \n\t" + "SHL v12.2D, v11.2D, #12 \n\t" + "MADD x9, x20, x14, x9 \n\t" + "SRI v12.2D, v10.2D, #52 \n\t" + "ADD x11, x11, x10, LSR #26 \n\t" + "AND v12.16B, v12.16B, v27.16B \n\t" + "AND x10, x10, #0x3ffffff \n\t" + "USHR v11.2D, v10.2D, #26 \n\t" + "AND x13, x13, #0x3ffffff \n\t" + "AND v11.16B, v11.16B, v27.16B \n\t" + "ADD x12, x12, x11, LSR #26 \n\t" + "AND v10.16B, v10.16B, v27.16B \n\t" + "AND x22, x11, #0x3ffffff \n\t" + /* Two message blocks loaded */ + "ADD v21.2D, v21.2D, v10.2D \n\t" + "ADD x21, x10, x9, LSR #26 \n\t" + "ADD v22.2D, v22.2D, v11.2D \n\t" + "AND x20, x9, #0x3ffffff \n\t" + "ADD v23.2D, v23.2D, v12.2D \n\t" + "ADD x24, x13, x12, LSR #26 \n\t" + "ADD v24.2D, v24.2D, v13.2D \n\t" + "AND x23, x12, #0x3ffffff \n\t" + "ADD v25.2D, v25.2D, v14.2D \n\t" + /* Reduce radix 26 NEON */ + /* Interleave h0 -> h1 -> h2 -> h3 -> h4 */ + /* with h3 -> h4 -> h0 -> h1 */ + "USRA v22.2D, v21.2D, #26 \n\t" + "AND v21.16B, v21.16B, v27.16B \n\t" + "USRA v25.2D, v24.2D, #26 \n\t" + "AND v24.16B, v24.16B, v27.16B \n\t" + "USHR v15.2D, v25.2D, #26 \n\t" + "USRA v23.2D, v22.2D, #26 \n\t" + /* Simulate multiplying by 5 using adding and shifting */ + "SHL v18.2D, v15.2D, #2 \n\t" + "AND v16.16B, v22.16B, v27.16B \n\t" + "ADD v18.2D, v18.2D, v15.2D \n\t" + "AND v19.16B, v25.16B, v27.16B \n\t" + "ADD v21.2D, v21.2D, v18.2D \n\t" + "USRA v24.2D, v23.2D, #26 \n\t" + "AND v17.16B, v23.16B, v27.16B \n\t" + "USRA v16.2D, v21.2D, #26 \n\t" + "AND v15.16B, v21.16B, v27.16B \n\t" + "USRA v19.2D, v24.2D, #26 \n\t" + "AND v18.16B, v24.16B, v27.16B \n\t" + /* Copy values to lower halves of result registers */ + "MOV v15.S[1], v15.S[2] \n\t" + "MOV v16.S[1], v16.S[2] \n\t" + "MOV v17.S[1], v17.S[2] \n\t" + "MOV v18.S[1], v18.S[2] \n\t" + "MOV v19.S[1], v19.S[2] \n\t" + /* If at least two message blocks left then loop_64 */ + "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "BHS L_poly1305_64_loop_64_%= \n\t" + "\n" + ".align 2 \n\t" + "L_poly1305_64_last_mult_%=: \n\t" + /* Load r */ + "LD4 { v0.S-v3.S }[1], [%[r]], #16 \n\t" + /* Compute h*r^2 */ + /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */ + /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */ + /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */ + /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */ + /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */ + "MUL x9, x20, x25 \n\t" + "LD1 { v4.S }[1], [%[r]] \n\t" + "MUL x10, x20, x26 \n\t" + "SUB %[r], %[r], #16 \n\t" + "MUL x11, x20, x27 \n\t" + /* Store [r^2, r] * 5 */ + "MUL v5.2S, v0.2S, v28.S[0] \n\t" + "MUL x12, x20, x28 \n\t" + "MUL v6.2S, v1.2S, v28.S[0] \n\t" + "MUL x13, x20, x30 \n\t" + "MUL v7.2S, v2.2S, v28.S[0] \n\t" + "MADD x9, x21, x19, x9 \n\t" + "MUL v8.2S, v3.2S, v28.S[0] \n\t" + "MADD x10, x21, x25, x10 \n\t" + "MUL v9.2S, v4.2S, v28.S[0] \n\t" + "MADD x11, x21, x26, x11 \n\t" + /* Final multiply by [r^2, r] */ + /* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */ + /* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */ + /* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */ + /* d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*s4 */ + /* d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 */ + "UMULL v21.2D, v15.2S, v0.2S \n\t" + "MADD x12, x21, x27, x12 \n\t" + "UMULL v22.2D, v15.2S, v1.2S \n\t" + "MADD x13, x21, x28, x13 \n\t" + "UMULL v23.2D, v15.2S, v2.2S \n\t" + "MADD x9, x22, x8, x9 \n\t" + "UMULL v24.2D, v15.2S, v3.2S \n\t" + "MADD x10, x22, x19, x10 \n\t" + "UMULL v25.2D, v15.2S, v4.2S \n\t" + "MADD x11, x22, x25, x11 \n\t" + "UMLAL v21.2D, v16.2S, v9.2S \n\t" + "MADD x12, x22, x26, x12 \n\t" + "UMLAL v22.2D, v16.2S, v0.2S \n\t" + "MADD x13, x22, x27, x13 \n\t" + "UMLAL v23.2D, v16.2S, v1.2S \n\t" + "MADD x9, x23, x17, x9 \n\t" + "UMLAL v24.2D, v16.2S, v2.2S \n\t" + "MADD x10, x23, x8, x10 \n\t" + "UMLAL v25.2D, v16.2S, v3.2S \n\t" + "MADD x11, x23, x19, x11 \n\t" + "UMLAL v21.2D, v17.2S, v8.2S \n\t" + "MADD x12, x23, x25, x12 \n\t" + "UMLAL v22.2D, v17.2S, v9.2S \n\t" + "MADD x13, x23, x26, x13 \n\t" + "UMLAL v23.2D, v17.2S, v0.2S \n\t" + "MADD x9, x24, x16, x9 \n\t" + "UMLAL v24.2D, v17.2S, v1.2S \n\t" + "MADD x10, x24, x17, x10 \n\t" + "UMLAL v25.2D, v17.2S, v2.2S \n\t" + "MADD x11, x24, x8, x11 \n\t" + "UMLAL v21.2D, v18.2S, v7.2S \n\t" + "MADD x12, x24, x19, x12 \n\t" + "UMLAL v22.2D, v18.2S, v8.2S \n\t" + "MADD x13, x24, x25, x13 \n\t" + "UMLAL v23.2D, v18.2S, v9.2S \n\t" + /* Reduce h % P */ + "MOV x14, #5 \n\t" + "UMLAL v24.2D, v18.2S, v0.2S \n\t" + "ADD x10, x10, x9, LSR #26 \n\t" + "UMLAL v25.2D, v18.2S, v1.2S \n\t" + "ADD x13, x13, x12, LSR #26 \n\t" + "UMLAL v21.2D, v19.2S, v6.2S \n\t" + "AND x9, x9, #0x3ffffff \n\t" + "UMLAL v22.2D, v19.2S, v7.2S \n\t" + "LSR x20, x13, #26 \n\t" + "UMLAL v23.2D, v19.2S, v8.2S \n\t" + "AND x12, x12, #0x3ffffff \n\t" + "UMLAL v24.2D, v19.2S, v9.2S \n\t" + "MADD x9, x20, x14, x9 \n\t" + "UMLAL v25.2D, v19.2S, v0.2S \n\t" + "ADD x11, x11, x10, LSR #26 \n\t" + /* Add even and odd elements */ + "ADDP d21, v21.2D \n\t" + "AND x10, x10, #0x3ffffff \n\t" + "ADDP d22, v22.2D \n\t" + "AND x13, x13, #0x3ffffff \n\t" + "ADDP d23, v23.2D \n\t" + "ADD x12, x12, x11, LSR #26 \n\t" + "ADDP d24, v24.2D \n\t" + "AND x22, x11, #0x3ffffff \n\t" + "ADDP d25, v25.2D \n\t" + "ADD x21, x10, x9, LSR #26 \n\t" + "AND x20, x9, #0x3ffffff \n\t" + "ADD x24, x13, x12, LSR #26 \n\t" + "AND x23, x12, #0x3ffffff \n\t" + /* Load h to NEON */ + "MOV v5.D[0], x20 \n\t" + "MOV v6.D[0], x21 \n\t" + "MOV v7.D[0], x22 \n\t" + "MOV v8.D[0], x23 \n\t" + "MOV v9.D[0], x24 \n\t" + /* Add ctx->h to current accumulator */ + "ADD v21.2D, v21.2D, v5.2D \n\t" + "ADD v22.2D, v22.2D, v6.2D \n\t" + "ADD v23.2D, v23.2D, v7.2D \n\t" + "ADD v24.2D, v24.2D, v8.2D \n\t" + "ADD v25.2D, v25.2D, v9.2D \n\t" + /* Reduce h (h % P) */ + /* Reduce radix 26 NEON */ + /* Interleave h0 -> h1 -> h2 -> h3 -> h4 */ + /* with h3 -> h4 -> h0 -> h1 */ + "USRA v22.2D, v21.2D, #26 \n\t" + "AND v21.16B, v21.16B, v27.16B \n\t" + "USRA v25.2D, v24.2D, #26 \n\t" + "AND v24.16B, v24.16B, v27.16B \n\t" + "USHR v5.2D, v25.2D, #26 \n\t" + "USRA v23.2D, v22.2D, #26 \n\t" + /* Simulate multiplying by 5 using adding and shifting */ + "SHL v8.2D, v5.2D, #2 \n\t" + "AND v6.16B, v22.16B, v27.16B \n\t" + "ADD v8.2D, v8.2D, v5.2D \n\t" + "AND v9.16B, v25.16B, v27.16B \n\t" + "ADD v21.2D, v21.2D, v8.2D \n\t" + "USRA v24.2D, v23.2D, #26 \n\t" + "AND v7.16B, v23.16B, v27.16B \n\t" + "USRA v6.2D, v21.2D, #26 \n\t" + "AND v5.16B, v21.16B, v27.16B \n\t" + "USRA v9.2D, v24.2D, #26 \n\t" + "AND v8.16B, v24.16B, v27.16B \n\t" + /* Copy values to lower halves of result registers */ + /* Store h */ + "ST4 { v5.S-v8.S }[0], [%[h]], #16 \n\t" + "ST1 { v9.S }[0], [%[h]] \n\t" + "SUB %[h], %[h], #16 \n\t" + "\n" + ".align 2 \n\t" + "L_poly1305_64_done_%=: \n\t" + : [bytes] "+r" (bytes), + [m] "+r" (m), + [ctx] "+m" (ctx) + : [POLY1305_BLOCK_SIZE] "I" (POLY1305_BLOCK_SIZE), + [h] "r" (ctx->h), + [r] "r" (ctx->r), + [r_2] "r" (ctx->r_2), + [r_4] "r" (ctx->r_4), + [finished] "r" ((word64)ctx->finished) + : "memory", "cc", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17", + "w19", "w20", "w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28", + "w30", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", + "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", + "x28", "x30" + ); + poly1305_blocks_16(ctx, m, bytes); +} + +void poly1305_block(Poly1305* ctx, const unsigned char *m) +{ + poly1305_blocks_16(ctx, m, POLY1305_BLOCK_SIZE); +} + +#if defined(POLY130564) +static word64 clamp[] = { + 0x0ffffffc0fffffff, + 0x0ffffffc0ffffffc, +}; +#endif /* POLY130564 */ + + +int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) +{ + if (key == NULL) + return BAD_FUNC_ARG; + +#ifdef CHACHA_AEAD_TEST + word32 k; + printf("Poly key used:\n"); + for (k = 0; k < keySz; k++) { + printf("%02x", key[k]); + if ((k+1) % 8 == 0) + printf("\n"); + } + printf("\n"); +#endif + + if (keySz != 32 || ctx == NULL) + return BAD_FUNC_ARG; + + __asm__ __volatile__ ( + /* Load key material */ + "LDP x8, x9, [%[key]] \n\t" + "LDP x10, x11, [%[key], #16] \n\t" + /* Load clamp */ + "LDP x12, x13, [%[clamp]] \n\t" + /* Apply clamp */ + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + "AND x8, x8, x12 \n\t" + "AND x9, x9, x13 \n\t" + "MOV x19, xzr \n\t" + "MOV x20, xzr \n\t" + "MOV x21, xzr \n\t" + "MOV x22, xzr \n\t" + "MOV x23, xzr \n\t" + "BFI x19, x8, #0, #26 \n\t" + "LSR x8, x8, #26 \n\t" + "BFI x20, x8, #0, #26 \n\t" + "LSR x8, x8, #26 \n\t" + "BFI x21, x8, #0, #12 \n\t" + "BFI x21, x9, #12, #14 \n\t" + "LSR x9, x9, #14 \n\t" + "BFI x22, x9, #0, #26 \n\t" + "LSR x9, x9, #26 \n\t" + "BFI x23, x9, #0, #24 \n\t" + /* Compute r^2 */ + /* r*5 */ + "MOV x8, #5 \n\t" + "MUL x24, x20, x8 \n\t" + "MUL x25, x21, x8 \n\t" + "MUL x26, x22, x8 \n\t" + "MUL x27, x23, x8 \n\t" + /* d = r*r */ + /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */ + /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */ + /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */ + /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */ + /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */ + "MUL x14, x19, x19 \n\t" + "MUL x15, x19, x20 \n\t" + "MUL x16, x19, x21 \n\t" + "MUL x17, x19, x22 \n\t" + "MUL x7, x19, x23 \n\t" + "MADD x14, x20, x27, x14 \n\t" + "MADD x15, x20, x19, x15 \n\t" + "MADD x16, x20, x20, x16 \n\t" + "MADD x17, x20, x21, x17 \n\t" + "MADD x7, x20, x22, x7 \n\t" + "MADD x14, x21, x26, x14 \n\t" + "MADD x15, x21, x27, x15 \n\t" + "MADD x16, x21, x19, x16 \n\t" + "MADD x17, x21, x20, x17 \n\t" + "MADD x7, x21, x21, x7 \n\t" + "MADD x14, x22, x25, x14 \n\t" + "MADD x15, x22, x26, x15 \n\t" + "MADD x16, x22, x27, x16 \n\t" + "MADD x17, x22, x19, x17 \n\t" + "MADD x7, x22, x20, x7 \n\t" + "MADD x14, x23, x24, x14 \n\t" + "MADD x15, x23, x25, x15 \n\t" + "MADD x16, x23, x26, x16 \n\t" + "MADD x17, x23, x27, x17 \n\t" + "MADD x7, x23, x19, x7 \n\t" + /* r_2 = r^2 % P */ + "ADD x15, x15, x14, LSR #26 \n\t" + "ADD x7, x7, x17, LSR #26 \n\t" + "AND x14, x14, #0x3ffffff \n\t" + "LSR x9, x7, #26 \n\t" + "AND x17, x17, #0x3ffffff \n\t" + "MADD x14, x9, x8, x14 \n\t" + "ADD x16, x16, x15, LSR #26 \n\t" + "AND x15, x15, #0x3ffffff \n\t" + "AND x7, x7, #0x3ffffff \n\t" + "ADD x17, x17, x16, LSR #26 \n\t" + "AND x16, x16, #0x3ffffff \n\t" + "ADD x15, x15, x14, LSR #26 \n\t" + "AND x14, x14, #0x3ffffff \n\t" + "ADD x7, x7, x17, LSR #26 \n\t" + "AND x17, x17, #0x3ffffff \n\t" + /* Store r */ + "ORR x19, x19, x20, LSL #32 \n\t" + "ORR x21, x21, x22, LSL #32 \n\t" + "STP x19, x21, [%[ctx_r]] \n\t" + "STR w23, [%[ctx_r], #16] \n\t" + "MOV x8, #5 \n\t" + "MUL x24, x15, x8 \n\t" + "MUL x25, x16, x8 \n\t" + "MUL x26, x17, x8 \n\t" + "MUL x27, x7, x8 \n\t" + /* Compute r^4 */ + /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */ + /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */ + /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */ + /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */ + /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */ + "MUL x19, x14, x14 \n\t" + "MUL x20, x14, x15 \n\t" + "MUL x21, x14, x16 \n\t" + "MUL x22, x14, x17 \n\t" + "MUL x23, x14, x7 \n\t" + "MADD x19, x15, x27, x19 \n\t" + "MADD x20, x15, x14, x20 \n\t" + "MADD x21, x15, x15, x21 \n\t" + "MADD x22, x15, x16, x22 \n\t" + "MADD x23, x15, x17, x23 \n\t" + "MADD x19, x16, x26, x19 \n\t" + "MADD x20, x16, x27, x20 \n\t" + "MADD x21, x16, x14, x21 \n\t" + "MADD x22, x16, x15, x22 \n\t" + "MADD x23, x16, x16, x23 \n\t" + "MADD x19, x17, x25, x19 \n\t" + "MADD x20, x17, x26, x20 \n\t" + "MADD x21, x17, x27, x21 \n\t" + "MADD x22, x17, x14, x22 \n\t" + "MADD x23, x17, x15, x23 \n\t" + "MADD x19, x7, x24, x19 \n\t" + "MADD x20, x7, x25, x20 \n\t" + "MADD x21, x7, x26, x21 \n\t" + "MADD x22, x7, x27, x22 \n\t" + "MADD x23, x7, x14, x23 \n\t" + /* r^4 % P */ + "ADD x20, x20, x19, LSR #26 \n\t" + "ADD x23, x23, x22, LSR #26 \n\t" + "AND x19, x19, #0x3ffffff \n\t" + "LSR x9, x23, #26 \n\t" + "AND x22, x22, #0x3ffffff \n\t" + "MADD x19, x9, x8, x19 \n\t" + "ADD x21, x21, x20, LSR #26 \n\t" + "AND x20, x20, #0x3ffffff \n\t" + "AND x23, x23, #0x3ffffff \n\t" + "ADD x22, x22, x21, LSR #26 \n\t" + "AND x21, x21, #0x3ffffff \n\t" + "ADD x20, x20, x19, LSR #26 \n\t" + "AND x19, x19, #0x3ffffff \n\t" + "ADD x23, x23, x22, LSR #26 \n\t" + "AND x22, x22, #0x3ffffff \n\t" + /* Store r^2 */ + "ORR x14, x14, x15, LSL #32 \n\t" + "ORR x16, x16, x17, LSL #32 \n\t" + "STP x14, x16, [%[ctx_r_2]] \n\t" + "STR w7, [%[ctx_r_2], #16] \n\t" + /* Store r^4 */ + "ORR x19, x19, x20, LSL #32 \n\t" + "ORR x21, x21, x22, LSL #32 \n\t" + "STP x19, x21, [%[ctx_r_4]] \n\t" + "STR w23, [%[ctx_r_4], #16] \n\t" + /* h (accumulator) = 0 */ + "STP xzr, xzr, [%[ctx_h_0]] \n\t" + "STR wzr, [%[ctx_h_0], #16] \n\t" + /* Save pad for later */ + "STP x10, x11, [%[ctx_pad]] \n\t" + /* Zero leftover */ + "STR xzr, [%[ctx_leftover]] \n\t" + /* Zero finished */ + "STRB wzr, [%[ctx_finished]] \n\t" + : + : [clamp] "r" (clamp), + [key] "r" (key), + [ctx_r] "r" (ctx->r), + [ctx_r_2] "r" (ctx->r_2), + [ctx_r_4] "r" (ctx->r_4), + [ctx_h_0] "r" (ctx->h), + [ctx_pad] "r" (ctx->pad), + [ctx_leftover] "r" (&ctx->leftover), + [ctx_finished] "r" (&ctx->finished) + : "memory", "cc", + "w7", "w14", "w15", "w16", "w17", "w19", "w20", "w21", "w22", "w23", + "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", + "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" + ); + + return 0; +} + + +int wc_Poly1305Final(Poly1305* ctx, byte* mac) +{ + + if (ctx == NULL) + return BAD_FUNC_ARG; + + /* process the remaining block */ + if (ctx->leftover) { + size_t i = ctx->leftover; + ctx->buffer[i++] = 1; + for (; i < POLY1305_BLOCK_SIZE; i++) + ctx->buffer[i] = 0; + ctx->finished = 1; + poly1305_block(ctx, ctx->buffer); + } + + __asm__ __volatile__ ( + /* Load raw h and zero h registers */ + "LDP x2, x3, %[h_addr] \n\t" + "MOV x5, xzr \n\t" + "LDR w4, %[h_4_addr] \n\t" + "MOV x6, xzr \n\t" + "LDP x16, x17, %[pad_addr] \n\t" + /* Base 26 -> Base 64 */ + "MOV w5, w2 \n\t" + "LSR x2, x2, #32 \n\t" + "ORR x5, x5, x2, LSL #26 \n\t" + "ORR x5, x5, x3, LSL #52 \n\t" + "LSR w6, w3, #12 \n\t" + "LSR x3, x3, #32 \n\t" + "ORR x6, x6, x3, LSL #14 \n\t" + "ORR x6, x6, x4, LSL #40 \n\t" + "LSR x7, x4, #24 \n\t" + /* Check if h is larger than p */ + "ADDS x2, x5, #5 \n\t" + "ADCS x3, x6, xzr \n\t" + "ADC x4, x7, xzr \n\t" + /* Check if h+5 is larger than 2^130 */ + "CMP x4, #3 \n\t" + "CSEL x5, x2, x5, HI \n\t" + "CSEL x6, x3, x6, HI \n\t" + "ADDS x5, x5, x16 \n\t" + "ADC x6, x6, x17 \n\t" + "STP x5, x6, [%[mac]] \n\t" + : [mac] "+r" (mac) + : [pad_addr] "m" (ctx->pad), + [h_addr] "m" (ctx->h), + [h_4_addr] "m" (ctx->h[4]) + : "memory", "cc", + "w2", "w3", "w4", "w5", "w6", "w7", "x2", "x3", "x4", "x5", + "x6", "x7", "x16", "x17" + ); + + /* zero out the state */ + ctx->h[0] = 0; + ctx->h[1] = 0; + ctx->h[2] = 0; + ctx->h[3] = 0; + ctx->h[4] = 0; + ctx->r[0] = 0; + ctx->r[1] = 0; + ctx->r[2] = 0; + ctx->r[3] = 0; + ctx->r[4] = 0; + ctx->r_2[0] = 0; + ctx->r_2[1] = 0; + ctx->r_2[2] = 0; + ctx->r_2[3] = 0; + ctx->r_2[4] = 0; + ctx->r_4[0] = 0; + ctx->r_4[1] = 0; + ctx->r_4[2] = 0; + ctx->r_4[3] = 0; + ctx->r_4[4] = 0; + ctx->pad[0] = 0; + ctx->pad[1] = 0; + ctx->pad[2] = 0; + ctx->pad[3] = 0; + + return 0; +} + +#endif /* HAVE_POLY1305 */ +#endif /* WOLFSSL_ARMASM */ +#endif /* __aarch64__ */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha256.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha256.c new file mode 100644 index 0000000..7f214d4 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha256.c @@ -0,0 +1,1508 @@ +/* armv8-sha256.c + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + + +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#ifdef WOLFSSL_ARMASM +#if !defined(NO_SHA256) || defined(WOLFSSL_SHA224) + +#include <wolfssl/wolfcrypt/sha256.h> +#include <wolfssl/wolfcrypt/logging.h> +#include <wolfssl/wolfcrypt/error-crypt.h> + +#ifdef NO_INLINE + #include <wolfssl/wolfcrypt/misc.h> +#else + #define WOLFSSL_MISC_INCLUDED + #include <wolfcrypt/src/misc.c> +#endif + + +static const ALIGN32 word32 K[64] = { + 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, + 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, + 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, + 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, + 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L, + 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L, + 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, + 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, + 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L, + 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L, + 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, + 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, + 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L +}; + + +static int InitSha256(wc_Sha256* sha256) +{ + int ret = 0; + + if (sha256 == NULL) { + return BAD_FUNC_ARG; + } + + sha256->digest[0] = 0x6A09E667L; + sha256->digest[1] = 0xBB67AE85L; + sha256->digest[2] = 0x3C6EF372L; + sha256->digest[3] = 0xA54FF53AL; + sha256->digest[4] = 0x510E527FL; + sha256->digest[5] = 0x9B05688CL; + sha256->digest[6] = 0x1F83D9ABL; + sha256->digest[7] = 0x5BE0CD19L; + + sha256->buffLen = 0; + sha256->loLen = 0; + sha256->hiLen = 0; + + return ret; +} + +static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len) +{ + word32 tmp = sha256->loLen; + if ((sha256->loLen += len) < tmp) + sha256->hiLen++; /* carry low to high */ +} + + +#ifdef __aarch64__ + +/* ARMv8 hardware acceleration */ +static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) +{ + word32 add; + word32 numBlocks; + + /* only perform actions if a buffer is passed in */ + if (len > 0) { + /* fill leftover buffer with data */ + add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen); + XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add); + sha256->buffLen += add; + data += add; + len -= add; + + /* number of blocks in a row to complete */ + numBlocks = (len + sha256->buffLen)/WC_SHA256_BLOCK_SIZE; + + if (numBlocks > 0) { + word32* k = (word32*)K; + + /* get leftover amount after blocks */ + add = (len + sha256->buffLen) - numBlocks * WC_SHA256_BLOCK_SIZE; + __asm__ volatile ( + "#load leftover data\n" + "LD1 {v0.2d-v3.2d}, %[buffer] \n" + + "#load current digest\n" + "LD1 {v12.2d-v13.2d}, %[digest] \n" + "MOV w8, %w[blocks] \n" + "REV32 v0.16b, v0.16b \n" + "REV32 v1.16b, v1.16b \n" + "REV32 v2.16b, v2.16b \n" + "REV32 v3.16b, v3.16b \n" + + "#load K values in \n" + "LD1 {v16.4s-v19.4s}, [%[k]], #64 \n" + "LD1 {v20.4s-v23.4s}, [%[k]], #64 \n" + "MOV v14.16b, v12.16b \n" /* store digest for add at the end */ + "MOV v15.16b, v13.16b \n" + "LD1 {v24.4s-v27.4s}, [%[k]], #64 \n" + "LD1 {v28.4s-v31.4s}, [%[k]], #64 \n" + + /* beginning of SHA256 block operation */ + "1:\n" + /* Round 1 */ + "MOV v4.16b, v0.16b \n" + "ADD v0.4s, v0.4s, v16.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 2 */ + "SHA256SU0 v4.4s, v1.4s \n" + "ADD v0.4s, v1.4s, v17.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v4.4s, v2.4s, v3.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 3 */ + "SHA256SU0 v1.4s, v2.4s \n" + "ADD v0.4s, v2.4s, v18.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v1.4s, v3.4s, v4.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 4 */ + "SHA256SU0 v2.4s, v3.4s \n" + "ADD v0.4s, v3.4s, v19.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v2.4s, v4.4s, v1.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 5 */ + "SHA256SU0 v3.4s, v4.4s \n" + "ADD v0.4s, v4.4s, v20.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v3.4s, v1.4s, v2.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 6 */ + "SHA256SU0 v4.4s, v1.4s \n" + "ADD v0.4s, v1.4s, v21.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v4.4s, v2.4s, v3.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 7 */ + "SHA256SU0 v1.4s, v2.4s \n" + "ADD v0.4s, v2.4s, v22.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v1.4s, v3.4s, v4.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 8 */ + "SHA256SU0 v2.4s, v3.4s \n" + "ADD v0.4s, v3.4s, v23.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v2.4s, v4.4s, v1.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 9 */ + "SHA256SU0 v3.4s, v4.4s \n" + "ADD v0.4s, v4.4s, v24.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v3.4s, v1.4s, v2.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 10 */ + "SHA256SU0 v4.4s, v1.4s \n" + "ADD v0.4s, v1.4s, v25.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v4.4s, v2.4s, v3.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 11 */ + "SHA256SU0 v1.4s, v2.4s \n" + "ADD v0.4s, v2.4s, v26.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v1.4s, v3.4s, v4.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 12 */ + "SHA256SU0 v2.4s, v3.4s \n" + "ADD v0.4s, v3.4s, v27.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v2.4s, v4.4s, v1.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 13 */ + "SHA256SU0 v3.4s, v4.4s \n" + "ADD v0.4s, v4.4s, v28.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v3.4s, v1.4s, v2.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 14 */ + "ADD v0.4s, v1.4s, v29.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 15 */ + "ADD v0.4s, v2.4s, v30.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 16 */ + "ADD v0.4s, v3.4s, v31.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + "#Add working vars back into digest state \n" + "SUB w8, w8, #1 \n" + "ADD v12.4s, v12.4s, v14.4s \n" + "ADD v13.4s, v13.4s, v15.4s \n" + + "#check if more blocks should be done\n" + "CBZ w8, 2f \n" + + "#load in message and schedule updates \n" + "LD1 {v0.2d-v3.2d}, [%[dataIn]], #64 \n" + "MOV v14.16b, v12.16b \n" + "MOV v15.16b, v13.16b \n" + "REV32 v0.16b, v0.16b \n" + "REV32 v1.16b, v1.16b \n" + "REV32 v2.16b, v2.16b \n" + "REV32 v3.16b, v3.16b \n" + "B 1b \n" /* do another block */ + + "2:\n" + "STP q12, q13, %[out] \n" + + : [out] "=m" (sha256->digest), "=m" (sha256->buffer), "=r" (numBlocks), + "=r" (data), "=r" (k) + : [k] "4" (k), [digest] "m" (sha256->digest), [buffer] "m" (sha256->buffer), + [blocks] "2" (numBlocks), [dataIn] "3" (data) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "w8" + ); + + AddLength(sha256, WC_SHA256_BLOCK_SIZE * numBlocks); + + /* copy over any remaining data leftover */ + XMEMCPY(sha256->buffer, data, add); + sha256->buffLen = add; + } + } + + /* account for possibility of not used if len = 0 */ + (void)add; + (void)numBlocks; + + return 0; +} + + +static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash) +{ + byte* local; + + local = (byte*)sha256->buffer; + AddLength(sha256, sha256->buffLen); /* before adding pads */ + + local[sha256->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (sha256->buffLen > WC_SHA256_PAD_SIZE) { + + XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE - sha256->buffLen); + sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen; + __asm__ volatile ( + "LD1 {v4.2d-v7.2d}, %[buffer] \n" + "MOV v0.16b, v4.16b \n" + "MOV v1.16b, v5.16b \n" + "REV32 v0.16b, v0.16b \n" + "REV32 v1.16b, v1.16b \n" + "MOV v2.16b, v6.16b \n" + "MOV v3.16b, v7.16b \n" + "REV32 v2.16b, v2.16b \n" + "REV32 v3.16b, v3.16b \n" + "MOV v4.16b, v0.16b \n" + "MOV v5.16b, v1.16b \n" + "LD1 {v20.2d-v21.2d}, %[digest] \n" + + "#SHA256 operation on updated message \n" + "MOV v16.16b, v20.16b \n" + "MOV v17.16b, v21.16b \n" + + "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n" + "SHA256SU0 v4.4s, v1.4s \n" + "ADD v0.4s, v0.4s, v22.4s \n" + "MOV v6.16b, v2.16b \n" + "MOV v18.16b, v16.16b \n" + "SHA256SU1 v4.4s, v2.4s, v3.4s \n" + "SHA256H q16, q17, v0.4s \n" + "SHA256H2 q17, q18, v0.4s \n" + + "SHA256SU0 v5.4s, v2.4s \n" + "ADD v1.4s, v1.4s, v23.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v7.16b, v3.16b \n" + "SHA256SU1 v5.4s, v3.4s, v4.4s \n" + "SHA256H q16, q17, v1.4s \n" + "SHA256H2 q17, q18, v1.4s \n" + + "SHA256SU0 v6.4s, v3.4s \n" + "ADD v2.4s, v2.4s, v24.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v8.16b, v4.16b \n" + "SHA256SU1 v6.4s, v4.4s, v5.4s \n" + "SHA256H q16, q17, v2.4s \n" + "SHA256H2 q17, q18, v2.4s \n" + + "SHA256SU0 v7.4s, v4.4s \n" + "ADD v3.4s, v3.4s, v25.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v9.16b, v5.16b \n" + "SHA256SU1 v7.4s, v5.4s, v6.4s \n" + "SHA256H q16, q17, v3.4s \n" + "SHA256H2 q17, q18, v3.4s \n" + + "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n" + "SHA256SU0 v8.4s, v5.4s \n" + "ADD v4.4s, v4.4s, v22.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v10.16b, v6.16b \n" + "SHA256SU1 v8.4s, v6.4s, v7.4s \n" + "SHA256H q16, q17, v4.4s \n" + "SHA256H2 q17, q18, v4.4s \n" + + "SHA256SU0 v9.4s, v6.4s \n" + "ADD v5.4s, v5.4s, v23.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v11.16b, v7.16b \n" + "SHA256SU1 v9.4s, v7.4s, v8.4s \n" + "SHA256H q16, q17, v5.4s \n" + "SHA256H2 q17, q18, v5.4s \n" + + "SHA256SU0 v10.4s, v7.4s \n" + "ADD v6.4s, v6.4s, v24.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v12.16b, v8.16b \n" + "SHA256SU1 v10.4s, v8.4s, v9.4s \n" + "SHA256H q16, q17, v6.4s \n" + "SHA256H2 q17, q18, v6.4s \n" + + "SHA256SU0 v11.4s, v8.4s \n" + "ADD v7.4s, v7.4s, v25.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v13.16b, v9.16b \n" + "SHA256SU1 v11.4s, v9.4s, v10.4s \n" + "SHA256H q16, q17, v7.4s \n" + "SHA256H2 q17, q18, v7.4s \n" + + "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n" + "SHA256SU0 v12.4s, v9.4s \n" + "ADD v8.4s, v8.4s, v22.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v14.16b, v10.16b \n" + "SHA256SU1 v12.4s, v10.4s, v11.4s \n" + "SHA256H q16, q17, v8.4s \n" + "SHA256H2 q17, q18, v8.4s \n" + + "SHA256SU0 v13.4s, v10.4s \n" + "ADD v9.4s, v9.4s, v23.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v15.16b, v11.16b \n" + "SHA256SU1 v13.4s, v11.4s, v12.4s \n" + "SHA256H q16, q17, v9.4s \n" + "SHA256H2 q17, q18, v9.4s \n" + + "SHA256SU0 v14.4s, v11.4s \n" + "ADD v10.4s, v10.4s, v24.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256SU1 v14.4s, v12.4s, v13.4s \n" + "SHA256H q16, q17, v10.4s \n" + "SHA256H2 q17, q18, v10.4s \n" + + "SHA256SU0 v15.4s, v12.4s \n" + "ADD v11.4s, v11.4s, v25.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256SU1 v15.4s, v13.4s, v14.4s \n" + "SHA256H q16, q17, v11.4s \n" + "SHA256H2 q17, q18, v11.4s \n" + + "LD1 {v22.16b-v25.16b}, [%[k]] \n" + "ADD v12.4s, v12.4s, v22.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256H q16, q17, v12.4s \n" + "SHA256H2 q17, q18, v12.4s \n" + + "ADD v13.4s, v13.4s, v23.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256H q16, q17, v13.4s \n" + "SHA256H2 q17, q18, v13.4s \n" + + "ADD v14.4s, v14.4s, v24.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256H q16, q17, v14.4s \n" + "SHA256H2 q17, q18, v14.4s \n" + + "ADD v15.4s, v15.4s, v25.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256H q16, q17, v15.4s \n" + "SHA256H2 q17, q18, v15.4s \n" + + "#Add working vars back into digest state \n" + "ADD v16.4s, v16.4s, v20.4s \n" + "ADD v17.4s, v17.4s, v21.4s \n" + "STP q16, q17, %[out] \n" + + : [out] "=m" (sha256->digest) + : [k] "r" (K), [digest] "m" (sha256->digest), + [buffer] "m" (sha256->buffer) + : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11" + , "v12", "v13", "v14", "v15", "v16", "v17", "v18" + , "v19", "v20", "v21", "v22", "v23", "v24", "v25" + ); + + sha256->buffLen = 0; + } + XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen); + + /* put lengths in bits */ + sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) + + (sha256->hiLen << 3); + sha256->loLen = sha256->loLen << 3; + + /* store lengths */ + #if defined(LITTLE_ENDIAN_ORDER) + __asm__ volatile ( + "LD1 {v0.2d-v3.2d}, %[in] \n" + "REV32 v0.16b, v0.16b \n" + "REV32 v1.16b, v1.16b \n" + "REV32 v2.16b, v2.16b \n" + "REV32 v3.16b, v3.16b \n" + "ST1 {v0.2d-v3.2d}, %[out] \n" + : [out] "=m" (sha256->buffer) + : [in] "m" (sha256->buffer) + : "cc", "memory", "v0", "v1", "v2", "v3" + ); + #endif + /* ! length ordering dependent on digest endian type ! */ + XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); + XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, + sizeof(word32)); + + __asm__ volatile ( + "#load in message and schedule updates \n" + "LD1 {v4.2d-v7.2d}, %[buffer] \n" + "MOV v0.16b, v4.16b \n" + "MOV v1.16b, v5.16b \n" + "MOV v2.16b, v6.16b \n" + "MOV v3.16b, v7.16b \n" + "LD1 {v20.2d-v21.2d}, %[digest] \n" + + "MOV v16.16b, v20.16b \n" + "MOV v17.16b, v21.16b \n" + "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n" + "SHA256SU0 v4.4s, v1.4s \n" + "ADD v0.4s, v0.4s, v22.4s \n" + "MOV v6.16b, v2.16b \n" + "MOV v18.16b, v16.16b \n" + "SHA256SU1 v4.4s, v2.4s, v3.4s \n" + "SHA256H q16, q17, v0.4s \n" + "SHA256H2 q17, q18, v0.4s \n" + + "SHA256SU0 v5.4s, v2.4s \n" + "ADD v1.4s, v1.4s, v23.4s \n" + "MOV v7.16b, v3.16b \n" + "MOV v18.16b, v16.16b \n" + "SHA256SU1 v5.4s, v3.4s, v4.4s \n" + "SHA256H q16, q17, v1.4s \n" + "SHA256H2 q17, q18, v1.4s \n" + + "SHA256SU0 v6.4s, v3.4s \n" + "ADD v2.4s, v2.4s, v24.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v8.16b, v4.16b \n" + "SHA256SU1 v6.4s, v4.4s, v5.4s \n" + "SHA256H q16, q17, v2.4s \n" + "SHA256H2 q17, q18, v2.4s \n" + + "SHA256SU0 v7.4s, v4.4s \n" + "ADD v3.4s, v3.4s, v25.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v9.16b, v5.16b \n" + "SHA256SU1 v7.4s, v5.4s, v6.4s \n" + "SHA256H q16, q17, v3.4s \n" + "SHA256H2 q17, q18, v3.4s \n" + + "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n" + "SHA256SU0 v8.4s, v5.4s \n" + "ADD v4.4s, v4.4s, v22.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v10.16b, v6.16b \n" + "SHA256SU1 v8.4s, v6.4s, v7.4s \n" + "SHA256H q16, q17, v4.4s \n" + "SHA256H2 q17, q18, v4.4s \n" + + "SHA256SU0 v9.4s, v6.4s \n" + "ADD v5.4s, v5.4s, v23.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v11.16b, v7.16b \n" + "SHA256SU1 v9.4s, v7.4s, v8.4s \n" + "SHA256H q16, q17, v5.4s \n" + "SHA256H2 q17, q18, v5.4s \n" + + "SHA256SU0 v10.4s, v7.4s \n" + "ADD v6.4s, v6.4s, v24.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v12.16b, v8.16b \n" + "SHA256SU1 v10.4s, v8.4s, v9.4s \n" + "SHA256H q16, q17, v6.4s \n" + "SHA256H2 q17, q18, v6.4s \n" + + "SHA256SU0 v11.4s, v8.4s \n" + "ADD v7.4s, v7.4s, v25.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v13.16b, v9.16b \n" + "SHA256SU1 v11.4s, v9.4s, v10.4s \n" + "SHA256H q16, q17, v7.4s \n" + "SHA256H2 q17, q18, v7.4s \n" + + "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n" + "SHA256SU0 v12.4s, v9.4s \n" + "ADD v8.4s, v8.4s, v22.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v14.16b, v10.16b \n" + "SHA256SU1 v12.4s, v10.4s, v11.4s \n" + "SHA256H q16, q17, v8.4s \n" + "SHA256H2 q17, q18, v8.4s \n" + + "SHA256SU0 v13.4s, v10.4s \n" + "ADD v9.4s, v9.4s, v23.4s \n" + "MOV v18.16b, v16.16b \n" + "MOV v15.16b, v11.16b \n" + "SHA256SU1 v13.4s, v11.4s, v12.4s \n" + "SHA256H q16, q17, v9.4s \n" + "SHA256H2 q17, q18, v9.4s \n" + + "SHA256SU0 v14.4s, v11.4s \n" + "ADD v10.4s, v10.4s, v24.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256SU1 v14.4s, v12.4s, v13.4s \n" + "SHA256H q16, q17, v10.4s \n" + "SHA256H2 q17, q18, v10.4s \n" + + "SHA256SU0 v15.4s, v12.4s \n" + "ADD v11.4s, v11.4s, v25.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256SU1 v15.4s, v13.4s, v14.4s \n" + "SHA256H q16, q17, v11.4s \n" + "SHA256H2 q17, q18, v11.4s \n" + + "LD1 {v22.16b-v25.16b}, [%[k]] \n" + "ADD v12.4s, v12.4s, v22.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256H q16, q17, v12.4s \n" + "SHA256H2 q17, q18, v12.4s \n" + + "ADD v13.4s, v13.4s, v23.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256H q16, q17, v13.4s \n" + "SHA256H2 q17, q18, v13.4s \n" + + "ADD v14.4s, v14.4s, v24.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256H q16, q17, v14.4s \n" + "SHA256H2 q17, q18, v14.4s \n" + + "ADD v15.4s, v15.4s, v25.4s \n" + "MOV v18.16b, v16.16b \n" + "SHA256H q16, q17, v15.4s \n" + "SHA256H2 q17, q18, v15.4s \n" + + "#Add working vars back into digest state \n" + "ADD v16.4s, v16.4s, v20.4s \n" + "ADD v17.4s, v17.4s, v21.4s \n" + + "#Store value as hash output \n" + #if defined(LITTLE_ENDIAN_ORDER) + "REV32 v16.16b, v16.16b \n" + #endif + "ST1 {v16.16b}, [%[hashOut]], #16 \n" + #if defined(LITTLE_ENDIAN_ORDER) + "REV32 v17.16b, v17.16b \n" + #endif + "ST1 {v17.16b}, [%[hashOut]] \n" + : [hashOut] "=r" (hash) + : [k] "r" (K), [digest] "m" (sha256->digest), + [buffer] "m" (sha256->buffer), + "0" (hash) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25" + ); + + return 0; +} + +#else /* not using 64 bit */ + +/* ARMv8 hardware acceleration Aarch32 */ +static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) +{ + word32 add; + word32 numBlocks; + + /* only perform actions if a buffer is passed in */ + if (len > 0) { + /* fill leftover buffer with data */ + add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen); + XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add); + sha256->buffLen += add; + data += add; + len -= add; + + /* number of blocks in a row to complete */ + numBlocks = (len + sha256->buffLen)/WC_SHA256_BLOCK_SIZE; + + if (numBlocks > 0) { + word32* bufPt = sha256->buffer; + word32* digPt = sha256->digest; + /* get leftover amount after blocks */ + add = (len + sha256->buffLen) - numBlocks * WC_SHA256_BLOCK_SIZE; + __asm__ volatile ( + "#load leftover data\n" + "VLDM %[buffer]!, {q0-q3} \n" + + "#load current digest\n" + "VLDM %[digest], {q12-q13} \n" + "MOV r8, %[blocks] \n" + "VREV32.8 q0, q0 \n" + "VREV32.8 q1, q1 \n" + "VREV32.8 q2, q2 \n" + "VREV32.8 q3, q3 \n" + "VLDM %[k]! ,{q5-q8} \n" + "VLDM %[k]! ,{q9}\n" + + "VMOV.32 q14, q12 \n" /* store digest for add at the end */ + "VMOV.32 q15, q13 \n" + + /* beginning of SHA256 block operation */ + "1:\n" + + /* Round 1 */ + "VMOV.32 q4, q0 \n" + "VADD.i32 q0, q0, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 2 */ + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q6 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 3 */ + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q7 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 4 */ + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q8 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 5 */ + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q9 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 6 */ + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 7 */ + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 8 */ + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 9 */ + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 10 */ + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 11 */ + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 12 */ + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 13 */ + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 14 */ + "VLD1.32 {q10}, [%[k]]! \n" + "VADD.i32 q0, q1, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 15 */ + "VLD1.32 {q10}, [%[k]]! \n" + "VADD.i32 q0, q2, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 16 */ + "VLD1.32 {q10}, [%[k]] \n" + "SUB r8, r8, #1 \n" + "VADD.i32 q0, q3, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + "#Add working vars back into digest state \n" + "VADD.i32 q12, q12, q14 \n" + "VADD.i32 q13, q13, q15 \n" + + "#check if more blocks should be done\n" + "CMP r8, #0 \n" + "BEQ 2f \n" + + "#load in message and schedule updates \n" + "VLD1.32 {q0}, [%[dataIn]]! \n" + "VLD1.32 {q1}, [%[dataIn]]! \n" + "VLD1.32 {q2}, [%[dataIn]]! \n" + "VLD1.32 {q3}, [%[dataIn]]! \n" + + /* reset K pointer */ + "SUB %[k], %[k], #160 \n" + "VREV32.8 q0, q0 \n" + "VREV32.8 q1, q1 \n" + "VREV32.8 q2, q2 \n" + "VREV32.8 q3, q3 \n" + "VMOV.32 q14, q12 \n" + "VMOV.32 q15, q13 \n" + "B 1b \n" /* do another block */ + + "2:\n" + "VST1.32 {q12, q13}, [%[out]] \n" + + : [out] "=r" (digPt), "=r" (bufPt), "=r" (numBlocks), + "=r" (data) + : [k] "r" (K), [digest] "0" (digPt), [buffer] "1" (bufPt), + [blocks] "2" (numBlocks), [dataIn] "3" (data) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "r8" + ); + + AddLength(sha256, WC_SHA256_BLOCK_SIZE * numBlocks); + + /* copy over any remaining data leftover */ + XMEMCPY(sha256->buffer, data, add); + sha256->buffLen = add; + } + } + + /* account for possibility of not used if len = 0 */ + (void)add; + (void)numBlocks; + + return 0; +} + + +static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash) +{ + byte* local; + + if (sha256 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + + local = (byte*)sha256->buffer; + AddLength(sha256, sha256->buffLen); /* before adding pads */ + + local[sha256->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (sha256->buffLen > WC_SHA256_PAD_SIZE) { + word32* bufPt = sha256->buffer; + word32* digPt = sha256->digest; + XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE - sha256->buffLen); + sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen; + __asm__ volatile ( + "#load leftover data\n" + "VLDM %[buffer]!, {q0-q3} \n" + + "#load current digest\n" + "VLDM %[digest], {q12-q13} \n" + "VREV32.8 q0, q0 \n" + "VREV32.8 q1, q1 \n" + "VREV32.8 q2, q2 \n" + "VREV32.8 q3, q3 \n" + + "#load K values in \n" + "VMOV.32 q14, q12 \n" /* store digest for add at the end */ + "VMOV.32 q15, q13 \n" + + /* beginning of SHA256 block operation */ + /* Round 1 */ + "VLD1.32 {q5}, [%[k]]! \n" + "VMOV.32 q4, q0 \n" + "VADD.i32 q0, q0, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 2 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 3 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 4 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 5 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 6 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 7 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 8 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 9 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 10 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 11 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 12 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 13 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 14 */ + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 15 */ + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 16 */ + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + "#Add working vars back into digest state \n" + "VADD.i32 q12, q12, q14 \n" + "VADD.i32 q13, q13, q15 \n" + + /* reset K pointer */ + "SUB %[k], %[k], #256 \n" + "VST1.32 {q12, q13}, [%[out]] \n" + + : [out] "=r" (digPt), "=r" (bufPt) + : [k] "r" (K), [digest] "0" (digPt), [buffer] "1" (bufPt) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15" + ); + + sha256->buffLen = 0; + } + XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen); + + /* put lengths in bits */ + sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) + + (sha256->hiLen << 3); + sha256->loLen = sha256->loLen << 3; + + /* store lengths */ + #if defined(LITTLE_ENDIAN_ORDER) + { + word32* bufPt = sha256->buffer; + __asm__ volatile ( + "VLD1.32 {q0}, [%[in]] \n" + "VREV32.8 q0, q0 \n" + "VST1.32 {q0}, [%[out]]!\n" + "VLD1.32 {q1}, [%[in]] \n" + "VREV32.8 q1, q1 \n" + "VST1.32 {q1}, [%[out]]!\n" + "VLD1.32 {q2}, [%[in]] \n" + "VREV32.8 q2, q2 \n" + "VST1.32 {q2}, [%[out]]!\n" + "VLD1.32 {q3}, [%[in]] \n" + "VREV32.8 q3, q3 \n" + "VST1.32 {q3}, [%[out]] \n" + : [out] "=r" (bufPt) + : [in] "0" (bufPt) + : "cc", "memory", "q0", "q1", "q2", "q3" + ); + } + #endif + /* ! length ordering dependent on digest endian type ! */ + XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); + XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, + sizeof(word32)); + + bufPt = sha256->buffer; + word32* digPt = sha256->digest; + __asm__ volatile ( + "#load leftover data\n" + "VLDM %[buffer]!, {q0-q3} \n" + + "#load current digest\n" + "VLDM %[digest], {q12-q13} \n" + + "VMOV.32 q14, q12 \n" /* store digest for add at the end */ + "VMOV.32 q15, q13 \n" + + /* beginning of SHA256 block operation */ + /* Round 1 */ + "VLD1.32 {q5}, [%[k]]! \n" + "VMOV.32 q4, q0 \n" + "VADD.i32 q0, q0, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 2 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 3 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 4 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 5 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 6 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 7 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 8 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 9 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 10 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 11 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 12 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 13 */ + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 14 */ + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 15 */ + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + /* Round 16 */ + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256H.32 q12, q13, q0 \n" + "SHA256H2.32 q13, q11, q0 \n" + + "#Add working vars back into digest state \n" + "VADD.i32 q12, q12, q14 \n" + "VADD.i32 q13, q13, q15 \n" + + "#Store value as hash output \n" + #if defined(LITTLE_ENDIAN_ORDER) + "VREV32.8 q12, q12 \n" + #endif + "VST1.32 {q12}, [%[hashOut]]! \n" + #if defined(LITTLE_ENDIAN_ORDER) + "VREV32.8 q13, q13 \n" + #endif + "VST1.32 {q13}, [%[hashOut]] \n" + + : [out] "=r" (digPt), "=r" (bufPt), + [hashOut] "=r" (hash) + : [k] "r" (K), [digest] "0" (digPt), [buffer] "1" (bufPt), + "2" (hash) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15" + ); + + return 0; +} + +#endif /* __aarch64__ */ + + +#ifndef NO_SHA256 + +int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) +{ + if (sha256 == NULL) + return BAD_FUNC_ARG; + + sha256->heap = heap; + (void)devId; + + return InitSha256(sha256); +} + +int wc_InitSha256(wc_Sha256* sha256) +{ + return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID); +} + +void wc_Sha256Free(wc_Sha256* sha256) +{ + (void)sha256; +} + +int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) +{ + if (sha256 == NULL || (data == NULL && len != 0)) { + return BAD_FUNC_ARG; + } + + return Sha256Update(sha256, data, len); +} + +int wc_Sha256Final(wc_Sha256* sha256, byte* hash) +{ + int ret; + + if (sha256 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + + ret = Sha256Final(sha256, hash); + if (ret != 0) + return ret; + + return InitSha256(sha256); /* reset state */ +} + +int wc_Sha256GetHash(wc_Sha256* sha256, byte* hash) +{ + int ret; + wc_Sha256 tmpSha256; + + if (sha256 == NULL || hash == NULL) + return BAD_FUNC_ARG; + + ret = wc_Sha256Copy(sha256, &tmpSha256); + if (ret == 0) { + ret = wc_Sha256Final(&tmpSha256, hash); + } + return ret; +} + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) +int wc_Sha256SetFlags(wc_Sha256* sha256, word32 flags) +{ + if (sha256) { + sha256->flags = flags; + } + return 0; +} +int wc_Sha256GetFlags(wc_Sha256* sha256, word32* flags) +{ + if (sha256 && flags) { + *flags = sha256->flags; + } + return 0; +} +#endif + +int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst) +{ + int ret = 0; + + if (src == NULL || dst == NULL) + return BAD_FUNC_ARG; + + XMEMCPY(dst, src, sizeof(wc_Sha256)); + + return ret; +} + +#endif /* !NO_SHA256 */ + + +#ifdef WOLFSSL_SHA224 + static int InitSha224(wc_Sha224* sha224) + { + + int ret = 0; + + if (sha224 == NULL) { + return BAD_FUNC_ARG; + } + + sha224->digest[0] = 0xc1059ed8; + sha224->digest[1] = 0x367cd507; + sha224->digest[2] = 0x3070dd17; + sha224->digest[3] = 0xf70e5939; + sha224->digest[4] = 0xffc00b31; + sha224->digest[5] = 0x68581511; + sha224->digest[6] = 0x64f98fa7; + sha224->digest[7] = 0xbefa4fa4; + + sha224->buffLen = 0; + sha224->loLen = 0; + sha224->hiLen = 0; + + return ret; + } + + int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId) + { + if (sha224 == NULL) + return BAD_FUNC_ARG; + + sha224->heap = heap; + (void)devId; + + return InitSha224(sha224); + } + + int wc_InitSha224(wc_Sha224* sha224) + { + return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID); + } + + int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len) + { + int ret; + + if (sha224 == NULL || (data == NULL && len > 0)) { + return BAD_FUNC_ARG; + } + + ret = Sha256Update((wc_Sha256 *)sha224, data, len); + + return ret; + } + + int wc_Sha224Final(wc_Sha224* sha224, byte* hash) + { + int ret; + word32 hashTmp[WC_SHA256_DIGEST_SIZE/sizeof(word32)]; + + if (sha224 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + + ret = Sha256Final((wc_Sha256*)sha224, (byte*)hashTmp); + if (ret != 0) + return ret; + + XMEMCPY(hash, hashTmp, WC_SHA224_DIGEST_SIZE); + + return InitSha224(sha224); /* reset state */ + } + + void wc_Sha224Free(wc_Sha224* sha224) + { + if (sha224 == NULL) + return; + } + + int wc_Sha224GetHash(wc_Sha224* sha224, byte* hash) + { + int ret; + wc_Sha224 tmpSha224; + + if (sha224 == NULL || hash == NULL) + return BAD_FUNC_ARG; + + ret = wc_Sha224Copy(sha224, &tmpSha224); + if (ret == 0) { + ret = wc_Sha224Final(&tmpSha224, hash); + } + return ret; + } + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) + int wc_Sha224SetFlags(wc_Sha224* sha224, word32 flags) + { + if (sha224) { + sha224->flags = flags; + } + return 0; + } + int wc_Sha224GetFlags(wc_Sha224* sha224, word32* flags) + { + if (sha224 && flags) { + *flags = sha224->flags; + } + return 0; + } +#endif + + int wc_Sha224Copy(wc_Sha224* src, wc_Sha224* dst) + { + int ret = 0; + + if (src == NULL || dst == NULL) + return BAD_FUNC_ARG; + + XMEMCPY(dst, src, sizeof(wc_Sha224)); + + return ret; + } + +#endif /* WOLFSSL_SHA224 */ + +#endif /* !NO_SHA256 || WOLFSSL_SHA224 */ +#endif /* WOLFSSL_ARMASM */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S new file mode 100644 index 0000000..a35bccb --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -0,0 +1,1046 @@ +/* armv8-sha512-asm + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S + */ +#ifdef __aarch64__ + .text + .section .rodata + .type L_SHA512_transform_neon_len_k, %object + .size L_SHA512_transform_neon_len_k, 640 + .align 3 +L_SHA512_transform_neon_len_k: + .xword 0x428a2f98d728ae22 + .xword 0x7137449123ef65cd + .xword 0xb5c0fbcfec4d3b2f + .xword 0xe9b5dba58189dbbc + .xword 0x3956c25bf348b538 + .xword 0x59f111f1b605d019 + .xword 0x923f82a4af194f9b + .xword 0xab1c5ed5da6d8118 + .xword 0xd807aa98a3030242 + .xword 0x12835b0145706fbe + .xword 0x243185be4ee4b28c + .xword 0x550c7dc3d5ffb4e2 + .xword 0x72be5d74f27b896f + .xword 0x80deb1fe3b1696b1 + .xword 0x9bdc06a725c71235 + .xword 0xc19bf174cf692694 + .xword 0xe49b69c19ef14ad2 + .xword 0xefbe4786384f25e3 + .xword 0xfc19dc68b8cd5b5 + .xword 0x240ca1cc77ac9c65 + .xword 0x2de92c6f592b0275 + .xword 0x4a7484aa6ea6e483 + .xword 0x5cb0a9dcbd41fbd4 + .xword 0x76f988da831153b5 + .xword 0x983e5152ee66dfab + .xword 0xa831c66d2db43210 + .xword 0xb00327c898fb213f + .xword 0xbf597fc7beef0ee4 + .xword 0xc6e00bf33da88fc2 + .xword 0xd5a79147930aa725 + .xword 0x6ca6351e003826f + .xword 0x142929670a0e6e70 + .xword 0x27b70a8546d22ffc + .xword 0x2e1b21385c26c926 + .xword 0x4d2c6dfc5ac42aed + .xword 0x53380d139d95b3df + .xword 0x650a73548baf63de + .xword 0x766a0abb3c77b2a8 + .xword 0x81c2c92e47edaee6 + .xword 0x92722c851482353b + .xword 0xa2bfe8a14cf10364 + .xword 0xa81a664bbc423001 + .xword 0xc24b8b70d0f89791 + .xword 0xc76c51a30654be30 + .xword 0xd192e819d6ef5218 + .xword 0xd69906245565a910 + .xword 0xf40e35855771202a + .xword 0x106aa07032bbd1b8 + .xword 0x19a4c116b8d2d0c8 + .xword 0x1e376c085141ab53 + .xword 0x2748774cdf8eeb99 + .xword 0x34b0bcb5e19b48a8 + .xword 0x391c0cb3c5c95a63 + .xword 0x4ed8aa4ae3418acb + .xword 0x5b9cca4f7763e373 + .xword 0x682e6ff3d6b2b8a3 + .xword 0x748f82ee5defb2fc + .xword 0x78a5636f43172f60 + .xword 0x84c87814a1f0ab72 + .xword 0x8cc702081a6439ec + .xword 0x90befffa23631e28 + .xword 0xa4506cebde82bde9 + .xword 0xbef9a3f7b2c67915 + .xword 0xc67178f2e372532b + .xword 0xca273eceea26619c + .xword 0xd186b8c721c0c207 + .xword 0xeada7dd6cde0eb1e + .xword 0xf57d4f7fee6ed178 + .xword 0x6f067aa72176fba + .xword 0xa637dc5a2c898a6 + .xword 0x113f9804bef90dae + .xword 0x1b710b35131c471b + .xword 0x28db77f523047d84 + .xword 0x32caab7b40c72493 + .xword 0x3c9ebe0a15c9bebc + .xword 0x431d67c49c100d4c + .xword 0x4cc5d4becb3e42b6 + .xword 0x597f299cfc657e2a + .xword 0x5fcb6fab3ad6faec + .xword 0x6c44198c4a475817 + .text + .section .rodata + .type L_SHA512_transform_neon_len_ror8, %object + .size L_SHA512_transform_neon_len_ror8, 16 + .align 4 +L_SHA512_transform_neon_len_ror8: + .xword 0x7060504030201, 0x80f0e0d0c0b0a09 + .text + .align 2 + .globl Transform_Sha512_Len + .type Transform_Sha512_Len, %function +Transform_Sha512_Len: + stp x29, x30, [sp, #-128]! + add x29, sp, #0 + str x17, [x29, #16] + str x19, [x29, #24] + stp x20, x21, [x29, #32] + stp x22, x23, [x29, #48] + stp x24, x25, [x29, #64] + stp x26, x27, [x29, #80] + stp d8, d9, [x29, #96] + stp d10, d11, [x29, #112] + adr x3, L_SHA512_transform_neon_len_k + adr x27, L_SHA512_transform_neon_len_ror8 + ld1 {v11.16b}, [x27] + # Load digest into working vars + ldp x4, x5, [x0] + ldp x6, x7, [x0, #16] + ldp x8, x9, [x0, #32] + ldp x10, x11, [x0, #48] + # Start of loop processing a block +L_sha512_len_neon_begin: + # Load W + # Copy digest to add in at end + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #0x40 + mov x19, x4 + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #0x40 + mov x20, x5 + rev64 v0.16b, v0.16b + mov x21, x6 + rev64 v1.16b, v1.16b + mov x22, x7 + rev64 v2.16b, v2.16b + mov x23, x8 + rev64 v3.16b, v3.16b + mov x24, x9 + rev64 v4.16b, v4.16b + mov x25, x10 + rev64 v5.16b, v5.16b + mov x26, x11 + rev64 v6.16b, v6.16b + rev64 v7.16b, v7.16b + # Pre-calc: b ^ c + eor x16, x5, x6 + mov x27, #4 + # Start of 16 rounds +L_sha512_len_neon_start: + # Round 0 + mov x13, v0.d[0] + ldr x15, [x3], #8 + ror x12, x8, #14 + ror x14, x4, #28 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x14, x14, x4, ror 39 + add x11, x11, x12 + eor x17, x4, x5 + eor x12, x9, x10 + and x16, x17, x16 + and x12, x12, x8 + add x11, x11, x13 + eor x12, x12, x10 + add x11, x11, x15 + eor x16, x16, x5 + add x11, x11, x12 + add x14, x14, x16 + add x7, x7, x11 + add x11, x11, x14 + # Round 1 + mov x13, v0.d[1] + ldr x15, [x3], #8 + ext v10.16b, v0.16b, v1.16b, #8 + ror x12, x7, #14 + shl v8.2d, v7.2d, #45 + ror x14, x11, #28 + sri v8.2d, v7.2d, #19 + eor x12, x12, x7, ror 18 + shl v9.2d, v7.2d, #3 + eor x14, x14, x11, ror 34 + sri v9.2d, v7.2d, #61 + eor x12, x12, x7, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x14, x14, x11, ror 39 + ushr v8.2d, v7.2d, #6 + add x10, x10, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x11, x4 + add v0.2d, v0.2d, v9.2d + eor x12, x8, x9 + ext v9.16b, v4.16b, v5.16b, #8 + and x17, x16, x17 + add v0.2d, v0.2d, v9.2d + and x12, x12, x7 + shl v8.2d, v10.2d, #63 + add x10, x10, x13 + sri v8.2d, v10.2d, #1 + eor x12, x12, x9 + tbl v9.16b, {v10.16b}, v11.16b + add x10, x10, x15 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x4 + ushr v10.2d, v10.2d, #7 + add x10, x10, x12 + eor v9.16b, v9.16b, v10.16b + add x14, x14, x17 + add v0.2d, v0.2d, v9.2d + add x6, x6, x10 + add x10, x10, x14 + # Round 2 + mov x13, v1.d[0] + ldr x15, [x3], #8 + ror x12, x6, #14 + ror x14, x10, #28 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x14, x14, x10, ror 39 + add x9, x9, x12 + eor x17, x10, x11 + eor x12, x7, x8 + and x16, x17, x16 + and x12, x12, x6 + add x9, x9, x13 + eor x12, x12, x8 + add x9, x9, x15 + eor x16, x16, x11 + add x9, x9, x12 + add x14, x14, x16 + add x5, x5, x9 + add x9, x9, x14 + # Round 3 + mov x13, v1.d[1] + ldr x15, [x3], #8 + ext v10.16b, v1.16b, v2.16b, #8 + ror x12, x5, #14 + shl v8.2d, v0.2d, #45 + ror x14, x9, #28 + sri v8.2d, v0.2d, #19 + eor x12, x12, x5, ror 18 + shl v9.2d, v0.2d, #3 + eor x14, x14, x9, ror 34 + sri v9.2d, v0.2d, #61 + eor x12, x12, x5, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x14, x14, x9, ror 39 + ushr v8.2d, v0.2d, #6 + add x8, x8, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x9, x10 + add v1.2d, v1.2d, v9.2d + eor x12, x6, x7 + ext v9.16b, v5.16b, v6.16b, #8 + and x17, x16, x17 + add v1.2d, v1.2d, v9.2d + and x12, x12, x5 + shl v8.2d, v10.2d, #63 + add x8, x8, x13 + sri v8.2d, v10.2d, #1 + eor x12, x12, x7 + tbl v9.16b, {v10.16b}, v11.16b + add x8, x8, x15 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x10 + ushr v10.2d, v10.2d, #7 + add x8, x8, x12 + eor v9.16b, v9.16b, v10.16b + add x14, x14, x17 + add v1.2d, v1.2d, v9.2d + add x4, x4, x8 + add x8, x8, x14 + # Round 4 + mov x13, v2.d[0] + ldr x15, [x3], #8 + ror x12, x4, #14 + ror x14, x8, #28 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x14, x14, x8, ror 39 + add x7, x7, x12 + eor x17, x8, x9 + eor x12, x5, x6 + and x16, x17, x16 + and x12, x12, x4 + add x7, x7, x13 + eor x12, x12, x6 + add x7, x7, x15 + eor x16, x16, x9 + add x7, x7, x12 + add x14, x14, x16 + add x11, x11, x7 + add x7, x7, x14 + # Round 5 + mov x13, v2.d[1] + ldr x15, [x3], #8 + ext v10.16b, v2.16b, v3.16b, #8 + ror x12, x11, #14 + shl v8.2d, v1.2d, #45 + ror x14, x7, #28 + sri v8.2d, v1.2d, #19 + eor x12, x12, x11, ror 18 + shl v9.2d, v1.2d, #3 + eor x14, x14, x7, ror 34 + sri v9.2d, v1.2d, #61 + eor x12, x12, x11, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x14, x14, x7, ror 39 + ushr v8.2d, v1.2d, #6 + add x6, x6, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x7, x8 + add v2.2d, v2.2d, v9.2d + eor x12, x4, x5 + ext v9.16b, v6.16b, v7.16b, #8 + and x17, x16, x17 + add v2.2d, v2.2d, v9.2d + and x12, x12, x11 + shl v8.2d, v10.2d, #63 + add x6, x6, x13 + sri v8.2d, v10.2d, #1 + eor x12, x12, x5 + tbl v9.16b, {v10.16b}, v11.16b + add x6, x6, x15 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x8 + ushr v10.2d, v10.2d, #7 + add x6, x6, x12 + eor v9.16b, v9.16b, v10.16b + add x14, x14, x17 + add v2.2d, v2.2d, v9.2d + add x10, x10, x6 + add x6, x6, x14 + # Round 6 + mov x13, v3.d[0] + ldr x15, [x3], #8 + ror x12, x10, #14 + ror x14, x6, #28 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x14, x14, x6, ror 39 + add x5, x5, x12 + eor x17, x6, x7 + eor x12, x11, x4 + and x16, x17, x16 + and x12, x12, x10 + add x5, x5, x13 + eor x12, x12, x4 + add x5, x5, x15 + eor x16, x16, x7 + add x5, x5, x12 + add x14, x14, x16 + add x9, x9, x5 + add x5, x5, x14 + # Round 7 + mov x13, v3.d[1] + ldr x15, [x3], #8 + ext v10.16b, v3.16b, v4.16b, #8 + ror x12, x9, #14 + shl v8.2d, v2.2d, #45 + ror x14, x5, #28 + sri v8.2d, v2.2d, #19 + eor x12, x12, x9, ror 18 + shl v9.2d, v2.2d, #3 + eor x14, x14, x5, ror 34 + sri v9.2d, v2.2d, #61 + eor x12, x12, x9, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x14, x14, x5, ror 39 + ushr v8.2d, v2.2d, #6 + add x4, x4, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x5, x6 + add v3.2d, v3.2d, v9.2d + eor x12, x10, x11 + ext v9.16b, v7.16b, v0.16b, #8 + and x17, x16, x17 + add v3.2d, v3.2d, v9.2d + and x12, x12, x9 + shl v8.2d, v10.2d, #63 + add x4, x4, x13 + sri v8.2d, v10.2d, #1 + eor x12, x12, x11 + tbl v9.16b, {v10.16b}, v11.16b + add x4, x4, x15 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x6 + ushr v10.2d, v10.2d, #7 + add x4, x4, x12 + eor v9.16b, v9.16b, v10.16b + add x14, x14, x17 + add v3.2d, v3.2d, v9.2d + add x8, x8, x4 + add x4, x4, x14 + # Round 8 + mov x13, v4.d[0] + ldr x15, [x3], #8 + ror x12, x8, #14 + ror x14, x4, #28 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x14, x14, x4, ror 39 + add x11, x11, x12 + eor x17, x4, x5 + eor x12, x9, x10 + and x16, x17, x16 + and x12, x12, x8 + add x11, x11, x13 + eor x12, x12, x10 + add x11, x11, x15 + eor x16, x16, x5 + add x11, x11, x12 + add x14, x14, x16 + add x7, x7, x11 + add x11, x11, x14 + # Round 9 + mov x13, v4.d[1] + ldr x15, [x3], #8 + ext v10.16b, v4.16b, v5.16b, #8 + ror x12, x7, #14 + shl v8.2d, v3.2d, #45 + ror x14, x11, #28 + sri v8.2d, v3.2d, #19 + eor x12, x12, x7, ror 18 + shl v9.2d, v3.2d, #3 + eor x14, x14, x11, ror 34 + sri v9.2d, v3.2d, #61 + eor x12, x12, x7, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x14, x14, x11, ror 39 + ushr v8.2d, v3.2d, #6 + add x10, x10, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x11, x4 + add v4.2d, v4.2d, v9.2d + eor x12, x8, x9 + ext v9.16b, v0.16b, v1.16b, #8 + and x17, x16, x17 + add v4.2d, v4.2d, v9.2d + and x12, x12, x7 + shl v8.2d, v10.2d, #63 + add x10, x10, x13 + sri v8.2d, v10.2d, #1 + eor x12, x12, x9 + tbl v9.16b, {v10.16b}, v11.16b + add x10, x10, x15 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x4 + ushr v10.2d, v10.2d, #7 + add x10, x10, x12 + eor v9.16b, v9.16b, v10.16b + add x14, x14, x17 + add v4.2d, v4.2d, v9.2d + add x6, x6, x10 + add x10, x10, x14 + # Round 10 + mov x13, v5.d[0] + ldr x15, [x3], #8 + ror x12, x6, #14 + ror x14, x10, #28 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x14, x14, x10, ror 39 + add x9, x9, x12 + eor x17, x10, x11 + eor x12, x7, x8 + and x16, x17, x16 + and x12, x12, x6 + add x9, x9, x13 + eor x12, x12, x8 + add x9, x9, x15 + eor x16, x16, x11 + add x9, x9, x12 + add x14, x14, x16 + add x5, x5, x9 + add x9, x9, x14 + # Round 11 + mov x13, v5.d[1] + ldr x15, [x3], #8 + ext v10.16b, v5.16b, v6.16b, #8 + ror x12, x5, #14 + shl v8.2d, v4.2d, #45 + ror x14, x9, #28 + sri v8.2d, v4.2d, #19 + eor x12, x12, x5, ror 18 + shl v9.2d, v4.2d, #3 + eor x14, x14, x9, ror 34 + sri v9.2d, v4.2d, #61 + eor x12, x12, x5, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x14, x14, x9, ror 39 + ushr v8.2d, v4.2d, #6 + add x8, x8, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x9, x10 + add v5.2d, v5.2d, v9.2d + eor x12, x6, x7 + ext v9.16b, v1.16b, v2.16b, #8 + and x17, x16, x17 + add v5.2d, v5.2d, v9.2d + and x12, x12, x5 + shl v8.2d, v10.2d, #63 + add x8, x8, x13 + sri v8.2d, v10.2d, #1 + eor x12, x12, x7 + tbl v9.16b, {v10.16b}, v11.16b + add x8, x8, x15 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x10 + ushr v10.2d, v10.2d, #7 + add x8, x8, x12 + eor v9.16b, v9.16b, v10.16b + add x14, x14, x17 + add v5.2d, v5.2d, v9.2d + add x4, x4, x8 + add x8, x8, x14 + # Round 12 + mov x13, v6.d[0] + ldr x15, [x3], #8 + ror x12, x4, #14 + ror x14, x8, #28 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x14, x14, x8, ror 39 + add x7, x7, x12 + eor x17, x8, x9 + eor x12, x5, x6 + and x16, x17, x16 + and x12, x12, x4 + add x7, x7, x13 + eor x12, x12, x6 + add x7, x7, x15 + eor x16, x16, x9 + add x7, x7, x12 + add x14, x14, x16 + add x11, x11, x7 + add x7, x7, x14 + # Round 13 + mov x13, v6.d[1] + ldr x15, [x3], #8 + ext v10.16b, v6.16b, v7.16b, #8 + ror x12, x11, #14 + shl v8.2d, v5.2d, #45 + ror x14, x7, #28 + sri v8.2d, v5.2d, #19 + eor x12, x12, x11, ror 18 + shl v9.2d, v5.2d, #3 + eor x14, x14, x7, ror 34 + sri v9.2d, v5.2d, #61 + eor x12, x12, x11, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x14, x14, x7, ror 39 + ushr v8.2d, v5.2d, #6 + add x6, x6, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x7, x8 + add v6.2d, v6.2d, v9.2d + eor x12, x4, x5 + ext v9.16b, v2.16b, v3.16b, #8 + and x17, x16, x17 + add v6.2d, v6.2d, v9.2d + and x12, x12, x11 + shl v8.2d, v10.2d, #63 + add x6, x6, x13 + sri v8.2d, v10.2d, #1 + eor x12, x12, x5 + tbl v9.16b, {v10.16b}, v11.16b + add x6, x6, x15 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x8 + ushr v10.2d, v10.2d, #7 + add x6, x6, x12 + eor v9.16b, v9.16b, v10.16b + add x14, x14, x17 + add v6.2d, v6.2d, v9.2d + add x10, x10, x6 + add x6, x6, x14 + # Round 14 + mov x13, v7.d[0] + ldr x15, [x3], #8 + ror x12, x10, #14 + ror x14, x6, #28 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x14, x14, x6, ror 39 + add x5, x5, x12 + eor x17, x6, x7 + eor x12, x11, x4 + and x16, x17, x16 + and x12, x12, x10 + add x5, x5, x13 + eor x12, x12, x4 + add x5, x5, x15 + eor x16, x16, x7 + add x5, x5, x12 + add x14, x14, x16 + add x9, x9, x5 + add x5, x5, x14 + # Round 15 + mov x13, v7.d[1] + ldr x15, [x3], #8 + ext v10.16b, v7.16b, v0.16b, #8 + ror x12, x9, #14 + shl v8.2d, v6.2d, #45 + ror x14, x5, #28 + sri v8.2d, v6.2d, #19 + eor x12, x12, x9, ror 18 + shl v9.2d, v6.2d, #3 + eor x14, x14, x5, ror 34 + sri v9.2d, v6.2d, #61 + eor x12, x12, x9, ror 41 + eor v9.16b, v9.16b, v8.16b + eor x14, x14, x5, ror 39 + ushr v8.2d, v6.2d, #6 + add x4, x4, x12 + eor v9.16b, v9.16b, v8.16b + eor x16, x5, x6 + add v7.2d, v7.2d, v9.2d + eor x12, x10, x11 + ext v9.16b, v3.16b, v4.16b, #8 + and x17, x16, x17 + add v7.2d, v7.2d, v9.2d + and x12, x12, x9 + shl v8.2d, v10.2d, #63 + add x4, x4, x13 + sri v8.2d, v10.2d, #1 + eor x12, x12, x11 + tbl v9.16b, {v10.16b}, v11.16b + add x4, x4, x15 + eor v9.16b, v9.16b, v8.16b + eor x17, x17, x6 + ushr v10.2d, v10.2d, #7 + add x4, x4, x12 + eor v9.16b, v9.16b, v10.16b + add x14, x14, x17 + add v7.2d, v7.2d, v9.2d + add x8, x8, x4 + add x4, x4, x14 + subs x27, x27, #1 + bne L_sha512_len_neon_start + # Round 0 + mov x13, v0.d[0] + ldr x15, [x3], #8 + ror x12, x8, #14 + ror x14, x4, #28 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x14, x14, x4, ror 39 + add x11, x11, x12 + eor x17, x4, x5 + eor x12, x9, x10 + and x16, x17, x16 + and x12, x12, x8 + add x11, x11, x13 + eor x12, x12, x10 + add x11, x11, x15 + eor x16, x16, x5 + add x11, x11, x12 + add x14, x14, x16 + add x7, x7, x11 + add x11, x11, x14 + # Round 1 + mov x13, v0.d[1] + ldr x15, [x3], #8 + ror x12, x7, #14 + ror x14, x11, #28 + eor x12, x12, x7, ror 18 + eor x14, x14, x11, ror 34 + eor x12, x12, x7, ror 41 + eor x14, x14, x11, ror 39 + add x10, x10, x12 + eor x16, x11, x4 + eor x12, x8, x9 + and x17, x16, x17 + and x12, x12, x7 + add x10, x10, x13 + eor x12, x12, x9 + add x10, x10, x15 + eor x17, x17, x4 + add x10, x10, x12 + add x14, x14, x17 + add x6, x6, x10 + add x10, x10, x14 + # Round 2 + mov x13, v1.d[0] + ldr x15, [x3], #8 + ror x12, x6, #14 + ror x14, x10, #28 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x14, x14, x10, ror 39 + add x9, x9, x12 + eor x17, x10, x11 + eor x12, x7, x8 + and x16, x17, x16 + and x12, x12, x6 + add x9, x9, x13 + eor x12, x12, x8 + add x9, x9, x15 + eor x16, x16, x11 + add x9, x9, x12 + add x14, x14, x16 + add x5, x5, x9 + add x9, x9, x14 + # Round 3 + mov x13, v1.d[1] + ldr x15, [x3], #8 + ror x12, x5, #14 + ror x14, x9, #28 + eor x12, x12, x5, ror 18 + eor x14, x14, x9, ror 34 + eor x12, x12, x5, ror 41 + eor x14, x14, x9, ror 39 + add x8, x8, x12 + eor x16, x9, x10 + eor x12, x6, x7 + and x17, x16, x17 + and x12, x12, x5 + add x8, x8, x13 + eor x12, x12, x7 + add x8, x8, x15 + eor x17, x17, x10 + add x8, x8, x12 + add x14, x14, x17 + add x4, x4, x8 + add x8, x8, x14 + # Round 4 + mov x13, v2.d[0] + ldr x15, [x3], #8 + ror x12, x4, #14 + ror x14, x8, #28 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x14, x14, x8, ror 39 + add x7, x7, x12 + eor x17, x8, x9 + eor x12, x5, x6 + and x16, x17, x16 + and x12, x12, x4 + add x7, x7, x13 + eor x12, x12, x6 + add x7, x7, x15 + eor x16, x16, x9 + add x7, x7, x12 + add x14, x14, x16 + add x11, x11, x7 + add x7, x7, x14 + # Round 5 + mov x13, v2.d[1] + ldr x15, [x3], #8 + ror x12, x11, #14 + ror x14, x7, #28 + eor x12, x12, x11, ror 18 + eor x14, x14, x7, ror 34 + eor x12, x12, x11, ror 41 + eor x14, x14, x7, ror 39 + add x6, x6, x12 + eor x16, x7, x8 + eor x12, x4, x5 + and x17, x16, x17 + and x12, x12, x11 + add x6, x6, x13 + eor x12, x12, x5 + add x6, x6, x15 + eor x17, x17, x8 + add x6, x6, x12 + add x14, x14, x17 + add x10, x10, x6 + add x6, x6, x14 + # Round 6 + mov x13, v3.d[0] + ldr x15, [x3], #8 + ror x12, x10, #14 + ror x14, x6, #28 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x14, x14, x6, ror 39 + add x5, x5, x12 + eor x17, x6, x7 + eor x12, x11, x4 + and x16, x17, x16 + and x12, x12, x10 + add x5, x5, x13 + eor x12, x12, x4 + add x5, x5, x15 + eor x16, x16, x7 + add x5, x5, x12 + add x14, x14, x16 + add x9, x9, x5 + add x5, x5, x14 + # Round 7 + mov x13, v3.d[1] + ldr x15, [x3], #8 + ror x12, x9, #14 + ror x14, x5, #28 + eor x12, x12, x9, ror 18 + eor x14, x14, x5, ror 34 + eor x12, x12, x9, ror 41 + eor x14, x14, x5, ror 39 + add x4, x4, x12 + eor x16, x5, x6 + eor x12, x10, x11 + and x17, x16, x17 + and x12, x12, x9 + add x4, x4, x13 + eor x12, x12, x11 + add x4, x4, x15 + eor x17, x17, x6 + add x4, x4, x12 + add x14, x14, x17 + add x8, x8, x4 + add x4, x4, x14 + # Round 8 + mov x13, v4.d[0] + ldr x15, [x3], #8 + ror x12, x8, #14 + ror x14, x4, #28 + eor x12, x12, x8, ror 18 + eor x14, x14, x4, ror 34 + eor x12, x12, x8, ror 41 + eor x14, x14, x4, ror 39 + add x11, x11, x12 + eor x17, x4, x5 + eor x12, x9, x10 + and x16, x17, x16 + and x12, x12, x8 + add x11, x11, x13 + eor x12, x12, x10 + add x11, x11, x15 + eor x16, x16, x5 + add x11, x11, x12 + add x14, x14, x16 + add x7, x7, x11 + add x11, x11, x14 + # Round 9 + mov x13, v4.d[1] + ldr x15, [x3], #8 + ror x12, x7, #14 + ror x14, x11, #28 + eor x12, x12, x7, ror 18 + eor x14, x14, x11, ror 34 + eor x12, x12, x7, ror 41 + eor x14, x14, x11, ror 39 + add x10, x10, x12 + eor x16, x11, x4 + eor x12, x8, x9 + and x17, x16, x17 + and x12, x12, x7 + add x10, x10, x13 + eor x12, x12, x9 + add x10, x10, x15 + eor x17, x17, x4 + add x10, x10, x12 + add x14, x14, x17 + add x6, x6, x10 + add x10, x10, x14 + # Round 10 + mov x13, v5.d[0] + ldr x15, [x3], #8 + ror x12, x6, #14 + ror x14, x10, #28 + eor x12, x12, x6, ror 18 + eor x14, x14, x10, ror 34 + eor x12, x12, x6, ror 41 + eor x14, x14, x10, ror 39 + add x9, x9, x12 + eor x17, x10, x11 + eor x12, x7, x8 + and x16, x17, x16 + and x12, x12, x6 + add x9, x9, x13 + eor x12, x12, x8 + add x9, x9, x15 + eor x16, x16, x11 + add x9, x9, x12 + add x14, x14, x16 + add x5, x5, x9 + add x9, x9, x14 + # Round 11 + mov x13, v5.d[1] + ldr x15, [x3], #8 + ror x12, x5, #14 + ror x14, x9, #28 + eor x12, x12, x5, ror 18 + eor x14, x14, x9, ror 34 + eor x12, x12, x5, ror 41 + eor x14, x14, x9, ror 39 + add x8, x8, x12 + eor x16, x9, x10 + eor x12, x6, x7 + and x17, x16, x17 + and x12, x12, x5 + add x8, x8, x13 + eor x12, x12, x7 + add x8, x8, x15 + eor x17, x17, x10 + add x8, x8, x12 + add x14, x14, x17 + add x4, x4, x8 + add x8, x8, x14 + # Round 12 + mov x13, v6.d[0] + ldr x15, [x3], #8 + ror x12, x4, #14 + ror x14, x8, #28 + eor x12, x12, x4, ror 18 + eor x14, x14, x8, ror 34 + eor x12, x12, x4, ror 41 + eor x14, x14, x8, ror 39 + add x7, x7, x12 + eor x17, x8, x9 + eor x12, x5, x6 + and x16, x17, x16 + and x12, x12, x4 + add x7, x7, x13 + eor x12, x12, x6 + add x7, x7, x15 + eor x16, x16, x9 + add x7, x7, x12 + add x14, x14, x16 + add x11, x11, x7 + add x7, x7, x14 + # Round 13 + mov x13, v6.d[1] + ldr x15, [x3], #8 + ror x12, x11, #14 + ror x14, x7, #28 + eor x12, x12, x11, ror 18 + eor x14, x14, x7, ror 34 + eor x12, x12, x11, ror 41 + eor x14, x14, x7, ror 39 + add x6, x6, x12 + eor x16, x7, x8 + eor x12, x4, x5 + and x17, x16, x17 + and x12, x12, x11 + add x6, x6, x13 + eor x12, x12, x5 + add x6, x6, x15 + eor x17, x17, x8 + add x6, x6, x12 + add x14, x14, x17 + add x10, x10, x6 + add x6, x6, x14 + # Round 14 + mov x13, v7.d[0] + ldr x15, [x3], #8 + ror x12, x10, #14 + ror x14, x6, #28 + eor x12, x12, x10, ror 18 + eor x14, x14, x6, ror 34 + eor x12, x12, x10, ror 41 + eor x14, x14, x6, ror 39 + add x5, x5, x12 + eor x17, x6, x7 + eor x12, x11, x4 + and x16, x17, x16 + and x12, x12, x10 + add x5, x5, x13 + eor x12, x12, x4 + add x5, x5, x15 + eor x16, x16, x7 + add x5, x5, x12 + add x14, x14, x16 + add x9, x9, x5 + add x5, x5, x14 + # Round 15 + mov x13, v7.d[1] + ldr x15, [x3], #8 + ror x12, x9, #14 + ror x14, x5, #28 + eor x12, x12, x9, ror 18 + eor x14, x14, x5, ror 34 + eor x12, x12, x9, ror 41 + eor x14, x14, x5, ror 39 + add x4, x4, x12 + eor x16, x5, x6 + eor x12, x10, x11 + and x17, x16, x17 + and x12, x12, x9 + add x4, x4, x13 + eor x12, x12, x11 + add x4, x4, x15 + eor x17, x17, x6 + add x4, x4, x12 + add x14, x14, x17 + add x8, x8, x4 + add x4, x4, x14 + add x11, x11, x26 + add x10, x10, x25 + add x9, x9, x24 + add x8, x8, x23 + add x7, x7, x22 + add x6, x6, x21 + add x5, x5, x20 + add x4, x4, x19 + adr x3, L_SHA512_transform_neon_len_k + subs w2, w2, #0x80 + bne L_sha512_len_neon_begin + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + stp x8, x9, [x0, #32] + stp x10, x11, [x0, #48] + ldr x17, [x29, #16] + ldr x19, [x29, #24] + ldp x20, x21, [x29, #32] + ldp x22, x23, [x29, #48] + ldp x24, x25, [x29, #64] + ldp x26, x27, [x29, #80] + ldp d8, d9, [x29, #96] + ldp d10, d11, [x29, #112] + ldp x29, x30, [sp], #0x80 + ret + .size Transform_Sha512_Len,.-Transform_Sha512_Len +#endif /* __aarch64__ */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c new file mode 100644 index 0000000..d323598 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c @@ -0,0 +1,1041 @@ +/* armv8-sha512-asm + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c + */ +#ifdef __aarch64__ +#include <stdint.h> + +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#ifdef WOLFSSL_ARMASM +#include <wolfssl/wolfcrypt/sha512.h> + +static const uint64_t L_SHA512_transform_neon_len_k[] = { + 0x428a2f98d728ae22UL, + 0x7137449123ef65cdUL, + 0xb5c0fbcfec4d3b2fUL, + 0xe9b5dba58189dbbcUL, + 0x3956c25bf348b538UL, + 0x59f111f1b605d019UL, + 0x923f82a4af194f9bUL, + 0xab1c5ed5da6d8118UL, + 0xd807aa98a3030242UL, + 0x12835b0145706fbeUL, + 0x243185be4ee4b28cUL, + 0x550c7dc3d5ffb4e2UL, + 0x72be5d74f27b896fUL, + 0x80deb1fe3b1696b1UL, + 0x9bdc06a725c71235UL, + 0xc19bf174cf692694UL, + 0xe49b69c19ef14ad2UL, + 0xefbe4786384f25e3UL, + 0xfc19dc68b8cd5b5UL, + 0x240ca1cc77ac9c65UL, + 0x2de92c6f592b0275UL, + 0x4a7484aa6ea6e483UL, + 0x5cb0a9dcbd41fbd4UL, + 0x76f988da831153b5UL, + 0x983e5152ee66dfabUL, + 0xa831c66d2db43210UL, + 0xb00327c898fb213fUL, + 0xbf597fc7beef0ee4UL, + 0xc6e00bf33da88fc2UL, + 0xd5a79147930aa725UL, + 0x6ca6351e003826fUL, + 0x142929670a0e6e70UL, + 0x27b70a8546d22ffcUL, + 0x2e1b21385c26c926UL, + 0x4d2c6dfc5ac42aedUL, + 0x53380d139d95b3dfUL, + 0x650a73548baf63deUL, + 0x766a0abb3c77b2a8UL, + 0x81c2c92e47edaee6UL, + 0x92722c851482353bUL, + 0xa2bfe8a14cf10364UL, + 0xa81a664bbc423001UL, + 0xc24b8b70d0f89791UL, + 0xc76c51a30654be30UL, + 0xd192e819d6ef5218UL, + 0xd69906245565a910UL, + 0xf40e35855771202aUL, + 0x106aa07032bbd1b8UL, + 0x19a4c116b8d2d0c8UL, + 0x1e376c085141ab53UL, + 0x2748774cdf8eeb99UL, + 0x34b0bcb5e19b48a8UL, + 0x391c0cb3c5c95a63UL, + 0x4ed8aa4ae3418acbUL, + 0x5b9cca4f7763e373UL, + 0x682e6ff3d6b2b8a3UL, + 0x748f82ee5defb2fcUL, + 0x78a5636f43172f60UL, + 0x84c87814a1f0ab72UL, + 0x8cc702081a6439ecUL, + 0x90befffa23631e28UL, + 0xa4506cebde82bde9UL, + 0xbef9a3f7b2c67915UL, + 0xc67178f2e372532bUL, + 0xca273eceea26619cUL, + 0xd186b8c721c0c207UL, + 0xeada7dd6cde0eb1eUL, + 0xf57d4f7fee6ed178UL, + 0x6f067aa72176fbaUL, + 0xa637dc5a2c898a6UL, + 0x113f9804bef90daeUL, + 0x1b710b35131c471bUL, + 0x28db77f523047d84UL, + 0x32caab7b40c72493UL, + 0x3c9ebe0a15c9bebcUL, + 0x431d67c49c100d4cUL, + 0x4cc5d4becb3e42b6UL, + 0x597f299cfc657e2aUL, + 0x5fcb6fab3ad6faecUL, + 0x6c44198c4a475817UL, +}; + +static const uint64_t L_SHA512_transform_neon_len_ror8[] = { + 0x7060504030201UL, + 0x80f0e0d0c0b0a09UL, +}; + +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-16]!\n\t" + "add x29, sp, #0\n\t" + "adr x3, %[L_SHA512_transform_neon_len_k]\n\t" + "adr x27, %[L_SHA512_transform_neon_len_ror8]\n\t" + "ld1 {v11.16b}, [x27]\n\t" + /* Load digest into working vars */ + "ldp x4, x5, [%x[sha512]]\n\t" + "ldp x6, x7, [%x[sha512], #16]\n\t" + "ldp x8, x9, [%x[sha512], #32]\n\t" + "ldp x10, x11, [%x[sha512], #48]\n\t" + /* Start of loop processing a block */ + "\n" + "L_sha512_len_neon_begin_%=: \n\t" + /* Load W */ + /* Copy digest to add in at end */ + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[data]], #0x40\n\t" + "mov x19, x4\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[data]], #0x40\n\t" + "mov x20, x5\n\t" + "rev64 v0.16b, v0.16b\n\t" + "mov x21, x6\n\t" + "rev64 v1.16b, v1.16b\n\t" + "mov x22, x7\n\t" + "rev64 v2.16b, v2.16b\n\t" + "mov x23, x8\n\t" + "rev64 v3.16b, v3.16b\n\t" + "mov x24, x9\n\t" + "rev64 v4.16b, v4.16b\n\t" + "mov x25, x10\n\t" + "rev64 v5.16b, v5.16b\n\t" + "mov x26, x11\n\t" + "rev64 v6.16b, v6.16b\n\t" + "rev64 v7.16b, v7.16b\n\t" + /* Pre-calc: b ^ c */ + "eor x16, x5, x6\n\t" + "mov x27, #4\n\t" + /* Start of 16 rounds */ + "\n" + "L_sha512_len_neon_start_%=: \n\t" + /* Round 0 */ + "mov x13, v0.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x8, #14\n\t" + "ror x14, x4, #28\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x14, x14, x4, ror 39\n\t" + "add x11, x11, x12\n\t" + "eor x17, x4, x5\n\t" + "eor x12, x9, x10\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x8\n\t" + "add x11, x11, x13\n\t" + "eor x12, x12, x10\n\t" + "add x11, x11, x15\n\t" + "eor x16, x16, x5\n\t" + "add x11, x11, x12\n\t" + "add x14, x14, x16\n\t" + "add x7, x7, x11\n\t" + "add x11, x11, x14\n\t" + /* Round 1 */ + "mov x13, v0.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ext v10.16b, v0.16b, v1.16b, #8\n\t" + "ror x12, x7, #14\n\t" + "shl v8.2d, v7.2d, #45\n\t" + "ror x14, x11, #28\n\t" + "sri v8.2d, v7.2d, #19\n\t" + "eor x12, x12, x7, ror 18\n\t" + "shl v9.2d, v7.2d, #3\n\t" + "eor x14, x14, x11, ror 34\n\t" + "sri v9.2d, v7.2d, #61\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x14, x14, x11, ror 39\n\t" + "ushr v8.2d, v7.2d, #6\n\t" + "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x11, x4\n\t" + "add v0.2d, v0.2d, v9.2d\n\t" + "eor x12, x8, x9\n\t" + "ext v9.16b, v4.16b, v5.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v0.2d, v0.2d, v9.2d\n\t" + "and x12, x12, x7\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x10, x10, x13\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x9\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x10, x10, x15\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x4\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x14, x14, x17\n\t" + "add v0.2d, v0.2d, v9.2d\n\t" + "add x6, x6, x10\n\t" + "add x10, x10, x14\n\t" + /* Round 2 */ + "mov x13, v1.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x6, #14\n\t" + "ror x14, x10, #28\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x14, x14, x10, ror 39\n\t" + "add x9, x9, x12\n\t" + "eor x17, x10, x11\n\t" + "eor x12, x7, x8\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x6\n\t" + "add x9, x9, x13\n\t" + "eor x12, x12, x8\n\t" + "add x9, x9, x15\n\t" + "eor x16, x16, x11\n\t" + "add x9, x9, x12\n\t" + "add x14, x14, x16\n\t" + "add x5, x5, x9\n\t" + "add x9, x9, x14\n\t" + /* Round 3 */ + "mov x13, v1.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ext v10.16b, v1.16b, v2.16b, #8\n\t" + "ror x12, x5, #14\n\t" + "shl v8.2d, v0.2d, #45\n\t" + "ror x14, x9, #28\n\t" + "sri v8.2d, v0.2d, #19\n\t" + "eor x12, x12, x5, ror 18\n\t" + "shl v9.2d, v0.2d, #3\n\t" + "eor x14, x14, x9, ror 34\n\t" + "sri v9.2d, v0.2d, #61\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x14, x14, x9, ror 39\n\t" + "ushr v8.2d, v0.2d, #6\n\t" + "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x9, x10\n\t" + "add v1.2d, v1.2d, v9.2d\n\t" + "eor x12, x6, x7\n\t" + "ext v9.16b, v5.16b, v6.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v1.2d, v1.2d, v9.2d\n\t" + "and x12, x12, x5\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x8, x8, x13\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x7\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x8, x8, x15\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x10\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x14, x14, x17\n\t" + "add v1.2d, v1.2d, v9.2d\n\t" + "add x4, x4, x8\n\t" + "add x8, x8, x14\n\t" + /* Round 4 */ + "mov x13, v2.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x4, #14\n\t" + "ror x14, x8, #28\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x14, x14, x8, ror 39\n\t" + "add x7, x7, x12\n\t" + "eor x17, x8, x9\n\t" + "eor x12, x5, x6\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x4\n\t" + "add x7, x7, x13\n\t" + "eor x12, x12, x6\n\t" + "add x7, x7, x15\n\t" + "eor x16, x16, x9\n\t" + "add x7, x7, x12\n\t" + "add x14, x14, x16\n\t" + "add x11, x11, x7\n\t" + "add x7, x7, x14\n\t" + /* Round 5 */ + "mov x13, v2.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ext v10.16b, v2.16b, v3.16b, #8\n\t" + "ror x12, x11, #14\n\t" + "shl v8.2d, v1.2d, #45\n\t" + "ror x14, x7, #28\n\t" + "sri v8.2d, v1.2d, #19\n\t" + "eor x12, x12, x11, ror 18\n\t" + "shl v9.2d, v1.2d, #3\n\t" + "eor x14, x14, x7, ror 34\n\t" + "sri v9.2d, v1.2d, #61\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x14, x14, x7, ror 39\n\t" + "ushr v8.2d, v1.2d, #6\n\t" + "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x7, x8\n\t" + "add v2.2d, v2.2d, v9.2d\n\t" + "eor x12, x4, x5\n\t" + "ext v9.16b, v6.16b, v7.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v2.2d, v2.2d, v9.2d\n\t" + "and x12, x12, x11\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x6, x6, x13\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x5\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x6, x6, x15\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x8\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x14, x14, x17\n\t" + "add v2.2d, v2.2d, v9.2d\n\t" + "add x10, x10, x6\n\t" + "add x6, x6, x14\n\t" + /* Round 6 */ + "mov x13, v3.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x10, #14\n\t" + "ror x14, x6, #28\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x14, x14, x6, ror 39\n\t" + "add x5, x5, x12\n\t" + "eor x17, x6, x7\n\t" + "eor x12, x11, x4\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x10\n\t" + "add x5, x5, x13\n\t" + "eor x12, x12, x4\n\t" + "add x5, x5, x15\n\t" + "eor x16, x16, x7\n\t" + "add x5, x5, x12\n\t" + "add x14, x14, x16\n\t" + "add x9, x9, x5\n\t" + "add x5, x5, x14\n\t" + /* Round 7 */ + "mov x13, v3.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ext v10.16b, v3.16b, v4.16b, #8\n\t" + "ror x12, x9, #14\n\t" + "shl v8.2d, v2.2d, #45\n\t" + "ror x14, x5, #28\n\t" + "sri v8.2d, v2.2d, #19\n\t" + "eor x12, x12, x9, ror 18\n\t" + "shl v9.2d, v2.2d, #3\n\t" + "eor x14, x14, x5, ror 34\n\t" + "sri v9.2d, v2.2d, #61\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x14, x14, x5, ror 39\n\t" + "ushr v8.2d, v2.2d, #6\n\t" + "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x5, x6\n\t" + "add v3.2d, v3.2d, v9.2d\n\t" + "eor x12, x10, x11\n\t" + "ext v9.16b, v7.16b, v0.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v3.2d, v3.2d, v9.2d\n\t" + "and x12, x12, x9\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x4, x4, x13\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x11\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x4, x4, x15\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x6\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x14, x14, x17\n\t" + "add v3.2d, v3.2d, v9.2d\n\t" + "add x8, x8, x4\n\t" + "add x4, x4, x14\n\t" + /* Round 8 */ + "mov x13, v4.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x8, #14\n\t" + "ror x14, x4, #28\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x14, x14, x4, ror 39\n\t" + "add x11, x11, x12\n\t" + "eor x17, x4, x5\n\t" + "eor x12, x9, x10\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x8\n\t" + "add x11, x11, x13\n\t" + "eor x12, x12, x10\n\t" + "add x11, x11, x15\n\t" + "eor x16, x16, x5\n\t" + "add x11, x11, x12\n\t" + "add x14, x14, x16\n\t" + "add x7, x7, x11\n\t" + "add x11, x11, x14\n\t" + /* Round 9 */ + "mov x13, v4.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ext v10.16b, v4.16b, v5.16b, #8\n\t" + "ror x12, x7, #14\n\t" + "shl v8.2d, v3.2d, #45\n\t" + "ror x14, x11, #28\n\t" + "sri v8.2d, v3.2d, #19\n\t" + "eor x12, x12, x7, ror 18\n\t" + "shl v9.2d, v3.2d, #3\n\t" + "eor x14, x14, x11, ror 34\n\t" + "sri v9.2d, v3.2d, #61\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x14, x14, x11, ror 39\n\t" + "ushr v8.2d, v3.2d, #6\n\t" + "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x11, x4\n\t" + "add v4.2d, v4.2d, v9.2d\n\t" + "eor x12, x8, x9\n\t" + "ext v9.16b, v0.16b, v1.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v4.2d, v4.2d, v9.2d\n\t" + "and x12, x12, x7\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x10, x10, x13\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x9\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x10, x10, x15\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x4\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x10, x10, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x14, x14, x17\n\t" + "add v4.2d, v4.2d, v9.2d\n\t" + "add x6, x6, x10\n\t" + "add x10, x10, x14\n\t" + /* Round 10 */ + "mov x13, v5.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x6, #14\n\t" + "ror x14, x10, #28\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x14, x14, x10, ror 39\n\t" + "add x9, x9, x12\n\t" + "eor x17, x10, x11\n\t" + "eor x12, x7, x8\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x6\n\t" + "add x9, x9, x13\n\t" + "eor x12, x12, x8\n\t" + "add x9, x9, x15\n\t" + "eor x16, x16, x11\n\t" + "add x9, x9, x12\n\t" + "add x14, x14, x16\n\t" + "add x5, x5, x9\n\t" + "add x9, x9, x14\n\t" + /* Round 11 */ + "mov x13, v5.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ext v10.16b, v5.16b, v6.16b, #8\n\t" + "ror x12, x5, #14\n\t" + "shl v8.2d, v4.2d, #45\n\t" + "ror x14, x9, #28\n\t" + "sri v8.2d, v4.2d, #19\n\t" + "eor x12, x12, x5, ror 18\n\t" + "shl v9.2d, v4.2d, #3\n\t" + "eor x14, x14, x9, ror 34\n\t" + "sri v9.2d, v4.2d, #61\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x14, x14, x9, ror 39\n\t" + "ushr v8.2d, v4.2d, #6\n\t" + "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x9, x10\n\t" + "add v5.2d, v5.2d, v9.2d\n\t" + "eor x12, x6, x7\n\t" + "ext v9.16b, v1.16b, v2.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v5.2d, v5.2d, v9.2d\n\t" + "and x12, x12, x5\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x8, x8, x13\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x7\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x8, x8, x15\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x10\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x8, x8, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x14, x14, x17\n\t" + "add v5.2d, v5.2d, v9.2d\n\t" + "add x4, x4, x8\n\t" + "add x8, x8, x14\n\t" + /* Round 12 */ + "mov x13, v6.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x4, #14\n\t" + "ror x14, x8, #28\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x14, x14, x8, ror 39\n\t" + "add x7, x7, x12\n\t" + "eor x17, x8, x9\n\t" + "eor x12, x5, x6\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x4\n\t" + "add x7, x7, x13\n\t" + "eor x12, x12, x6\n\t" + "add x7, x7, x15\n\t" + "eor x16, x16, x9\n\t" + "add x7, x7, x12\n\t" + "add x14, x14, x16\n\t" + "add x11, x11, x7\n\t" + "add x7, x7, x14\n\t" + /* Round 13 */ + "mov x13, v6.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ext v10.16b, v6.16b, v7.16b, #8\n\t" + "ror x12, x11, #14\n\t" + "shl v8.2d, v5.2d, #45\n\t" + "ror x14, x7, #28\n\t" + "sri v8.2d, v5.2d, #19\n\t" + "eor x12, x12, x11, ror 18\n\t" + "shl v9.2d, v5.2d, #3\n\t" + "eor x14, x14, x7, ror 34\n\t" + "sri v9.2d, v5.2d, #61\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x14, x14, x7, ror 39\n\t" + "ushr v8.2d, v5.2d, #6\n\t" + "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x7, x8\n\t" + "add v6.2d, v6.2d, v9.2d\n\t" + "eor x12, x4, x5\n\t" + "ext v9.16b, v2.16b, v3.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v6.2d, v6.2d, v9.2d\n\t" + "and x12, x12, x11\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x6, x6, x13\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x5\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x6, x6, x15\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x8\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x6, x6, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x14, x14, x17\n\t" + "add v6.2d, v6.2d, v9.2d\n\t" + "add x10, x10, x6\n\t" + "add x6, x6, x14\n\t" + /* Round 14 */ + "mov x13, v7.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x10, #14\n\t" + "ror x14, x6, #28\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x14, x14, x6, ror 39\n\t" + "add x5, x5, x12\n\t" + "eor x17, x6, x7\n\t" + "eor x12, x11, x4\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x10\n\t" + "add x5, x5, x13\n\t" + "eor x12, x12, x4\n\t" + "add x5, x5, x15\n\t" + "eor x16, x16, x7\n\t" + "add x5, x5, x12\n\t" + "add x14, x14, x16\n\t" + "add x9, x9, x5\n\t" + "add x5, x5, x14\n\t" + /* Round 15 */ + "mov x13, v7.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ext v10.16b, v7.16b, v0.16b, #8\n\t" + "ror x12, x9, #14\n\t" + "shl v8.2d, v6.2d, #45\n\t" + "ror x14, x5, #28\n\t" + "sri v8.2d, v6.2d, #19\n\t" + "eor x12, x12, x9, ror 18\n\t" + "shl v9.2d, v6.2d, #3\n\t" + "eor x14, x14, x5, ror 34\n\t" + "sri v9.2d, v6.2d, #61\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x14, x14, x5, ror 39\n\t" + "ushr v8.2d, v6.2d, #6\n\t" + "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x16, x5, x6\n\t" + "add v7.2d, v7.2d, v9.2d\n\t" + "eor x12, x10, x11\n\t" + "ext v9.16b, v3.16b, v4.16b, #8\n\t" + "and x17, x16, x17\n\t" + "add v7.2d, v7.2d, v9.2d\n\t" + "and x12, x12, x9\n\t" + "shl v8.2d, v10.2d, #63\n\t" + "add x4, x4, x13\n\t" + "sri v8.2d, v10.2d, #1\n\t" + "eor x12, x12, x11\n\t" + "tbl v9.16b, {v10.16b}, v11.16b\n\t" + "add x4, x4, x15\n\t" + "eor v9.16b, v9.16b, v8.16b\n\t" + "eor x17, x17, x6\n\t" + "ushr v10.2d, v10.2d, #7\n\t" + "add x4, x4, x12\n\t" + "eor v9.16b, v9.16b, v10.16b\n\t" + "add x14, x14, x17\n\t" + "add v7.2d, v7.2d, v9.2d\n\t" + "add x8, x8, x4\n\t" + "add x4, x4, x14\n\t" + "subs x27, x27, #1\n\t" + "bne L_sha512_len_neon_start_%=\n\t" + /* Round 0 */ + "mov x13, v0.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x8, #14\n\t" + "ror x14, x4, #28\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x14, x14, x4, ror 39\n\t" + "add x11, x11, x12\n\t" + "eor x17, x4, x5\n\t" + "eor x12, x9, x10\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x8\n\t" + "add x11, x11, x13\n\t" + "eor x12, x12, x10\n\t" + "add x11, x11, x15\n\t" + "eor x16, x16, x5\n\t" + "add x11, x11, x12\n\t" + "add x14, x14, x16\n\t" + "add x7, x7, x11\n\t" + "add x11, x11, x14\n\t" + /* Round 1 */ + "mov x13, v0.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x7, #14\n\t" + "ror x14, x11, #28\n\t" + "eor x12, x12, x7, ror 18\n\t" + "eor x14, x14, x11, ror 34\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor x14, x14, x11, ror 39\n\t" + "add x10, x10, x12\n\t" + "eor x16, x11, x4\n\t" + "eor x12, x8, x9\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x7\n\t" + "add x10, x10, x13\n\t" + "eor x12, x12, x9\n\t" + "add x10, x10, x15\n\t" + "eor x17, x17, x4\n\t" + "add x10, x10, x12\n\t" + "add x14, x14, x17\n\t" + "add x6, x6, x10\n\t" + "add x10, x10, x14\n\t" + /* Round 2 */ + "mov x13, v1.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x6, #14\n\t" + "ror x14, x10, #28\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x14, x14, x10, ror 39\n\t" + "add x9, x9, x12\n\t" + "eor x17, x10, x11\n\t" + "eor x12, x7, x8\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x6\n\t" + "add x9, x9, x13\n\t" + "eor x12, x12, x8\n\t" + "add x9, x9, x15\n\t" + "eor x16, x16, x11\n\t" + "add x9, x9, x12\n\t" + "add x14, x14, x16\n\t" + "add x5, x5, x9\n\t" + "add x9, x9, x14\n\t" + /* Round 3 */ + "mov x13, v1.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x5, #14\n\t" + "ror x14, x9, #28\n\t" + "eor x12, x12, x5, ror 18\n\t" + "eor x14, x14, x9, ror 34\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor x14, x14, x9, ror 39\n\t" + "add x8, x8, x12\n\t" + "eor x16, x9, x10\n\t" + "eor x12, x6, x7\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x5\n\t" + "add x8, x8, x13\n\t" + "eor x12, x12, x7\n\t" + "add x8, x8, x15\n\t" + "eor x17, x17, x10\n\t" + "add x8, x8, x12\n\t" + "add x14, x14, x17\n\t" + "add x4, x4, x8\n\t" + "add x8, x8, x14\n\t" + /* Round 4 */ + "mov x13, v2.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x4, #14\n\t" + "ror x14, x8, #28\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x14, x14, x8, ror 39\n\t" + "add x7, x7, x12\n\t" + "eor x17, x8, x9\n\t" + "eor x12, x5, x6\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x4\n\t" + "add x7, x7, x13\n\t" + "eor x12, x12, x6\n\t" + "add x7, x7, x15\n\t" + "eor x16, x16, x9\n\t" + "add x7, x7, x12\n\t" + "add x14, x14, x16\n\t" + "add x11, x11, x7\n\t" + "add x7, x7, x14\n\t" + /* Round 5 */ + "mov x13, v2.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x11, #14\n\t" + "ror x14, x7, #28\n\t" + "eor x12, x12, x11, ror 18\n\t" + "eor x14, x14, x7, ror 34\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor x14, x14, x7, ror 39\n\t" + "add x6, x6, x12\n\t" + "eor x16, x7, x8\n\t" + "eor x12, x4, x5\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x11\n\t" + "add x6, x6, x13\n\t" + "eor x12, x12, x5\n\t" + "add x6, x6, x15\n\t" + "eor x17, x17, x8\n\t" + "add x6, x6, x12\n\t" + "add x14, x14, x17\n\t" + "add x10, x10, x6\n\t" + "add x6, x6, x14\n\t" + /* Round 6 */ + "mov x13, v3.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x10, #14\n\t" + "ror x14, x6, #28\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x14, x14, x6, ror 39\n\t" + "add x5, x5, x12\n\t" + "eor x17, x6, x7\n\t" + "eor x12, x11, x4\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x10\n\t" + "add x5, x5, x13\n\t" + "eor x12, x12, x4\n\t" + "add x5, x5, x15\n\t" + "eor x16, x16, x7\n\t" + "add x5, x5, x12\n\t" + "add x14, x14, x16\n\t" + "add x9, x9, x5\n\t" + "add x5, x5, x14\n\t" + /* Round 7 */ + "mov x13, v3.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x9, #14\n\t" + "ror x14, x5, #28\n\t" + "eor x12, x12, x9, ror 18\n\t" + "eor x14, x14, x5, ror 34\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor x14, x14, x5, ror 39\n\t" + "add x4, x4, x12\n\t" + "eor x16, x5, x6\n\t" + "eor x12, x10, x11\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x9\n\t" + "add x4, x4, x13\n\t" + "eor x12, x12, x11\n\t" + "add x4, x4, x15\n\t" + "eor x17, x17, x6\n\t" + "add x4, x4, x12\n\t" + "add x14, x14, x17\n\t" + "add x8, x8, x4\n\t" + "add x4, x4, x14\n\t" + /* Round 8 */ + "mov x13, v4.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x8, #14\n\t" + "ror x14, x4, #28\n\t" + "eor x12, x12, x8, ror 18\n\t" + "eor x14, x14, x4, ror 34\n\t" + "eor x12, x12, x8, ror 41\n\t" + "eor x14, x14, x4, ror 39\n\t" + "add x11, x11, x12\n\t" + "eor x17, x4, x5\n\t" + "eor x12, x9, x10\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x8\n\t" + "add x11, x11, x13\n\t" + "eor x12, x12, x10\n\t" + "add x11, x11, x15\n\t" + "eor x16, x16, x5\n\t" + "add x11, x11, x12\n\t" + "add x14, x14, x16\n\t" + "add x7, x7, x11\n\t" + "add x11, x11, x14\n\t" + /* Round 9 */ + "mov x13, v4.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x7, #14\n\t" + "ror x14, x11, #28\n\t" + "eor x12, x12, x7, ror 18\n\t" + "eor x14, x14, x11, ror 34\n\t" + "eor x12, x12, x7, ror 41\n\t" + "eor x14, x14, x11, ror 39\n\t" + "add x10, x10, x12\n\t" + "eor x16, x11, x4\n\t" + "eor x12, x8, x9\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x7\n\t" + "add x10, x10, x13\n\t" + "eor x12, x12, x9\n\t" + "add x10, x10, x15\n\t" + "eor x17, x17, x4\n\t" + "add x10, x10, x12\n\t" + "add x14, x14, x17\n\t" + "add x6, x6, x10\n\t" + "add x10, x10, x14\n\t" + /* Round 10 */ + "mov x13, v5.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x6, #14\n\t" + "ror x14, x10, #28\n\t" + "eor x12, x12, x6, ror 18\n\t" + "eor x14, x14, x10, ror 34\n\t" + "eor x12, x12, x6, ror 41\n\t" + "eor x14, x14, x10, ror 39\n\t" + "add x9, x9, x12\n\t" + "eor x17, x10, x11\n\t" + "eor x12, x7, x8\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x6\n\t" + "add x9, x9, x13\n\t" + "eor x12, x12, x8\n\t" + "add x9, x9, x15\n\t" + "eor x16, x16, x11\n\t" + "add x9, x9, x12\n\t" + "add x14, x14, x16\n\t" + "add x5, x5, x9\n\t" + "add x9, x9, x14\n\t" + /* Round 11 */ + "mov x13, v5.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x5, #14\n\t" + "ror x14, x9, #28\n\t" + "eor x12, x12, x5, ror 18\n\t" + "eor x14, x14, x9, ror 34\n\t" + "eor x12, x12, x5, ror 41\n\t" + "eor x14, x14, x9, ror 39\n\t" + "add x8, x8, x12\n\t" + "eor x16, x9, x10\n\t" + "eor x12, x6, x7\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x5\n\t" + "add x8, x8, x13\n\t" + "eor x12, x12, x7\n\t" + "add x8, x8, x15\n\t" + "eor x17, x17, x10\n\t" + "add x8, x8, x12\n\t" + "add x14, x14, x17\n\t" + "add x4, x4, x8\n\t" + "add x8, x8, x14\n\t" + /* Round 12 */ + "mov x13, v6.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x4, #14\n\t" + "ror x14, x8, #28\n\t" + "eor x12, x12, x4, ror 18\n\t" + "eor x14, x14, x8, ror 34\n\t" + "eor x12, x12, x4, ror 41\n\t" + "eor x14, x14, x8, ror 39\n\t" + "add x7, x7, x12\n\t" + "eor x17, x8, x9\n\t" + "eor x12, x5, x6\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x4\n\t" + "add x7, x7, x13\n\t" + "eor x12, x12, x6\n\t" + "add x7, x7, x15\n\t" + "eor x16, x16, x9\n\t" + "add x7, x7, x12\n\t" + "add x14, x14, x16\n\t" + "add x11, x11, x7\n\t" + "add x7, x7, x14\n\t" + /* Round 13 */ + "mov x13, v6.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x11, #14\n\t" + "ror x14, x7, #28\n\t" + "eor x12, x12, x11, ror 18\n\t" + "eor x14, x14, x7, ror 34\n\t" + "eor x12, x12, x11, ror 41\n\t" + "eor x14, x14, x7, ror 39\n\t" + "add x6, x6, x12\n\t" + "eor x16, x7, x8\n\t" + "eor x12, x4, x5\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x11\n\t" + "add x6, x6, x13\n\t" + "eor x12, x12, x5\n\t" + "add x6, x6, x15\n\t" + "eor x17, x17, x8\n\t" + "add x6, x6, x12\n\t" + "add x14, x14, x17\n\t" + "add x10, x10, x6\n\t" + "add x6, x6, x14\n\t" + /* Round 14 */ + "mov x13, v7.d[0]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x10, #14\n\t" + "ror x14, x6, #28\n\t" + "eor x12, x12, x10, ror 18\n\t" + "eor x14, x14, x6, ror 34\n\t" + "eor x12, x12, x10, ror 41\n\t" + "eor x14, x14, x6, ror 39\n\t" + "add x5, x5, x12\n\t" + "eor x17, x6, x7\n\t" + "eor x12, x11, x4\n\t" + "and x16, x17, x16\n\t" + "and x12, x12, x10\n\t" + "add x5, x5, x13\n\t" + "eor x12, x12, x4\n\t" + "add x5, x5, x15\n\t" + "eor x16, x16, x7\n\t" + "add x5, x5, x12\n\t" + "add x14, x14, x16\n\t" + "add x9, x9, x5\n\t" + "add x5, x5, x14\n\t" + /* Round 15 */ + "mov x13, v7.d[1]\n\t" + "ldr x15, [x3], #8\n\t" + "ror x12, x9, #14\n\t" + "ror x14, x5, #28\n\t" + "eor x12, x12, x9, ror 18\n\t" + "eor x14, x14, x5, ror 34\n\t" + "eor x12, x12, x9, ror 41\n\t" + "eor x14, x14, x5, ror 39\n\t" + "add x4, x4, x12\n\t" + "eor x16, x5, x6\n\t" + "eor x12, x10, x11\n\t" + "and x17, x16, x17\n\t" + "and x12, x12, x9\n\t" + "add x4, x4, x13\n\t" + "eor x12, x12, x11\n\t" + "add x4, x4, x15\n\t" + "eor x17, x17, x6\n\t" + "add x4, x4, x12\n\t" + "add x14, x14, x17\n\t" + "add x8, x8, x4\n\t" + "add x4, x4, x14\n\t" + "add x11, x11, x26\n\t" + "add x10, x10, x25\n\t" + "add x9, x9, x24\n\t" + "add x8, x8, x23\n\t" + "add x7, x7, x22\n\t" + "add x6, x6, x21\n\t" + "add x5, x5, x20\n\t" + "add x4, x4, x19\n\t" + "adr x3, %[L_SHA512_transform_neon_len_k]\n\t" + "subs %w[len], %w[len], #0x80\n\t" + "bne L_sha512_len_neon_begin_%=\n\t" + "stp x4, x5, [%x[sha512]]\n\t" + "stp x6, x7, [%x[sha512], #16]\n\t" + "stp x8, x9, [%x[sha512], #32]\n\t" + "stp x10, x11, [%x[sha512], #48]\n\t" + "ldp x29, x30, [sp], #16\n\t" + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k), [L_SHA512_transform_neon_len_ror8] "S" (L_SHA512_transform_neon_len_ror8) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); +} + +#endif /* WOLFSSL_ARMASM */ +#endif /* __aarch64__ */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha512.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha512.c new file mode 100644 index 0000000..e909c7c --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-sha512.c @@ -0,0 +1,715 @@ +/* sha512.c + * + * Copyright (C) 2006-2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +#ifdef WOLFSSL_ARMASM +#if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384) + +#include <wolfssl/wolfcrypt/sha512.h> +#include <wolfssl/wolfcrypt/error-crypt.h> +#include <wolfssl/wolfcrypt/cpuid.h> +#include <wolfssl/wolfcrypt/hash.h> + +#include <wolfssl/wolfcrypt/logging.h> + +#ifdef NO_INLINE + #include <wolfssl/wolfcrypt/misc.h> +#else + #define WOLFSSL_MISC_INCLUDED + #include <wolfcrypt/src/misc.c> +#endif + +#ifdef WOLFSSL_SHA512 + +static int InitSha512(wc_Sha512* sha512) +{ + if (sha512 == NULL) + return BAD_FUNC_ARG; + + sha512->digest[0] = W64LIT(0x6a09e667f3bcc908); + sha512->digest[1] = W64LIT(0xbb67ae8584caa73b); + sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b); + sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1); + sha512->digest[4] = W64LIT(0x510e527fade682d1); + sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f); + sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b); + sha512->digest[7] = W64LIT(0x5be0cd19137e2179); + + sha512->buffLen = 0; + sha512->loLen = 0; + sha512->hiLen = 0; +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) + sha512->flags = 0; +#endif + + return 0; +} + +#endif /* WOLFSSL_SHA512 */ + +#ifdef WOLFSSL_SHA512 + +int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) +{ + int ret = 0; + + if (sha512 == NULL) + return BAD_FUNC_ARG; + + sha512->heap = heap; + + ret = InitSha512(sha512); + if (ret != 0) + return ret; + +#ifdef WOLFSSL_SMALL_STACK_CACHE + sha512->W = NULL; +#endif + + (void)devId; + + return ret; +} + +#endif /* WOLFSSL_SHA512 */ + +#ifndef WOLFSSL_ARMASM +static const word64 K512[80] = { + W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), + W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), + W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), + W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), + W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), + W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), + W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), + W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), + W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), + W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), + W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), + W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), + W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), + W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), + W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), + W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), + W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), + W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), + W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), + W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), + W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), + W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), + W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), + W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), + W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), + W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), + W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), + W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), + W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), + W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), + W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), + W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), + W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), + W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), + W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), + W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), + W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), + W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), + W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), + W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) +}; + +#ifdef LITTLE_ENDIAN_ORDER +#define blk0(i) (W[i] = ByteReverseWord64(DATA[i])) +#else +#define blk0(i) (W[i] = DATA[i]) +#endif + +#define blk2(i) ( \ + W[ i ] += \ + s1(W[(i- 2) & 15])+ \ + W[(i- 7) & 15] + \ + s0(W[(i-15) & 15]) \ + ) + +#define Ch(x,y,z) (z ^ ((z ^ y) & x)) +#define Maj(x,y,z) (y ^ ((y ^ z) & (x ^ y))) + +#define a(i) T[(0-i) & 7] +#define b(i) T[(1-i) & 7] +#define c(i) T[(2-i) & 7] +#define d(i) T[(3-i) & 7] +#define e(i) T[(4-i) & 7] +#define f(i) T[(5-i) & 7] +#define g(i) T[(6-i) & 7] +#define h(i) T[(7-i) & 7] + +#define S0(x) (rotrFixed64(x,28) ^ rotrFixed64(x,34) ^ rotrFixed64(x,39)) +#define S1(x) (rotrFixed64(x,14) ^ rotrFixed64(x,18) ^ rotrFixed64(x,41)) +#define s0(x) (rotrFixed64(x, 1) ^ rotrFixed64(x, 8) ^ (x>>7)) +#define s1(x) (rotrFixed64(x,19) ^ rotrFixed64(x,61) ^ (x>>6)) + +#define R0(i) \ + h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + blk0(i); \ + d(i) += h(i); \ + h(i) += S0(a(i)) + Maj(a(i),b(i),c(i)) +#define R(i) \ + h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + blk2(i); \ + d(i) += h(i); \ + h(i) += S0(a(i)) + Maj(a(i),b(i),c(i)) + +#define DATA sha512->buffer +static void Transform_Sha512(wc_Sha512* sha512) +{ + const word64* K = K512; + word32 j; + word64 T[8]; + word64 W[16]; + + /* Copy digest to working vars */ + T[0] = sha512->digest[0]; + T[1] = sha512->digest[1]; + T[2] = sha512->digest[2]; + T[3] = sha512->digest[3]; + T[4] = sha512->digest[4]; + T[5] = sha512->digest[5]; + T[6] = sha512->digest[6]; + T[7] = sha512->digest[7]; + + /* 80 operations, partially loop unrolled */ + j = 0; + R0( 0); R0( 1); R0( 2); R0( 3); + R0( 4); R0( 5); R0( 6); R0( 7); + R0( 8); R0( 9); R0(10); R0(11); + R0(12); R0(13); R0(14); R0(15); + for (j = 16; j < 80; j += 16) { + R( 0); R( 1); R( 2); R( 3); + R( 4); R( 5); R( 6); R( 7); + R( 8); R( 9); R(10); R(11); + R(12); R(13); R(14); R(15); + } + + /* Add the working vars back into digest */ + sha512->digest[0] += T[0]; + sha512->digest[1] += T[1]; + sha512->digest[2] += T[2]; + sha512->digest[3] += T[3]; + sha512->digest[4] += T[4]; + sha512->digest[5] += T[5]; + sha512->digest[6] += T[6]; + sha512->digest[7] += T[7]; + + return 0; +} +#undef DATA + +#define DATA ((word64*)data) +static void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +{ + const word64* K = K512; + word32 j; + word64 T[8]; + word64 TO[8]; + word64 W[16]; + + /* Copy digest to working vars */ + T[0] = sha512->digest[0]; + T[1] = sha512->digest[1]; + T[2] = sha512->digest[2]; + T[3] = sha512->digest[3]; + T[4] = sha512->digest[4]; + T[5] = sha512->digest[5]; + T[6] = sha512->digest[6]; + T[7] = sha512->digest[7]; + + do { + TO[0] = T[0]; + TO[1] = T[1]; + TO[2] = T[2]; + TO[3] = T[3]; + TO[4] = T[4]; + TO[5] = T[5]; + TO[6] = T[6]; + TO[7] = T[7]; + + /* 80 operations, partially loop unrolled */ + j = 0; + R0( 0); R0( 1); R0( 2); R0( 3); + R0( 4); R0( 5); R0( 6); R0( 7); + R0( 8); R0( 9); R0(10); R0(11); + R0(12); R0(13); R0(14); R0(15); + for (j = 16; j < 80; j += 16) { + R( 0); R( 1); R( 2); R( 3); + R( 4); R( 5); R( 6); R( 7); + R( 8); R( 9); R(10); R(11); + R(12); R(13); R(14); R(15); + } + + T[0] += TO[0]; + T[1] += TO[1]; + T[2] += TO[2]; + T[3] += TO[3]; + T[4] += TO[4]; + T[5] += TO[5]; + T[6] += TO[6]; + T[7] += TO[7]; + + data += 128; + len -= 128; + } + while (len > 0); + + /* Add the working vars back into digest */ + sha512->digest[0] = T[0]; + sha512->digest[1] = T[1]; + sha512->digest[2] = T[2]; + sha512->digest[3] = T[3]; + sha512->digest[4] = T[4]; + sha512->digest[5] = T[5]; + sha512->digest[6] = T[6]; + sha512->digest[7] = T[7]; + + return 0; +} +#undef DATA +#endif + + +static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len) +{ + word64 tmp = sha512->loLen; + if ( (sha512->loLen += len) < tmp) + sha512->hiLen++; /* carry low to high */ +} + +static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) +{ + int ret = 0; + /* do block size increments */ + byte* local = (byte*)sha512->buffer; + word32 blocksLen; + + /* check that internal buffLen is valid */ + if (sha512->buffLen >= WC_SHA512_BLOCK_SIZE) + return BUFFER_E; + + AddLength(sha512, len); + + if (sha512->buffLen > 0) { + word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen); + if (add > 0) { + XMEMCPY(&local[sha512->buffLen], data, add); + + sha512->buffLen += add; + data += add; + len -= add; + } + + if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) { +#ifndef WOLFSSL_ARMASM + Transform_Sha512(sha512); +#else + Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, + WC_SHA512_BLOCK_SIZE); +#endif + sha512->buffLen = 0; + } + } + + blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); + if (blocksLen > 0) { + /* Byte reversal performed in function if required. */ + Transform_Sha512_Len(sha512, data, blocksLen); + data += blocksLen; + len -= blocksLen; + } + + if (len > 0) { + XMEMCPY(local, data, len); + sha512->buffLen = len; + } + + return ret; +} + +#ifdef WOLFSSL_SHA512 + +int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) +{ + if (sha512 == NULL || (data == NULL && len > 0)) { + return BAD_FUNC_ARG; + } + + return Sha512Update(sha512, data, len); +} + +#endif /* WOLFSSL_SHA512 */ + +static WC_INLINE int Sha512Final(wc_Sha512* sha512) +{ + byte* local = (byte*)sha512->buffer; + + if (sha512 == NULL) { + return BAD_FUNC_ARG; + } + + local[sha512->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (sha512->buffLen > WC_SHA512_PAD_SIZE) { + XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_BLOCK_SIZE - + sha512->buffLen); + sha512->buffLen += WC_SHA512_BLOCK_SIZE - sha512->buffLen; +#ifndef WOLFSSL_ARMASM + Transform_Sha512(sha512); +#else + Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, + WC_SHA512_BLOCK_SIZE); +#endif + + sha512->buffLen = 0; + } + XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen); + + /* put lengths in bits */ + sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) + + (sha512->hiLen << 3); + sha512->loLen = sha512->loLen << 3; + + /* store lengths */ + /* ! length ordering dependent on digest endian type ! */ + + sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen; + sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen; + + ByteReverseWords64( + &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]), + &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]), + WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE); +#ifndef WOLFSSL_ARMASM + Transform_Sha512(sha512); +#else + Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, + WC_SHA512_BLOCK_SIZE); +#endif + +#ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE); +#endif + + return 0; +} + +#ifdef WOLFSSL_SHA512 + +int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash) +{ +#ifdef LITTLE_ENDIAN_ORDER + word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)]; +#endif + + if (sha512 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + +#ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords64((word64*)digest, (word64*)sha512->digest, + WC_SHA512_DIGEST_SIZE); + XMEMCPY(hash, digest, WC_SHA512_DIGEST_SIZE); +#else + XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE); +#endif + + return 0; +} + +int wc_Sha512Final(wc_Sha512* sha512, byte* hash) +{ + int ret; + + if (sha512 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + + ret = Sha512Final(sha512); + if (ret != 0) + return ret; + + XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE); + + return InitSha512(sha512); /* reset state */ +} + +int wc_InitSha512(wc_Sha512* sha512) +{ + return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID); +} + +void wc_Sha512Free(wc_Sha512* sha512) +{ + if (sha512 == NULL) + return; + +#ifdef WOLFSSL_SMALL_STACK_CACHE + if (sha512->W != NULL) { + XFREE(sha512->W, NULL, DYNAMIC_TYPE_TMP_BUFFER); + sha512->W = NULL; + } +#endif +} + +#endif /* WOLFSSL_SHA512 */ + +/* -------------------------------------------------------------------------- */ +/* SHA384 */ +/* -------------------------------------------------------------------------- */ +#ifdef WOLFSSL_SHA384 + +static int InitSha384(wc_Sha384* sha384) +{ + if (sha384 == NULL) { + return BAD_FUNC_ARG; + } + + sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8); + sha384->digest[1] = W64LIT(0x629a292a367cd507); + sha384->digest[2] = W64LIT(0x9159015a3070dd17); + sha384->digest[3] = W64LIT(0x152fecd8f70e5939); + sha384->digest[4] = W64LIT(0x67332667ffc00b31); + sha384->digest[5] = W64LIT(0x8eb44a8768581511); + sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7); + sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4); + + sha384->buffLen = 0; + sha384->loLen = 0; + sha384->hiLen = 0; +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) + sha384->flags = 0; +#endif + + return 0; +} + +int wc_Sha384Update(wc_Sha384* sha384, const byte* data, word32 len) +{ + if (sha384 == NULL || (data == NULL && len > 0)) { + return BAD_FUNC_ARG; + } + + return Sha512Update((wc_Sha512*)sha384, data, len); +} + + +int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash) +{ +#ifdef LITTLE_ENDIAN_ORDER + word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)]; +#endif + + if (sha384 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + +#ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords64((word64*)digest, (word64*)sha384->digest, + WC_SHA384_DIGEST_SIZE); + XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE); +#else + XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); +#endif + + return 0; +} + +int wc_Sha384Final(wc_Sha384* sha384, byte* hash) +{ + int ret; + + if (sha384 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + + ret = Sha512Final((wc_Sha512*)sha384); + if (ret != 0) + return ret; + + XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); + + return InitSha384(sha384); /* reset state */ +} + +int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId) +{ + int ret; + + if (sha384 == NULL) { + return BAD_FUNC_ARG; + } + + sha384->heap = heap; + ret = InitSha384(sha384); + if (ret != 0) + return ret; + +#ifdef WOLFSSL_SMALL_STACK_CACHE + sha384->W = NULL; +#endif + + (void)devId; + + return ret; +} + +int wc_InitSha384(wc_Sha384* sha384) +{ + return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID); +} + +void wc_Sha384Free(wc_Sha384* sha384) +{ + if (sha384 == NULL) + return; + +#ifdef WOLFSSL_SMALL_STACK_CACHE + if (sha384->W != NULL) { + XFREE(sha384->W, NULL, DYNAMIC_TYPE_TMP_BUFFER); + sha384->W = NULL; + } +#endif +} + +#endif /* WOLFSSL_SHA384 */ + +#ifdef WOLFSSL_SHA512 + +int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash) +{ + int ret; + wc_Sha512 tmpSha512; + + if (sha512 == NULL || hash == NULL) + return BAD_FUNC_ARG; + + ret = wc_Sha512Copy(sha512, &tmpSha512); + if (ret == 0) { + ret = wc_Sha512Final(&tmpSha512, hash); + wc_Sha512Free(&tmpSha512); + } + return ret; +} + +int wc_Sha512Copy(wc_Sha512* src, wc_Sha512* dst) +{ + int ret = 0; + + if (src == NULL || dst == NULL) + return BAD_FUNC_ARG; + + XMEMCPY(dst, src, sizeof(wc_Sha512)); +#ifdef WOLFSSL_SMALL_STACK_CACHE + dst->W = NULL; +#endif + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) + dst->flags |= WC_HASH_FLAG_ISCOPY; +#endif + + return ret; +} + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) +int wc_Sha512SetFlags(wc_Sha512* sha512, word32 flags) +{ + if (sha512) { + sha512->flags = flags; + } + return 0; +} +int wc_Sha512GetFlags(wc_Sha512* sha512, word32* flags) +{ + if (sha512 && flags) { + *flags = sha512->flags; + } + return 0; +} +#endif + +#endif /* WOLFSSL_SHA512 */ + +#ifdef WOLFSSL_SHA384 + +int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash) +{ + int ret; + wc_Sha384 tmpSha384; + + if (sha384 == NULL || hash == NULL) + return BAD_FUNC_ARG; + ret = wc_Sha384Copy(sha384, &tmpSha384); + if (ret == 0) { + ret = wc_Sha384Final(&tmpSha384, hash); + wc_Sha384Free(&tmpSha384); + } + return ret; +} +int wc_Sha384Copy(wc_Sha384* src, wc_Sha384* dst) +{ + int ret = 0; + + if (src == NULL || dst == NULL) + return BAD_FUNC_ARG; + + XMEMCPY(dst, src, sizeof(wc_Sha384)); +#ifdef WOLFSSL_SMALL_STACK_CACHE + dst->W = NULL; +#endif + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) + dst->flags |= WC_HASH_FLAG_ISCOPY; +#endif + + return ret; +} + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) +int wc_Sha384SetFlags(wc_Sha384* sha384, word32 flags) +{ + if (sha384) { + sha384->flags = flags; + } + return 0; +} +int wc_Sha384GetFlags(wc_Sha384* sha384, word32* flags) +{ + if (sha384 && flags) { + *flags = sha384->flags; + } + return 0; +} +#endif + +#endif /* WOLFSSL_SHA384 */ + +#endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */ +#endif /* WOLFSSL_ARMASM */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/cryptoCell.c b/client/wolfssl/wolfcrypt/src/port/arm/cryptoCell.c new file mode 100644 index 0000000..c3bd2d9 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/cryptoCell.c @@ -0,0 +1,309 @@ +/* cryptoCell.c + * + * Copyright (C) 2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +/* This source is included in wc_port.c */ +/* WOLFSSL_CRYPTOCELL_C is defined by wc_port.c in case compile tries to + include this .c directly */ +#ifdef WOLFSSL_CRYPTOCELL_C + +#ifdef WOLFSSL_CRYPTOCELL + +#include <wolfssl/wolfcrypt/error-crypt.h> +#include <wolfssl/wolfcrypt/logging.h> +#include <wolfssl/wolfcrypt/ecc.h> +#include <wolfssl/wolfcrypt/port/arm/cryptoCell.h> + +#ifdef NO_INLINE + #include <wolfssl/wolfcrypt/misc.h> +#else + #define WOLFSSL_MISC_INCLUDED + #include <wolfcrypt/src/misc.c> +#endif + +/* Global Variables (extern) */ +CRYS_RND_State_t wc_rndState; +CRYS_RND_WorkBuff_t wc_rndWorkBuff; +SaSiRndGenerateVectWorkFunc_t wc_rndGenVectFunc = CRYS_RND_GenerateVector; + +static word32 cc310_enableCount = 0; + +static void cc310_enable(void) +{ + cc310_enableCount++; + + /* Enable the CC310 HW/IQ once*/ + + NRF_CRYPTOCELL->ENABLE = 1; + NVIC_EnableIRQ(CRYPTOCELL_IRQn); +} + +static void cc310_disable(void) +{ + cc310_enableCount--; + + /* Disable HW/IRQ if no more users */ + if (cc310_enableCount == 0) { + NRF_CRYPTOCELL->ENABLE = 0; + NVIC_DisableIRQ(CRYPTOCELL_IRQn); + } +} + +int cc310_Init(void) +{ + int ret = 0; + static int initialized = 0; + + if (!initialized) { + /* Enable the CC310 HW. */ + cc310_enable(); + + /*Initialize the CC310 run-time library*/ + ret = SaSi_LibInit(); + + if (ret != SA_SILIB_RET_OK) { + WOLFSSL_MSG("Error SaSi_LibInit"); + return ret; + } + + /* RNG CryptoCell CC310 */ + ret = CRYS_RndInit(&wc_rndState, &wc_rndWorkBuff); + if (ret != CRYS_OK) { + WOLFSSL_MSG("Error CRYS_RndInit"); + return ret; + } + initialized = 1; + } + return ret; +} + +void cc310_Free(void) +{ + CRYSError_t crys_result; + + SaSi_LibFini(); + + crys_result = CRYS_RND_UnInstantiation(&wc_rndState); + + if (crys_result != CRYS_OK) { + WOLFSSL_MSG("Error RYS_RND_UnInstantiation"); + } + cc310_disable(); +} + +int cc310_random_generate(byte* output, word32 size) +{ + CRYSError_t crys_result; + + crys_result = CRYS_RND_GenerateVector(&wc_rndState, size, output); + + return (crys_result == CRYS_OK) ? 0 : -1; +} +#ifdef HAVE_ECC +CRYS_ECPKI_DomainID_t cc310_mapCurve(int curve_id) +{ + switch(curve_id) + { + case ECC_CURVE_DEF: return CRYS_ECPKI_DomainID_secp256r1; /* default */ + case ECC_SECP160K1: return CRYS_ECPKI_DomainID_secp160k1; + case ECC_SECP160R1: return CRYS_ECPKI_DomainID_secp160r1; + case ECC_SECP160R2: return CRYS_ECPKI_DomainID_secp160r2; + case ECC_SECP192K1: return CRYS_ECPKI_DomainID_secp192k1; + case ECC_SECP192R1: return CRYS_ECPKI_DomainID_secp192r1; + case ECC_SECP224K1: return CRYS_ECPKI_DomainID_secp224k1; + case ECC_SECP224R1: return CRYS_ECPKI_DomainID_secp224r1; + case ECC_SECP256K1: return CRYS_ECPKI_DomainID_secp256k1; + case ECC_SECP256R1: return CRYS_ECPKI_DomainID_secp256r1; + case ECC_SECP384R1: return CRYS_ECPKI_DomainID_secp384r1; + case ECC_SECP521R1: return CRYS_ECPKI_DomainID_secp521r1; + default: WOLFSSL_MSG("Curve not identified"); + return CRYS_ECPKI_DomainID_Builded; + } +} +#endif /* HAVE_ECC */ + +#ifndef NO_RSA +CRYS_RSA_HASH_OpMode_t cc310_hashModeRSA(enum wc_HashType hash_type, int isHashed) +{ + switch(hash_type) + { + case WC_HASH_TYPE_MD5: + #ifndef NO_MD5 + return isHashed? CRYS_RSA_After_MD5_mode : CRYS_RSA_HASH_MD5_mode; + #endif + case WC_HASH_TYPE_SHA: + #ifndef NO_SHA + return isHashed? CRYS_RSA_After_SHA1_mode : CRYS_RSA_HASH_SHA1_mode; + #endif + case WC_HASH_TYPE_SHA224: + #ifdef WOLFSSL_SHA224 + return isHashed? CRYS_RSA_After_SHA224_mode : CRYS_RSA_HASH_SHA224_mode; + #endif + case WC_HASH_TYPE_SHA256: + #ifndef NO_SHA256 + return isHashed? CRYS_RSA_After_SHA256_mode : CRYS_RSA_HASH_SHA256_mode; + #endif + case WC_HASH_TYPE_SHA384: + #ifdef WOLFSSL_SHA384 + return isHashed? CRYS_RSA_After_SHA384_mode : CRYS_RSA_HASH_SHA384_mode; + #endif + case WC_HASH_TYPE_SHA512: + #ifdef WOLFSSL_SHA512 + return isHashed? CRYS_RSA_After_SHA512_mode : CRYS_RSA_HASH_SHA512_mode; + #endif + case WC_HASH_TYPE_NONE: + /* default to SHA256 */ + return isHashed? CRYS_RSA_After_SHA256_mode : CRYS_RSA_HASH_SHA256_mode; + default: + return CRYS_RSA_After_HASH_NOT_KNOWN_mode; + } +} +#endif /* !NO_RSA */ + +#ifdef HAVE_ECC +CRYS_ECPKI_HASH_OpMode_t cc310_hashModeECC(int hash_size) +{ + CRYS_ECPKI_HASH_OpMode_t hash_mode; + switch (hash_size) + { + case 20: + hash_mode = CRYS_ECPKI_AFTER_HASH_SHA1_mode; + break; + case 28: + hash_mode = CRYS_ECPKI_AFTER_HASH_SHA224_mode; + break; + case 32: + hash_mode = CRYS_ECPKI_AFTER_HASH_SHA256_mode; + break; + case 48: + hash_mode = CRYS_ECPKI_AFTER_HASH_SHA384_mode; + break; + case 64: + hash_mode = CRYS_ECPKI_AFTER_HASH_SHA512_mode; + break; + default: + hash_mode = CRYS_ECPKI_HASH_OpModeLast; + break; + } + return hash_mode; +} +#endif /* HAVE_ECC */ +#endif /* WOLFSSL_CRYPTOCELL*/ + +#if !defined(NO_CRYPT_BENCHMARK) && defined(WOLFSSL_nRF5x_SDK_15_2) + +static int mRtcSec = 0; +static const nrfx_rtc_t rtc = NRFX_RTC_INSTANCE(0); + +static void rtc_handler(nrfx_rtc_int_type_t int_type) +{ + if (int_type == NRFX_RTC_INT_COMPARE0) { + mRtcSec++; + nrfx_rtc_counter_clear(&rtc); + nrfx_rtc_int_enable(&rtc, RTC_CHANNEL_INT_MASK(0)); +#ifdef BSP_LED_1 + nrf_gpio_pin_toggle(BSP_LED_1); +#endif + } + else if (int_type == NRF_DRV_RTC_INT_TICK) { +#ifdef BSP_LED_0 + nrf_gpio_pin_toggle(BSP_LED_0); +#endif + } +} + +static void rtc_config(void) +{ + uint32_t err_code; + nrfx_rtc_config_t config = NRFX_RTC_DEFAULT_CONFIG; + + /* configure gpio for pin toggling. */ + bsp_board_init(BSP_INIT_LEDS); + + /* start the internal LFCLK XTAL oscillator.*/ + err_code = nrf_drv_clock_init(); + APP_ERROR_CHECK(err_code); + nrf_drv_clock_lfclk_request(NULL); + + /* Initialize RTC instance */ + err_code = nrfx_rtc_init(&rtc, &config, rtc_handler); + APP_ERROR_CHECK(err_code); + + /* Enable tick event */ + nrfx_rtc_tick_enable(&rtc, false); + + /* Set compare channel to trigger interrupt after 1 seconds */ + err_code = nrfx_rtc_cc_set(&rtc, 0, RTC_INPUT_FREQ, true); + APP_ERROR_CHECK(err_code); + + /* Power on RTC instance */ + nrfx_rtc_enable(&rtc); +} + +static int rtc_get_ms(void) +{ + /* Prescaler is 12-bit for COUNTER: frequency = (32768/(PRESCALER+1)) */ + int frequency = (RTC_INPUT_FREQ / (rtc_prescaler_get(rtc.p_reg) + 1)); + uint32_t counter = nrfx_rtc_counter_get(&rtc); + + /* Convert with rounding frequency to milliseconds */ + return ((counter * 1000) + (frequency / 2) ) / frequency; +} + +double current_time(int reset) +{ + double time; + static int initialized = 0; + + if (!initialized) { + rtc_config(); + initialized = 1; + } + time = mRtcSec; + time += (double)rtc_get_ms() / 1000; + + return time; +} + +int nrf_random_generate(byte* output, word32 size) +{ + uint32_t err_code; + static int initialized = 0; + + /* RNG must be initialized once */ + if (!initialized) { + err_code = nrf_drv_rng_init(NULL); + if (err_code != NRF_SUCCESS) { + return -1; + } + initialized = 1; + } + nrf_drv_rng_block_rand(output, size); + return 0; +} +#endif /* !NO_CRYPT_BENCHMARK && WOLFSSL_nRF5x_SDK_15_2 */ + +#endif /* WOLFSSL_CRYPTOCELL_C */ diff --git a/client/wolfssl/wolfcrypt/src/port/arm/cryptoCellHash.c b/client/wolfssl/wolfcrypt/src/port/arm/cryptoCellHash.c new file mode 100644 index 0000000..bc729f7 --- /dev/null +++ b/client/wolfssl/wolfcrypt/src/port/arm/cryptoCellHash.c @@ -0,0 +1,134 @@ +/* cryptoCellHash.c + * + * Copyright (C) 2020 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef HAVE_CONFIG_H + #include <config.h> +#endif + +#include <wolfssl/wolfcrypt/settings.h> + +/* This source is included in wc_port.c */ +/* WOLFSSL_CRYPTOCELL_HASH_C is defined by wc_port.c in case compile tries + to include this .c directly */ +#ifdef WOLFSSL_CRYPTOCELL_HASH_C +#if !defined(NO_SHA256) && defined(WOLFSSL_CRYPTOCELL) + +#include <wolfssl/wolfcrypt/error-crypt.h> +#include <wolfssl/wolfcrypt/logging.h> +#include <wolfssl/wolfcrypt/sha256.h> +#include <wolfssl/wolfcrypt/port/arm/cryptoCell.h> + +#ifdef NO_INLINE + #include <wolfssl/wolfcrypt/misc.h> +#else + #define WOLFSSL_MISC_INCLUDED + #include <wolfcrypt/src/misc.c> +#endif + +int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) +{ + CRYSError_t ret = 0; + + (void)heap; + (void)devId; + + if (sha256 == NULL) + return BAD_FUNC_ARG; + + XMEMSET(sha256->digest, 0, sizeof(sha256->digest)); + + /* initializes the HASH context and machine to the supported mode.*/ + ret = CRYS_HASH_Init(&sha256->ctx, CRYS_HASH_SHA256_mode); + + if (ret != SA_SILIB_RET_OK){ + WOLFSSL_MSG("Error CRYS_HASH_Init failed"); + } + + return ret; +} + +int wc_InitSha256(Sha256* sha256) +{ + return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID); +} + +int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) +{ + CRYSError_t ret = 0; + size_t length; + size_t remaining = len; + byte const * p_cur = data; + + if (sha256 == NULL || (data == NULL && len > 0)) { + return BAD_FUNC_ARG; + } + + if (data == NULL && len == 0) { + /* valid, but do nothing */ + return 0; + } + + /* If the input is larger than CC310_MAX_LENGTH_DMA, split into smaller */ + do { + length = (remaining > CC310_MAX_LENGTH_DMA) ? + CC310_MAX_LENGTH_DMA : remaining; + + ret = CRYS_HASH_Update(&sha256->ctx, (uint8_t *)p_cur, length); + + remaining -= length; + p_cur += length; + + } while (ret == CRYS_OK && remaining > 0); + + return ret; +} + +int wc_Sha256Final(wc_Sha256* sha256, byte* hash) +{ + CRYSError_t ret = 0; + CRYS_HASH_Result_t hashResult; + + if (sha256 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + + ret = CRYS_HASH_Finish(&sha256->ctx, hashResult); + + if (ret != SA_SILIB_RET_OK){ + WOLFSSL_MSG("Error CRYS_HASH_Finish failed"); + return ret; + } + XMEMCPY(sha256->digest, hashResult, WC_SHA256_DIGEST_SIZE); + + XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); + + /* reset state */ + return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID); +} + +void wc_Sha256Free(wc_Sha256* sha256) +{ + if (sha256 == NULL) + return; +} + +#endif /* !NO_SHA256 && WOLFSSL_CRYPTOCELL */ +#endif /* WOLFSSL_CRYPTOCELL_HASH_C */ |