aboutsummaryrefslogtreecommitdiff
path: root/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c
diff options
context:
space:
mode:
Diffstat (limited to 'client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c')
-rw-r--r--client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c4783
1 files changed, 4783 insertions, 0 deletions
diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c
new file mode 100644
index 0000000..c502a39
--- /dev/null
+++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c
@@ -0,0 +1,4783 @@
+/* armv8-32-sha512-asm
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c
+ */
+
+#ifndef __aarch64__
+#include <stdint.h>
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#include <wolfssl/wolfcrypt/sha512.h>
+
+#ifdef WOLFSSL_ARMASM_NO_NEON
+static const uint64_t L_SHA512_transform_len_k[] = {
+ 0x428a2f98d728ae22UL,
+ 0x7137449123ef65cdUL,
+ 0xb5c0fbcfec4d3b2fUL,
+ 0xe9b5dba58189dbbcUL,
+ 0x3956c25bf348b538UL,
+ 0x59f111f1b605d019UL,
+ 0x923f82a4af194f9bUL,
+ 0xab1c5ed5da6d8118UL,
+ 0xd807aa98a3030242UL,
+ 0x12835b0145706fbeUL,
+ 0x243185be4ee4b28cUL,
+ 0x550c7dc3d5ffb4e2UL,
+ 0x72be5d74f27b896fUL,
+ 0x80deb1fe3b1696b1UL,
+ 0x9bdc06a725c71235UL,
+ 0xc19bf174cf692694UL,
+ 0xe49b69c19ef14ad2UL,
+ 0xefbe4786384f25e3UL,
+ 0xfc19dc68b8cd5b5UL,
+ 0x240ca1cc77ac9c65UL,
+ 0x2de92c6f592b0275UL,
+ 0x4a7484aa6ea6e483UL,
+ 0x5cb0a9dcbd41fbd4UL,
+ 0x76f988da831153b5UL,
+ 0x983e5152ee66dfabUL,
+ 0xa831c66d2db43210UL,
+ 0xb00327c898fb213fUL,
+ 0xbf597fc7beef0ee4UL,
+ 0xc6e00bf33da88fc2UL,
+ 0xd5a79147930aa725UL,
+ 0x6ca6351e003826fUL,
+ 0x142929670a0e6e70UL,
+ 0x27b70a8546d22ffcUL,
+ 0x2e1b21385c26c926UL,
+ 0x4d2c6dfc5ac42aedUL,
+ 0x53380d139d95b3dfUL,
+ 0x650a73548baf63deUL,
+ 0x766a0abb3c77b2a8UL,
+ 0x81c2c92e47edaee6UL,
+ 0x92722c851482353bUL,
+ 0xa2bfe8a14cf10364UL,
+ 0xa81a664bbc423001UL,
+ 0xc24b8b70d0f89791UL,
+ 0xc76c51a30654be30UL,
+ 0xd192e819d6ef5218UL,
+ 0xd69906245565a910UL,
+ 0xf40e35855771202aUL,
+ 0x106aa07032bbd1b8UL,
+ 0x19a4c116b8d2d0c8UL,
+ 0x1e376c085141ab53UL,
+ 0x2748774cdf8eeb99UL,
+ 0x34b0bcb5e19b48a8UL,
+ 0x391c0cb3c5c95a63UL,
+ 0x4ed8aa4ae3418acbUL,
+ 0x5b9cca4f7763e373UL,
+ 0x682e6ff3d6b2b8a3UL,
+ 0x748f82ee5defb2fcUL,
+ 0x78a5636f43172f60UL,
+ 0x84c87814a1f0ab72UL,
+ 0x8cc702081a6439ecUL,
+ 0x90befffa23631e28UL,
+ 0xa4506cebde82bde9UL,
+ 0xbef9a3f7b2c67915UL,
+ 0xc67178f2e372532bUL,
+ 0xca273eceea26619cUL,
+ 0xd186b8c721c0c207UL,
+ 0xeada7dd6cde0eb1eUL,
+ 0xf57d4f7fee6ed178UL,
+ 0x6f067aa72176fbaUL,
+ 0xa637dc5a2c898a6UL,
+ 0x113f9804bef90daeUL,
+ 0x1b710b35131c471bUL,
+ 0x28db77f523047d84UL,
+ 0x32caab7b40c72493UL,
+ 0x3c9ebe0a15c9bebcUL,
+ 0x431d67c49c100d4cUL,
+ 0x4cc5d4becb3e42b6UL,
+ 0x597f299cfc657e2aUL,
+ 0x5fcb6fab3ad6faecUL,
+ 0x6c44198c4a475817UL,
+};
+
+void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0xc0\n\t"
+ "mov r3, %[L_SHA512_transform_len_k]\n\t"
+ /* Copy digest to add in at end */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r8, r9, [%[sha512], #24]\n\t"
+ "strd r12, lr, [sp, #128]\n\t"
+ "strd r4, r5, [sp, #136]\n\t"
+ "strd r6, r7, [sp, #144]\n\t"
+ "strd r8, r9, [sp, #152]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r8, r9, [%[sha512], #56]\n\t"
+ "strd r12, lr, [sp, #160]\n\t"
+ "strd r4, r5, [sp, #168]\n\t"
+ "strd r6, r7, [sp, #176]\n\t"
+ "strd r8, r9, [sp, #184]\n\t"
+ /* Start of loop processing a block */
+ "\n"
+ "L_sha512_len_neon_begin_%=: \n\t"
+ /* Load, Reverse and Store W */
+ "ldrd r12, lr, [%[data]]\n\t"
+ "ldrd r4, r5, [%[data], #8]\n\t"
+ "ldrd r6, r7, [%[data], #16]\n\t"
+ "ldrd r8, r9, [%[data], #24]\n\t"
+ "rev r12, r12\n\t"
+ "rev lr, lr\n\t"
+ "rev r4, r4\n\t"
+ "rev r5, r5\n\t"
+ "rev r6, r6\n\t"
+ "rev r7, r7\n\t"
+ "rev r8, r8\n\t"
+ "rev r9, r9\n\t"
+ "str lr, [sp]\n\t"
+ "str r12, [sp, #4]\n\t"
+ "str r5, [sp, #8]\n\t"
+ "str r4, [sp, #12]\n\t"
+ "str r7, [sp, #16]\n\t"
+ "str r6, [sp, #20]\n\t"
+ "str r9, [sp, #24]\n\t"
+ "str r8, [sp, #28]\n\t"
+ "ldrd r12, lr, [%[data], #32]\n\t"
+ "ldrd r4, r5, [%[data], #40]\n\t"
+ "ldrd r6, r7, [%[data], #48]\n\t"
+ "ldrd r8, r9, [%[data], #56]\n\t"
+ "rev r12, r12\n\t"
+ "rev lr, lr\n\t"
+ "rev r4, r4\n\t"
+ "rev r5, r5\n\t"
+ "rev r6, r6\n\t"
+ "rev r7, r7\n\t"
+ "rev r8, r8\n\t"
+ "rev r9, r9\n\t"
+ "str lr, [sp, #32]\n\t"
+ "str r12, [sp, #36]\n\t"
+ "str r5, [sp, #40]\n\t"
+ "str r4, [sp, #44]\n\t"
+ "str r7, [sp, #48]\n\t"
+ "str r6, [sp, #52]\n\t"
+ "str r9, [sp, #56]\n\t"
+ "str r8, [sp, #60]\n\t"
+ "ldrd r12, lr, [%[data], #64]\n\t"
+ "ldrd r4, r5, [%[data], #72]\n\t"
+ "ldrd r6, r7, [%[data], #80]\n\t"
+ "ldrd r8, r9, [%[data], #88]\n\t"
+ "rev r12, r12\n\t"
+ "rev lr, lr\n\t"
+ "rev r4, r4\n\t"
+ "rev r5, r5\n\t"
+ "rev r6, r6\n\t"
+ "rev r7, r7\n\t"
+ "rev r8, r8\n\t"
+ "rev r9, r9\n\t"
+ "str lr, [sp, #64]\n\t"
+ "str r12, [sp, #68]\n\t"
+ "str r5, [sp, #72]\n\t"
+ "str r4, [sp, #76]\n\t"
+ "str r7, [sp, #80]\n\t"
+ "str r6, [sp, #84]\n\t"
+ "str r9, [sp, #88]\n\t"
+ "str r8, [sp, #92]\n\t"
+ "ldrd r12, lr, [%[data], #96]\n\t"
+ "ldrd r4, r5, [%[data], #104]\n\t"
+ "ldrd r6, r7, [%[data], #112]\n\t"
+ "ldrd r8, r9, [%[data], #120]\n\t"
+ "rev r12, r12\n\t"
+ "rev lr, lr\n\t"
+ "rev r4, r4\n\t"
+ "rev r5, r5\n\t"
+ "rev r6, r6\n\t"
+ "rev r7, r7\n\t"
+ "rev r8, r8\n\t"
+ "rev r9, r9\n\t"
+ "str lr, [sp, #96]\n\t"
+ "str r12, [sp, #100]\n\t"
+ "str r5, [sp, #104]\n\t"
+ "str r4, [sp, #108]\n\t"
+ "str r7, [sp, #112]\n\t"
+ "str r6, [sp, #116]\n\t"
+ "str r9, [sp, #120]\n\t"
+ "str r8, [sp, #124]\n\t"
+ /* Pre-calc: b ^ c */
+ "ldrd r8, r9, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r8, r8, r12\n\t"
+ "eor r9, r9, lr\n\t"
+ "mov r10, #4\n\t"
+ /* Start of 16 rounds */
+ "\n"
+ "L_sha512_len_neon_start_%=: \n\t"
+ /* Round 0 */
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "strd r6, r7, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[0] */
+ "ldrd r12, lr, [sp, #112]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp]\n\t"
+ "ldrd r6, r7, [sp, #72]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp]\n\t"
+ "ldrd r12, lr, [sp, #8]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp]\n\t"
+ /* Round 1 */
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #8]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "strd r6, r7, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #48]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[1] */
+ "ldrd r12, lr, [sp, #120]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #8]\n\t"
+ "ldrd r6, r7, [sp, #80]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #8]\n\t"
+ "ldrd r12, lr, [sp, #16]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #8]\n\t"
+ /* Round 2 */
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #16]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "strd r6, r7, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[2] */
+ "ldrd r12, lr, [sp]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #16]\n\t"
+ "ldrd r6, r7, [sp, #88]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #16]\n\t"
+ "ldrd r12, lr, [sp, #24]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #16]\n\t"
+ /* Round 3 */
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #24]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "strd r6, r7, [%[sha512]]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #32]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[3] */
+ "ldrd r12, lr, [sp, #8]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #24]\n\t"
+ "ldrd r6, r7, [sp, #96]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #24]\n\t"
+ "ldrd r12, lr, [sp, #32]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #24]\n\t"
+ /* Round 4 */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #32]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "strd r6, r7, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[4] */
+ "ldrd r12, lr, [sp, #16]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #32]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #32]\n\t"
+ "ldrd r12, lr, [sp, #40]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #32]\n\t"
+ /* Round 5 */
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #40]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "strd r6, r7, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #16]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[5] */
+ "ldrd r12, lr, [sp, #24]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #40]\n\t"
+ "ldrd r6, r7, [sp, #112]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #40]\n\t"
+ "ldrd r12, lr, [sp, #48]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #40]\n\t"
+ /* Round 6 */
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #48]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "strd r6, r7, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[6] */
+ "ldrd r12, lr, [sp, #32]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #48]\n\t"
+ "ldrd r6, r7, [sp, #120]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #48]\n\t"
+ "ldrd r12, lr, [sp, #56]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #48]\n\t"
+ /* Round 7 */
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #56]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "strd r6, r7, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512]]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[7] */
+ "ldrd r12, lr, [sp, #40]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #56]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #56]\n\t"
+ "ldrd r12, lr, [sp, #64]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #56]\n\t"
+ /* Round 8 */
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp, #64]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #64]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "strd r6, r7, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[8] */
+ "ldrd r12, lr, [sp, #48]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #64]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #64]\n\t"
+ "ldrd r12, lr, [sp, #72]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #64]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #64]\n\t"
+ /* Round 9 */
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [sp, #72]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #72]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "strd r6, r7, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #48]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[9] */
+ "ldrd r12, lr, [sp, #56]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #72]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #72]\n\t"
+ "ldrd r12, lr, [sp, #80]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #72]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #72]\n\t"
+ /* Round 10 */
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #80]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #80]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "strd r6, r7, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[10] */
+ "ldrd r12, lr, [sp, #64]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #80]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #80]\n\t"
+ "ldrd r12, lr, [sp, #88]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #80]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #80]\n\t"
+ /* Round 11 */
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [sp, #88]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #88]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "strd r6, r7, [%[sha512]]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #32]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[11] */
+ "ldrd r12, lr, [sp, #72]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #88]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #88]\n\t"
+ "ldrd r12, lr, [sp, #96]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #88]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #88]\n\t"
+ /* Round 12 */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #96]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #96]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "strd r6, r7, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[12] */
+ "ldrd r12, lr, [sp, #80]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #96]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #96]\n\t"
+ "ldrd r12, lr, [sp, #104]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #96]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #96]\n\t"
+ /* Round 13 */
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #104]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "strd r6, r7, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #16]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[13] */
+ "ldrd r12, lr, [sp, #88]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #104]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #104]\n\t"
+ "ldrd r12, lr, [sp, #112]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #104]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #104]\n\t"
+ /* Round 14 */
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #112]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #112]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "strd r6, r7, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[14] */
+ "ldrd r12, lr, [sp, #96]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #112]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #112]\n\t"
+ "ldrd r12, lr, [sp, #120]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #112]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #112]\n\t"
+ /* Round 15 */
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r6, r7, [sp, #120]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #120]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "strd r6, r7, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512]]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[15] */
+ "ldrd r12, lr, [sp, #104]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #120]\n\t"
+ "ldrd r6, r7, [sp, #64]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #120]\n\t"
+ "ldrd r12, lr, [sp]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #120]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #120]\n\t"
+ "add r3, r3, #0x80\n\t"
+ "subs r10, r10, #1\n\t"
+ "bne L_sha512_len_neon_start_%=\n\t"
+ /* Round 0 */
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "strd r6, r7, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 1 */
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #8]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "strd r6, r7, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #48]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 2 */
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #16]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "strd r6, r7, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 3 */
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #24]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "strd r6, r7, [%[sha512]]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #32]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 4 */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #32]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "strd r6, r7, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 5 */
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #40]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "strd r6, r7, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #16]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 6 */
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #48]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "strd r6, r7, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 7 */
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #56]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "strd r6, r7, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512]]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 8 */
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp, #64]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #64]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "strd r6, r7, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 9 */
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [sp, #72]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #72]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "strd r6, r7, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #48]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 10 */
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #80]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #80]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "strd r6, r7, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 11 */
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [sp, #88]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #88]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "strd r6, r7, [%[sha512]]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #32]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 12 */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #96]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #96]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "strd r6, r7, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 13 */
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #104]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "strd r6, r7, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #16]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 14 */
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #112]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #112]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "strd r6, r7, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 15 */
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r6, r7, [sp, #120]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #120]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "strd r6, r7, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512]]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Add in digest from start */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #128]\n\t"
+ "ldrd r8, r9, [sp, #136]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [sp, #128]\n\t"
+ "strd r4, r5, [sp, #136]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #144]\n\t"
+ "ldrd r8, r9, [sp, #152]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [sp, #144]\n\t"
+ "strd r4, r5, [sp, #152]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #160]\n\t"
+ "ldrd r8, r9, [sp, #168]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [sp, #160]\n\t"
+ "strd r4, r5, [sp, #168]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp, #176]\n\t"
+ "ldrd r8, r9, [sp, #184]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [sp, #176]\n\t"
+ "strd r4, r5, [sp, #184]\n\t"
+ "subs %[len], %[len], #0x80\n\t"
+ "sub r3, r3, #0x200\n\t"
+ "add %[data], %[data], #0x80\n\t"
+ "bne L_sha512_len_neon_begin_%=\n\t"
+ "eor r0, r0, r0\n\t"
+ "add sp, sp, #0xc0\n\t"
+ : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
+ : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k)
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+ );
+}
+
+#endif /* WOLFSSL_ARMASM_NO_NEON */
+#include <wolfssl/wolfcrypt/sha512.h>
+
+#ifndef WOLFSSL_ARMASM_NO_NEON
+static const uint64_t L_SHA512_transform_neon_len_k[] = {
+ 0x428a2f98d728ae22UL,
+ 0x7137449123ef65cdUL,
+ 0xb5c0fbcfec4d3b2fUL,
+ 0xe9b5dba58189dbbcUL,
+ 0x3956c25bf348b538UL,
+ 0x59f111f1b605d019UL,
+ 0x923f82a4af194f9bUL,
+ 0xab1c5ed5da6d8118UL,
+ 0xd807aa98a3030242UL,
+ 0x12835b0145706fbeUL,
+ 0x243185be4ee4b28cUL,
+ 0x550c7dc3d5ffb4e2UL,
+ 0x72be5d74f27b896fUL,
+ 0x80deb1fe3b1696b1UL,
+ 0x9bdc06a725c71235UL,
+ 0xc19bf174cf692694UL,
+ 0xe49b69c19ef14ad2UL,
+ 0xefbe4786384f25e3UL,
+ 0xfc19dc68b8cd5b5UL,
+ 0x240ca1cc77ac9c65UL,
+ 0x2de92c6f592b0275UL,
+ 0x4a7484aa6ea6e483UL,
+ 0x5cb0a9dcbd41fbd4UL,
+ 0x76f988da831153b5UL,
+ 0x983e5152ee66dfabUL,
+ 0xa831c66d2db43210UL,
+ 0xb00327c898fb213fUL,
+ 0xbf597fc7beef0ee4UL,
+ 0xc6e00bf33da88fc2UL,
+ 0xd5a79147930aa725UL,
+ 0x6ca6351e003826fUL,
+ 0x142929670a0e6e70UL,
+ 0x27b70a8546d22ffcUL,
+ 0x2e1b21385c26c926UL,
+ 0x4d2c6dfc5ac42aedUL,
+ 0x53380d139d95b3dfUL,
+ 0x650a73548baf63deUL,
+ 0x766a0abb3c77b2a8UL,
+ 0x81c2c92e47edaee6UL,
+ 0x92722c851482353bUL,
+ 0xa2bfe8a14cf10364UL,
+ 0xa81a664bbc423001UL,
+ 0xc24b8b70d0f89791UL,
+ 0xc76c51a30654be30UL,
+ 0xd192e819d6ef5218UL,
+ 0xd69906245565a910UL,
+ 0xf40e35855771202aUL,
+ 0x106aa07032bbd1b8UL,
+ 0x19a4c116b8d2d0c8UL,
+ 0x1e376c085141ab53UL,
+ 0x2748774cdf8eeb99UL,
+ 0x34b0bcb5e19b48a8UL,
+ 0x391c0cb3c5c95a63UL,
+ 0x4ed8aa4ae3418acbUL,
+ 0x5b9cca4f7763e373UL,
+ 0x682e6ff3d6b2b8a3UL,
+ 0x748f82ee5defb2fcUL,
+ 0x78a5636f43172f60UL,
+ 0x84c87814a1f0ab72UL,
+ 0x8cc702081a6439ecUL,
+ 0x90befffa23631e28UL,
+ 0xa4506cebde82bde9UL,
+ 0xbef9a3f7b2c67915UL,
+ 0xc67178f2e372532bUL,
+ 0xca273eceea26619cUL,
+ 0xd186b8c721c0c207UL,
+ 0xeada7dd6cde0eb1eUL,
+ 0xf57d4f7fee6ed178UL,
+ 0x6f067aa72176fbaUL,
+ 0xa637dc5a2c898a6UL,
+ 0x113f9804bef90daeUL,
+ 0x1b710b35131c471bUL,
+ 0x28db77f523047d84UL,
+ 0x32caab7b40c72493UL,
+ 0x3c9ebe0a15c9bebcUL,
+ 0x431d67c49c100d4cUL,
+ 0x4cc5d4becb3e42b6UL,
+ 0x597f299cfc657e2aUL,
+ 0x5fcb6fab3ad6faecUL,
+ 0x6c44198c4a475817UL,
+};
+
+void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
+{
+ __asm__ __volatile__ (
+ /* Load digest into working vars */
+ "vldm.64 %[sha512], {d0-d7}\n\t"
+ /* Start of loop processing a block */
+ "\n"
+ "L_sha512_len_neon_begin_%=: \n\t"
+ /* Load W */
+ "vldm.64 %[data]!, {d16-d31}\n\t"
+ "vrev64.8 q8, q8\n\t"
+ "vrev64.8 q9, q9\n\t"
+ "vrev64.8 q10, q10\n\t"
+ "vrev64.8 q11, q11\n\t"
+ "vrev64.8 q12, q12\n\t"
+ "vrev64.8 q13, q13\n\t"
+ "vrev64.8 q14, q14\n\t"
+ "vrev64.8 q15, q15\n\t"
+ "mov r3, %[L_SHA512_transform_neon_len_k]\n\t"
+ "mov r12, #4\n\t"
+ /* Start of 16 rounds */
+ "\n"
+ "L_sha512_len_neon_start_%=: \n\t"
+ /* Round 0 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d4, #50\n\t"
+ "vsri.u64 d8, d4, #14\n\t"
+ "vshl.u64 d9, d0, #36\n\t"
+ "vsri.u64 d9, d0, #28\n\t"
+ "vshl.u64 d10, d4, #46\n\t"
+ "vsri.u64 d10, d4, #18\n\t"
+ "vshl.u64 d11, d0, #30\n\t"
+ "vsri.u64 d11, d0, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d4, #23\n\t"
+ "vsri.u64 d10, d4, #41\n\t"
+ "vshl.u64 d11, d0, #25\n\t"
+ "vsri.u64 d11, d0, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d12, d16\n\t"
+ "vmov d8, d4\n\t"
+ "veor d10, d1, d2\n\t"
+ "vadd.i64 d7, d12\n\t"
+ "vbsl d8, d5, d6\n\t"
+ "vbsl d10, d0, d2\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d3, d7\n\t"
+ "vadd.i64 d7, d10\n\t"
+ /* Round 1 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d3, #50\n\t"
+ "vsri.u64 d8, d3, #14\n\t"
+ "vshl.u64 d9, d7, #36\n\t"
+ "vsri.u64 d9, d7, #28\n\t"
+ "vshl.u64 d10, d3, #46\n\t"
+ "vsri.u64 d10, d3, #18\n\t"
+ "vshl.u64 d11, d7, #30\n\t"
+ "vsri.u64 d11, d7, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d3, #23\n\t"
+ "vsri.u64 d10, d3, #41\n\t"
+ "vshl.u64 d11, d7, #25\n\t"
+ "vsri.u64 d11, d7, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d12, d17\n\t"
+ "vmov d8, d3\n\t"
+ "veor d10, d0, d1\n\t"
+ "vadd.i64 d6, d12\n\t"
+ "vbsl d8, d4, d5\n\t"
+ "vbsl d10, d7, d1\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d2, d6\n\t"
+ "vadd.i64 d6, d10\n\t"
+ /* Calc new W[0]-W[1] */
+ "vext.8 q6, q8, q9, #8\n\t"
+ "vshl.u64 q4, q15, #45\n\t"
+ "vsri.u64 q4, q15, #19\n\t"
+ "vshl.u64 q5, q15, #3\n\t"
+ "vsri.u64 q5, q15, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q15, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q8, q5\n\t"
+ "vext.8 q7, q12, q13, #8\n\t"
+ "vadd.i64 q8, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q8, q5\n\t"
+ /* Round 2 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d2, #50\n\t"
+ "vsri.u64 d8, d2, #14\n\t"
+ "vshl.u64 d9, d6, #36\n\t"
+ "vsri.u64 d9, d6, #28\n\t"
+ "vshl.u64 d10, d2, #46\n\t"
+ "vsri.u64 d10, d2, #18\n\t"
+ "vshl.u64 d11, d6, #30\n\t"
+ "vsri.u64 d11, d6, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d2, #23\n\t"
+ "vsri.u64 d10, d2, #41\n\t"
+ "vshl.u64 d11, d6, #25\n\t"
+ "vsri.u64 d11, d6, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d12, d18\n\t"
+ "vmov d8, d2\n\t"
+ "veor d10, d7, d0\n\t"
+ "vadd.i64 d5, d12\n\t"
+ "vbsl d8, d3, d4\n\t"
+ "vbsl d10, d6, d0\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d1, d5\n\t"
+ "vadd.i64 d5, d10\n\t"
+ /* Round 3 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d1, #50\n\t"
+ "vsri.u64 d8, d1, #14\n\t"
+ "vshl.u64 d9, d5, #36\n\t"
+ "vsri.u64 d9, d5, #28\n\t"
+ "vshl.u64 d10, d1, #46\n\t"
+ "vsri.u64 d10, d1, #18\n\t"
+ "vshl.u64 d11, d5, #30\n\t"
+ "vsri.u64 d11, d5, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d1, #23\n\t"
+ "vsri.u64 d10, d1, #41\n\t"
+ "vshl.u64 d11, d5, #25\n\t"
+ "vsri.u64 d11, d5, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d12, d19\n\t"
+ "vmov d8, d1\n\t"
+ "veor d10, d6, d7\n\t"
+ "vadd.i64 d4, d12\n\t"
+ "vbsl d8, d2, d3\n\t"
+ "vbsl d10, d5, d7\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d0, d4\n\t"
+ "vadd.i64 d4, d10\n\t"
+ /* Calc new W[2]-W[3] */
+ "vext.8 q6, q9, q10, #8\n\t"
+ "vshl.u64 q4, q8, #45\n\t"
+ "vsri.u64 q4, q8, #19\n\t"
+ "vshl.u64 q5, q8, #3\n\t"
+ "vsri.u64 q5, q8, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q8, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q9, q5\n\t"
+ "vext.8 q7, q13, q14, #8\n\t"
+ "vadd.i64 q9, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q9, q5\n\t"
+ /* Round 4 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d0, #50\n\t"
+ "vsri.u64 d8, d0, #14\n\t"
+ "vshl.u64 d9, d4, #36\n\t"
+ "vsri.u64 d9, d4, #28\n\t"
+ "vshl.u64 d10, d0, #46\n\t"
+ "vsri.u64 d10, d0, #18\n\t"
+ "vshl.u64 d11, d4, #30\n\t"
+ "vsri.u64 d11, d4, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d0, #23\n\t"
+ "vsri.u64 d10, d0, #41\n\t"
+ "vshl.u64 d11, d4, #25\n\t"
+ "vsri.u64 d11, d4, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d12, d20\n\t"
+ "vmov d8, d0\n\t"
+ "veor d10, d5, d6\n\t"
+ "vadd.i64 d3, d12\n\t"
+ "vbsl d8, d1, d2\n\t"
+ "vbsl d10, d4, d6\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d7, d3\n\t"
+ "vadd.i64 d3, d10\n\t"
+ /* Round 5 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d7, #50\n\t"
+ "vsri.u64 d8, d7, #14\n\t"
+ "vshl.u64 d9, d3, #36\n\t"
+ "vsri.u64 d9, d3, #28\n\t"
+ "vshl.u64 d10, d7, #46\n\t"
+ "vsri.u64 d10, d7, #18\n\t"
+ "vshl.u64 d11, d3, #30\n\t"
+ "vsri.u64 d11, d3, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d7, #23\n\t"
+ "vsri.u64 d10, d7, #41\n\t"
+ "vshl.u64 d11, d3, #25\n\t"
+ "vsri.u64 d11, d3, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d12, d21\n\t"
+ "vmov d8, d7\n\t"
+ "veor d10, d4, d5\n\t"
+ "vadd.i64 d2, d12\n\t"
+ "vbsl d8, d0, d1\n\t"
+ "vbsl d10, d3, d5\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d6, d2\n\t"
+ "vadd.i64 d2, d10\n\t"
+ /* Calc new W[4]-W[5] */
+ "vext.8 q6, q10, q11, #8\n\t"
+ "vshl.u64 q4, q9, #45\n\t"
+ "vsri.u64 q4, q9, #19\n\t"
+ "vshl.u64 q5, q9, #3\n\t"
+ "vsri.u64 q5, q9, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q9, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q10, q5\n\t"
+ "vext.8 q7, q14, q15, #8\n\t"
+ "vadd.i64 q10, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q10, q5\n\t"
+ /* Round 6 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d6, #50\n\t"
+ "vsri.u64 d8, d6, #14\n\t"
+ "vshl.u64 d9, d2, #36\n\t"
+ "vsri.u64 d9, d2, #28\n\t"
+ "vshl.u64 d10, d6, #46\n\t"
+ "vsri.u64 d10, d6, #18\n\t"
+ "vshl.u64 d11, d2, #30\n\t"
+ "vsri.u64 d11, d2, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d6, #23\n\t"
+ "vsri.u64 d10, d6, #41\n\t"
+ "vshl.u64 d11, d2, #25\n\t"
+ "vsri.u64 d11, d2, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d12, d22\n\t"
+ "vmov d8, d6\n\t"
+ "veor d10, d3, d4\n\t"
+ "vadd.i64 d1, d12\n\t"
+ "vbsl d8, d7, d0\n\t"
+ "vbsl d10, d2, d4\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d5, d1\n\t"
+ "vadd.i64 d1, d10\n\t"
+ /* Round 7 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d5, #50\n\t"
+ "vsri.u64 d8, d5, #14\n\t"
+ "vshl.u64 d9, d1, #36\n\t"
+ "vsri.u64 d9, d1, #28\n\t"
+ "vshl.u64 d10, d5, #46\n\t"
+ "vsri.u64 d10, d5, #18\n\t"
+ "vshl.u64 d11, d1, #30\n\t"
+ "vsri.u64 d11, d1, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d5, #23\n\t"
+ "vsri.u64 d10, d5, #41\n\t"
+ "vshl.u64 d11, d1, #25\n\t"
+ "vsri.u64 d11, d1, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d12, d23\n\t"
+ "vmov d8, d5\n\t"
+ "veor d10, d2, d3\n\t"
+ "vadd.i64 d0, d12\n\t"
+ "vbsl d8, d6, d7\n\t"
+ "vbsl d10, d1, d3\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d4, d0\n\t"
+ "vadd.i64 d0, d10\n\t"
+ /* Calc new W[6]-W[7] */
+ "vext.8 q6, q11, q12, #8\n\t"
+ "vshl.u64 q4, q10, #45\n\t"
+ "vsri.u64 q4, q10, #19\n\t"
+ "vshl.u64 q5, q10, #3\n\t"
+ "vsri.u64 q5, q10, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q10, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q11, q5\n\t"
+ "vext.8 q7, q15, q8, #8\n\t"
+ "vadd.i64 q11, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q11, q5\n\t"
+ /* Round 8 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d4, #50\n\t"
+ "vsri.u64 d8, d4, #14\n\t"
+ "vshl.u64 d9, d0, #36\n\t"
+ "vsri.u64 d9, d0, #28\n\t"
+ "vshl.u64 d10, d4, #46\n\t"
+ "vsri.u64 d10, d4, #18\n\t"
+ "vshl.u64 d11, d0, #30\n\t"
+ "vsri.u64 d11, d0, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d4, #23\n\t"
+ "vsri.u64 d10, d4, #41\n\t"
+ "vshl.u64 d11, d0, #25\n\t"
+ "vsri.u64 d11, d0, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d12, d24\n\t"
+ "vmov d8, d4\n\t"
+ "veor d10, d1, d2\n\t"
+ "vadd.i64 d7, d12\n\t"
+ "vbsl d8, d5, d6\n\t"
+ "vbsl d10, d0, d2\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d3, d7\n\t"
+ "vadd.i64 d7, d10\n\t"
+ /* Round 9 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d3, #50\n\t"
+ "vsri.u64 d8, d3, #14\n\t"
+ "vshl.u64 d9, d7, #36\n\t"
+ "vsri.u64 d9, d7, #28\n\t"
+ "vshl.u64 d10, d3, #46\n\t"
+ "vsri.u64 d10, d3, #18\n\t"
+ "vshl.u64 d11, d7, #30\n\t"
+ "vsri.u64 d11, d7, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d3, #23\n\t"
+ "vsri.u64 d10, d3, #41\n\t"
+ "vshl.u64 d11, d7, #25\n\t"
+ "vsri.u64 d11, d7, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d12, d25\n\t"
+ "vmov d8, d3\n\t"
+ "veor d10, d0, d1\n\t"
+ "vadd.i64 d6, d12\n\t"
+ "vbsl d8, d4, d5\n\t"
+ "vbsl d10, d7, d1\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d2, d6\n\t"
+ "vadd.i64 d6, d10\n\t"
+ /* Calc new W[8]-W[9] */
+ "vext.8 q6, q12, q13, #8\n\t"
+ "vshl.u64 q4, q11, #45\n\t"
+ "vsri.u64 q4, q11, #19\n\t"
+ "vshl.u64 q5, q11, #3\n\t"
+ "vsri.u64 q5, q11, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q11, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q12, q5\n\t"
+ "vext.8 q7, q8, q9, #8\n\t"
+ "vadd.i64 q12, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q12, q5\n\t"
+ /* Round 10 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d2, #50\n\t"
+ "vsri.u64 d8, d2, #14\n\t"
+ "vshl.u64 d9, d6, #36\n\t"
+ "vsri.u64 d9, d6, #28\n\t"
+ "vshl.u64 d10, d2, #46\n\t"
+ "vsri.u64 d10, d2, #18\n\t"
+ "vshl.u64 d11, d6, #30\n\t"
+ "vsri.u64 d11, d6, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d2, #23\n\t"
+ "vsri.u64 d10, d2, #41\n\t"
+ "vshl.u64 d11, d6, #25\n\t"
+ "vsri.u64 d11, d6, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d12, d26\n\t"
+ "vmov d8, d2\n\t"
+ "veor d10, d7, d0\n\t"
+ "vadd.i64 d5, d12\n\t"
+ "vbsl d8, d3, d4\n\t"
+ "vbsl d10, d6, d0\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d1, d5\n\t"
+ "vadd.i64 d5, d10\n\t"
+ /* Round 11 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d1, #50\n\t"
+ "vsri.u64 d8, d1, #14\n\t"
+ "vshl.u64 d9, d5, #36\n\t"
+ "vsri.u64 d9, d5, #28\n\t"
+ "vshl.u64 d10, d1, #46\n\t"
+ "vsri.u64 d10, d1, #18\n\t"
+ "vshl.u64 d11, d5, #30\n\t"
+ "vsri.u64 d11, d5, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d1, #23\n\t"
+ "vsri.u64 d10, d1, #41\n\t"
+ "vshl.u64 d11, d5, #25\n\t"
+ "vsri.u64 d11, d5, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d12, d27\n\t"
+ "vmov d8, d1\n\t"
+ "veor d10, d6, d7\n\t"
+ "vadd.i64 d4, d12\n\t"
+ "vbsl d8, d2, d3\n\t"
+ "vbsl d10, d5, d7\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d0, d4\n\t"
+ "vadd.i64 d4, d10\n\t"
+ /* Calc new W[10]-W[11] */
+ "vext.8 q6, q13, q14, #8\n\t"
+ "vshl.u64 q4, q12, #45\n\t"
+ "vsri.u64 q4, q12, #19\n\t"
+ "vshl.u64 q5, q12, #3\n\t"
+ "vsri.u64 q5, q12, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q12, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q13, q5\n\t"
+ "vext.8 q7, q9, q10, #8\n\t"
+ "vadd.i64 q13, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q13, q5\n\t"
+ /* Round 12 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d0, #50\n\t"
+ "vsri.u64 d8, d0, #14\n\t"
+ "vshl.u64 d9, d4, #36\n\t"
+ "vsri.u64 d9, d4, #28\n\t"
+ "vshl.u64 d10, d0, #46\n\t"
+ "vsri.u64 d10, d0, #18\n\t"
+ "vshl.u64 d11, d4, #30\n\t"
+ "vsri.u64 d11, d4, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d0, #23\n\t"
+ "vsri.u64 d10, d0, #41\n\t"
+ "vshl.u64 d11, d4, #25\n\t"
+ "vsri.u64 d11, d4, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d12, d28\n\t"
+ "vmov d8, d0\n\t"
+ "veor d10, d5, d6\n\t"
+ "vadd.i64 d3, d12\n\t"
+ "vbsl d8, d1, d2\n\t"
+ "vbsl d10, d4, d6\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d7, d3\n\t"
+ "vadd.i64 d3, d10\n\t"
+ /* Round 13 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d7, #50\n\t"
+ "vsri.u64 d8, d7, #14\n\t"
+ "vshl.u64 d9, d3, #36\n\t"
+ "vsri.u64 d9, d3, #28\n\t"
+ "vshl.u64 d10, d7, #46\n\t"
+ "vsri.u64 d10, d7, #18\n\t"
+ "vshl.u64 d11, d3, #30\n\t"
+ "vsri.u64 d11, d3, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d7, #23\n\t"
+ "vsri.u64 d10, d7, #41\n\t"
+ "vshl.u64 d11, d3, #25\n\t"
+ "vsri.u64 d11, d3, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d12, d29\n\t"
+ "vmov d8, d7\n\t"
+ "veor d10, d4, d5\n\t"
+ "vadd.i64 d2, d12\n\t"
+ "vbsl d8, d0, d1\n\t"
+ "vbsl d10, d3, d5\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d6, d2\n\t"
+ "vadd.i64 d2, d10\n\t"
+ /* Calc new W[12]-W[13] */
+ "vext.8 q6, q14, q15, #8\n\t"
+ "vshl.u64 q4, q13, #45\n\t"
+ "vsri.u64 q4, q13, #19\n\t"
+ "vshl.u64 q5, q13, #3\n\t"
+ "vsri.u64 q5, q13, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q13, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q14, q5\n\t"
+ "vext.8 q7, q10, q11, #8\n\t"
+ "vadd.i64 q14, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q14, q5\n\t"
+ /* Round 14 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d6, #50\n\t"
+ "vsri.u64 d8, d6, #14\n\t"
+ "vshl.u64 d9, d2, #36\n\t"
+ "vsri.u64 d9, d2, #28\n\t"
+ "vshl.u64 d10, d6, #46\n\t"
+ "vsri.u64 d10, d6, #18\n\t"
+ "vshl.u64 d11, d2, #30\n\t"
+ "vsri.u64 d11, d2, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d6, #23\n\t"
+ "vsri.u64 d10, d6, #41\n\t"
+ "vshl.u64 d11, d2, #25\n\t"
+ "vsri.u64 d11, d2, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d12, d30\n\t"
+ "vmov d8, d6\n\t"
+ "veor d10, d3, d4\n\t"
+ "vadd.i64 d1, d12\n\t"
+ "vbsl d8, d7, d0\n\t"
+ "vbsl d10, d2, d4\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d5, d1\n\t"
+ "vadd.i64 d1, d10\n\t"
+ /* Round 15 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d5, #50\n\t"
+ "vsri.u64 d8, d5, #14\n\t"
+ "vshl.u64 d9, d1, #36\n\t"
+ "vsri.u64 d9, d1, #28\n\t"
+ "vshl.u64 d10, d5, #46\n\t"
+ "vsri.u64 d10, d5, #18\n\t"
+ "vshl.u64 d11, d1, #30\n\t"
+ "vsri.u64 d11, d1, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d5, #23\n\t"
+ "vsri.u64 d10, d5, #41\n\t"
+ "vshl.u64 d11, d1, #25\n\t"
+ "vsri.u64 d11, d1, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d12, d31\n\t"
+ "vmov d8, d5\n\t"
+ "veor d10, d2, d3\n\t"
+ "vadd.i64 d0, d12\n\t"
+ "vbsl d8, d6, d7\n\t"
+ "vbsl d10, d1, d3\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d4, d0\n\t"
+ "vadd.i64 d0, d10\n\t"
+ /* Calc new W[14]-W[15] */
+ "vext.8 q6, q15, q8, #8\n\t"
+ "vshl.u64 q4, q14, #45\n\t"
+ "vsri.u64 q4, q14, #19\n\t"
+ "vshl.u64 q5, q14, #3\n\t"
+ "vsri.u64 q5, q14, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q14, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q15, q5\n\t"
+ "vext.8 q7, q11, q12, #8\n\t"
+ "vadd.i64 q15, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q15, q5\n\t"
+ "subs r12, r12, #1\n\t"
+ "bne L_sha512_len_neon_start_%=\n\t"
+ /* Round 0 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d4, #50\n\t"
+ "vsri.u64 d8, d4, #14\n\t"
+ "vshl.u64 d9, d0, #36\n\t"
+ "vsri.u64 d9, d0, #28\n\t"
+ "vshl.u64 d10, d4, #46\n\t"
+ "vsri.u64 d10, d4, #18\n\t"
+ "vshl.u64 d11, d0, #30\n\t"
+ "vsri.u64 d11, d0, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d4, #23\n\t"
+ "vsri.u64 d10, d4, #41\n\t"
+ "vshl.u64 d11, d0, #25\n\t"
+ "vsri.u64 d11, d0, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d12, d16\n\t"
+ "vmov d8, d4\n\t"
+ "veor d10, d1, d2\n\t"
+ "vadd.i64 d7, d12\n\t"
+ "vbsl d8, d5, d6\n\t"
+ "vbsl d10, d0, d2\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d3, d7\n\t"
+ "vadd.i64 d7, d10\n\t"
+ /* Round 1 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d3, #50\n\t"
+ "vsri.u64 d8, d3, #14\n\t"
+ "vshl.u64 d9, d7, #36\n\t"
+ "vsri.u64 d9, d7, #28\n\t"
+ "vshl.u64 d10, d3, #46\n\t"
+ "vsri.u64 d10, d3, #18\n\t"
+ "vshl.u64 d11, d7, #30\n\t"
+ "vsri.u64 d11, d7, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d3, #23\n\t"
+ "vsri.u64 d10, d3, #41\n\t"
+ "vshl.u64 d11, d7, #25\n\t"
+ "vsri.u64 d11, d7, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d12, d17\n\t"
+ "vmov d8, d3\n\t"
+ "veor d10, d0, d1\n\t"
+ "vadd.i64 d6, d12\n\t"
+ "vbsl d8, d4, d5\n\t"
+ "vbsl d10, d7, d1\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d2, d6\n\t"
+ "vadd.i64 d6, d10\n\t"
+ /* Round 2 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d2, #50\n\t"
+ "vsri.u64 d8, d2, #14\n\t"
+ "vshl.u64 d9, d6, #36\n\t"
+ "vsri.u64 d9, d6, #28\n\t"
+ "vshl.u64 d10, d2, #46\n\t"
+ "vsri.u64 d10, d2, #18\n\t"
+ "vshl.u64 d11, d6, #30\n\t"
+ "vsri.u64 d11, d6, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d2, #23\n\t"
+ "vsri.u64 d10, d2, #41\n\t"
+ "vshl.u64 d11, d6, #25\n\t"
+ "vsri.u64 d11, d6, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d12, d18\n\t"
+ "vmov d8, d2\n\t"
+ "veor d10, d7, d0\n\t"
+ "vadd.i64 d5, d12\n\t"
+ "vbsl d8, d3, d4\n\t"
+ "vbsl d10, d6, d0\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d1, d5\n\t"
+ "vadd.i64 d5, d10\n\t"
+ /* Round 3 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d1, #50\n\t"
+ "vsri.u64 d8, d1, #14\n\t"
+ "vshl.u64 d9, d5, #36\n\t"
+ "vsri.u64 d9, d5, #28\n\t"
+ "vshl.u64 d10, d1, #46\n\t"
+ "vsri.u64 d10, d1, #18\n\t"
+ "vshl.u64 d11, d5, #30\n\t"
+ "vsri.u64 d11, d5, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d1, #23\n\t"
+ "vsri.u64 d10, d1, #41\n\t"
+ "vshl.u64 d11, d5, #25\n\t"
+ "vsri.u64 d11, d5, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d12, d19\n\t"
+ "vmov d8, d1\n\t"
+ "veor d10, d6, d7\n\t"
+ "vadd.i64 d4, d12\n\t"
+ "vbsl d8, d2, d3\n\t"
+ "vbsl d10, d5, d7\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d0, d4\n\t"
+ "vadd.i64 d4, d10\n\t"
+ /* Round 4 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d0, #50\n\t"
+ "vsri.u64 d8, d0, #14\n\t"
+ "vshl.u64 d9, d4, #36\n\t"
+ "vsri.u64 d9, d4, #28\n\t"
+ "vshl.u64 d10, d0, #46\n\t"
+ "vsri.u64 d10, d0, #18\n\t"
+ "vshl.u64 d11, d4, #30\n\t"
+ "vsri.u64 d11, d4, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d0, #23\n\t"
+ "vsri.u64 d10, d0, #41\n\t"
+ "vshl.u64 d11, d4, #25\n\t"
+ "vsri.u64 d11, d4, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d12, d20\n\t"
+ "vmov d8, d0\n\t"
+ "veor d10, d5, d6\n\t"
+ "vadd.i64 d3, d12\n\t"
+ "vbsl d8, d1, d2\n\t"
+ "vbsl d10, d4, d6\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d7, d3\n\t"
+ "vadd.i64 d3, d10\n\t"
+ /* Round 5 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d7, #50\n\t"
+ "vsri.u64 d8, d7, #14\n\t"
+ "vshl.u64 d9, d3, #36\n\t"
+ "vsri.u64 d9, d3, #28\n\t"
+ "vshl.u64 d10, d7, #46\n\t"
+ "vsri.u64 d10, d7, #18\n\t"
+ "vshl.u64 d11, d3, #30\n\t"
+ "vsri.u64 d11, d3, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d7, #23\n\t"
+ "vsri.u64 d10, d7, #41\n\t"
+ "vshl.u64 d11, d3, #25\n\t"
+ "vsri.u64 d11, d3, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d12, d21\n\t"
+ "vmov d8, d7\n\t"
+ "veor d10, d4, d5\n\t"
+ "vadd.i64 d2, d12\n\t"
+ "vbsl d8, d0, d1\n\t"
+ "vbsl d10, d3, d5\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d6, d2\n\t"
+ "vadd.i64 d2, d10\n\t"
+ /* Round 6 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d6, #50\n\t"
+ "vsri.u64 d8, d6, #14\n\t"
+ "vshl.u64 d9, d2, #36\n\t"
+ "vsri.u64 d9, d2, #28\n\t"
+ "vshl.u64 d10, d6, #46\n\t"
+ "vsri.u64 d10, d6, #18\n\t"
+ "vshl.u64 d11, d2, #30\n\t"
+ "vsri.u64 d11, d2, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d6, #23\n\t"
+ "vsri.u64 d10, d6, #41\n\t"
+ "vshl.u64 d11, d2, #25\n\t"
+ "vsri.u64 d11, d2, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d12, d22\n\t"
+ "vmov d8, d6\n\t"
+ "veor d10, d3, d4\n\t"
+ "vadd.i64 d1, d12\n\t"
+ "vbsl d8, d7, d0\n\t"
+ "vbsl d10, d2, d4\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d5, d1\n\t"
+ "vadd.i64 d1, d10\n\t"
+ /* Round 7 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d5, #50\n\t"
+ "vsri.u64 d8, d5, #14\n\t"
+ "vshl.u64 d9, d1, #36\n\t"
+ "vsri.u64 d9, d1, #28\n\t"
+ "vshl.u64 d10, d5, #46\n\t"
+ "vsri.u64 d10, d5, #18\n\t"
+ "vshl.u64 d11, d1, #30\n\t"
+ "vsri.u64 d11, d1, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d5, #23\n\t"
+ "vsri.u64 d10, d5, #41\n\t"
+ "vshl.u64 d11, d1, #25\n\t"
+ "vsri.u64 d11, d1, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d12, d23\n\t"
+ "vmov d8, d5\n\t"
+ "veor d10, d2, d3\n\t"
+ "vadd.i64 d0, d12\n\t"
+ "vbsl d8, d6, d7\n\t"
+ "vbsl d10, d1, d3\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d4, d0\n\t"
+ "vadd.i64 d0, d10\n\t"
+ /* Round 8 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d4, #50\n\t"
+ "vsri.u64 d8, d4, #14\n\t"
+ "vshl.u64 d9, d0, #36\n\t"
+ "vsri.u64 d9, d0, #28\n\t"
+ "vshl.u64 d10, d4, #46\n\t"
+ "vsri.u64 d10, d4, #18\n\t"
+ "vshl.u64 d11, d0, #30\n\t"
+ "vsri.u64 d11, d0, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d4, #23\n\t"
+ "vsri.u64 d10, d4, #41\n\t"
+ "vshl.u64 d11, d0, #25\n\t"
+ "vsri.u64 d11, d0, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d12, d24\n\t"
+ "vmov d8, d4\n\t"
+ "veor d10, d1, d2\n\t"
+ "vadd.i64 d7, d12\n\t"
+ "vbsl d8, d5, d6\n\t"
+ "vbsl d10, d0, d2\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d3, d7\n\t"
+ "vadd.i64 d7, d10\n\t"
+ /* Round 9 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d3, #50\n\t"
+ "vsri.u64 d8, d3, #14\n\t"
+ "vshl.u64 d9, d7, #36\n\t"
+ "vsri.u64 d9, d7, #28\n\t"
+ "vshl.u64 d10, d3, #46\n\t"
+ "vsri.u64 d10, d3, #18\n\t"
+ "vshl.u64 d11, d7, #30\n\t"
+ "vsri.u64 d11, d7, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d3, #23\n\t"
+ "vsri.u64 d10, d3, #41\n\t"
+ "vshl.u64 d11, d7, #25\n\t"
+ "vsri.u64 d11, d7, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d12, d25\n\t"
+ "vmov d8, d3\n\t"
+ "veor d10, d0, d1\n\t"
+ "vadd.i64 d6, d12\n\t"
+ "vbsl d8, d4, d5\n\t"
+ "vbsl d10, d7, d1\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d2, d6\n\t"
+ "vadd.i64 d6, d10\n\t"
+ /* Round 10 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d2, #50\n\t"
+ "vsri.u64 d8, d2, #14\n\t"
+ "vshl.u64 d9, d6, #36\n\t"
+ "vsri.u64 d9, d6, #28\n\t"
+ "vshl.u64 d10, d2, #46\n\t"
+ "vsri.u64 d10, d2, #18\n\t"
+ "vshl.u64 d11, d6, #30\n\t"
+ "vsri.u64 d11, d6, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d2, #23\n\t"
+ "vsri.u64 d10, d2, #41\n\t"
+ "vshl.u64 d11, d6, #25\n\t"
+ "vsri.u64 d11, d6, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d12, d26\n\t"
+ "vmov d8, d2\n\t"
+ "veor d10, d7, d0\n\t"
+ "vadd.i64 d5, d12\n\t"
+ "vbsl d8, d3, d4\n\t"
+ "vbsl d10, d6, d0\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d1, d5\n\t"
+ "vadd.i64 d5, d10\n\t"
+ /* Round 11 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d1, #50\n\t"
+ "vsri.u64 d8, d1, #14\n\t"
+ "vshl.u64 d9, d5, #36\n\t"
+ "vsri.u64 d9, d5, #28\n\t"
+ "vshl.u64 d10, d1, #46\n\t"
+ "vsri.u64 d10, d1, #18\n\t"
+ "vshl.u64 d11, d5, #30\n\t"
+ "vsri.u64 d11, d5, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d1, #23\n\t"
+ "vsri.u64 d10, d1, #41\n\t"
+ "vshl.u64 d11, d5, #25\n\t"
+ "vsri.u64 d11, d5, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d12, d27\n\t"
+ "vmov d8, d1\n\t"
+ "veor d10, d6, d7\n\t"
+ "vadd.i64 d4, d12\n\t"
+ "vbsl d8, d2, d3\n\t"
+ "vbsl d10, d5, d7\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d0, d4\n\t"
+ "vadd.i64 d4, d10\n\t"
+ /* Round 12 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d0, #50\n\t"
+ "vsri.u64 d8, d0, #14\n\t"
+ "vshl.u64 d9, d4, #36\n\t"
+ "vsri.u64 d9, d4, #28\n\t"
+ "vshl.u64 d10, d0, #46\n\t"
+ "vsri.u64 d10, d0, #18\n\t"
+ "vshl.u64 d11, d4, #30\n\t"
+ "vsri.u64 d11, d4, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d0, #23\n\t"
+ "vsri.u64 d10, d0, #41\n\t"
+ "vshl.u64 d11, d4, #25\n\t"
+ "vsri.u64 d11, d4, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d12, d28\n\t"
+ "vmov d8, d0\n\t"
+ "veor d10, d5, d6\n\t"
+ "vadd.i64 d3, d12\n\t"
+ "vbsl d8, d1, d2\n\t"
+ "vbsl d10, d4, d6\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d7, d3\n\t"
+ "vadd.i64 d3, d10\n\t"
+ /* Round 13 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d7, #50\n\t"
+ "vsri.u64 d8, d7, #14\n\t"
+ "vshl.u64 d9, d3, #36\n\t"
+ "vsri.u64 d9, d3, #28\n\t"
+ "vshl.u64 d10, d7, #46\n\t"
+ "vsri.u64 d10, d7, #18\n\t"
+ "vshl.u64 d11, d3, #30\n\t"
+ "vsri.u64 d11, d3, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d7, #23\n\t"
+ "vsri.u64 d10, d7, #41\n\t"
+ "vshl.u64 d11, d3, #25\n\t"
+ "vsri.u64 d11, d3, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d12, d29\n\t"
+ "vmov d8, d7\n\t"
+ "veor d10, d4, d5\n\t"
+ "vadd.i64 d2, d12\n\t"
+ "vbsl d8, d0, d1\n\t"
+ "vbsl d10, d3, d5\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d6, d2\n\t"
+ "vadd.i64 d2, d10\n\t"
+ /* Round 14 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d6, #50\n\t"
+ "vsri.u64 d8, d6, #14\n\t"
+ "vshl.u64 d9, d2, #36\n\t"
+ "vsri.u64 d9, d2, #28\n\t"
+ "vshl.u64 d10, d6, #46\n\t"
+ "vsri.u64 d10, d6, #18\n\t"
+ "vshl.u64 d11, d2, #30\n\t"
+ "vsri.u64 d11, d2, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d6, #23\n\t"
+ "vsri.u64 d10, d6, #41\n\t"
+ "vshl.u64 d11, d2, #25\n\t"
+ "vsri.u64 d11, d2, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d12, d30\n\t"
+ "vmov d8, d6\n\t"
+ "veor d10, d3, d4\n\t"
+ "vadd.i64 d1, d12\n\t"
+ "vbsl d8, d7, d0\n\t"
+ "vbsl d10, d2, d4\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d5, d1\n\t"
+ "vadd.i64 d1, d10\n\t"
+ /* Round 15 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d5, #50\n\t"
+ "vsri.u64 d8, d5, #14\n\t"
+ "vshl.u64 d9, d1, #36\n\t"
+ "vsri.u64 d9, d1, #28\n\t"
+ "vshl.u64 d10, d5, #46\n\t"
+ "vsri.u64 d10, d5, #18\n\t"
+ "vshl.u64 d11, d1, #30\n\t"
+ "vsri.u64 d11, d1, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d5, #23\n\t"
+ "vsri.u64 d10, d5, #41\n\t"
+ "vshl.u64 d11, d1, #25\n\t"
+ "vsri.u64 d11, d1, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d12, d31\n\t"
+ "vmov d8, d5\n\t"
+ "veor d10, d2, d3\n\t"
+ "vadd.i64 d0, d12\n\t"
+ "vbsl d8, d6, d7\n\t"
+ "vbsl d10, d1, d3\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d4, d0\n\t"
+ "vadd.i64 d0, d10\n\t"
+ /* Add in digest from start */
+ "vldm.64 %[sha512], {d8-d15}\n\t"
+ "vadd.i64 q0, q0, q4\n\t"
+ "vadd.i64 q1, q1, q5\n\t"
+ "vadd.i64 q2, q2, q6\n\t"
+ "vadd.i64 q3, q3, q7\n\t"
+ "vstm.64 %[sha512], {d0-d7}\n\t"
+ "subs %[len], %[len], #0x80\n\t"
+ "bne L_sha512_len_neon_begin_%=\n\t"
+ : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
+ : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k), [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k)
+ : "memory", "r3", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#endif /* !WOLFSSL_ARMASM_NO_NEON */
+#endif /* WOLFSSL_ARMASM */
+#endif /* !__aarch64__ */