summaryrefslogtreecommitdiff
path: root/wolfcrypt/src/port
diff options
context:
space:
mode:
authorauth12 <[email protected]>2020-07-22 08:34:12 -0700
committerauth12 <[email protected]>2020-07-22 08:34:12 -0700
commit5015ddb9b1eee748efc24056e46f81888c975f7a (patch)
treea810f6ee90f8bfe0e934fdd9142198e6b3862957 /wolfcrypt/src/port
downloadwolfssl_windows-5015ddb9b1eee748efc24056e46f81888c975f7a.tar.xz
wolfssl_windows-5015ddb9b1eee748efc24056e46f81888c975f7a.zip
Initial commit
Diffstat (limited to 'wolfcrypt/src/port')
-rw-r--r--wolfcrypt/src/port/Espressif/README.md109
-rw-r--r--wolfcrypt/src/port/Espressif/esp32_aes.c299
-rw-r--r--wolfcrypt/src/port/Espressif/esp32_mp.c514
-rw-r--r--wolfcrypt/src/port/Espressif/esp32_sha.c434
-rw-r--r--wolfcrypt/src/port/Espressif/esp32_util.c67
-rw-r--r--wolfcrypt/src/port/Renesas/README.md176
-rw-r--r--wolfcrypt/src/port/Renesas/renesas_tsip_aes.c156
-rw-r--r--wolfcrypt/src/port/Renesas/renesas_tsip_sha.c274
-rw-r--r--wolfcrypt/src/port/Renesas/renesas_tsip_util.c719
-rw-r--r--wolfcrypt/src/port/af_alg/afalg_aes.c900
-rw-r--r--wolfcrypt/src/port/af_alg/afalg_hash.c339
-rw-r--r--wolfcrypt/src/port/af_alg/wc_afalg.c141
-rw-r--r--wolfcrypt/src/port/arm/armv8-32-curve25519.S6012
-rw-r--r--wolfcrypt/src/port/arm/armv8-32-curve25519.c5581
-rw-r--r--wolfcrypt/src/port/arm/armv8-32-sha512-asm.S5335
-rw-r--r--wolfcrypt/src/port/arm/armv8-32-sha512-asm.c4783
-rw-r--r--wolfcrypt/src/port/arm/armv8-aes.c4653
-rw-r--r--wolfcrypt/src/port/arm/armv8-chacha.c2857
-rw-r--r--wolfcrypt/src/port/arm/armv8-curve25519.S6715
-rw-r--r--wolfcrypt/src/port/arm/armv8-curve25519.c6725
-rw-r--r--wolfcrypt/src/port/arm/armv8-poly1305.c1166
-rw-r--r--wolfcrypt/src/port/arm/armv8-sha256.c1508
-rw-r--r--wolfcrypt/src/port/arm/armv8-sha512-asm.S1046
-rw-r--r--wolfcrypt/src/port/arm/armv8-sha512-asm.c1041
-rw-r--r--wolfcrypt/src/port/arm/armv8-sha512.c715
-rw-r--r--wolfcrypt/src/port/arm/cryptoCell.c309
-rw-r--r--wolfcrypt/src/port/arm/cryptoCellHash.c134
-rw-r--r--wolfcrypt/src/port/atmel/README.md94
-rw-r--r--wolfcrypt/src/port/atmel/atmel.c843
-rw-r--r--wolfcrypt/src/port/caam/caam_aes.c649
-rw-r--r--wolfcrypt/src/port/caam/caam_doc.pdfbin0 -> 1107370 bytes
-rw-r--r--wolfcrypt/src/port/caam/caam_driver.c1713
-rw-r--r--wolfcrypt/src/port/caam/caam_init.c289
-rw-r--r--wolfcrypt/src/port/caam/caam_sha.c397
-rw-r--r--wolfcrypt/src/port/cavium/README.md3
-rw-r--r--wolfcrypt/src/port/cavium/README_Octeon.md3
-rw-r--r--wolfcrypt/src/port/cavium/cavium_nitrox.c0
-rw-r--r--wolfcrypt/src/port/cavium/cavium_octeon_sync.c879
-rw-r--r--wolfcrypt/src/port/devcrypto/README.md43
-rw-r--r--wolfcrypt/src/port/devcrypto/devcrypto_aes.c384
-rw-r--r--wolfcrypt/src/port/devcrypto/devcrypto_hash.c248
-rw-r--r--wolfcrypt/src/port/devcrypto/wc_devcrypto.c167
-rw-r--r--wolfcrypt/src/port/intel/README.md3
-rw-r--r--wolfcrypt/src/port/intel/quickassist.c0
-rw-r--r--wolfcrypt/src/port/intel/quickassist_mem.c0
-rw-r--r--wolfcrypt/src/port/intel/quickassist_sync.c2004
-rw-r--r--wolfcrypt/src/port/mynewt/mynewt_port.c146
-rw-r--r--wolfcrypt/src/port/nrf51.c220
-rw-r--r--wolfcrypt/src/port/nxp/ksdk_port.c1731
-rw-r--r--wolfcrypt/src/port/pic32/pic32mz-crypt.c804
-rw-r--r--wolfcrypt/src/port/st/README.md132
-rw-r--r--wolfcrypt/src/port/st/stm32.c879
-rw-r--r--wolfcrypt/src/port/st/stsafe.c566
-rw-r--r--wolfcrypt/src/port/ti/ti-aes.c569
-rw-r--r--wolfcrypt/src/port/ti/ti-ccm.c94
-rw-r--r--wolfcrypt/src/port/ti/ti-des3.c204
-rw-r--r--wolfcrypt/src/port/ti/ti-hash.c338
-rw-r--r--wolfcrypt/src/port/xilinx/xil-aesgcm.c202
-rw-r--r--wolfcrypt/src/port/xilinx/xil-sha3.c158
59 files changed, 66470 insertions, 0 deletions
diff --git a/wolfcrypt/src/port/Espressif/README.md b/wolfcrypt/src/port/Espressif/README.md
new file mode 100644
index 0000000..4f0d0b5
--- /dev/null
+++ b/wolfcrypt/src/port/Espressif/README.md
@@ -0,0 +1,109 @@
+# ESP32 Port
+
+Support for the ESP32-WROOM-32 on-board crypto hardware acceleration for symmetric AES, SHA1/SHA256/SHA384/SHA512 and RSA primitive including mul, mulmod and exptmod.
+
+## ESP32 Acceleration
+
+For detail about ESP32 HW Acceleration, you can find in [Technical Reference Manual](https://espressif.com/sites/default/files/documentation/esp32_technical_reference_manual_en.pdf)
+
+### Building
+
+To enable hw acceleration :
+
+Uncomment out #define WOLFSSL_ESPIDF in /path/to/wolfssl/wolfssl/wolfcrypt/settings.h
+Uncomment out #define WOLFSSL_ESPWROOM32 in /path/to/wolfssl/wolfssl/wolfcrypt/settings.h
+
+To disable portions of the hardware acceleration you can optionally define:
+
+```
+/* Disabled SHA, AES and RSA acceleration */
+#define NO_ESP32WROOM32_CRYPT
+/* Disabled AES acceleration */
+#define NO_WOLFSSL_ESP32WROOM32_CRYPT_AES
+/* Disabled SHA acceleration */
+#define NO_WOLFSSL_ESP32WROOM32_CRYPT_HASH
+/* Disabled RSA Primitive acceleration */
+#define NO_WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI
+```
+
+### Coding
+
+In your application you must include <wolfssl/wolfcrypt/settings.h> before any other wolfSSL headers. If building the sources directly we recommend defining `WOLFSSL_USER_SETTINGS` and adding your own `user_settings.h` file. You can find a good reference for this in `IDE/GCC-ARM/Header/user_settings.h`.
+
+
+### Benchmarks
+
+w/ USE_FAST_MATH and WOLFSSL_SMALL_STACK options
+
+Software only implementation :
+
+```
+AES-128-CBC-enc 1 MB took 1.001 seconds, 1.146 MB/s
+AES-128-CBC-dec 1 MB took 1.017 seconds, 1.104 MB/s
+AES-192-CBC-enc 1 MB took 1.018 seconds, 1.055 MB/s
+AES-192-CBC-dec 1 MB took 1.006 seconds, 1.019 MB/s
+AES-256-CBC-enc 1000 KB took 1.000 seconds, 1000.000 KB/s
+AES-256-CBC-dec 975 KB took 1.007 seconds, 968.222 KB/s
+AES-128-GCM-enc 350 KB took 1.055 seconds, 331.754 KB/s
+AES-128-GCM-dec 350 KB took 1.054 seconds, 332.068 KB/s
+AES-192-GCM-enc 325 KB took 1.013 seconds, 320.829 KB/s
+AES-192-GCM-dec 325 KB took 1.013 seconds, 320.829 KB/s
+AES-256-GCM-enc 325 KB took 1.041 seconds, 312.200 KB/s
+AES-256-GCM-dec 325 KB took 1.041 seconds, 312.200 KB/s
+SHA 6 MB took 1.004 seconds, 5.714 MB/s
+SHA-256 2 MB took 1.006 seconds, 1.747 MB/s
+SHA-384 1 MB took 1.011 seconds, 1.159 MB/s
+SHA-512 1 MB took 1.009 seconds, 1.161 MB/s
+HMAC-SHA 6 MB took 1.001 seconds, 5.634 MB/s
+HMAC-SHA256 2 MB took 1.000 seconds, 1.733 MB/s
+HMAC-SHA384 1 MB took 1.004 seconds, 1.046 MB/s
+HMAC-SHA512 1 MB took 1.002 seconds, 1.048 MB/s
+RSA 2048 public 16 ops took 1.056 sec, avg 66.000 ms, 15.152 ops/sec
+RSA 2048 private 2 ops took 2.488 sec, avg 1244.000 ms, 0.804 ops/sec
+ECC 256 key gen 4 ops took 1.101 sec, avg 275.250 ms, 3.633 ops/sec
+ECDHE 256 agree 4 ops took 1.098 sec, avg 274.500 ms, 3.643 ops/sec
+ECDSA 256 sign 4 ops took 1.111 sec, avg 277.750 ms, 3.600 ops/sec
+ECDSA 256 verify 2 ops took 1.099 sec, avg 549.500 ms, 1.820 ops/sec
+```
+
+Hardware Acceleration :
+
+
+```
+AES-128-CBC-enc 6 MB took 1.004 seconds, 5.958 MB/s
+AES-128-CBC-dec 5 MB took 1.002 seconds, 5.287 MB/s
+AES-192-CBC-enc 6 MB took 1.004 seconds, 5.958 MB/s
+AES-192-CBC-dec 5 MB took 1.002 seconds, 5.287 MB/s
+AES-256-CBC-enc 6 MB took 1.001 seconds, 5.951 MB/s
+AES-256-CBC-dec 5 MB took 1.004 seconds, 5.277 MB/s
+AES-128-GCM-enc 375 KB took 1.067 seconds, 351.453 KB/s
+AES-128-GCM-dec 375 KB took 1.067 seconds, 351.453 KB/s
+AES-192-GCM-enc 350 KB took 1.010 seconds, 346.535 KB/s
+AES-192-GCM-dec 350 KB took 1.009 seconds, 346.878 KB/s
+AES-256-GCM-enc 350 KB took 1.016 seconds, 344.488 KB/s
+AES-256-GCM-dec 350 KB took 1.016 seconds, 344.488 KB/s
+SHA 14 MB took 1.000 seconds, 14.062 MB/s
+SHA-256 15 MB took 1.000 seconds, 15.234 MB/s
+SHA-384 17 MB took 1.000 seconds, 17.383 MB/s
+SHA-512 18 MB took 1.001 seconds, 17.512 MB/s
+HMAC-SHA 14 MB took 1.000 seconds, 13.818 MB/s
+HMAC-SHA256 15 MB took 1.001 seconds, 14.951 MB/s
+HMAC-SHA384 17 MB took 1.001 seconds, 16.683 MB/s
+HMAC-SHA512 17 MB took 1.000 seconds, 16.943 MB/s
+RSA 2048 public 20 ops took 1.017 sec, avg 50.850 ms, 19.666 ops/sec
+RSA 2048 private 4 ops took 1.059 sec, avg 264.750 ms, 3.777 ops/sec
+ECC 256 key gen 4 ops took 1.092 sec, avg 273.000 ms, 3.663 ops/sec
+ECDHE 256 agree 4 ops took 1.089 sec, avg 272.250 ms, 3.673 ops/sec
+ECDSA 256 sign 4 ops took 1.101 sec, avg 275.250 ms, 3.633 ops/sec
+ECDSA 256 verify 2 ops took 1.092 sec, avg 546.000 ms, 1.832 ops/sec
+```
+
+Condition :
+- Model : ESP32-WROOM-32
+- CPU Speed: 240Mhz
+- ESP-IDF : v3.3-beta1-39-g6cb37ecc5(commit hash : 6cb37ecc5)
+- OS : Ubuntu 18.04.1 LTS (Bionic Beaver)
+
+## Support
+
+Email us at [[email protected]](mailto:[email protected]).
diff --git a/wolfcrypt/src/port/Espressif/esp32_aes.c b/wolfcrypt/src/port/Espressif/esp32_aes.c
new file mode 100644
index 0000000..f2fb8a5
--- /dev/null
+++ b/wolfcrypt/src/port/Espressif/esp32_aes.c
@@ -0,0 +1,299 @@
+/* esp32_aes.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifndef NO_AES
+
+#if defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
+ !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_AES)
+
+#include <wolfssl/wolfcrypt/aes.h>
+#include "wolfssl/wolfcrypt/port/Espressif/esp32-crypt.h"
+
+static const char* TAG = "wolf_hw_aes";
+/* mutex */
+static wolfSSL_Mutex aes_mutex;
+static int espaes_CryptHwMutexInit = 0;
+
+/*
+* lock hw engine.
+* this should be called before using engine.
+*/
+static int esp_aes_hw_InUse()
+{
+ int ret = 0;
+
+ ESP_LOGV(TAG, "enter esp_aes_hw_InUse");
+
+ if(espaes_CryptHwMutexInit == 0) {
+ ret = esp_CryptHwMutexInit(&aes_mutex);
+ if(ret == 0){
+ espaes_CryptHwMutexInit = 1;
+ } else {
+ ESP_LOGE(TAG, "aes mutx initialization failed.");
+ return -1;
+ }
+ }
+ /* lock hardware */
+ ret = esp_CryptHwMutexLock(&aes_mutex, portMAX_DELAY);
+ if(ret != 0) {
+ ESP_LOGE(TAG, "aes engine lock failed.");
+ return -1;
+ }
+ /* Enable AES hardware */
+ periph_module_enable(PERIPH_AES_MODULE);
+
+ ESP_LOGV(TAG, "leave esp_aes_hw_InUse");
+ return ret;
+}
+
+/*
+* release hw engine
+*/
+static void esp_aes_hw_Leave( void )
+{
+ ESP_LOGV(TAG, "enter esp_aes_hw_Leave");
+ /* Disable AES hardware */
+ periph_module_disable(PERIPH_AES_MODULE);
+
+ /* unlock */
+ esp_CryptHwMutexUnLock(&aes_mutex);
+
+ ESP_LOGV(TAG, "leave esp_aes_hw_Leave");
+}
+
+/*
+ * set key to hardware key registers.
+ */
+static void esp_aes_hw_Set_KeyMode(Aes *ctx, ESP32_AESPROCESS mode)
+{
+ int i;
+ word32 mode_ = 0;
+
+ ESP_LOGV(TAG, "enter esp_aes_hw_Set_KeyMode");
+
+ /* check mode */
+ if(mode == ESP32_AES_UPDATEKEY_ENCRYPT) {
+ mode_ = 0;
+ } else if(mode == ESP32_AES_UPDATEKEY_DECRYPT){
+ mode_ = 4;
+ } else {
+ ESP_LOGE(TAG, "unexpected error.");
+ return;
+ }
+
+ /* update key */
+ for(i=0;i<(ctx->keylen)/sizeof(word32);i++){
+ DPORT_REG_WRITE(AES_KEY_BASE + (i*4), *(((word32*)ctx->key) + i));
+ }
+
+ /* mode
+ * 0 AES-128 Encryption
+ * 1 AES-192 Encryption
+ * 2 AES-256 Encryption
+ * 4 AES-128 Decryption
+ * 5 AES-192 Decryption
+ * 6 AES-256 Decryption
+ */
+ switch(ctx->keylen){
+ case 24: mode_ += 1; break;
+ case 32: mode_ += 2; break;
+ default: break;
+ }
+
+ DPORT_REG_WRITE(AES_MODE_REG, mode_);
+ ESP_LOGV(TAG, "leave esp_aes_hw_Setkey");
+}
+
+/*
+ * Process a one block of AES
+ */
+static void esp_aes_bk(const byte* in, byte* out)
+{
+ const word32 *inwords = (const word32 *)in;
+ word32 *outwords = (word32 *)out;
+
+ ESP_LOGV(TAG, "enter esp_aes_bk");
+
+ /* copy text for encrypting/decrypting blocks */
+ DPORT_REG_WRITE(AES_TEXT_BASE, inwords[0]);
+ DPORT_REG_WRITE(AES_TEXT_BASE + 4, inwords[1]);
+ DPORT_REG_WRITE(AES_TEXT_BASE + 8, inwords[2]);
+ DPORT_REG_WRITE(AES_TEXT_BASE + 12, inwords[3]);
+
+ /* start engine */
+ DPORT_REG_WRITE(AES_START_REG, 1);
+
+ /* wait until finishing the process */
+ while(1) {
+ if(DPORT_REG_READ(AES_IDLE_REG) == 1)
+ break;
+ }
+
+ /* read-out blocks */
+ esp_dport_access_read_buffer(outwords, AES_TEXT_BASE, 4);
+ ESP_LOGV(TAG, "leave esp_aes_bk");
+}
+
+/*
+* wc_esp32AesEncrypt
+* @brief: a one block encrypt of the input block, into the output block
+* @param aes: a pointer of the AES object used to encrypt data
+* @param in : a pointer of the input buffer containing plain text to be encrypted
+* @param out: a pointer of the output buffer in which to store the cipher text of
+* the encrypted message
+*/
+int wc_esp32AesEncrypt(Aes *aes, const byte* in, byte* out)
+{
+ ESP_LOGV(TAG, "enter wc_esp32AesEncrypt");
+ /* lock the hw engine */
+ esp_aes_hw_InUse();
+ /* load the key into the register */
+ esp_aes_hw_Set_KeyMode(aes, ESP32_AES_UPDATEKEY_ENCRYPT);
+ /* process a one block of AES */
+ esp_aes_bk(in, out);
+ /* release hw */
+ esp_aes_hw_Leave();
+ return 0;
+}
+/*
+* wc_esp32AesDecrypt
+* @brief: a one block decrypt of the input block, into the output block
+* @param aes: a pointer of the AES object used to decrypt data
+* @param in : a pointer of the input buffer containing plain text to be decrypted
+* @param out: a pointer of the output buffer in which to store the cipher text of
+* the decrypted message
+*/
+int wc_esp32AesDecrypt(Aes *aes, const byte* in, byte* out)
+{
+ ESP_LOGV(TAG, "enter wc_esp32AesDecrypt");
+ /* lock the hw engine */
+ esp_aes_hw_InUse();
+ /* load the key into the register */
+ esp_aes_hw_Set_KeyMode(aes, ESP32_AES_UPDATEKEY_DECRYPT);
+ /* process a one block of AES */
+ esp_aes_bk(in, out);
+ /* release hw engine */
+ esp_aes_hw_Leave();
+ return 0;
+}
+/*
+* wc_esp32AesCbcEncrypt
+* @brief: Encrypts a plain text message from the input buffer, and places the
+* resulting cipher text into the output buffer using cipher block chaining
+* with AES.
+* @param aes: a pointer of the AES object used to encrypt data
+* @param out: a pointer of the output buffer in which to store the cipher text of
+* the encrypted message
+* @param in : a pointer of the input buffer containing plain text to be encrypted
+* @param sz : size of input message
+*/
+int wc_esp32AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ int i;
+ int offset = 0;
+ word32 blocks = (sz / AES_BLOCK_SIZE);
+ byte *iv;
+ byte temp_block[AES_BLOCK_SIZE];
+
+ ESP_LOGV(TAG, "enter wc_esp32AesCbcEncrypt");
+
+ iv = (byte*)aes->reg;
+
+ esp_aes_hw_InUse();
+
+ esp_aes_hw_Set_KeyMode(aes, ESP32_AES_UPDATEKEY_ENCRYPT);
+
+ while (blocks--) {
+ XMEMCPY(temp_block, in + offset, AES_BLOCK_SIZE);
+
+ /* XOR block with IV for CBC */
+ for (i = 0; i < AES_BLOCK_SIZE; i++)
+ temp_block[i] ^= iv[i];
+
+ esp_aes_bk(temp_block, (out + offset));
+
+ offset += AES_BLOCK_SIZE;
+
+ /* store IV for next block */
+ XMEMCPY(iv, out + offset - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+ }
+
+ esp_aes_hw_Leave();
+ ESP_LOGV(TAG, "leave wc_esp32AesCbcEncrypt");
+ return 0;
+}
+/*
+* wc_esp32AesCbcDecrypt
+* @brief: Encrypts a plain text message from the input buffer, and places the
+* resulting cipher text into the output buffer using cipher block chaining
+* with AES.
+* @param aes: a pointer of the AES object used to decrypt data
+* @param out: a pointer of the output buffer in which to store the cipher text of
+* the decrypted message
+* @param in : a pointer of the input buffer containing plain text to be decrypted
+* @param sz : size of input message
+*/
+int wc_esp32AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ int i;
+ int offset = 0;
+ word32 blocks = (sz / AES_BLOCK_SIZE);
+ byte* iv;
+ byte temp_block[AES_BLOCK_SIZE];
+
+ ESP_LOGV(TAG, "enter wc_esp32AesCbcDecrypt");
+
+ iv = (byte*)aes->reg;
+
+ esp_aes_hw_InUse();
+
+ esp_aes_hw_Set_KeyMode(aes, ESP32_AES_UPDATEKEY_DECRYPT);
+
+ while (blocks--) {
+ XMEMCPY(temp_block, in + offset, AES_BLOCK_SIZE);
+
+ esp_aes_bk((in + offset), (out + offset));
+
+ /* XOR block with IV for CBC */
+ for (i = 0; i < AES_BLOCK_SIZE; i++)
+ (out + offset)[i] ^= iv[i];
+
+ /* store IV for next block */
+ XMEMCPY(iv, temp_block, AES_BLOCK_SIZE);
+
+ offset += AES_BLOCK_SIZE;
+ }
+
+ esp_aes_hw_Leave();
+ ESP_LOGV(TAG, "leave wc_esp32AesCbcDecrypt");
+ return 0;
+}
+
+#endif /* WOLFSSL_ESP32WROOM32_CRYPT */
+#endif /* NO_AES */
diff --git a/wolfcrypt/src/port/Espressif/esp32_mp.c b/wolfcrypt/src/port/Espressif/esp32_mp.c
new file mode 100644
index 0000000..2174089
--- /dev/null
+++ b/wolfcrypt/src/port/Espressif/esp32_mp.c
@@ -0,0 +1,514 @@
+/* esp32_mp.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+#include <wolfssl/wolfcrypt/settings.h>
+
+#include "wolfssl/wolfcrypt/logging.h"
+
+#if !defined(NO_RSA) || defined(HAVE_ECC)
+
+#if defined(WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI) && \
+ !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI)
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+#include <wolfssl/wolfcrypt/tfm.h>
+
+static const char* const TAG = "wolfssl_mp";
+
+#define ESP_HW_RSAMAX_BIT 4096
+#define ESP_HW_MULTI_RSAMAX_BITS 2048
+#define ESP_HW_RSAMIN_BIT 512
+#define BYTE_TO_WORDS(s) (((s+3)>>2)) /* (s+(4-1))/ 4 */
+#define BITS_TO_WORDS(s) (((s+31)>>3)>>2) /* (s+(32-1))/ 8/ 4*/
+
+#define MP_NG -1
+
+/* mutex */
+static wolfSSL_Mutex mp_mutex;
+static int espmp_CryptHwMutexInit = 0;
+/*
+* check if the hw is ready before accessing it
+*/
+static int esp_mp_hw_wait_clean()
+{
+ int timeout = 0;
+ while(++timeout < ESP_RSA_TIMEOUT && DPORT_REG_READ(RSA_CLEAN_REG) != 1){}
+
+ if(timeout >= ESP_RSA_TIMEOUT) {
+ ESP_LOGE(TAG, "waiting hw ready is time-outed.");
+ return MP_NG;
+ }
+ return MP_OKAY;
+}
+/*
+* lock hw engine.
+* this should be called before using engine.
+*/
+static int esp_mp_hw_lock()
+{
+ int ret = 0;
+
+ if(espmp_CryptHwMutexInit == 0) {
+ ret = esp_CryptHwMutexInit(&mp_mutex);
+ if(ret == 0){
+ espmp_CryptHwMutexInit = 1;
+ } else {
+ ESP_LOGE(TAG, "mp mutx initialization failed.");
+ return MP_NG;
+ }
+ }
+ /* lock hardware */
+ ret = esp_CryptHwMutexLock(&mp_mutex, portMAX_DELAY);
+ if(ret != 0) {
+ ESP_LOGE(TAG, "mp engine lock failed.");
+ return MP_NG;
+ }
+ /* Enable RSA hardware */
+ periph_module_enable(PERIPH_RSA_MODULE);
+
+ return ret;
+}
+/*
+* Release hw engine
+*/
+static void esp_mp_hw_unlock( void )
+{
+ /* Disable RSA hardware */
+ periph_module_disable(PERIPH_RSA_MODULE);
+
+ /* unlock */
+ esp_CryptHwMutexUnLock(&mp_mutex);
+}
+/* this is based on an article by Cetin Kaya Koc, A New Algorithm for Inversion*/
+/* mod p^k, June 28 2017. */
+static int esp_calc_Mdash(mp_int *M, word32 k, mp_digit* md)
+{
+ int i;
+ int xi;
+ int b0 = 1;
+ int bi;
+ word32 N = 0;
+ word32 x;
+
+ N = M->dp[0];
+ bi = b0;
+ x = 0;
+
+ for(i = 0; i < k; i++) {
+ xi = bi % 2;
+ if(xi < 0){
+ xi *= -1;
+ }
+ bi = (bi - N * xi) / 2;
+ x |= (xi << i);
+ }
+ /* 2's complement */
+ *md = ~x + 1;
+ return MP_OKAY;
+}
+/* start hw process */
+static void process_start(word32 reg)
+{
+ /* clear interrupt */
+ DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1);
+ /* start process */
+ DPORT_REG_WRITE(reg, 1);
+}
+/* wait until done */
+static int wait_uitil_done(word32 reg)
+{
+ int timeout = 0;
+ /* wait until done && not timeout */
+ while(1) {
+ if(++timeout < ESP_RSA_TIMEOUT && DPORT_REG_READ(reg) == 1){
+ break;
+ }
+ }
+
+ /* clear interrupt */
+ DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1);
+
+ if(timeout >= ESP_RSA_TIMEOUT) {
+ ESP_LOGE(TAG, "rsa operation is time-outed.");
+ return MP_NG;
+ }
+
+ return MP_OKAY;
+}
+/* read data from memory into mp_init */
+static void esp_memblock_to_mpint(word32 mem_address, mp_int* mp, word32 numwords)
+{
+ esp_dport_access_read_buffer((uint32_t*)mp->dp, mem_address, numwords);
+ mp->used = numwords;
+}
+
+/* write mp_init into memory block */
+static void esp_mpint_to_memblock(word32 mem_address, const mp_int* mp,
+ const word32 bits,
+ const word32 hwords)
+{
+ word32 i;
+ word32 len = (bits / 8 + ((bits & 7) != 0 ? 1 : 0));
+
+ len = (len+sizeof(word32)-1)/sizeof(word32);
+
+ for(i=0;i < hwords; i++) {
+ if(i < len) {
+ DPORT_REG_WRITE(mem_address + (i * sizeof(word32)), mp->dp[i]);
+ } else {
+ DPORT_REG_WRITE(mem_address + (i * sizeof(word32)), 0);
+ }
+ }
+}
+/* return needed hw words. */
+/* supported words length */
+/* words : {16 , 32, 48, 64, 80, 96, 112, 128} */
+/* bits : {512,1024, 1536, 2048, 2560, 3072, 3584, 4096} */
+static word32 words2hwords(word32 wd)
+{
+ const word32 shit_ = 4;
+
+ return (((wd + 0xf)>>shit_)<<shit_);
+}
+/* count the number of words is needed for bits */
+static word32 bits2words(word32 bits)
+{
+ /* 32 bits */
+ const word32 d = sizeof(word32) * WOLFSSL_BIT_SIZE;
+
+ return((bits + (d - 1))/d);
+}
+/* get rinv */
+static int esp_get_rinv(mp_int *rinv, mp_int *M, word32 exp)
+{
+ int ret = 0;
+
+ /* 2^(exp)*/
+ if((ret = mp_2expt(rinv, exp)) != MP_OKAY) {
+ ESP_LOGE(TAG, "failed to calculate mp_2expt()");
+ return ret;
+ }
+
+ /* r_inv = R^2 mod M(=P) */
+ if(ret == 0 && (ret = mp_mod(rinv, M, rinv)) != MP_OKAY){
+ ESP_LOGE(TAG, "failed to calculate mp_mod()");
+ return ret;
+ }
+
+ return ret;
+}
+/* Z = X * Y; */
+int esp_mp_mul(fp_int* X, fp_int* Y, fp_int* Z)
+{
+ int ret = 0;
+ int neg = (X->sign == Y->sign)? MP_ZPOS : MP_NEG;
+
+ word32 Xs;
+ word32 Ys;
+ word32 Zs;
+ word32 maxWords_sz;
+ word32 hwWords_sz;
+
+ /* ask bits number */
+ Xs = mp_count_bits(X);
+ Ys = mp_count_bits(Y);
+ Zs = Xs + Ys;
+
+ /* maximum bits and words for writing to hw */
+ maxWords_sz = bits2words(max(Xs, Ys));
+ hwWords_sz = words2hwords(maxWords_sz);
+
+ /* sanity check */
+ if((hwWords_sz<<5) > ESP_HW_MULTI_RSAMAX_BITS) {
+ ESP_LOGW(TAG, "exceeds max bit length(2048)");
+ return -2;
+ }
+
+ /*Steps to use hw in the following order:
+ * 1. wait until clean hw engine
+ * 2. Write(2*N/512bits - 1 + 8) to MULT_MODE_REG
+ * 3. Write X and Y to memory blocks
+ * need to write data to each memory block only according to the length
+ * of the number.
+ * 4. Write 1 to MUL_START_REG
+ * 5. Wait for the first operation to be done. Poll INTERRUPT_REG until it reads 1.
+ * (Or until the INTER interrupt is generated.)
+ * 6. Write 1 to RSA_INTERRUPT_REG to clear the interrupt.
+ * 7. Read the Z from RSA_Z_MEM
+ * 8. Write 1 to RSA_INTERUPT_REG to clear the interrupt.
+ * 9. Release the hw engine
+ */
+ /* lock hw for use */
+ if((ret = esp_mp_hw_lock()) != MP_OKAY)
+ return ret;
+
+ if((ret = esp_mp_hw_wait_clean()) != MP_OKAY){
+ return ret;
+ }
+
+ /* step.1 (2*N/512) => N/256. 512 bits => 16 words */
+ DPORT_REG_WRITE(RSA_MULT_MODE_REG, (hwWords_sz >> 3) - 1 + 8);
+ /* step.2 write X, M and r_inv into memory */
+ esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE, X, Xs, hwWords_sz);
+ /* Y(let-extend) */
+ esp_mpint_to_memblock(RSA_MEM_Z_BLOCK_BASE + (hwWords_sz<<2), Y, Ys, hwWords_sz);
+ /* step.3 start process */
+ process_start(RSA_MULT_START_REG);
+
+ /* step.4,5 wait until done */
+ wait_uitil_done(RSA_INTERRUPT_REG);
+ /* step.6 read the result form MEM_Z */
+ esp_memblock_to_mpint(RSA_MEM_Z_BLOCK_BASE, Z, BITS_TO_WORDS(Zs));
+
+ /* step.7 clear and release hw */
+ esp_mp_hw_unlock();
+
+ Z->sign = (Z->used > 0)? neg : MP_ZPOS;
+
+ return ret;
+}
+/* Z = X * Y (mod M) */
+int esp_mp_mulmod(fp_int* X, fp_int* Y, fp_int* M, fp_int* Z)
+{
+ int ret = 0;
+ int negcheck = 0;
+ word32 Xs;
+ word32 Ys;
+ word32 Ms;
+ word32 maxWords_sz;
+ word32 hwWords_sz;
+ word32 zwords;
+
+ mp_int r_inv;
+ mp_int tmpZ;
+ mp_digit mp;
+
+ /* neg check */
+ if(X->sign != Y->sign) {
+ /* X*Y becomes negative */
+ negcheck = 1;
+ }
+ /* ask bits number */
+ Xs = mp_count_bits(X);
+ Ys = mp_count_bits(Y);
+ Ms = mp_count_bits(M);
+
+ /* maximum bits and words for writing to hw */
+ maxWords_sz = bits2words(max(Xs, max(Ys, Ms)));
+ zwords = bits2words(min(Ms, Xs + Ys));
+ hwWords_sz = words2hwords(maxWords_sz);
+
+ if((hwWords_sz<<5) > ESP_HW_RSAMAX_BIT) {
+ ESP_LOGE(TAG, "exceeds hw maximum bits");
+ return -2;
+ }
+ /* calculate r_inv = R^2 mode M
+ * where: R = b^n, and b = 2^32
+ * accordingly R^2 = 2^(n*32*2)
+ */
+ ret = mp_init_multi(&tmpZ, &r_inv, NULL, NULL, NULL, NULL);
+ if(ret == 0 && (ret = esp_get_rinv(&r_inv, M, (hwWords_sz<<6))) != MP_OKAY) {
+ ESP_LOGE(TAG, "calculate r_inv failed.");
+ mp_clear(&tmpZ);
+ mp_clear(&r_inv);
+ return ret;
+ }
+ /* lock hw for use */
+ if((ret = esp_mp_hw_lock()) != MP_OKAY){
+ mp_clear(&tmpZ);
+ mp_clear(&r_inv);
+ return ret;
+ }
+ /* Calculate M' */
+ if((ret = esp_calc_Mdash(M, 32/* bits */, &mp)) != MP_OKAY) {
+ ESP_LOGE(TAG, "failed to calculate M dash");
+ mp_clear(&tmpZ);
+ mp_clear(&r_inv);
+ return -1;
+ }
+ /*Steps to use hw in the following order:
+ * 1. wait until clean hw engine
+ * 2. Write(N/512bits - 1) to MULT_MODE_REG
+ * 3. Write X,M(=G, X, P) to memory blocks
+ * need to write data to each memory block only according to the length
+ * of the number.
+ * 4. Write M' to M_PRIME_REG
+ * 5. Write 1 to MODEXP_START_REG
+ * 6. Wait for the first operation to be done. Poll INTERRUPT_REG until it reads 1.
+ * (Or until the INTER interrupt is generated.)
+ * 7. Write 1 to RSA_INTERRUPT_REG to clear the interrupt.
+ * 8. Write Y to RSA_X_MEM
+ * 9. Write 1 to RSA_MULT_START_REG
+ * 10. Wait for the second operation to be completed. Poll INTERRUPT_REG until it reads 1.
+ * 11. Read the Z from RSA_Z_MEM
+ * 12. Write 1 to RSA_INTERUPT_REG to clear the interrupt.
+ * 13. Release the hw engine
+ */
+
+ if((ret = esp_mp_hw_wait_clean()) != MP_OKAY){
+ return ret;
+ }
+ /* step.1 512 bits => 16 words */
+ DPORT_REG_WRITE(RSA_MULT_MODE_REG, (hwWords_sz >> 4) - 1);
+
+ /* step.2 write X, M and r_inv into memory */
+ esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE, X, Xs, hwWords_sz);
+ esp_mpint_to_memblock(RSA_MEM_M_BLOCK_BASE, M, Ms, hwWords_sz);
+ esp_mpint_to_memblock(RSA_MEM_Z_BLOCK_BASE, &r_inv, mp_count_bits(&r_inv),
+ hwWords_sz);
+ /* step.3 write M' into memory */
+ DPORT_REG_WRITE(RSA_M_DASH_REG, mp);
+ /* step.4 start process */
+ process_start(RSA_MULT_START_REG);
+
+ /* step.5,6 wait until done */
+ wait_uitil_done(RSA_INTERRUPT_REG);
+ /* step.7 Y to MEM_X */
+ esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE, Y, Ys, hwWords_sz);
+
+ /* step.8 start process */
+ process_start(RSA_MULT_START_REG);
+
+ /* step.9,11 wait until done */
+ wait_uitil_done(RSA_INTERRUPT_REG);
+
+ /* step.12 read the result from MEM_Z */
+ esp_memblock_to_mpint(RSA_MEM_Z_BLOCK_BASE, &tmpZ, zwords);
+
+ /* step.13 clear and release hw */
+ esp_mp_hw_unlock();
+
+ /* additional steps */
+ /* this needs for known issue when Z is greater than M */
+ if(mp_cmp(&tmpZ, M)==FP_GT) {
+ /* Z -= M */
+ mp_sub(&tmpZ, M, &tmpZ);
+ }
+ if(negcheck) {
+ mp_sub(M, &tmpZ, &tmpZ);
+ }
+
+ mp_copy(&tmpZ, Z);
+
+ mp_clear(&tmpZ);
+ mp_clear(&r_inv);
+
+ return ret;
+}
+/* Z = X^Y mod M */
+int esp_mp_exptmod(fp_int* X, fp_int* Y, word32 Ys, fp_int* M, fp_int* Z)
+{
+ int ret = 0;
+
+ word32 Xs;
+ word32 Ms;
+ word32 maxWords_sz;
+ word32 hwWords_sz;
+
+ mp_int r_inv;
+ mp_digit mp;
+
+ /* ask bits number */
+ Xs = mp_count_bits(X);
+ Ms = mp_count_bits(M);
+ /* maximum bits and words for writing to hw */
+ maxWords_sz = bits2words(max(Xs, max(Ys, Ms)));
+ hwWords_sz = words2hwords(maxWords_sz);
+
+ if((hwWords_sz<<5) > ESP_HW_RSAMAX_BIT) {
+ ESP_LOGE(TAG, "exceeds hw maximum bits");
+ return -2;
+ }
+ /* calculate r_inv = R^2 mode M
+ * where: R = b^n, and b = 2^32
+ * accordingly R^2 = 2^(n*32*2)
+ */
+ ret = mp_init(&r_inv);
+ if(ret == 0 && (ret = esp_get_rinv(&r_inv, M, (hwWords_sz<<6))) != MP_OKAY) {
+ ESP_LOGE(TAG, "calculate r_inv failed.");
+ mp_clear(&r_inv);
+ return ret;
+ }
+ /* lock and init the hw */
+ if((ret = esp_mp_hw_lock()) != MP_OKAY) {
+ mp_clear(&r_inv);
+ return ret;
+ }
+ /* calc M' */
+ /* if Pm is odd, uses mp_montgomery_setup() */
+ if((ret = esp_calc_Mdash(M, 32/* bits */, &mp)) != MP_OKAY) {
+ ESP_LOGE(TAG, "failed to calculate M dash");
+ mp_clear(&r_inv);
+ return -1;
+ }
+
+ /*Steps to use hw in the following order:
+ * 1. Write(N/512bits - 1) to MODEXP_MODE_REG
+ * 2. Write X, Y, M and r_inv to memory blocks
+ * need to write data to each memory block only according to the length
+ * of the number.
+ * 3. Write M' to M_PRIME_REG
+ * 4. Write 1 to MODEXP_START_REG
+ * 5. Wait for the operation to be done. Poll INTERRUPT_REG until it reads 1.
+ * (Or until the INTER interrupt is generated.)
+ * 6. Read the result Z(=Y) from Z_MEM
+ * 7. Write 1 to INTERRUPT_REG to clear the interrupt.
+ */
+ if((ret = esp_mp_hw_wait_clean()) != MP_OKAY){
+ return ret;
+ }
+
+ /* step.1 */
+ DPORT_REG_WRITE(RSA_MODEXP_MODE_REG, (hwWords_sz >> 4) - 1);
+ /* step.2 write G, X, P, r_inv and M' into memory */
+ esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE, X, Xs, hwWords_sz);
+ esp_mpint_to_memblock(RSA_MEM_Y_BLOCK_BASE, Y, Ys, hwWords_sz);
+ esp_mpint_to_memblock(RSA_MEM_M_BLOCK_BASE, M, Ms, hwWords_sz);
+ esp_mpint_to_memblock(RSA_MEM_Z_BLOCK_BASE, &r_inv, mp_count_bits(&r_inv),
+ hwWords_sz);
+ /* step.3 write M' into memory */
+ DPORT_REG_WRITE(RSA_M_DASH_REG, mp);
+ /* step.4 start process */
+ process_start(RSA_START_MODEXP_REG);
+
+ /* step.5 wait until done */
+ wait_uitil_done(RSA_INTERRUPT_REG);
+ /* step.6 read a result form memory */
+ esp_memblock_to_mpint(RSA_MEM_Z_BLOCK_BASE, Z, BITS_TO_WORDS(Ms));
+ /* step.7 clear and release hw */
+ esp_mp_hw_unlock();
+
+ mp_clear(&r_inv);
+
+ return ret;
+}
+#endif /* !NO_RSA || HAVE_ECC */
+#endif /* (WOLFSS_ESP32WROOM32_CRYPT) && (NO_WOLFSSL_ESP32WROOM32_CRYPT_RES_PRI)*/
diff --git a/wolfcrypt/src/port/Espressif/esp32_sha.c b/wolfcrypt/src/port/Espressif/esp32_sha.c
new file mode 100644
index 0000000..94789cd
--- /dev/null
+++ b/wolfcrypt/src/port/Espressif/esp32_sha.c
@@ -0,0 +1,434 @@
+/* esp32_sha.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if !defined(NO_SHA) || !defined(NO_SHA256) || defined(WC_SHA384) || \
+ defined(WC_SHA512)
+
+#include "wolfssl/wolfcrypt/logging.h"
+
+
+#if defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
+ !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_HASH)
+
+#include <wolfssl/wolfcrypt/sha.h>
+#include <wolfssl/wolfcrypt/sha256.h>
+#include <wolfssl/wolfcrypt/sha512.h>
+
+#include "wolfssl/wolfcrypt/port/Espressif/esp32-crypt.h"
+#include "wolfssl/wolfcrypt/error-crypt.h"
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+static const char* TAG = "wolf_hw_sha";
+/* continue register offset */
+#define CONTINUE_REG_OFFSET (0x04) /* start_reg + 0x04 */
+
+#ifdef NO_SHA
+ #define WC_SHA_DIGEST_SIZE 20
+#endif
+/* mutex */
+#if defined(SINGLE_THREADED)
+static int InUse = 0;
+#else
+static wolfSSL_Mutex sha_mutex;
+static int espsha_CryptHwMutexInit = 0;
+#endif
+/*
+ enum SHA_TYPE {
+ SHA1 = 0,
+ SHA2_256,
+ SHA2_384,
+ SHA2_512,
+ SHA_INVALID = -1,
+ };
+*/
+static word32 esp_sha_digest_size(enum SHA_TYPE type)
+{
+ ESP_LOGV(TAG, "enter esp_sha_digest_size");
+
+ switch(type){
+#ifndef NO_SHA
+ case SHA1:
+ return WC_SHA_DIGEST_SIZE;
+#endif
+#ifndef NO_SHA256
+ case SHA2_256:
+ return WC_SHA256_DIGEST_SIZE;
+#endif
+#ifdef WOLFSSL_SHA384
+ case SHA2_384:
+ return WC_SHA384_DIGEST_SIZE;
+#endif
+#ifdef WOLFSSL_SHA512
+ case SHA2_512:
+ return WC_SHA512_DIGEST_SIZE;
+#endif
+ default:
+ ESP_LOGE(TAG, "Bad sha type");
+ return WC_SHA_DIGEST_SIZE;
+ }
+ ESP_LOGV(TAG, "leave esp_sha_digest_size");
+}
+/*
+* wait until engines becomes idle
+*/
+static void esp_wait_until_idle()
+{
+ while((DPORT_REG_READ(SHA_1_BUSY_REG) !=0) ||
+ (DPORT_REG_READ(SHA_256_BUSY_REG)!=0) ||
+ (DPORT_REG_READ(SHA_384_BUSY_REG)!=0) ||
+ (DPORT_REG_READ(SHA_512_BUSY_REG)!=0)){ }
+}
+/*
+* lock hw engine.
+* this should be called before using engine.
+*/
+int esp_sha_try_hw_lock(WC_ESP32SHA* ctx)
+{
+ int ret = 0;
+
+ ESP_LOGV(TAG, "enter esp_sha_hw_lock");
+
+ /* Init mutex */
+#if defined(SINGLE_THREADED)
+ if(ctx->mode == ESP32_SHA_INIT) {
+ if(!InUse) {
+ ctx->mode = ESP32_SHA_HW;
+ InUse = 1;
+ } else {
+ ctx->mode = ESP32_SHA_SW;
+ }
+ } else {
+ /* this should not happens */
+ ESP_LOGE(TAG, "unexpected error in esp_sha_try_hw_lock.");
+ return -1;
+ }
+#else
+ if(espsha_CryptHwMutexInit == 0){
+ ret = esp_CryptHwMutexInit(&sha_mutex);
+ if(ret == 0) {
+ espsha_CryptHwMutexInit = 1;
+ } else {
+ ESP_LOGE(TAG, " mutex initialization failed.");
+ ctx->mode = ESP32_SHA_SW;
+ return 0;
+ }
+ }
+ /* check if this sha has been operated as sw or hw, or not yet init */
+ if(ctx->mode == ESP32_SHA_INIT){
+ /* try to lock the hw engine */
+ if(esp_CryptHwMutexLock(&sha_mutex, (TickType_t)0) == 0) {
+ ctx->mode = ESP32_SHA_HW;
+ } else {
+ ESP_LOGI(TAG, "someone used. hw is locked.....");
+ ESP_LOGI(TAG, "the rest of operation will use sw implementation for this sha");
+ ctx->mode = ESP32_SHA_SW;
+ return 0;
+ }
+ } else {
+ /* this should not happens */
+ ESP_LOGE(TAG, "unexpected error in esp_sha_try_hw_lock.");
+ return -1;
+ }
+#endif
+ /* Enable SHA hardware */
+ periph_module_enable(PERIPH_SHA_MODULE);
+
+ ESP_LOGV(TAG, "leave esp_sha_hw_lock");
+ return ret;
+}
+/*
+* release hw engine
+*/
+void esp_sha_hw_unlock( void )
+{
+ ESP_LOGV(TAG, "enter esp_sha_hw_unlock");
+
+ /* Disable AES hardware */
+ periph_module_disable(PERIPH_SHA_MODULE);
+#if defined(SINGLE_THREADED)
+ InUse = 0;
+#else
+ /* unlock hw engine for next use */
+ esp_CryptHwMutexUnLock(&sha_mutex);
+#endif
+ ESP_LOGV(TAG, "leave esp_sha_hw_unlock");
+}
+/*
+* start sha process by using hw engine
+*/
+static void esp_sha_start_process(WC_ESP32SHA* sha, word32 address)
+{
+ ESP_LOGV(TAG, "enter esp_sha_start_process");
+
+ if(sha->isfirstblock){
+ /* start first message block */
+ DPORT_REG_WRITE(address, 1);
+ sha->isfirstblock = 0;
+ } else {
+ /* CONTINU_REG */
+ DPORT_REG_WRITE(address + CONTINUE_REG_OFFSET , 1);
+ }
+
+ ESP_LOGV(TAG, "leave esp_sha_start_process");
+}
+/*
+* process message block
+*/
+static void esp_process_block(WC_ESP32SHA* ctx, word32 address,
+ const word32* data, word32 len)
+{
+ int i;
+
+ ESP_LOGV(TAG, "enter esp_process_block");
+
+ /* check if there are any busy engine */
+ esp_wait_until_idle();
+ /* load message data into hw */
+ for(i=0;i<((len)/(sizeof(word32)));++i){
+ DPORT_REG_WRITE(SHA_TEXT_BASE+(i*sizeof(word32)),*(data+i));
+ }
+ /* notify hw to start process */
+ esp_sha_start_process(ctx, address);
+
+ ESP_LOGV(TAG, "leave esp_process_block");
+}
+/*
+* retrieve sha digest from memory
+*/
+static void esp_digest_state(WC_ESP32SHA* ctx, byte* hash, enum SHA_TYPE sha_type)
+{
+ /* registers */
+ word32 SHA_LOAD_REG = SHA_1_LOAD_REG;
+ word32 SHA_BUSY_REG = SHA_1_BUSY_REG;
+
+ ESP_LOGV(TAG, "enter esp_digest_state");
+
+ /* sanity check */
+ if(sha_type == SHA_INVALID) {
+ ESP_LOGE(TAG, "unexpected error. sha_type is invalid.");
+ return;
+ }
+
+ SHA_LOAD_REG += (sha_type << 4);
+ SHA_BUSY_REG += (sha_type << 4);
+
+ if(ctx->isfirstblock == 1){
+ /* no hardware use yet. Nothing to do yet */
+ return ;
+ }
+
+ /* wait until idle */
+ esp_wait_until_idle();
+
+ /* LOAD final digest */
+ DPORT_REG_WRITE(SHA_LOAD_REG, 1);
+ /* wait until done */
+ while(DPORT_REG_READ(SHA_BUSY_REG) == 1){ }
+
+ esp_dport_access_read_buffer((word32*)(hash), SHA_TEXT_BASE,
+ esp_sha_digest_size(sha_type)/sizeof(word32));
+
+#if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
+ if(sha_type==SHA2_384||sha_type==SHA2_512) {
+ word32 i;
+ word32* pwrd1 = (word32*)(hash);
+ /* swap value */
+ for(i = 0; i <WC_SHA512_DIGEST_SIZE/4; i+=2 ) {
+ pwrd1[i] ^= pwrd1[i+1];
+ pwrd1[i+1]^= pwrd1[i];
+ pwrd1[i] ^= pwrd1[i+1];
+ }
+ }
+#endif
+
+ ESP_LOGV(TAG, "leave esp_digest_state");
+}
+
+#ifndef NO_SHA
+/*
+* sha1 process
+*/
+int esp_sha_process(struct wc_Sha* sha, const byte* data)
+{
+ int ret = 0;
+
+ ESP_LOGV(TAG, "enter esp_sha_process");
+
+ word32 SHA_START_REG = SHA_1_START_REG;
+
+ esp_process_block(&sha->ctx, SHA_START_REG, (const word32*)data,
+ WC_SHA_BLOCK_SIZE);
+
+ ESP_LOGV(TAG, "leave esp_sha_process");
+ return ret;
+}
+/*
+* retrieve sha1 digest
+*/
+int esp_sha_digest_process(struct wc_Sha* sha, byte blockproc)
+{
+ int ret = 0;
+
+ ESP_LOGV(TAG, "enter esp_sha_digest_process");
+
+ if(blockproc) {
+ word32 SHA_START_REG = SHA_1_START_REG;
+
+ esp_process_block(&sha->ctx, SHA_START_REG, sha->buffer,
+ WC_SHA_BLOCK_SIZE);
+ }
+
+ esp_digest_state(&sha->ctx, (byte*)sha->digest, SHA1);
+
+ ESP_LOGV(TAG, "leave esp_sha_digest_process");
+
+ return ret;
+}
+#endif /* NO_SHA */
+
+
+#ifndef NO_SHA256
+/*
+* sha256 process
+*/
+int esp_sha256_process(struct wc_Sha256* sha, const byte* data)
+{
+ int ret = 0;
+ word32 SHA_START_REG = SHA_1_START_REG;
+
+ ESP_LOGV(TAG, "enter esp_sha256_process");
+
+ /* start register offset */
+ SHA_START_REG += (SHA2_256 << 4);
+
+ esp_process_block(&sha->ctx, SHA_START_REG, (const word32*)data,
+ WC_SHA256_BLOCK_SIZE);
+
+ ESP_LOGV(TAG, "leave esp_sha256_process");
+
+ return ret;
+}
+/*
+* retrieve sha256 digest
+*/
+int esp_sha256_digest_process(struct wc_Sha256* sha, byte blockproc)
+{
+ int ret = 0;
+
+ ESP_LOGV(TAG, "enter esp_sha256_digest_process");
+
+ if(blockproc) {
+ word32 SHA_START_REG = SHA_1_START_REG + (SHA2_256 << 4);
+
+ esp_process_block(&sha->ctx, SHA_START_REG, sha->buffer,
+ WC_SHA256_BLOCK_SIZE);
+ }
+
+ esp_digest_state(&sha->ctx, (byte*)sha->digest, SHA2_256);
+
+ ESP_LOGV(TAG, "leave esp_sha256_digest_process");
+ return ret;
+}
+#endif /* NO_SHA256 */
+
+#if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
+/*
+* sha512 proess. this is used for sha384 too.
+*/
+void esp_sha512_block(struct wc_Sha512* sha, const word32* data, byte isfinal)
+{
+ enum SHA_TYPE sha_type = sha->ctx.sha_type;
+ word32 SHA_START_REG = SHA_1_START_REG;
+
+ ESP_LOGV(TAG, "enter esp_sha512_block");
+ /* start register offset */
+ SHA_START_REG += (sha_type << 4);
+
+ if(sha->ctx.mode == ESP32_SHA_SW){
+ ByteReverseWords64(sha->buffer, sha->buffer,
+ WC_SHA512_BLOCK_SIZE);
+ if(isfinal){
+ sha->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha->hiLen;
+ sha->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha->loLen;
+ }
+
+ } else {
+ ByteReverseWords((word32*)sha->buffer, (word32*)sha->buffer,
+ WC_SHA512_BLOCK_SIZE);
+ if(isfinal){
+ sha->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] =
+ rotlFixed64(sha->hiLen, 32U);
+ sha->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] =
+ rotlFixed64(sha->loLen, 32U);
+ }
+
+ esp_process_block(&sha->ctx, SHA_START_REG, data, WC_SHA512_BLOCK_SIZE);
+ }
+ ESP_LOGV(TAG, "leave esp_sha512_block");
+}
+/*
+* sha512 process. this is used for sha384 too.
+*/
+int esp_sha512_process(struct wc_Sha512* sha)
+{
+ word32 *data = (word32*)sha->buffer;
+
+ ESP_LOGV(TAG, "enter esp_sha512_process");
+
+ esp_sha512_block(sha, data, 0);
+
+ ESP_LOGV(TAG, "leave esp_sha512_process");
+ return 0;
+}
+/*
+* retrieve sha512 digest. this is used for sha384 too.
+*/
+int esp_sha512_digest_process(struct wc_Sha512* sha, byte blockproc)
+{
+ ESP_LOGV(TAG, "enter esp_sha512_digest_process");
+
+ if(blockproc) {
+ word32* data = (word32*)sha->buffer;
+
+ esp_sha512_block(sha, data, 1);
+ }
+ if(sha->ctx.mode != ESP32_SHA_SW)
+ esp_digest_state(&sha->ctx, (byte*)sha->digest, sha->ctx.sha_type);
+
+ ESP_LOGV(TAG, "leave esp_sha512_digest_process");
+ return 0;
+}
+#endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */
+#endif /* WOLFSSL_ESP32WROOM32_CRYPT */
+#endif /* !defined(NO_SHA) ||... */
diff --git a/wolfcrypt/src/port/Espressif/esp32_util.c b/wolfcrypt/src/port/Espressif/esp32_util.c
new file mode 100644
index 0000000..b501b5e
--- /dev/null
+++ b/wolfcrypt/src/port/Espressif/esp32_util.c
@@ -0,0 +1,67 @@
+/* esp32_util.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
+ (!defined(NO_AES) || !defined(NO_SHA) || !defined(NO_SHA256) ||\
+ defined(WOLFSSL_SHA384) || defined(WOLFSSL_SHA512))
+
+#include <wolfssl/wolfcrypt/wc_port.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+
+int esp_CryptHwMutexInit(wolfSSL_Mutex* mutex) {
+ return wc_InitMutex(mutex);
+}
+
+int esp_CryptHwMutexLock(wolfSSL_Mutex* mutex, TickType_t xBlockTime) {
+#ifdef SINGLE_THREADED
+ return wc_LockMutex(mutex);
+#else
+ return ((xSemaphoreTake( *mutex, xBlockTime ) == pdTRUE) ? 0 : BAD_MUTEX_E);
+#endif
+}
+
+int esp_CryptHwMutexUnLock(wolfSSL_Mutex* mutex) {
+ return wc_UnLockMutex(mutex);
+}
+
+#endif
+
+#ifdef WOLFSSL_ESP32WROOM32_CRYPT_DEBUG
+
+#include "esp_timer.h"
+#include "esp_log.h"
+
+static uint64_t startTime = 0;
+
+
+void wc_esp32TimerStart()
+{
+ startTime = esp_timer_get_time();
+}
+
+uint64_t wc_esp32elapsedTime()
+{
+ /* return elapsed time since wc_esp32AesTimeStart() is called in us */
+ return esp_timer_get_time() - startTime;
+}
+
+#endif /*WOLFSSL_ESP32WROOM32_CRYPT_DEBUG */
diff --git a/wolfcrypt/src/port/Renesas/README.md b/wolfcrypt/src/port/Renesas/README.md
new file mode 100644
index 0000000..ca60bc5
--- /dev/null
+++ b/wolfcrypt/src/port/Renesas/README.md
@@ -0,0 +1,176 @@
+# TSIP FIT Module port
+Support for TSIP FIT driver for symmetric AES, SHA1/SHA256 hardware acceleration and TLS-linked capability including Root CA, the server certificate or intermediate certificate verification.
+
+## Overview
+Renesas TSIP FIT module with wolfSSL by setting *WOLFSSL_RENESAS_TSIP* definition.
+
+Including the following examples:
+
+* simple tls_client/tls_server
+* crypt test
+* crypt benchmark
+
+ The *user_settings.h* file enables some of the hardened settings.
+
+## Requirements
+### 1. [Renesas TSIP FIT module](https://www.renesas.com/us/en/products/software-tools/software-os-middleware-driver/security-crypto/trusted-secure-ip-driver.html)
+[FIT module](https://www.renesas.com/us/en/products/software-tools/software-os-middleware-driver/software-package/fit.html)
+Note : The included example program is tested with <u>TSIP FIT version **1.06**</u>.
+
+### 2. [e2studio](https://www.renesas.com/us/en/products/software-tools/tools/ide/e2studio.html)
+
+### 3. Evaluation Board that supports TSIP
+Note : The included example program is tested with [GR-ROSE](http://gadget.renesas.com/en/product/rose.html), which is classified to RX65N.
+
+## Setup and Build wolfSSL library
+ 1. Uncomment out #define WOLFSSL_RENESAS_TSIP in /path/to/wolfssl/wolfssl/wolfcrypt/settings.h
+ Uncomment out #define WOLFSSL_RENESAS_RX65N in /path/to/wolfssl/wolfssl/wolfcrypt/settings.h
+ 2. Open a project file at /path/to/wolfssl/IDE/Renesas/e2studio/Projects/wolfssl/ by e2studio and build to create wolfssl library
+Note : Generating FIT module source files in advance are required to compile wolfSSL when enabling WOLFSSL_RENESAS_TSIP and WOLFSSL_RENESAS_RX65N. Please see for creating FIT module files at "Setup and Build and example program" in this readme below.
+
+To disable portions of the hardware acceleration you can optionally define:
+
+```
+/* Disabled SHA acceleration */
+#define NO_WOLFSSL_RENESAS_TSIP_CRYPT_HASH
+/* Disabled TLS-linked acceleration */
+#define NO_WOLFSSL_RENESAS_TSIP_TLS_SESSION
+```
+### Benchmarks
+**Software only implementation:**
+*block cipher*
+```
+RNG 200 KB took 1.099 seconds, 182.000 KB/s
+SHA 1 MB took 1.005 seconds, 1.166 MB/s
+SHA-256 425 KB took 1.038 seconds, 409.520 KB/s
+```
+
+*TLS establishment time*
+```
+TLS_RSA_WITH_AES_128_CBC_SHA : 0.651 (s)
+TLS_RSA_WITH_AES_128_CBC_SHA256 : 0.651 (s)
+TLS_RSA_WITH_AES_256_CBC_SHA : 0.642 (s)
+TLS_RSA_WITH_AES_256_CBAC_SHA256 : 0.662 (s)
+TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 : 2.050 (s)
+```
+**Hardware acceleration:**
+*block cipher*
+```
+RNG 1 MB took 1.011 seconds, 1.038 MB/s
+SHA 12 MB took 1.001 seconds, 11.515 MB/s
+SHA-256 13 MB took 1.001 seconds, 12.900 MB/s
+```
+*TLS establishment time with TLS-linked capability*
+*Perform full TlS-linked capability*
+```
+TLS_RSA_WITH_AES_128_CBC_SHA : 0.141 (s)
+TLS_RSA_WITH_AES_128_CBC_SHA256 : 0.141 (s)
+TLS_RSA_WITH_AES_256_CBC_SHA : 0.141 (s)
+TLS_RSA_WITH_AES_256_CBAC_SHA256 : 0.144 (s)
+```
+*Perform certificate verification by TSIP TLS-linked API*
+```
+TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 : 1.721 (s)
+```
+Condition:
+Renesas : e2Studio v7.4.0
+ToolChain : Renesas CCRX version 3.00.00
+TSIP FIT : version 1.0.6
+Board : [GR-ROSE](http://gadget.renesas.com/en/product/rose.html)
+wolfSSL : 4.1.0
+
+
+## Setup and Build an example program
+An example program expects the following FIT modules:
+
+* r_bsp
+* r_cmt_rx
+* r_config
+* r_ether_rx
+* r_sys_time_rx
+* r_t4_driver_rx
+* r_t4_rx
+* r_tsip_rx
+
+These needed source files can be generated by creating a dummy project including Renesas Smart Configurator as steps below:
+
+ 1. Create a dummy project including Renesas Smart Configurator for your evaluation board type
+ 2. Open Smart Configurator and add FIT modules above
+ It would need to expand *User Stack Size* property and *Heap Size* of r_bsp.
+ Change IP ADDRESS and PORT NUMBER in r_t4_rx_config.h
+ `#define T4_CFG_FIXED_IP_ADDRESS_CH0 192,168,1,33`
+ `#define T4_CFG_TCP_REPID1_PORT_NUMBER 11111`
+ Note: It would need to modify other configuration base on evaluation board.
+
+ When using GR-ROSE, you can choose "GR-ROSE" from "board" tab and "board" drop-down list and then is able to follow settings below:
+
+ Go to component tab and open r_ether_rx properties:
+ Ethernet interface : RMII
+ The register bus of PHY0 for ETHER0/1: Use ETHER0
+ Resource, ETHERC: Check ETHERC0_RMII
+
+ Go to component tab and open r_t4_rx properties:
+ Enable/Disable DHCP function : 0
+ IP address for ch0, when DHCP disable : 192,168,1,33
+ TCP REPID1 prot number : 11111
+
+ Go to pins tab and select ethernet controller
+ Check to use pins
+
+ 3. Generate source code
+Now, it is able to copy these FIT modules into an example project.
+ 4. Make "smc_gen" folder under /path/to/wolfssl/IDE/Renesas/e2studio/Projects/test/src/
+ 5. Copy the FIT modules into the folder that is created at step 4.
+ 6. Open an example project file at /path/to/wolfssl/IDE/Renesas/e2studio/Projects/test/ by e2studio
+ 7. Enable a macro definition in /path/to/wolfssl/IDE/Renesas/e2studio/Projects/test/src/wolfssl_demo.h for application type
+ `#define CRYPT_TEST // enable crypt test`
+ `#define BENCHMARK // enable benchmark application`
+ `#define TLS_CLIENT // enable simple tls client application`
+ `#define TLS_SERVER // enable simple tls server application`
+ `#define USE_TSIP_TLS // to inform user key and flash keying, when using TSIP`
+ Note: CRYPT_TEST and BENCHMARK can be enabled at the same time. TLS_CLIENT and TLS_SERVER cannot be enabled together other definitions.
+ 7. Setup debug configuration based on your debug hardware
+
+## Run client/server program on the device
+When testing the embedded client or server on the device, it is recommended to test against one of the standard wolfSSL example application running on a desktop machine.
+
+
+For the embedded client, an example server commands for running on a desktop machine, IP address 192.168.1.45, is as follows:
+`$./example/server/server -b -d -i`
+
+
+For the embedded server, an example client commands for running on a desktop machine is as follows:
+`$./example/client/client -h 192.168.1.33 -p 11111`
+
+## Modify an example program
+To use own TSIP keys for TSIP TLS-linked API use, it needs own flash keyring, PSS signed signature and RSA key.
+
+### Create flash keyring and use it in an example program
+ 1. Please follow the instruction at TSIP manual, chapter 7. Key Data Operations.
+ 2. Copy and paste s_flash[] data to s_flash[] data in example-program/key_data.c
+`const uint32_t s_flash[] =`
+
+### Create RSA key pair for signing Root CA verification and use them in an example program
+ To use TSIP TLS-linked APIs, it needs RSA key pair and Root CA certificate bundle signature by RSA 2048 PSS with SHA256.
+ Shell and Perl script program in /path/to/wolfssl/IDE/Renesas/e2studio/Projects/tools/ can be used for the purpose.
+
+ * generate_rsa_keypair.sh : generate RSA 2048 bit key pair. Show modulus and public exponent when specifying "-s" option
+ * rsa_pss_sign.sh : sign the file by the specified private key
+ * genhexbuf.pl : generate C header file including a byte array generated from the specified file in the script
+
+ Modulus and public exponent showed by `generate_rsa_keypair.sh` can be used for input date to Renesas Secure Flash Programmer to generate encrypted RSA keys for TSIP TLS-linked API use. Please follow the instruction about how to generate RSA keys in the TSIP manual.
+
+
+ Generated byte array of signed signature by genhexbuf.pl can be replaced signature data in key_data.c of an example program.
+
+
+ Encrypted RSA key and generated byte array of signed signature need to be informed wolfSSL library before loading CA certification. Please see SetTsipTlskey() function an example program about how to inform them.
+
+### Coding
+
+In your application you must include <wolfssl/wolfcrypt/settings.h> before any other wolfSSL headers. If building the sources directly we recommend defining `WOLFSSL_USER_SETTINGS` and adding your own `user_settings.h` file. You can find a good reference for this in `/path/to/Renesas/e2studio/Projects/common/user_settings.h`.
+
+## Support
+ For question please email [[email protected]]
+
+
diff --git a/wolfcrypt/src/port/Renesas/renesas_tsip_aes.c b/wolfcrypt/src/port/Renesas/renesas_tsip_aes.c
new file mode 100644
index 0000000..ce04ff5
--- /dev/null
+++ b/wolfcrypt/src/port/Renesas/renesas_tsip_aes.c
@@ -0,0 +1,156 @@
+/* renesas_tsip_aes.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+#include <wolfssl/wolfcrypt/settings.h>
+#include <stdio.h>
+
+#ifndef NO_AES
+
+#if defined(WOLFSSL_RENESAS_TSIP_CRYPT) && \
+ !defined(NO_WOLFSSL_RENESAS_TSIP_CRYPT_AES)
+
+#include <wolfssl/wolfcrypt/wc_port.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+
+#include <wolfssl/wolfcrypt/aes.h>
+#include "wolfssl/wolfcrypt/port/Renesas/renesas-tsip-crypt.h"
+
+struct Aes;
+
+int wc_tsip_AesCbcEncrypt(struct Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ tsip_aes_handle_t _handle;
+ word32 ret;
+ word32 blocks = (sz / AES_BLOCK_SIZE);
+ uint32_t dataLength;
+ byte *iv;
+
+ if ((in == NULL) || (out == NULL) || (aes == NULL))
+ return BAD_FUNC_ARG;
+
+ /* while doing TLS handshake, TSIP driver keeps true-key and iv *
+ * on the device. iv is dummy */
+ iv = (uint8_t*)aes->reg;
+
+ if((ret = tsip_hw_lock()) != 0){
+ WOLFSSL_MSG("Failed to lock");
+ return ret;
+ }
+
+ if (aes->ctx.keySize == 16) {
+ ret = R_TSIP_Aes128CbcEncryptInit(&_handle, &aes->ctx.tsip_keyIdx, iv);
+ } else if (aes->ctx.keySize == 32) {
+ ret = R_TSIP_Aes256CbcEncryptInit(&_handle, &aes->ctx.tsip_keyIdx, iv);
+ } else {
+ tsip_hw_unlock();
+ return -1;
+ }
+
+ while (ret == TSIP_SUCCESS && blocks--) {
+
+ if (aes->ctx.keySize == 16)
+ ret = R_TSIP_Aes128CbcEncryptUpdate(&_handle, (uint8_t*)in,
+ (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
+ else
+ ret = R_TSIP_Aes256CbcEncryptUpdate(&_handle, (uint8_t*)in,
+ (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
+
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+
+ if (ret == TSIP_SUCCESS) {
+ if (aes->ctx.keySize == 16) {
+ ret = R_TSIP_Aes128CbcEncryptFinal(&_handle, out, &dataLength);
+ } else {
+ ret = R_TSIP_Aes256CbcEncryptFinal(&_handle, out, &dataLength);
+ }
+ } else {
+ WOLFSSL_MSG("TSIP AES CBC encryption failed");
+ ret = -1;
+ }
+
+ tsip_hw_unlock();
+ return ret;
+}
+
+int wc_tsip_AesCbcDecrypt(struct Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ tsip_aes_handle_t _handle;
+ word32 ret;
+ word32 blocks = (sz / AES_BLOCK_SIZE);
+ uint32_t dataLength;
+ byte *iv;
+
+ if ((in == NULL) || (out == NULL) || (aes == NULL))
+ return BAD_FUNC_ARG;
+
+ iv = (uint8_t*)aes->reg;
+
+ if((ret = tsip_hw_lock()) != 0){
+ WOLFSSL_MSG("Failed to lock");
+ return ret;
+ }
+
+ if (aes->ctx.keySize == 16) {
+ ret = R_TSIP_Aes128CbcDecryptInit(&_handle, &aes->ctx.tsip_keyIdx, iv);
+ } else if (aes->ctx.keySize == 32) {
+ ret = R_TSIP_Aes256CbcDecryptInit(&_handle, &aes->ctx.tsip_keyIdx, iv);
+ } else {
+ tsip_hw_unlock();
+ return -1;
+ }
+
+ while (ret == TSIP_SUCCESS && blocks--) {
+
+ if (aes->ctx.keySize == 16)
+ ret = R_TSIP_Aes128CbcDecryptUpdate(&_handle, (uint8_t*)in,
+ (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
+ else
+ ret = R_TSIP_Aes256CbcDecryptUpdate(&_handle, (uint8_t*)in,
+ (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
+
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+
+ if (ret == TSIP_SUCCESS) {
+ if (aes->ctx.keySize == 16)
+ ret = R_TSIP_Aes128CbcDecryptFinal(&_handle, out, &dataLength);
+ else
+ ret = R_TSIP_Aes256CbcDecryptFinal(&_handle, out, &dataLength);
+ } else {
+ WOLFSSL_MSG("TSIP AES CBC decryption failed");
+ ret = -1;
+ }
+
+ tsip_hw_unlock();
+ return ret;
+}
+
+#endif /* WOLFSSL_RENESAS_TSIP_CRYPT */
+#endif /* NO_AES */
diff --git a/wolfcrypt/src/port/Renesas/renesas_tsip_sha.c b/wolfcrypt/src/port/Renesas/renesas_tsip_sha.c
new file mode 100644
index 0000000..b12d8ee
--- /dev/null
+++ b/wolfcrypt/src/port/Renesas/renesas_tsip_sha.c
@@ -0,0 +1,274 @@
+/* renesas_tsip_sha.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if !defined(NO_SHA) || !defined(NO_SHA256)
+
+#include <wolfssl/wolfcrypt/logging.h>
+
+#if defined(WOLFSSL_RENESAS_TSIP_CRYPT)
+
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/port/Renesas/renesas-tsip-crypt.h>
+
+#if !defined(NO_SHA)
+#include <wolfssl/wolfcrypt/sha.h>
+
+static void TSIPHashFree(wolfssl_TSIP_Hash* hash)
+{
+ if (hash == NULL)
+ return;
+
+ if (hash->msg != NULL) {
+ XFREE(hash->msg, hash->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ hash->msg = NULL;
+ }
+}
+
+static int TSIPHashInit(wolfssl_TSIP_Hash* hash, void* heap, int devId,
+ word32 sha_type)
+{
+ if (hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ (void)devId;
+ XMEMSET(hash, 0, sizeof(wolfssl_TSIP_Hash));
+
+ hash->heap = heap;
+ hash->len = 0;
+ hash->used = 0;
+ hash->msg = NULL;
+ hash->sha_type = sha_type;
+
+ return 0;
+}
+
+static int TSIPHashUpdate(wolfssl_TSIP_Hash* hash, const byte* data, word32 sz)
+{
+ if (hash == NULL || (sz > 0 && data == NULL)) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (hash->len < hash->used + sz) {
+ if (hash->msg == NULL) {
+ hash->msg = (byte*)XMALLOC(hash->used + sz, hash->heap,
+ DYNAMIC_TYPE_TMP_BUFFER);
+ } else {
+#ifdef FREERTOS
+ byte* pt = (byte*)XMALLOC(hash->used + sz, hash->heap,
+ DYNAMIC_TYPE_TMP_BUFFER);
+ if (pt == NULL) {
+ return MEMORY_E;
+ }
+ XMEMCPY(pt, hash->msg, hash->used);
+ XFREE(hash->msg, hash->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ hash->msg = NULL;
+ hash->msg = pt;
+#else
+ byte* pt = (byte*)XREALLOC(hash->msg, hash->used + sz, hash->heap,
+ DYNAMIC_TYPE_TMP_BUFFER);
+ if (pt == NULL) {
+ return MEMORY_E;
+ }
+ hash->msg = pt;
+#endif
+ }
+ if (hash->msg == NULL) {
+ return MEMORY_E;
+ }
+ hash->len = hash->used + sz;
+ }
+ XMEMCPY(hash->msg + hash->used, data , sz);
+ hash->used += sz;
+
+ return 0;
+}
+
+static int TSIPHashFinal(wolfssl_TSIP_Hash* hash, byte* out, word32 outSz)
+{
+ int ret;
+ void* heap;
+ tsip_sha_md5_handle_t handle;
+ uint32_t sz;
+
+ e_tsip_err_t (*Init)(tsip_sha_md5_handle_t*);
+ e_tsip_err_t (*Update)(tsip_sha_md5_handle_t*, uint8_t*, uint32_t);
+ e_tsip_err_t (*Final )(tsip_sha_md5_handle_t*, uint8_t*, uint32_t*);
+
+ if (hash == NULL || out == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (hash->sha_type == TSIP_SHA1) {
+ Init = R_TSIP_Sha1Init;
+ Update = R_TSIP_Sha1Update;
+ Final = R_TSIP_Sha1Final;
+ } else if (hash->sha_type == TSIP_SHA256) {
+ Init = R_TSIP_Sha256Init;
+ Update = R_TSIP_Sha256Update;
+ Final = R_TSIP_Sha256Final;
+ } else
+ return BAD_FUNC_ARG;
+
+ heap = hash->heap;
+
+ tsip_hw_lock();
+
+ if (Init(&handle) == TSIP_SUCCESS) {
+ ret = Update(&handle, (uint8_t*)hash->msg, hash->used);
+ if (ret == TSIP_SUCCESS) {
+ ret = Final(&handle, out, (uint32_t*)&sz);
+ if (ret != TSIP_SUCCESS || sz != outSz) {
+ return ret;
+ }
+ }
+ }
+ tsip_hw_unlock();
+
+ TSIPHashFree(hash);
+ return TSIPHashInit(hash, heap, 0, hash->sha_type);
+}
+
+static int TSIPHashGet(wolfssl_TSIP_Hash* hash, byte* out, word32 outSz)
+{
+ int ret;
+ tsip_sha_md5_handle_t handle;
+ uint32_t sz;
+
+ e_tsip_err_t (*Init)(tsip_sha_md5_handle_t*);
+ e_tsip_err_t (*Update)(tsip_sha_md5_handle_t*, uint8_t*, uint32_t);
+ e_tsip_err_t (*Final )(tsip_sha_md5_handle_t*, uint8_t*, uint32_t*);
+
+ if (hash == NULL || out == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (hash->sha_type == TSIP_SHA1) {
+ Init = R_TSIP_Sha1Init;
+ Update = R_TSIP_Sha1Update;
+ Final = R_TSIP_Sha1Final;
+ } else if (hash->sha_type == TSIP_SHA256) {
+ Init = R_TSIP_Sha256Init;
+ Update = R_TSIP_Sha256Update;
+ Final = R_TSIP_Sha256Final;
+ } else
+ return BAD_FUNC_ARG;
+
+ tsip_hw_lock();
+
+ if (Init(&handle) == TSIP_SUCCESS) {
+ ret = Update(&handle, (uint8_t*)hash->msg, hash->used);
+ if (ret == TSIP_SUCCESS) {
+ ret = Final(&handle, out, &sz);
+ if (ret != TSIP_SUCCESS || sz != outSz) {
+ return ret;
+ }
+ }
+ }
+
+ tsip_hw_unlock();
+
+ return 0;
+}
+
+static int TSIPHashCopy(wolfssl_TSIP_Hash* src, wolfssl_TSIP_Hash* dst)
+{
+ if (src == NULL || dst == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMCPY(dst, src, sizeof(wolfssl_TSIP_Hash));
+
+ if (src->len > 0 && src->msg != NULL) {
+ dst->msg = (byte*)XMALLOC(src->len, dst->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ if (dst->msg == NULL) {
+ return MEMORY_E;
+ }
+ XMEMCPY(dst->msg, src->msg, src->len);
+ }
+
+ return 0;
+}
+ /* */
+int wc_InitSha_ex(wc_Sha* sha, void* heap, int devId)
+{
+ return TSIPHashInit(sha, heap, devId, TSIP_SHA1);
+}
+
+int wc_ShaUpdate(wc_Sha* sha, const byte* in, word32 sz)
+{
+ return TSIPHashUpdate(sha, in, sz);
+}
+
+int wc_ShaFinal(wc_Sha* sha, byte* hash)
+{
+ return TSIPHashFinal(sha, hash, WC_SHA_DIGEST_SIZE);
+}
+
+int wc_ShaGetHash(wc_Sha* sha, byte* hash)
+{
+ return TSIPHashGet(sha, hash, WC_SHA_DIGEST_SIZE);
+}
+
+int wc_ShaCopy(wc_Sha256* src, wc_Sha256* dst)
+{
+ return TSIPHashCopy(src, dst);
+}
+#endif /* !NO_SHA */
+
+#if !defined(NO_SHA256)
+#include <wolfssl/wolfcrypt/sha256.h>
+
+/* */
+int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
+{
+ return TSIPHashInit(sha, heap, devId, TSIP_SHA256);
+}
+
+int wc_Sha256Update(wc_Sha256* sha, const byte* in, word32 sz)
+{
+ return TSIPHashUpdate(sha, in, sz);
+}
+
+int wc_Sha256Final(wc_Sha256* sha, byte* hash)
+{
+ return TSIPHashFinal(sha, hash, WC_SHA256_DIGEST_SIZE);
+}
+
+int wc_Sha256GetHash(wc_Sha256* sha, byte* hash)
+{
+ return TSIPHashGet(sha, hash, WC_SHA256_DIGEST_SIZE);
+}
+
+int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
+{
+ return TSIPHashCopy(src, dst);
+}
+#endif /* !NO_SHA256 */
+#endif /* WOLFSSL_RENESAS_TSIP_CRYPT */
+#endif /* #if !defined(NO_SHA) || !defined(NO_SHA256) */
diff --git a/wolfcrypt/src/port/Renesas/renesas_tsip_util.c b/wolfcrypt/src/port/Renesas/renesas_tsip_util.c
new file mode 100644
index 0000000..e3cd7ad
--- /dev/null
+++ b/wolfcrypt/src/port/Renesas/renesas_tsip_util.c
@@ -0,0 +1,719 @@
+/* renesas_tsip_util.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_RENESAS_TSIP)
+
+#include <wolfssl/wolfcrypt/wc_port.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+
+#include <wolfssl/wolfcrypt/port/Renesas/renesas-tsip-crypt.h>
+#include <wolfssl/wolfcrypt/memory.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/aes.h>
+#include <wolfssl/ssl.h>
+#include <wolfssl/internal.h>
+
+#include <stdio.h>
+/* mutex */
+wolfSSL_Mutex tsip_mutex;
+static int tsip_CryptHwMutexInit_ = 0;
+
+/* ./ca-cert.der.sign, */
+/* expect to have these variables defined at user application */
+extern uint32_t s_flash[];
+extern uint32_t s_inst1[R_TSIP_SINST_WORD_SIZE];
+extern uint32_t s_inst2[R_TSIP_SINST2_WORD_SIZE];
+static const byte *ca_cert_sig;
+
+/* user key */
+static tsip_key_data g_user_key_info;
+/* tsip only keep one encrypted ca public key */
+#if defined(WOLFSSL_RENESAS_TSIP_TLS)
+static uint32_t g_encrypted_publicCA_key[R_TSIP_SINST_WORD_SIZE];
+static uint32_t g_CAscm_Idx; /* index of CM table */
+#endif
+
+static int tsip_CryptHwMutexInit(wolfSSL_Mutex* mutex) {
+ return wc_InitMutex(mutex);
+}
+
+static int tsip_CryptHwMutexLock(wolfSSL_Mutex* mutex) {
+ return wc_LockMutex(mutex);
+}
+
+static int tsip_CryptHwMutexUnLock(wolfSSL_Mutex* mutex) {
+ return wc_UnLockMutex(mutex);
+}
+
+/*
+* lock hw engine.
+* this should be called before using engine.
+*/
+int tsip_hw_lock()
+{
+ int ret = 0;
+
+ WOLFSSL_MSG("enter esp_sha_hw_lock");
+
+ if(tsip_CryptHwMutexInit_ == 0){
+ ret = tsip_CryptHwMutexInit(&tsip_mutex);
+ if(ret == 0) {
+ tsip_CryptHwMutexInit_ = 1;
+ } else {
+ WOLFSSL_MSG(" mutex initialization failed.");
+ return -1;
+ }
+ }
+ if(tsip_CryptHwMutexLock(&tsip_mutex) != 0) {
+ /* this should not happens */
+ return -1;
+ }
+
+ WOLFSSL_MSG("leave tsip_sha_try_hw_lock");
+ return ret;
+}
+
+/*
+* release hw engine
+*/
+void tsip_hw_unlock( void )
+{
+ WOLFSSL_MSG("enter tsip_hw_unlock");
+ /* unlock hw engine for next use */
+ tsip_CryptHwMutexUnLock(&tsip_mutex);
+ WOLFSSL_MSG("leave tsip_hw_unlock");
+}
+/* check if tsip tls functions can be used for the cipher */
+/* cipher0 : in the some cipher suite, */
+/* first byte becomes greater than 0, otherwise 0x00 */
+/* side : CLIENT END or SEVER END */
+int tsip_useable(const struct WOLFSSL *ssl)
+{
+ byte cipher0;
+ byte cipher;
+ byte side;
+
+ /* sanity check */
+ if (ssl == NULL)
+ return BAD_FUNC_ARG;
+
+ /* when rsa key index == NULL, tsip isn't used for cert verification. */
+ /* in the case, we cannot use TSIP. */
+ if (!ssl->peerTsipEncRsaKeyIndex)
+ return 0;
+
+ /* when enabled Extended Master Secret, we cannot use TSIP. */
+ if (ssl->options.haveEMS)
+ return 0;
+
+ cipher0 = ssl->options.cipherSuite0;
+ cipher = ssl->options.cipherSuite;
+ side = ssl->options.side;
+
+ if (cipher0 > 0x00)
+ return 0;
+
+ if ((cipher == l_TLS_RSA_WITH_AES_128_CBC_SHA ||
+ cipher == l_TLS_RSA_WITH_AES_128_CBC_SHA256 ||
+ cipher == l_TLS_RSA_WITH_AES_256_CBC_SHA ||
+ cipher == l_TLS_RSA_WITH_AES_256_CBC_SHA256) &&
+ side == WOLFSSL_CLIENT_END)
+ return 1;
+ else
+ return 0;
+}
+
+/* check if the g_alreadyVerified CA's key can be used for *
+ * peer's certification */
+byte tsip_checkCA(word32 cmIdx)
+{
+ return (cmIdx == g_CAscm_Idx? 1:0);
+}
+
+/* check if the root CA has been verified by TSIP, *
+ * and it exists in the CM table. */
+byte tsip_rootCAverified( )
+{
+ return (g_CAscm_Idx != (uint32_t)-1 ? 1:0);
+}
+
+/* open TSIP driver for use */
+int tsip_Open( ) {
+
+ int ret;
+
+ if ((ret = tsip_hw_lock()) == 0) {
+ /* open the TSIP */
+ ret = R_TSIP_Open((uint32_t*)s_flash, s_inst1, s_inst2);
+ if( ret != TSIP_SUCCESS ) {
+ WOLFSSL_MSG("RENESAS TSIP Open failed");
+ }
+
+#if defined(WOLFSSL_RENESAS_TSIP_TLS)
+ /* generate TLS Rsa public key for Certificate verification */
+ if (ret == TSIP_SUCCESS && g_user_key_info.encrypted_user_tls_key) {
+ ret = R_TSIP_GenerateTlsRsaPublicKeyIndex(
+ g_user_key_info.encrypted_session_key,
+ g_user_key_info.iv,
+ g_user_key_info.encrypted_user_tls_key,
+ &g_user_key_info.user_rsa2048_tls_pubindex);
+
+ if (ret != TSIP_SUCCESS) {
+ WOLFSSL_MSG("R_TSIP_GenerateTlsRsaPublicKeyIndex failed");
+ } else {
+ /* close once */
+ tsip_Close( );
+ /* open again with s_inst[] */
+ XMEMCPY(s_inst1,
+ g_user_key_info.user_rsa2048_tls_pubindex.value,
+ sizeof(s_inst1));
+ ret = R_TSIP_Open((uint32_t*)s_flash, s_inst1, s_inst2);
+ if (ret != TSIP_SUCCESS) {
+ WOLFSSL_MSG("R_TSIP_(Re)Open failed");
+ }
+ /* init vars */
+ g_CAscm_Idx = (uint32_t)-1;
+ }
+ }
+#endif
+ /* unlock hw */
+ tsip_hw_unlock();
+ } else
+ WOLFSSL_MSG("Failed to lock tsip hw \n");
+
+ return ret;
+}
+
+/* close TSIP driver */
+void tsip_Close( ) {
+ int ret;
+
+ if ((ret = tsip_hw_lock()) == 0) {
+ /* close TSIP */
+ ret = R_TSIP_Close();
+#if defined(WOLFSSL_RENESAS_TSIP_TLS)
+ g_CAscm_Idx = (uint32_t)-1;
+#endif
+ /* unlock hw */
+ tsip_hw_unlock();
+ if( ret != TSIP_SUCCESS ) {
+ WOLFSSL_MSG("RENESAS TSIP Close failed");
+ }
+ } else
+ WOLFSSL_MSG("Failed to unlock tsip hw \n");
+}
+
+/* Support functions for TSIP TLS Capability */
+#if defined(WOLFSSL_RENESAS_TSIP_TLS)
+
+/* to inform ca certificate sign */
+/* signature format expects RSA 2048 PSS with SHA256 */
+void tsip_inform_cert_sign(const byte *sign)
+{
+ if(sign)
+ ca_cert_sig = sign;
+}
+
+/* inform user key */
+/* the function expects to be called from user application */
+/* user has to create these key information by Renesas tool in advance.*/
+void tsip_inform_user_keys(
+ byte *encrypted_session_key,
+ byte *iv,
+ byte *encrypted_user_tls_key
+)
+{
+ g_user_key_info.encrypted_session_key = NULL;
+ g_user_key_info.iv = NULL;
+ g_user_key_info.encrypted_user_tls_key = NULL;
+
+ if ( encrypted_session_key ) {
+ g_user_key_info.encrypted_session_key = encrypted_session_key;
+ }
+ if ( iv ) {
+ g_user_key_info.iv = iv;
+ }
+ if ( encrypted_user_tls_key ) {
+ g_user_key_info.encrypted_user_tls_key = encrypted_user_tls_key;
+ }
+}
+
+#ifndef NO_WOLFSSL_RENESAS_TSIP_TLS_SESSION
+/* convert def to tsip define */
+static byte _tls2tsipdef(byte cipher)
+{
+ byte def = R_TSIP_TLS_RSA_WITH_AES_128_CBC_SHA;
+ switch(cipher){
+ case l_TLS_RSA_WITH_AES_128_CBC_SHA:
+ break;
+ case l_TLS_RSA_WITH_AES_128_CBC_SHA256:
+ def = R_TSIP_TLS_RSA_WITH_AES_128_CBC_SHA256;
+ break;
+ case l_TLS_RSA_WITH_AES_256_CBC_SHA:
+ def = R_TSIP_TLS_RSA_WITH_AES_256_CBC_SHA;
+ break;
+ case l_TLS_RSA_WITH_AES_256_CBC_SHA256:
+ def = R_TSIP_TLS_RSA_WITH_AES_256_CBC_SHA256;
+ break;
+ default:break;
+ }
+ return def;
+}
+
+/* Sha1Hmac */
+int tsip_Sha1Hmac(const struct WOLFSSL *ssl, const byte *myInner,
+ word32 innerSz, const byte *in, word32 sz, byte *digest,
+ word32 verify)
+{
+ tsip_hmac_sha_handle_t _handle;
+ tsip_hmac_sha_key_index_t key_index;
+ int ret;
+
+ if ((ssl == NULL) || (myInner == NULL) || (in == NULL) ||
+ (digest == NULL))
+ return BAD_FUNC_ARG;
+
+ if ((ret = tsip_hw_lock()) != 0) {
+ WOLFSSL_MSG("hw lock failed\n");
+ return ret;
+ }
+
+ if ( (ssl->options.side == WOLFSSL_CLIENT_END && !verify) ||
+ (ssl->options.side == WOLFSSL_SERVER_END && verify) )
+ XMEMCPY(key_index.value, ssl->keys.tsip_client_write_MAC_secret,
+ sizeof(key_index.value));
+ else
+ XMEMCPY(key_index.value, ssl->keys.tsip_server_write_MAC_secret,
+ sizeof(key_index.value));
+
+ ret = R_TSIP_Sha1HmacGenerateInit(&_handle, &key_index);
+
+ if (ret == TSIP_SUCCESS)
+ ret = R_TSIP_Sha1HmacGenerateUpdate(&_handle, (uint8_t*)myInner,
+ (uint32_t)innerSz);
+
+ if (ret == TSIP_SUCCESS)
+ ret = R_TSIP_Sha1HmacGenerateUpdate(&_handle, (uint8_t*)in, sz);
+
+ if (ret == TSIP_SUCCESS)
+ ret = R_TSIP_Sha1HmacGenerateFinal(&_handle, digest);
+
+ /* unlock hw */
+ tsip_hw_unlock();
+
+ return ret;
+}
+
+/* Sha256Hmac */
+int tsip_Sha256Hmac(const struct WOLFSSL *ssl, const byte *myInner,
+ word32 innerSz, const byte *in, word32 sz, byte *digest,
+ word32 verify)
+{
+ tsip_hmac_sha_handle_t _handle;
+ tsip_hmac_sha_key_index_t key_index;
+ int ret;
+
+ if ((ssl == NULL) || (myInner == NULL) || (in == NULL) ||
+ (digest == NULL))
+ return BAD_FUNC_ARG;
+
+ if ( (ssl->options.side == WOLFSSL_CLIENT_END && !verify) ||
+ (ssl->options.side == WOLFSSL_SERVER_END && verify) )
+ XMEMCPY(key_index.value, ssl->keys.tsip_client_write_MAC_secret,
+ sizeof(key_index.value));
+ else
+ XMEMCPY(key_index.value, ssl->keys.tsip_server_write_MAC_secret,
+ sizeof(key_index.value));
+
+ if ((ret = tsip_hw_lock()) != 0) {
+ WOLFSSL_MSG("hw lock failed\n");
+ return ret;
+ }
+
+ ret = R_TSIP_Sha256HmacGenerateInit(&_handle, &key_index);
+
+ if (ret == TSIP_SUCCESS)
+ ret = R_TSIP_Sha256HmacGenerateUpdate(&_handle, (uint8_t*)myInner,
+ innerSz);
+
+ if (ret == TSIP_SUCCESS)
+ ret = R_TSIP_Sha256HmacGenerateUpdate(&_handle, (uint8_t*)in, sz);
+
+ if (ret == TSIP_SUCCESS)
+ ret = R_TSIP_Sha256HmacGenerateFinal(&_handle, digest);
+
+ /* unlock hw */
+ tsip_hw_unlock();
+
+ return ret;
+}
+
+/* generate Verify Data based on master secret */
+int tsip_generateVerifyData(const byte *ms, /* master secret */
+ const byte *side, const byte *handshake_hash,
+ byte *hashes /* out */)
+{
+ int ret ;
+ uint32_t l_side = R_TSIP_TLS_GENERATE_CLIENT_VERIFY;
+
+ if ((ms == NULL) || (side == NULL) || (handshake_hash == NULL) ||
+ (hashes == NULL))
+ return BAD_FUNC_ARG;
+
+ if (XSTRNCMP((const char*)side, (const char*)tls_server, FINISHED_LABEL_SZ)
+ == 0)
+ {
+ l_side = R_TSIP_TLS_GENERATE_SERVER_VERIFY;
+ }
+
+ if ((ret = tsip_hw_lock()) == 0) {
+ ret = R_TSIP_TlsGenerateVerifyData(l_side, (uint32_t*)ms,
+ (uint8_t*)handshake_hash, hashes/* out */);
+ if (ret != TSIP_SUCCESS) {
+ WOLFSSL_MSG("R_TSIP_TlsGenerateSessionKey failed\n");
+ }
+ }
+ /* unlock hw */
+ tsip_hw_unlock();
+
+ return ret;
+}
+
+/* generate keys for TLS communication */
+int tsip_generateSeesionKey(struct WOLFSSL *ssl)
+{
+ int ret;
+ Ciphers *enc;
+ Ciphers *dec;
+ tsip_hmac_sha_key_index_t key_client_mac;
+ tsip_hmac_sha_key_index_t key_server_mac;
+ tsip_aes_key_index_t key_client_aes;
+ tsip_aes_key_index_t key_server_aes;
+
+ if (ssl== NULL)
+ return BAD_FUNC_ARG;
+
+ if ((ret = tsip_hw_lock()) == 0) {
+ ret = R_TSIP_TlsGenerateSessionKey(
+ _tls2tsipdef(ssl->options.cipherSuite),
+ (uint32_t*)ssl->arrays->tsip_masterSecret,
+ (uint8_t*)ssl->arrays->clientRandom,
+ (uint8_t*)ssl->arrays->serverRandom, &key_client_mac,
+ &key_server_mac, &key_client_aes, &key_server_aes,
+ NULL, NULL);
+ if (ret != TSIP_SUCCESS) {
+ WOLFSSL_MSG("R_TSIP_TlsGenerateSessionKey failed\n");
+ } else {
+ /* succeeded creating session keys */
+ /* alloc aes instance for both enc and dec */
+ enc = &ssl->encrypt;
+ dec = &ssl->decrypt;
+
+ if (enc) {
+ if (enc->aes == NULL) {
+ enc->aes = (Aes*)XMALLOC(sizeof(Aes), ssl->heap,
+ DYNAMIC_TYPE_CIPHER);
+ if (enc->aes == NULL)
+ return MEMORY_E;
+ }
+
+ XMEMSET(enc->aes, 0, sizeof(Aes));
+ }
+ if (dec) {
+ if (dec->aes == NULL) {
+ dec->aes = (Aes*)XMALLOC(sizeof(Aes), ssl->heap,
+ DYNAMIC_TYPE_CIPHER);
+ if (dec->aes == NULL) {
+ if (enc) {
+ XFREE(enc->aes, NULL, DYNAMIC_TYPE_CIPHER);
+ }
+ return MEMORY_E;
+ }
+ }
+
+ XMEMSET(dec->aes, 0, sizeof(Aes));
+ }
+ /* copy key index into aes */
+ if (ssl->options.side == PROVISION_CLIENT) {
+ XMEMCPY(&enc->aes->ctx.tsip_keyIdx, &key_client_aes,
+ sizeof(key_client_aes));
+ XMEMCPY(&dec->aes->ctx.tsip_keyIdx, &key_server_aes,
+ sizeof(key_server_aes));
+ } else {
+ XMEMCPY(&enc->aes->ctx.tsip_keyIdx, &key_server_aes,
+ sizeof(key_server_aes));
+ XMEMCPY(&dec->aes->ctx.tsip_keyIdx, &key_client_aes,
+ sizeof(key_client_aes));
+ }
+ /* copy hac key index into keys */
+ XMEMCPY(ssl->keys.tsip_client_write_MAC_secret, key_client_mac.value,
+ sizeof(key_client_mac.value));
+ XMEMCPY(ssl->keys.tsip_server_write_MAC_secret, key_server_mac.value,
+ sizeof(key_client_mac.value));
+ /* set up key size and marked readly */
+ if (enc){
+ enc->aes->ctx.keySize = ssl->specs.key_size;
+ /* ready for use */
+ enc->setup = 1;
+ }
+ /* set up key size and marked readly */
+ if (dec) {
+ dec->aes->ctx.keySize = ssl->specs.key_size;
+ /* ready for use */
+ dec->setup = 1;
+ }
+ }
+ /* unlock hw */
+ tsip_hw_unlock();
+ } else
+ WOLFSSL_MSG("hw lock failed\n");
+
+ return ret;
+}
+/* generate Master secrete by TSIP */
+int tsip_generateMasterSecret(const byte *pr, /* pre-master */
+ const byte *cr, /* client random */
+ const byte *sr, /* server random */
+ byte *ms)
+{
+ int ret;
+
+ if ((pr == NULL) || (cr == NULL) || (sr == NULL) ||
+ (ms == NULL))
+ return BAD_FUNC_ARG;
+
+ if ((ret = tsip_hw_lock()) == 0) {
+ ret = R_TSIP_TlsGenerateMasterSecret( (uint32_t*)pr,
+ (uint8_t*)cr, (uint8_t*)sr, (uint32_t*)ms);
+ if (ret != TSIP_SUCCESS) {
+ WOLFSSL_MSG("R_TSIP_TlsGenerateMasterSecret failed\n");
+ }
+ /* unlock hw */
+ tsip_hw_unlock();
+ } else {
+ WOLFSSL_MSG(" hw lock failed ");
+ }
+
+ return ret;
+}
+/* generate pre-Master secrete by TSIP */
+int tsip_generatePremasterSecret(byte *premaster, word32 preSz )
+{
+ int ret;
+
+ if (premaster == NULL)
+ return BAD_FUNC_ARG;
+
+ if ((ret = tsip_hw_lock()) == 0 && preSz >=
+ (R_TSIP_TLS_MASTER_SECRET_WORD_SIZE*4)) {
+ /* generate pre-master, 80 bytes */
+ ret = R_TSIP_TlsGeneratePreMasterSecret( (uint32_t*)premaster );
+ if (ret != TSIP_SUCCESS) {
+ WOLFSSL_MSG(" R_TSIP_TlsGeneratePreMasterSecret failed\n");
+ }
+ /* unlock hw */
+ tsip_hw_unlock();
+ } else {
+ WOLFSSL_MSG(" hw lock failed or preSz is smaller than 80");
+ }
+
+ return ret;
+}
+/* generate encrypted pre-Master secrete by TSIP */
+int tsip_generateEncryptPreMasterSecret(WOLFSSL *ssl, byte *out, word32 *outSz)
+{
+ int ret;
+
+ if ((ssl == NULL) || (out == NULL) || (outSz == NULL))
+ return BAD_FUNC_ARG;
+
+ if ((ret = tsip_hw_lock()) == 0) {
+ if (*outSz >= 256)
+ ret = R_TSIP_TlsEncryptPreMasterSecret(
+ (uint32_t*)ssl->peerTsipEncRsaKeyIndex,
+ (uint32_t*)&ssl->arrays->preMasterSecret[VERSION_SZ],
+ (uint8_t*)out);
+ else
+ ret = -1;
+
+ if (ret != TSIP_SUCCESS) {
+ WOLFSSL_MSG(" R_TSIP_TlsEncryptPreMasterSecret failed\n");
+ } else {
+ *outSz = 256; /* TSIP can only handles 2048 RSA */
+ }
+ /* unlock hw */
+ tsip_hw_unlock();
+ } else {
+ WOLFSSL_MSG(" hw lock failed ");
+ }
+
+ return ret;
+}
+#endif /* NO_WOLFSSL_RENESAS_TSIP_TLS_SESSION */
+
+/* Certificate verification by TSIP */
+int tsip_tls_CertVerify(const byte *cert, word32 certSz,
+ const byte *signature, word32 sigSz,
+ word32 key_n_start, word32 key_n_len,
+ word32 key_e_start, word32 key_e_len,
+ byte *tsip_encRsaKeyIndex)
+{
+ int ret;
+
+ if (cert == NULL)
+ return BAD_FUNC_ARG;
+
+ if (!signature) {
+ WOLFSSL_MSG(" signature for ca verification is not set\n");
+ return -1;
+ }
+ if (!tsip_encRsaKeyIndex) {
+ WOLFSSL_MSG(" tsip_encRsaKeyIndex is NULL.\n");
+ return -1;
+ }
+
+ if ((ret = tsip_hw_lock()) == 0) {
+ ret = R_TSIP_TlsCertificateVerification(
+ (uint32_t*)g_encrypted_publicCA_key,/* encrypted public key */
+ (uint8_t*)cert, /* certificate der */
+ certSz, /* length of der */
+ (uint8_t*)signature, /* sign data by RSA PSS */
+ key_n_start, /* start position of public key n in bytes */
+ (key_n_start + key_n_len), /* length of the public key n */
+ key_e_start, /* start pos, key e in bytes */
+ (key_e_start + key_e_len), /* length of the public key e */
+ (uint32_t*)tsip_encRsaKeyIndex /* returned encrypted key */
+ );
+
+ if (ret != TSIP_SUCCESS) {
+ WOLFSSL_MSG(" R_TSIP_TlsCertificateVerification() failed");
+ }
+ tsip_hw_unlock();
+ } else {
+ WOLFSSL_MSG(" hw lock failed ");
+ }
+
+ return ret;
+}
+/* Root Certificate verification */
+int tsip_tls_RootCertVerify(const byte *cert, word32 cert_len,
+ word32 key_n_start, word32 key_n_len,
+ word32 key_e_start, word32 key_e_len,
+ word32 cm_row)
+{
+ int ret;
+ /* call to generate encrypted public key for certificate verification */
+ uint8_t *signature = (uint8_t*)ca_cert_sig;
+
+ if (cert == NULL)
+ return BAD_FUNC_ARG;
+
+ if (!signature) {
+ WOLFSSL_MSG(" signature for ca verification is not set\n");
+ return -1;
+ }
+
+ if ((ret = tsip_hw_lock()) == 0) {
+ ret = R_TSIP_TlsRootCertificateVerification(
+ /* CA cert */
+ (uint8_t*)cert,
+ /* length of CA cert */
+ (uint32_t)cert_len,
+ /* Byte position of public key */
+ key_n_start,
+ (key_n_start + key_n_len),
+ key_e_start,
+ (key_e_start + key_e_len),
+ /* signature by "RSA 2048 PSS with SHA256" */
+ (uint8_t*)ca_cert_sig,
+ /* RSA-2048 public key used by
+ RSA-2048 PSS with SHA256. 560 Bytes*/
+ g_encrypted_publicCA_key
+ );
+
+ if (ret != TSIP_SUCCESS) {
+ WOLFSSL_MSG(" R_TSIP_TlsRootCertVerify() failed");
+ } else {
+ g_CAscm_Idx = cm_row;
+ }
+
+ tsip_hw_unlock();
+ } else {
+ WOLFSSL_MSG(" hw lock failed ");
+ }
+
+ return ret;
+}
+#endif /* WOLFSSL_RENESAS_TSIP_TLS */
+
+#ifdef WOLFSSL_RENESAS_TSIP_CRYPT_DEBUG
+
+/* err
+ * e_tsip_err
+ TSIP_SUCCESS = 0,
+ TSIP_ERR_SELF_CHECK1, // Self-check 1 fail or TSIP function internal err.
+ TSIP_ERR_RESOURCE_CONFLICT, // A resource conflict occurred.
+ TSIP_ERR_SELF_CHECK2, // Self-check 2 fail.
+ TSIP_ERR_KEY_SET, // setting the invalid key.
+ TSIP_ERR_AUTHENTICATION, // Authentication failed.
+ TSIP_ERR_CALLBACK_UNREGIST, // Callback function is not registered.
+ TSIP_ERR_PARAMETER, // Illegal Input data.
+ TSIP_ERR_PROHIBIT_FUNCTION, // An invalid function call occurred.
+ * TSIP_RESUME_FIRMWARE_GENERATE_MAC,
+ // There is a continuation of R_TSIP_GenerateFirmwareMAC.
+*/
+
+static void hexdump(const uint8_t* in, uint32_t len)
+{
+ uint32_t i;
+
+ if (in == NULL)
+ return;
+
+ for (i = 0; i <= len;i++, in++){
+ printf("%02x:", *in);
+ if (((i+1)%16)==0){
+ printf("\n");
+ }
+ }
+ printf("\n");
+}
+
+byte *ret2err(word32 ret)
+{
+ switch(ret){
+ case TSIP_SUCCESS: return "success";
+ case TSIP_ERR_SELF_CHECK1: return "selfcheck1";
+ case TSIP_ERR_RESOURCE_CONFLICT: return "rsconflict";
+ case TSIP_ERR_SELF_CHECK2: return "selfcheck2";
+ case TSIP_ERR_KEY_SET: return "keyset";
+ case TSIP_ERR_AUTHENTICATION: return "authentication";
+ case TSIP_ERR_CALLBACK_UNREGIST: return "callback unreg";
+ case TSIP_ERR_PARAMETER: return "badarg";
+ case TSIP_ERR_PROHIBIT_FUNCTION: return "prohibitfunc";
+ case TSIP_RESUME_FIRMWARE_GENERATE_MAC: return "conti-generate-mac";
+ default:return "unknown";
+ }
+}
+
+#endif /* WOLFSSL_RENESAS_TSIP_CRYPT_DEBUG */
+#endif /* WOLFSSL_RENESAS_TSIP */
diff --git a/wolfcrypt/src/port/af_alg/afalg_aes.c b/wolfcrypt/src/port/af_alg/afalg_aes.c
new file mode 100644
index 0000000..2d1d41a
--- /dev/null
+++ b/wolfcrypt/src/port/af_alg/afalg_aes.c
@@ -0,0 +1,900 @@
+/* afalg_aes.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+
+#if !defined(NO_AES) && (defined(WOLFSSL_AFALG) || \
+ defined(WOLFSSL_AFALG_XILINX_AES))
+
+#include <wolfssl/wolfcrypt/aes.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/port/af_alg/wc_afalg.h>
+
+#include <sys/uio.h> /* for readv */
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifdef WOLFSSL_AFALG_XILINX_AES
+ #define WOLFSSL_XILINX_ALIGN sizeof(wolfssl_word)
+#endif
+
+static const char WC_TYPE_SYMKEY[] = "skcipher";
+
+static int wc_AesSetup(Aes* aes, const char* type, const char* name, int ivSz, int aadSz)
+{
+#ifdef WOLFSSL_AFALG_XILINX_AES
+ byte* key = (byte*)aes->msgBuf;
+#else
+ byte* key = (byte*)aes->key;
+#endif
+
+ aes->rdFd = wc_Afalg_CreateRead(aes->alFd, type, name);
+ if (aes->rdFd < 0) {
+ WOLFSSL_MSG("Unable to accept and get AF_ALG read socket");
+ aes->rdFd = WC_SOCK_NOTSET;
+ return aes->rdFd;
+ }
+
+ if (setsockopt(aes->alFd, SOL_ALG, ALG_SET_KEY, key, aes->keylen) != 0) {
+ WOLFSSL_MSG("Unable to set AF_ALG key");
+ aes->rdFd = WC_SOCK_NOTSET;
+ return WC_AFALG_SOCK_E;
+ }
+ ForceZero(key, sizeof(aes->key));
+
+ /* set up CMSG headers */
+ XMEMSET((byte*)&(aes->msg), 0, sizeof(struct msghdr));
+
+ aes->msg.msg_control = key; /* use existing key buffer for
+ * control buffer */
+#ifdef WOLFSSL_AFALG_XILINX_AES
+ aes->msg.msg_controllen = CMSG_SPACE(4) +
+ CMSG_SPACE(sizeof(struct af_alg_iv) + ivSz);
+ (void)aadSz;
+#else
+ aes->msg.msg_controllen = CMSG_SPACE(4);
+ if (aadSz > 0) {
+ aes->msg.msg_controllen += CMSG_SPACE(4);
+ }
+ if (ivSz > 0) {
+ aes->msg.msg_controllen += CMSG_SPACE((sizeof(struct af_alg_iv) + ivSz));
+ }
+#endif
+
+ if (wc_Afalg_SetOp(CMSG_FIRSTHDR(&(aes->msg)), aes->dir) < 0) {
+ WOLFSSL_MSG("Error with setting AF_ALG operation");
+ aes->rdFd = WC_SOCK_NOTSET;
+ return -1;
+ }
+
+ return 0;
+}
+
+
+#ifdef WOLFSSL_AFALG
+int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
+ const byte* iv, int dir)
+{
+#if defined(AES_MAX_KEY_SIZE)
+ const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
+#endif
+
+ if (aes == NULL ||
+ !((keylen == 16) || (keylen == 24) || (keylen == 32))) {
+ return BAD_FUNC_ARG;
+ }
+
+#if defined(AES_MAX_KEY_SIZE)
+ /* Check key length */
+ if (keylen > max_key_len) {
+ return BAD_FUNC_ARG;
+ }
+#endif
+ aes->keylen = keylen;
+ aes->rounds = keylen/4 + 6;
+
+#ifdef WOLFSSL_AES_COUNTER
+ aes->left = 0;
+#endif
+
+ aes->rdFd = WC_SOCK_NOTSET;
+ aes->alFd = wc_Afalg_Socket();
+ if (aes->alFd < 0) {
+ WOLFSSL_MSG("Unable to open an AF_ALG socket");
+ return WC_AFALG_SOCK_E;
+ }
+
+ /* save key until type is known i.e. CBC, ECB, ... */
+ XMEMCPY((byte*)(aes->key), userKey, keylen);
+ aes->dir = dir;
+
+ return wc_AesSetIV(aes, iv);
+}
+#endif
+
+/* AES-CBC */
+#if defined(HAVE_AES_CBC) && defined(WOLFSSL_AFALG)
+ static const char WC_NAME_AESCBC[] = "cbc(aes)";
+
+ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+ {
+ struct cmsghdr* cmsg;
+ struct iovec iov;
+ int ret;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (aes->rdFd == WC_SOCK_NOTSET) {
+ if ((ret = wc_AesSetup(aes, WC_TYPE_SYMKEY, WC_NAME_AESCBC,
+ AES_IV_SIZE, 0)) != 0) {
+ WOLFSSL_MSG("Error with first time setup of AF_ALG socket");
+ return ret;
+ }
+ }
+
+ sz = sz - (sz % AES_BLOCK_SIZE);
+ if ((sz / AES_BLOCK_SIZE) > 0) {
+ /* update IV */
+ cmsg = CMSG_FIRSTHDR(&(aes->msg));
+ ret = wc_Afalg_SetIv(CMSG_NXTHDR(&(aes->msg), cmsg),
+ (byte*)(aes->reg), AES_IV_SIZE);
+ if (ret < 0) {
+ WOLFSSL_MSG("Error setting IV");
+ return ret;
+ }
+
+ /* set data to be encrypted */
+ iov.iov_base = (byte*)in;
+ iov.iov_len = sz;
+
+ aes->msg.msg_iov = &iov;
+ aes->msg.msg_iovlen = 1; /* # of iov structures */
+
+ ret = (int)sendmsg(aes->rdFd, &(aes->msg), 0);
+ if (ret < 0) {
+ return ret;
+ }
+ ret = (int)read(aes->rdFd, out, sz);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* set IV for next CBC call */
+ XMEMCPY(aes->reg, out + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+ }
+
+ return 0;
+ }
+
+ #ifdef HAVE_AES_DECRYPT
+ int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+ {
+ struct cmsghdr* cmsg;
+ struct iovec iov;
+ int ret;
+
+ if (aes == NULL || out == NULL || in == NULL
+ || sz % AES_BLOCK_SIZE != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (aes->rdFd == WC_SOCK_NOTSET) {
+ if ((ret = wc_AesSetup(aes, WC_TYPE_SYMKEY, WC_NAME_AESCBC,
+ AES_IV_SIZE, 0)) != 0) {
+ return ret;
+ }
+ }
+
+ if ((sz / AES_BLOCK_SIZE) > 0) {
+ /* update IV */
+ cmsg = CMSG_FIRSTHDR(&(aes->msg));
+ ret = wc_Afalg_SetIv(CMSG_NXTHDR(&(aes->msg), cmsg),
+ (byte*)(aes->reg), AES_IV_SIZE);
+ if (ret != 0) {
+ return ret;
+ }
+
+ /* set data to be decrypted */
+ iov.iov_base = (byte*)in;
+ iov.iov_len = sz;
+
+ aes->msg.msg_iov = &iov;
+ aes->msg.msg_iovlen = 1; /* # of iov structures */
+
+ /* set IV for next CBC call */
+ XMEMCPY(aes->reg, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+
+ ret = (int)sendmsg(aes->rdFd, &(aes->msg), 0);
+ if (ret < 0) {
+ return ret;
+ }
+ ret = (int)read(aes->rdFd, out, sz);
+ if (ret < 0) {
+ return ret;
+ }
+
+ }
+
+ return 0;
+ }
+ #endif
+
+#endif /* HAVE_AES_CBC */
+
+
+/* AES-DIRECT */
+#if (defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AES_ECB)) && \
+ defined(WOLFSSL_AFALG)
+
+static const char WC_NAME_AESECB[] = "ecb(aes)";
+
+/* common code between ECB encrypt and decrypt
+ * returns 0 on success */
+static int wc_Afalg_AesDirect(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ struct iovec iov;
+ int ret;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (aes->rdFd == WC_SOCK_NOTSET) {
+ if ((ret = wc_AesSetup(aes, WC_TYPE_SYMKEY, WC_NAME_AESECB,
+ 0, 0)) != 0) {
+ WOLFSSL_MSG("Error with first time setup of AF_ALG socket");
+ return ret;
+ }
+ }
+
+ /* set data to be encrypted */
+ iov.iov_base = (byte*)in;
+ iov.iov_len = sz;
+
+ aes->msg.msg_iov = &iov;
+ aes->msg.msg_iovlen = 1; /* # of iov structures */
+
+ ret = (int)sendmsg(aes->rdFd, &(aes->msg), 0);
+ if (ret < 0) {
+ return ret;
+ }
+ ret = (int)read(aes->rdFd, out, sz);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+#endif
+
+
+#if defined(WOLFSSL_AES_DIRECT) && defined(WOLFSSL_AFALG)
+void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in)
+{
+ if (wc_Afalg_AesDirect(aes, out, in, AES_BLOCK_SIZE) != 0) {
+ WOLFSSL_MSG("Error with AES encrypt direct call");
+ }
+}
+
+
+void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in)
+{
+ if (wc_Afalg_AesDirect(aes, out, in, AES_BLOCK_SIZE) != 0) {
+ WOLFSSL_MSG("Error with AES decrypt direct call");
+ }
+}
+
+
+int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
+ const byte* iv, int dir)
+{
+ return wc_AesSetKey(aes, userKey, keylen, iv, dir);
+}
+#endif
+
+
+/* AES-CTR */
+#if defined(WOLFSSL_AES_COUNTER) && defined(WOLFSSL_AFALG)
+ static const char WC_NAME_AESCTR[] = "ctr(aes)";
+
+ /* Increment AES counter */
+ static WC_INLINE void IncrementAesCounter(byte* inOutCtr)
+ {
+ /* in network byte order so start at end and work back */
+ int i;
+ for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) {
+ if (++inOutCtr[i]) /* we're done unless we overflow */
+ return;
+ }
+ }
+
+ int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+ {
+ struct cmsghdr* cmsg;
+ struct iovec iov[2];
+ int ret;
+ byte* tmp;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* consume any unused bytes left in aes->tmp */
+ tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
+ while (aes->left && sz) {
+ *(out++) = *(in++) ^ *(tmp++);
+ aes->left--;
+ sz--;
+ }
+
+ if (aes->rdFd == WC_SOCK_NOTSET) {
+ if ((ret = wc_AesSetup(aes, WC_TYPE_SYMKEY, WC_NAME_AESCTR,
+ AES_IV_SIZE, 0)) != 0) {
+ WOLFSSL_MSG("Error with first time setup of AF_ALG socket");
+ return ret;
+ }
+ }
+
+ if (sz > 0) {
+ aes->left = sz % AES_BLOCK_SIZE;
+
+ /* clear previously leftover data */
+ tmp = (byte*)aes->tmp;
+ XMEMSET(tmp, 0, AES_BLOCK_SIZE);
+
+ /* update IV */
+ cmsg = CMSG_FIRSTHDR(&(aes->msg));
+ ret = wc_Afalg_SetIv(CMSG_NXTHDR(&(aes->msg), cmsg),
+ (byte*)(aes->reg), AES_IV_SIZE);
+ if (ret < 0) {
+ WOLFSSL_MSG("Error setting IV");
+ return ret;
+ }
+
+ /* set data to be encrypted */
+ iov[0].iov_base = (byte*)in;
+ iov[0].iov_len = sz - aes->left;
+
+ iov[1].iov_base = tmp;
+ if (aes->left > 0) {
+ XMEMCPY(tmp, in + sz - aes->left, aes->left);
+ iov[1].iov_len = AES_BLOCK_SIZE;
+ }
+ else {
+ iov[1].iov_len = 0;
+ }
+
+ aes->msg.msg_iov = iov;
+ aes->msg.msg_iovlen = 2; /* # of iov structures */
+
+ ret = (int)sendmsg(aes->rdFd, &(aes->msg), 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+
+ /* set buffers to hold result and left over stream */
+ iov[0].iov_base = (byte*)out;
+ iov[0].iov_len = sz - aes->left;
+
+ iov[1].iov_base = tmp;
+ if (aes->left > 0) {
+ iov[1].iov_len = AES_BLOCK_SIZE;
+ }
+ else {
+ iov[1].iov_len = 0;
+ }
+
+ ret = (int)readv(aes->rdFd, iov, 2);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (aes->left > 0) {
+ XMEMCPY(out + sz - aes->left, tmp, aes->left);
+ aes->left = AES_BLOCK_SIZE - aes->left;
+ }
+ }
+
+ /* adjust counter after call to hardware */
+ while (sz >= AES_BLOCK_SIZE) {
+ IncrementAesCounter((byte*)aes->reg);
+ sz -= AES_BLOCK_SIZE;
+ }
+
+ if (aes->left > 0) {
+ IncrementAesCounter((byte*)aes->reg);
+ }
+
+ return 0;
+ }
+#endif /* WOLFSSL_AES_COUNTER */
+
+
+#ifdef HAVE_AESGCM
+
+
+#ifdef WOLFSSL_AFALG_XILINX_AES
+ static const char WC_NAME_AESGCM[] = "xilinx-zynqmp-aes";
+ static const char* WC_TYPE_AEAD = WC_TYPE_SYMKEY;
+#else
+ static const char WC_NAME_AESGCM[] = "gcm(aes)";
+ static const char WC_TYPE_AEAD[] = "aead";
+#endif
+
+#ifndef WC_SYSTEM_AESGCM_IV
+/* size of IV allowed on system for AES-GCM */
+#define WC_SYSTEM_AESGCM_IV 12
+#endif
+
+#ifndef WOLFSSL_MAX_AUTH_TAG_SZ
+/* size of tag is restricted by system for AES-GCM
+ * check 'cat /proc/crypto' to see restricted size */
+#define WOLFSSL_MAX_AUTH_TAG_SZ 16
+#endif
+
+#ifdef WOLFSSL_AFALG_XILINX_AES
+/* Xilinx uses a slightly different function because the default AES key is also
+ * needed if handling additional data with creating/validating the TAG.
+ *
+ * returns 0 on success
+ */
+int wc_AesGcmSetKey_ex(Aes* aes, const byte* key, word32 len, word32 kup)
+#else
+int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
+#endif
+{
+#if defined(AES_MAX_KEY_SIZE)
+ const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
+#endif
+
+ if (aes == NULL ||
+ !((len == 16) || (len == 24) || (len == 32))) {
+ return BAD_FUNC_ARG;
+ }
+
+#if defined(AES_MAX_KEY_SIZE)
+ /* Check key length */
+ if (len > max_key_len) {
+ return BAD_FUNC_ARG;
+ }
+#endif
+ aes->keylen = len;
+ aes->rounds = len/4 + 6;
+
+ aes->rdFd = WC_SOCK_NOTSET;
+ aes->alFd = wc_Afalg_Socket();
+ if (aes->alFd < 0) {
+ WOLFSSL_MSG("Unable to open an AF_ALG socket");
+ return WC_AFALG_SOCK_E;
+ }
+
+ /* save key until direction is known i.e. encrypt or decrypt */
+#ifdef WOLFSSL_AFALG_XILINX_AES
+ (void)kup; /* using alternate buffer because software key is needed */
+ XMEMCPY((byte*)(aes->msgBuf), key, len);
+#else
+ XMEMCPY((byte*)(aes->key), key, len);
+#endif
+
+ return 0;
+}
+
+
+
+/* Performs AES-GCM encryption and returns 0 on success
+ *
+ * Warning: If using Xilinx hardware acceleration it is assumed that the out
+ * buffer is large enough to hold both cipher text and tag. That is
+ * sz | 16 bytes. The input and output buffer is expected to be 64 bit
+ * aligned
+ *
+ */
+int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ struct cmsghdr* cmsg;
+ struct iovec iov[3];
+ int ret;
+ struct msghdr* msg;
+ byte scratch[AES_BLOCK_SIZE];
+
+ /* argument checks */
+ if (aes == NULL || authTagSz > AES_BLOCK_SIZE) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (ivSz != WC_SYSTEM_AESGCM_IV || authTagSz > WOLFSSL_MAX_AUTH_TAG_SZ) {
+ WOLFSSL_MSG("IV/AAD size not supported on system");
+ return BAD_FUNC_ARG;
+ }
+
+ if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) {
+ WOLFSSL_MSG("GcmEncrypt authTagSz too small error");
+ return BAD_FUNC_ARG;
+ }
+
+ if (aes->rdFd == WC_SOCK_NOTSET) {
+ aes->dir = AES_ENCRYPTION;
+ if ((ret = wc_AesSetup(aes, WC_TYPE_AEAD, WC_NAME_AESGCM, ivSz,
+ authInSz)) != 0) {
+ WOLFSSL_MSG("Error with first time setup of AF_ALG socket");
+ return ret;
+ }
+
+ /* note that if the ivSz was to change, the msg_controllen would need
+ reset */
+
+#ifndef WOLFSSL_AFALG_XILINX_AES
+ /* set auth tag
+ * @TODO case where tag size changes between calls? */
+ ret = setsockopt(aes->alFd, SOL_ALG, ALG_SET_AEAD_AUTHSIZE, NULL,
+ authTagSz);
+ if (ret != 0) {
+ perror("set tag");
+ WOLFSSL_MSG("Unable to set AF_ALG tag size ");
+ return WC_AFALG_SOCK_E;
+ }
+#endif
+ }
+
+
+ msg = &(aes->msg);
+ cmsg = CMSG_FIRSTHDR(msg);
+ cmsg = CMSG_NXTHDR(msg, cmsg);
+
+ /* set IV and AAD size */
+ ret = wc_Afalg_SetIv(cmsg, (byte*)iv, ivSz);
+ if (ret < 0) {
+ WOLFSSL_MSG("Error setting IV");
+ return ret;
+
+ }
+#ifdef WOLFSSL_AFALG_XILINX_AES
+ if (sz > 0) {
+ #ifndef NO_WOLFSSL_ALLOC_ALIGN
+ byte* tmp = NULL;
+ #endif
+ if ((wolfssl_word)in % WOLFSSL_XILINX_ALIGN) {
+ #ifndef NO_WOLFSSL_ALLOC_ALIGN
+ byte* tmp_align;
+ tmp = (byte*)XMALLOC(sz + WOLFSSL_XILINX_ALIGN +
+ AES_BLOCK_SIZE, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ if (tmp == NULL) {
+ return MEMORY_E;
+ }
+ tmp_align = tmp + (WOLFSSL_XILINX_ALIGN -
+ ((size_t)tmp % WOLFSSL_XILINX_ALIGN));
+ XMEMCPY(tmp_align, in, sz);
+ iov[0].iov_base = tmp_align;
+ #else
+ WOLFSSL_MSG("Buffer expected to be word aligned");
+ return BAD_ALIGN_E;
+ #endif
+ }
+ else {
+ iov[0].iov_base = (byte*)in;
+ }
+ iov[0].iov_len = sz + AES_BLOCK_SIZE;
+
+ msg->msg_iov = iov;
+ msg->msg_iovlen = 1; /* # of iov structures */
+
+ ret = (int)sendmsg(aes->rdFd, msg, 0);
+ #ifndef NO_WOLFSSL_ALLOC_ALIGN
+ XFREE(tmp, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ #endif
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = read(aes->rdFd, out, sz + AES_BLOCK_SIZE);
+ if (ret < 0) {
+ return ret;
+ }
+ XMEMCPY(authTag, out + sz, authTagSz);
+ }
+
+ /* handle completing tag with using software if additional data added */
+ if (authIn != NULL && authInSz > 0) {
+ byte initalCounter[AES_BLOCK_SIZE];
+ XMEMSET(initalCounter, 0, AES_BLOCK_SIZE);
+ XMEMCPY(initalCounter, iv, ivSz);
+ initalCounter[AES_BLOCK_SIZE - 1] = 1;
+ GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz);
+ wc_AesEncryptDirect(aes, scratch, initalCounter);
+ xorbuf(authTag, scratch, authTagSz);
+ }
+#else
+ if (authInSz > 0) {
+ cmsg = CMSG_NXTHDR(msg, cmsg);
+ ret = wc_Afalg_SetAad(cmsg, authInSz);
+ if (ret < 0) {
+ WOLFSSL_MSG("Unable to set AAD size");
+ return ret;
+ }
+ }
+
+ /* set data to be encrypted*/
+ iov[0].iov_base = (byte*)authIn;
+ iov[0].iov_len = authInSz;
+
+ iov[1].iov_base = (byte*)in;
+ iov[1].iov_len = sz;
+
+ msg->msg_iov = iov;
+ msg->msg_iovlen = 2; /* # of iov structures */
+
+ ret = (int)sendmsg(aes->rdFd, msg, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ {
+ byte* tmp = (byte*)XMALLOC(authInSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ if (tmp == NULL) {
+ return MEMORY_E;
+ }
+ /* first 16 bytes was all 0's */
+ iov[0].iov_base = tmp;
+ (void)scratch;
+ iov[0].iov_len = authInSz;
+
+ iov[1].iov_base = out;
+ iov[1].iov_len = sz;
+
+ iov[2].iov_base = authTag;
+ iov[2].iov_len = authTagSz;
+
+ ret = (int)readv(aes->rdFd, iov, 3);
+ XFREE(tmp, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ }
+ if (ret < 0) {
+ return ret;
+ }
+#endif
+
+
+ return 0;
+}
+
+#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AESGCM_DECRYPT)
+/* Performs AES-GCM decryption and returns 0 on success
+ *
+ * Warning: If using Xilinx hardware acceleration it is assumed that the in
+ * buffer is large enough to hold both cipher text and tag. That is
+ * sz | 16 bytes
+ */
+int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ struct cmsghdr* cmsg;
+ struct msghdr* msg;
+ struct iovec iov[3];
+ byte scratch[AES_BLOCK_SIZE];
+ int ret;
+#ifdef WOLFSSL_AFALG_XILINX_AES
+ byte* tag = (byte*)authTag;
+ byte buf[AES_BLOCK_SIZE];
+ byte initalCounter[AES_BLOCK_SIZE];
+#ifndef NO_WOLFSSL_ALLOC_ALIGN
+ byte* tmp = NULL;
+#endif
+#endif
+
+ /* argument checks */
+ if (aes == NULL || authTagSz > AES_BLOCK_SIZE) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (ivSz != WC_SYSTEM_AESGCM_IV || authTagSz > WOLFSSL_MAX_AUTH_TAG_SZ) {
+ WOLFSSL_MSG("IV/AAD size not supported on system");
+ return BAD_FUNC_ARG;
+ }
+
+ if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) {
+ WOLFSSL_MSG("GcmEncrypt authTagSz too small error");
+ return BAD_FUNC_ARG;
+ }
+
+ if (aes->rdFd == WC_SOCK_NOTSET) {
+ aes->dir = AES_DECRYPTION;
+ if ((ret = wc_AesSetup(aes, WC_TYPE_AEAD, WC_NAME_AESGCM, ivSz,
+ authInSz)) != 0) {
+ WOLFSSL_MSG("Error with first time setup of AF_ALG socket");
+ return ret;
+ }
+
+#ifndef WOLFSSL_AFALG_XILINX_AES
+ /* set auth tag
+ * @TODO case where tag size changes between calls? */
+ ret = setsockopt(aes->alFd, SOL_ALG, ALG_SET_AEAD_AUTHSIZE, NULL,
+ authTagSz);
+ if (ret != 0) {
+ WOLFSSL_MSG("Unable to set AF_ALG tag size ");
+ return WC_AFALG_SOCK_E;
+ }
+#endif
+ }
+
+ /* set IV and AAD size */
+ msg = &aes->msg;
+ if ((cmsg = CMSG_FIRSTHDR(msg)) == NULL) {
+ return WC_AFALG_SOCK_E;
+ }
+ if (wc_Afalg_SetOp(cmsg, aes->dir) < 0) {
+ WOLFSSL_MSG("Error with setting AF_ALG operation");
+ return WC_AFALG_SOCK_E;
+ }
+ if ((cmsg = CMSG_NXTHDR(msg, cmsg)) == NULL) {
+ return WC_AFALG_SOCK_E;
+ }
+ ret = wc_Afalg_SetIv(cmsg, (byte*)iv, ivSz);
+ if (ret < 0) {
+ return ret;
+ }
+
+#ifdef WOLFSSL_AFALG_XILINX_AES
+ /* check for and handle additional data */
+ if (authIn != NULL && authInSz > 0) {
+
+ XMEMSET(initalCounter, 0, AES_BLOCK_SIZE);
+ XMEMCPY(initalCounter, iv, ivSz);
+ initalCounter[AES_BLOCK_SIZE - 1] = 1;
+ tag = buf;
+ GHASH(aes, NULL, 0, in, sz, tag, AES_BLOCK_SIZE);
+ wc_AesEncryptDirect(aes, scratch, initalCounter);
+ xorbuf(tag, scratch, AES_BLOCK_SIZE);
+ if (ret != 0) {
+ return AES_GCM_AUTH_E;
+ }
+ }
+
+ /* it is assumed that in buffer size is large enough to hold TAG */
+ XMEMCPY((byte*)in + sz, tag, AES_BLOCK_SIZE);
+ if ((wolfssl_word)in % WOLFSSL_XILINX_ALIGN) {
+ #ifndef NO_WOLFSSL_ALLOC_ALIGN
+ byte* tmp_align;
+ tmp = (byte*)XMALLOC(sz + WOLFSSL_XILINX_ALIGN +
+ AES_BLOCK_SIZE, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ if (tmp == NULL) {
+ return MEMORY_E;
+ }
+ tmp_align = tmp + (WOLFSSL_XILINX_ALIGN -
+ ((size_t)tmp % WOLFSSL_XILINX_ALIGN));
+ XMEMCPY(tmp_align, in, sz + AES_BLOCK_SIZE);
+ iov[0].iov_base = tmp_align;
+ #else
+ WOLFSSL_MSG("Buffer expected to be word aligned");
+ return BAD_ALIGN_E;
+ #endif
+ }
+ else {
+ iov[0].iov_base = (byte*)in;
+ }
+ iov[0].iov_len = sz + AES_BLOCK_SIZE;
+
+ msg->msg_iov = iov;
+ msg->msg_iovlen = 1;
+
+ ret = sendmsg(aes->rdFd, msg, 0);
+#ifndef NO_WOLFSSL_ALLOC_ALIGN
+ XFREE(tmp, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = read(aes->rdFd, out, sz + AES_BLOCK_SIZE);
+ if (ret < 0) {
+ return AES_GCM_AUTH_E;
+ }
+
+ /* check on tag */
+ if (authIn != NULL && authInSz > 0) {
+ GHASH(aes, authIn, authInSz, in, sz, tag, AES_BLOCK_SIZE);
+ wc_AesEncryptDirect(aes, scratch, initalCounter);
+ xorbuf(tag, scratch, AES_BLOCK_SIZE);
+ if (ConstantCompare(tag, authTag, authTagSz) != 0) {
+ return AES_GCM_AUTH_E;
+ }
+ }
+
+#else
+ if (authInSz > 0) {
+ cmsg = CMSG_NXTHDR(msg, cmsg);
+ ret = wc_Afalg_SetAad(cmsg, authInSz);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ /* set data to be decrypted*/
+ iov[0].iov_base = (byte*)authIn;
+ iov[0].iov_len = authInSz;
+ iov[1].iov_base = (byte*)in;
+ iov[1].iov_len = sz;
+ iov[2].iov_base = (byte*)authTag;
+ iov[2].iov_len = authTagSz;
+
+ msg->msg_iov = iov;
+ msg->msg_iovlen = 3; /* # of iov structures */
+ ret = (int)sendmsg(aes->rdFd, &(aes->msg), 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ {
+ byte* tmp = (byte*)XMALLOC(authInSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ if (tmp == NULL) {
+ return MEMORY_E;
+ }
+ iov[0].iov_base = tmp;
+ iov[0].iov_len = authInSz;
+ iov[1].iov_base = out;
+ iov[1].iov_len = sz;
+ ret = (int)readv(aes->rdFd, iov, 2);
+ XFREE(tmp, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ }
+ if (ret < 0) {
+ return AES_GCM_AUTH_E;
+ }
+ (void)scratch;
+#endif
+
+ return 0;
+}
+#endif /* HAVE_AES_DECRYPT || HAVE_AESGCM_DECRYPT */
+#endif /* HAVE_AESGCM */
+
+
+#ifdef HAVE_AES_ECB
+int wc_AesEcbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ return wc_Afalg_AesDirect(aes, out, in, sz);
+}
+
+
+int wc_AesEcbDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ return wc_Afalg_AesDirect(aes, out, in, sz);
+}
+#endif /* HAVE_AES_ECB */
+#endif /* !NO_AES && WOLFSSL_AFALG */
+
diff --git a/wolfcrypt/src/port/af_alg/afalg_hash.c b/wolfcrypt/src/port/af_alg/afalg_hash.c
new file mode 100644
index 0000000..41e57bc
--- /dev/null
+++ b/wolfcrypt/src/port/af_alg/afalg_hash.c
@@ -0,0 +1,339 @@
+/* afalg_hash.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_AFALG_HASH) || (defined(WOLFSSL_AFALG_XILINX_SHA3) \
+ && defined(WOLFSSL_SHA3))
+
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/port/af_alg/wc_afalg.h>
+#include <wolfssl/wolfcrypt/port/af_alg/afalg_hash.h>
+
+static const char WC_TYPE_HASH[] = "hash";
+
+
+/* generic AF_ALG hash free */
+static void AfalgHashFree(wolfssl_AFALG_Hash* hash)
+{
+ if (hash == NULL)
+ return;
+
+ if (hash->alFd > 0) {
+ close(hash->alFd);
+ hash->alFd = -1; /* avoid possible double close on socket */
+ }
+ if (hash->rdFd > 0) {
+ close(hash->rdFd);
+ hash->rdFd = -1; /* avoid possible double close on socket */
+ }
+
+ #if defined(WOLFSSL_AFALG_HASH_KEEP)
+ if (hash->msg != NULL) {
+ XFREE(hash->msg, hash->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ hash->msg = NULL;
+ }
+ #endif
+}
+
+
+/* generic hash init for AF_ALG, returns 0 on success */
+static int AfalgHashInit(wolfssl_AFALG_Hash* hash, void* heap, int devId,
+ const char* type)
+{
+ if (hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ (void)devId; /* no async for now */
+ XMEMSET(hash, 0, sizeof(wolfssl_AFALG_Hash));
+ hash->heap = heap;
+
+ hash->len = 0;
+ hash->used = 0;
+ hash->msg = NULL;
+ hash->alFd = -1;
+ hash->rdFd = -1;
+
+ hash->alFd = wc_Afalg_Socket();
+ if (hash->alFd < 0) {
+ return WC_AFALG_SOCK_E;
+ }
+
+ hash->rdFd = wc_Afalg_CreateRead(hash->alFd, WC_TYPE_HASH, type);
+ if (hash->rdFd < 0) {
+ close(hash->alFd);
+ return WC_AFALG_SOCK_E;
+ }
+
+ return 0;
+
+}
+
+
+/* generic hash update for AF_ALG, returns 0 on success */
+static int AfalgHashUpdate(wolfssl_AFALG_Hash* hash, const byte* in, word32 sz)
+{
+ if (hash == NULL || (sz > 0 && in == NULL)) {
+ return BAD_FUNC_ARG;
+ }
+
+#ifdef WOLFSSL_AFALG_HASH_KEEP
+ /* keep full message to hash at end instead of incremental updates */
+ if (hash->len < hash->used + sz) {
+ if (hash->msg == NULL) {
+ hash->msg = (byte*)XMALLOC(hash->used + sz, hash->heap,
+ DYNAMIC_TYPE_TMP_BUFFER);
+ } else {
+ byte* pt = (byte*)XREALLOC(hash->msg, hash->used + sz, hash->heap,
+ DYNAMIC_TYPE_TMP_BUFFER);
+ if (pt == NULL) {
+ return MEMORY_E;
+ }
+ hash->msg = pt;
+ }
+ if (hash->msg == NULL) {
+ return MEMORY_E;
+ }
+ hash->len = hash->used + sz;
+ }
+ XMEMCPY(hash->msg + hash->used, in, sz);
+ hash->used += sz;
+#else
+ int ret;
+
+ if ((ret = (int)send(hash->rdFd, in, sz, MSG_MORE)) < 0) {
+ return ret;
+ }
+#endif
+ return 0;
+}
+
+
+/* generic hash final for AF_ALG, return 0 on success */
+static int AfalgHashFinal(wolfssl_AFALG_Hash* hash, byte* out, word32 outSz,
+ const char* type)
+{
+ int ret;
+ void* heap;
+
+ if (hash == NULL || out == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ heap = hash->heap; /* keep because AfalgHashInit clears the pointer */
+#ifdef WOLFSSL_AFALG_HASH_KEEP
+ /* keep full message to out at end instead of incremental updates */
+ if ((ret = (int)send(hash->rdFd, hash->msg, hash->used, 0)) < 0) {
+ return ret;
+ }
+ XFREE(hash->msg, heap, DYNAMIC_TYPE_TMP_BUFFER);
+ hash->msg = NULL;
+#else
+ if ((ret = (int)send(hash->rdFd, NULL, 0, 0)) < 0) {
+ return ret;
+ }
+#endif
+
+ if ((ret = (int)read(hash->rdFd, out, outSz)) != (int)outSz) {
+ return ret;
+ }
+
+ AfalgHashFree(hash);
+ return AfalgHashInit(hash, heap, 0, type);
+}
+
+
+/* generic function to get intermediate hash */
+static int AfalgHashGet(wolfssl_AFALG_Hash* hash, byte* out, word32 outSz)
+{
+ int ret;
+
+ if (hash == NULL || out == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ (void)ret;
+#ifdef WOLFSSL_AFALG_HASH_KEEP
+ if ((ret = (int)send(hash->rdFd, hash->msg, hash->used, 0)) < 0) {
+ return ret;
+ }
+
+ if ((ret = (int)read(hash->rdFd, out, outSz)) != (int)outSz) {
+ return ret;
+ }
+ return 0;
+#else
+ (void)hash;
+ (void)out;
+ (void)outSz;
+
+ WOLFSSL_MSG("Compile with WOLFSSL_AFALG_HASH_KEEP for this feature");
+ return NOT_COMPILED_IN;
+#endif
+}
+
+
+/* generic struct copy for AF_ALG, returns 0 on success */
+static int AfalgHashCopy(wolfssl_AFALG_Hash* src, wolfssl_AFALG_Hash* dst)
+{
+ if (src == NULL || dst == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMCPY(dst, src, sizeof(wolfssl_AFALG_Hash));
+
+#ifdef WOLFSSL_AFALG_HASH_KEEP
+ dst->msg = (byte*)XMALLOC(src->len, dst->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ if (dst->msg == NULL) {
+ return MEMORY_E;
+ }
+ XMEMCPY(dst->msg, src->msg, src->len);
+#endif
+
+ dst->rdFd = accept(src->rdFd, NULL, 0);
+ dst->alFd = accept(src->alFd, NULL, 0);
+
+ if (dst->rdFd == -1 || dst->alFd == -1) {
+ AfalgHashFree(dst);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+#if !defined(NO_SHA256) && defined(WOLFSSL_AFALG_HASH)
+#include <wolfssl/wolfcrypt/sha256.h>
+
+static const char WC_NAME_SHA256[] = "sha256";
+
+
+/* create AF_ALG sockets for SHA256 operation */
+int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
+{
+ return AfalgHashInit(sha, heap, devId, WC_NAME_SHA256);
+}
+
+
+int wc_Sha256Update(wc_Sha256* sha, const byte* in, word32 sz)
+{
+ return AfalgHashUpdate(sha, in, sz);
+}
+
+
+int wc_Sha256Final(wc_Sha256* sha, byte* hash)
+{
+ return AfalgHashFinal(sha, hash, WC_SHA256_DIGEST_SIZE, WC_NAME_SHA256);
+}
+
+
+int wc_Sha256GetHash(wc_Sha256* sha, byte* hash)
+{
+ return AfalgHashGet(sha, hash, WC_SHA256_DIGEST_SIZE);
+}
+
+
+int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
+{
+ return AfalgHashCopy(src, dst);
+}
+#endif /* !NO_SHA256 */
+
+
+
+#if defined(WOLFSSL_SHA3) && defined(WOLFSSL_AFALG_XILINX_SHA3)
+#include <wolfssl/wolfcrypt/sha3.h>
+
+static const char WC_NAME_SHA3[] = "xilinx-keccak-384";
+
+void wc_Sha3_384_Free(wc_Sha3* sha)
+{
+ AfalgHashFree(sha);
+}
+
+
+/* create AF_ALG sockets for SHA256 operation */
+int wc_InitSha3_384(wc_Sha3* sha, void* heap, int devId)
+{
+ return AfalgHashInit(sha, heap, devId, WC_NAME_SHA3);
+}
+
+
+int wc_Sha3_384_Update(wc_Sha3* sha, const byte* in, word32 sz)
+{
+#ifndef WOLFSSL_AFALG_HASH_KEEP
+ if (sz % 4) {
+ WOLFSSL_MSG("Alignment issue. Message size needs to be divisible by 4")
+ return BAD_FUNC_ARG;
+ }
+#endif
+
+ return AfalgHashUpdate(sha, in, sz);
+}
+
+
+int wc_Sha3_384_Final(wc_Sha3* sha, byte* hash)
+{
+ if (sha == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+#ifdef WOLFSSL_AFALG_HASH_KEEP
+ if (sha->used % 4) {
+ WOLFSSL_MSG("Alignment issue. Message size needs to be divisible by 4");
+ return BAD_FUNC_ARG;
+ }
+#endif
+
+ return AfalgHashFinal(sha, hash, WC_SHA3_384_DIGEST_SIZE, WC_NAME_SHA3);
+}
+
+
+int wc_Sha3_384_GetHash(wc_Sha3* sha, byte* hash)
+{
+ if (sha == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+#ifdef WOLFSSL_AFALG_HASH_KEEP
+ if (sha->used % 4) {
+ WOLFSSL_MSG("Alignment issue. Message size needs to be divisible by 4");
+ return BAD_FUNC_ARG;
+ }
+#endif
+
+ return AfalgHashGet(sha, hash, WC_SHA3_384_DIGEST_SIZE);
+}
+
+int wc_Sha3_384_Copy(wc_Sha3* src, wc_Sha3* dst)
+{
+ return AfalgHashCopy(src, dst);
+}
+#endif /* WOLFSSL_SHA3 && WOLFSSL_AFALG_XILINX_SHA3 */
+
+#endif /* WOLFSSL_AFALG */
diff --git a/wolfcrypt/src/port/af_alg/wc_afalg.c b/wolfcrypt/src/port/af_alg/wc_afalg.c
new file mode 100644
index 0000000..0a91b51
--- /dev/null
+++ b/wolfcrypt/src/port/af_alg/wc_afalg.c
@@ -0,0 +1,141 @@
+/* wc_afalg.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+
+#if defined(WOLFSSL_AFALG) || defined(WOLFSSL_AFALG_XILINX)
+
+#include <wolfssl/wolfcrypt/port/af_alg/wc_afalg.h>
+#include <linux/if_alg.h>
+
+
+/* Sets the type of socket address to use */
+void wc_Afalg_SockAddr(struct sockaddr_alg* in, const char* type, const char* name)
+{
+ in->salg_family = AF_ALG;
+ XSTRNCPY((char*)in->salg_type, type, XSTRLEN(type));
+ in->salg_type[XSTRLEN(type)] = '\0';
+ XSTRNCPY((char*)in->salg_name, name, XSTRLEN(name));
+ in->salg_name[XSTRLEN(name)] = '\0';
+}
+
+
+/* returns the socket accepting on with success
+ * negative values are returned in fail cases */
+int wc_Afalg_Accept(struct sockaddr_alg* in, int inSz, int sock)
+{
+ if (bind(sock, (const struct sockaddr*)in, inSz) < 0) {
+ WOLFSSL_MSG("Failed to bind with AF_ALG");
+ return WC_AFALG_SOCK_E;
+ }
+
+ return accept(sock, NULL, 0);
+}
+
+
+/* creates a new AF_ALG socket and returns it
+ * negative values are returned in fail cases */
+int wc_Afalg_Socket(void)
+{
+ int sock;
+
+ if ((sock = socket(AF_ALG, SOCK_SEQPACKET, 0)) < 0) {
+ WOLFSSL_MSG("Failed to get AF_ALG socket");
+ return WC_AFALG_SOCK_E;
+ }
+
+ return sock;
+}
+
+
+/* binds and creates the read fd */
+int wc_Afalg_CreateRead(int sock, const char* type, const char* name)
+{
+ struct sockaddr_alg sa = {0};
+ wc_Afalg_SockAddr(&sa, type, name);
+ return wc_Afalg_Accept(&sa, sizeof(sa), sock);
+}
+
+
+/* sets the IV in CMSG structure, returns 0 on success */
+int wc_Afalg_SetIv(struct cmsghdr* cmsg, byte* iv, word32 ivSz)
+{
+ struct af_alg_iv* afIv;
+
+ if (cmsg == NULL || iv == NULL) {
+ WOLFSSL_MSG("Null cmsg or iv passed in");
+ return BAD_FUNC_ARG;
+ }
+
+ cmsg->cmsg_level = SOL_ALG;
+ cmsg->cmsg_type = ALG_SET_IV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct af_alg_iv) + ivSz);
+ afIv = (void*)CMSG_DATA(cmsg);
+ afIv->ivlen = ivSz;
+ XMEMCPY(afIv->iv, iv, ivSz);
+
+ return 0;
+}
+
+
+/* sets the AAD size in CMSG structure, returns 0 on success */
+int wc_Afalg_SetAad(struct cmsghdr* cmsg, word32 sz)
+{
+ if (cmsg == NULL) {
+ WOLFSSL_MSG("Null cmsg passed in");
+ return BAD_FUNC_ARG;
+ }
+
+ cmsg->cmsg_level = SOL_ALG;
+ cmsg->cmsg_type = ALG_SET_AEAD_ASSOCLEN;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(word32));
+ *((word32*)CMSG_DATA(cmsg)) = sz;
+
+ return 0;
+}
+
+
+/* sets the operation type in CMSG structure, returns 0 on success
+ *
+ * dir 0 is encryption 1 is decryption
+ */
+int wc_Afalg_SetOp(struct cmsghdr* cmsg, int dir)
+{
+ if (cmsg == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ cmsg->cmsg_level = SOL_ALG;
+ cmsg->cmsg_type = ALG_SET_OP;
+ cmsg->cmsg_len = CMSG_LEN(4);
+ *((word32*)CMSG_DATA(cmsg)) = (dir == 1)? ALG_OP_DECRYPT : ALG_OP_ENCRYPT;
+
+ return 0;
+}
+
+#endif /* !NO_AES && WOLFSSL_AFALG */
+
diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S
new file mode 100644
index 0000000..6fd1ed3
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S
@@ -0,0 +1,6012 @@
+/* armv8-32-curve25519
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S
+ */
+
+#ifdef WOLFSSL_ARMASM
+#ifndef __aarch64__
+ .text
+ .align 2
+ .globl fe_init
+ .type fe_init, %function
+fe_init:
+ bx lr
+ .size fe_init,.-fe_init
+ .text
+ .align 2
+ .globl fe_frombytes
+ .type fe_frombytes, %function
+fe_frombytes:
+ push {r4, r5, r6, r7, lr}
+ ldrd r2, r3, [r1]
+ ldr r12, [r1, #8]
+ ldr lr, [r1, #12]
+ ldrd r4, r5, [r1, #16]
+ ldrd r6, r7, [r1, #24]
+ and r7, r7, #0x7fffffff
+ strd r2, r3, [r0]
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ strd r4, r5, [r0, #16]
+ strd r6, r7, [r0, #24]
+ pop {r4, r5, r6, r7, pc}
+ .size fe_frombytes,.-fe_frombytes
+ .text
+ .align 2
+ .globl fe_tobytes
+ .type fe_tobytes, %function
+fe_tobytes:
+ push {r4, r5, r6, r7, r8, lr}
+ ldrd r2, r3, [r1]
+ ldr r12, [r1, #8]
+ ldr lr, [r1, #12]
+ ldrd r4, r5, [r1, #16]
+ ldrd r6, r7, [r1, #24]
+ adds r8, r2, #19
+ adcs r8, r3, #0
+ adcs r8, r12, #0
+ adcs r8, lr, #0
+ adcs r8, r4, #0
+ adcs r8, r5, #0
+ adcs r8, r6, #0
+ adc r8, r7, #0
+ asr r8, r8, #31
+ and r8, r8, #19
+ adds r2, r2, r8
+ adcs r3, r3, #0
+ adcs r12, r12, #0
+ adcs lr, lr, #0
+ adcs r4, r4, #0
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adc r7, r7, #0
+ and r7, r7, #0x7fffffff
+ strd r2, r3, [r0]
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ strd r4, r5, [r0, #16]
+ strd r6, r7, [r0, #24]
+ pop {r4, r5, r6, r7, r8, pc}
+ .size fe_tobytes,.-fe_tobytes
+ .text
+ .align 2
+ .globl fe_1
+ .type fe_1, %function
+fe_1:
+ # Set one
+ mov r2, #1
+ mov r1, #0
+ str r2, [r0]
+ str r1, [r0, #4]
+ str r1, [r0, #8]
+ str r1, [r0, #12]
+ str r1, [r0, #16]
+ str r1, [r0, #20]
+ str r1, [r0, #24]
+ str r1, [r0, #28]
+ bx lr
+ .size fe_1,.-fe_1
+ .text
+ .align 2
+ .globl fe_0
+ .type fe_0, %function
+fe_0:
+ # Set zero
+ mov r1, #0
+ str r1, [r0]
+ str r1, [r0, #4]
+ str r1, [r0, #8]
+ str r1, [r0, #12]
+ str r1, [r0, #16]
+ str r1, [r0, #20]
+ str r1, [r0, #24]
+ str r1, [r0, #28]
+ bx lr
+ .size fe_0,.-fe_0
+ .text
+ .align 2
+ .globl fe_copy
+ .type fe_copy, %function
+fe_copy:
+ push {lr}
+ # Copy
+ ldrd r2, r3, [r1]
+ ldr r12, [r1, #8]
+ ldr lr, [r1, #12]
+ strd r2, r3, [r0]
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ ldrd r2, r3, [r1, #16]
+ ldr r12, [r1, #24]
+ ldr lr, [r1, #28]
+ strd r2, r3, [r0, #16]
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ pop {pc}
+ .size fe_copy,.-fe_copy
+ .text
+ .align 2
+ .globl fe_sub
+ .type fe_sub, %function
+fe_sub:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ # Sub
+ ldr r12, [r1]
+ ldr lr, [r1, #4]
+ ldrd r4, r5, [r1, #8]
+ ldrd r6, r7, [r2]
+ ldrd r8, r9, [r2, #8]
+ subs r6, r12, r6
+ sbcs r7, lr, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ strd r6, r7, [r0]
+ strd r8, r9, [r0, #8]
+ ldr r12, [r1, #16]
+ ldr lr, [r1, #20]
+ ldrd r4, r5, [r1, #24]
+ ldrd r6, r7, [r2, #16]
+ ldrd r8, r9, [r2, #24]
+ sbcs r6, r12, r6
+ sbcs r7, lr, r7
+ sbcs r8, r4, r8
+ sbc r9, r5, r9
+ mov r10, #-19
+ asr r3, r9, #31
+ # Mask the modulus
+ and r10, r3, r10
+ and r11, r3, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r4, r5, [r0, #8]
+ adds r12, r12, r10
+ adcs lr, lr, r3
+ adcs r4, r4, r3
+ adcs r5, r5, r3
+ adcs r6, r6, r3
+ adcs r7, r7, r3
+ adcs r8, r8, r3
+ adc r9, r9, r11
+ str r12, [r0]
+ str lr, [r0, #4]
+ strd r4, r5, [r0, #8]
+ strd r6, r7, [r0, #16]
+ strd r8, r9, [r0, #24]
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_sub,.-fe_sub
+ .text
+ .align 2
+ .globl fe_add
+ .type fe_add, %function
+fe_add:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ # Add
+ ldr r12, [r1]
+ ldr lr, [r1, #4]
+ ldrd r4, r5, [r1, #8]
+ ldrd r6, r7, [r2]
+ ldrd r8, r9, [r2, #8]
+ adds r6, r12, r6
+ adcs r7, lr, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ strd r6, r7, [r0]
+ strd r8, r9, [r0, #8]
+ ldr r12, [r1, #16]
+ ldr lr, [r1, #20]
+ ldrd r4, r5, [r1, #24]
+ ldrd r6, r7, [r2, #16]
+ ldrd r8, r9, [r2, #24]
+ adcs r6, r12, r6
+ adcs r7, lr, r7
+ adcs r8, r4, r8
+ adc r9, r5, r9
+ mov r10, #-19
+ asr r3, r9, #31
+ # Mask the modulus
+ and r10, r3, r10
+ and r11, r3, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r4, r5, [r0, #8]
+ subs r12, r12, r10
+ sbcs lr, lr, r3
+ sbcs r4, r4, r3
+ sbcs r5, r5, r3
+ sbcs r6, r6, r3
+ sbcs r7, r7, r3
+ sbcs r8, r8, r3
+ sbc r9, r9, r11
+ str r12, [r0]
+ str lr, [r0, #4]
+ strd r4, r5, [r0, #8]
+ strd r6, r7, [r0, #16]
+ strd r8, r9, [r0, #24]
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_add,.-fe_add
+ .text
+ .align 2
+ .globl fe_neg
+ .type fe_neg, %function
+fe_neg:
+ push {r4, r5, lr}
+ mov r5, #-1
+ mov r4, #-19
+ ldrd r2, r3, [r1]
+ ldr r12, [r1, #8]
+ ldr lr, [r1, #12]
+ subs r2, r4, r2
+ sbcs r3, r5, r3
+ sbcs r12, r5, r12
+ sbcs lr, r5, lr
+ strd r2, r3, [r0]
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ mov r4, #0x7fffffff
+ ldrd r2, r3, [r1, #16]
+ ldr r12, [r1, #24]
+ ldr lr, [r1, #28]
+ sbcs r2, r5, r2
+ sbcs r3, r5, r3
+ sbcs r12, r5, r12
+ sbc lr, r4, lr
+ strd r2, r3, [r0, #16]
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ pop {r4, r5, pc}
+ .size fe_neg,.-fe_neg
+ .text
+ .align 2
+ .globl fe_isnonzero
+ .type fe_isnonzero, %function
+fe_isnonzero:
+ push {r4, r5, r6, r7, r8, lr}
+ ldrd r2, r3, [r0]
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ ldrd r4, r5, [r0, #16]
+ ldrd r6, r7, [r0, #24]
+ adds r1, r2, #19
+ adcs r1, r3, #0
+ adcs r1, r12, #0
+ adcs r1, lr, #0
+ adcs r1, r4, #0
+ adcs r1, r5, #0
+ adcs r1, r6, #0
+ adc r1, r7, #0
+ asr r1, r1, #31
+ and r1, r1, #19
+ adds r2, r2, r1
+ adcs r3, r3, #0
+ adcs r12, r12, #0
+ adcs lr, lr, #0
+ adcs r4, r4, #0
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adc r7, r7, #0
+ and r7, r7, #0x7fffffff
+ orr r2, r2, r3
+ orr r12, r12, lr
+ orr r4, r4, r5
+ orr r6, r6, r7
+ orr r12, r12, r4
+ orr r2, r2, r6
+ orr r0, r2, r12
+ pop {r4, r5, r6, r7, r8, pc}
+ .size fe_isnonzero,.-fe_isnonzero
+ .text
+ .align 2
+ .globl fe_isnegative
+ .type fe_isnegative, %function
+fe_isnegative:
+ push {lr}
+ ldrd r2, r3, [r0]
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ adds r1, r2, #19
+ adcs r1, r3, #0
+ adcs r1, r12, #0
+ adcs r1, lr, #0
+ ldrd r2, r3, [r0, #16]
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ adcs r1, r2, #0
+ adcs r1, r3, #0
+ adcs r1, r12, #0
+ ldr r2, [r0]
+ adc r1, lr, #0
+ and r0, r2, #1
+ lsr r1, r1, #31
+ eor r0, r0, r1
+ pop {pc}
+ .size fe_isnegative,.-fe_isnegative
+ .text
+ .align 2
+ .globl fe_cmov_table
+ .type fe_cmov_table, %function
+fe_cmov_table:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sxtb r2, r2
+ sbfx r7, r2, #7, #1
+ eor r10, r2, r7
+ sub r10, r10, r7
+ mov r3, #1
+ mov r12, #0
+ mov lr, #1
+ mov r4, #0
+ mov r5, #0
+ mov r6, #0
+ mov r7, #0x80000000
+ ror r7, r7, #31
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #32]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #64]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #30
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #32]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #64]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #29
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #32]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #64]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #28
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #32]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #64]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #27
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #32]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #64]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #26
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #32]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #64]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #25
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #32]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #64]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #24
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #32]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #64]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ sub r1, r1, #0x2a0
+ mov r8, #-19
+ mov r9, #-1
+ subs r8, r8, r5
+ sbcs r9, r9, r6
+ sbc r11, r11, r11
+ asr r10, r2, #31
+ eor r7, r3, lr
+ and r7, r7, r10
+ eor r3, r3, r7
+ eor lr, lr, r7
+ eor r7, r12, r4
+ and r7, r7, r10
+ eor r12, r12, r7
+ eor r4, r4, r7
+ eor r8, r8, r5
+ and r8, r8, r10
+ eor r5, r5, r8
+ eor r9, r9, r6
+ and r9, r9, r10
+ eor r6, r6, r9
+ str r3, [r0]
+ str r12, [r0, #4]
+ str lr, [r0, #32]
+ str r4, [r0, #36]
+ str r5, [r0, #64]
+ str r6, [r0, #68]
+ sbfx r7, r2, #7, #1
+ eor r10, r2, r7
+ sub r10, r10, r7
+ mov r3, #0
+ mov r12, #0
+ mov lr, #0
+ mov r4, #0
+ mov r5, #0
+ mov r6, #0
+ mov r7, #0x80000000
+ ror r7, r7, #31
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #8]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #40]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #72]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #30
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #8]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #40]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #72]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #29
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #8]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #40]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #72]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #28
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #8]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #40]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #72]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #27
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #8]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #40]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #72]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #26
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #8]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #40]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #72]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #25
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #8]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #40]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #72]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #24
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #8]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #40]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #72]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ sub r1, r1, #0x2a0
+ mov r8, #-1
+ mov r9, #-1
+ rsbs r11, r11, #0
+ sbcs r8, r8, r5
+ sbcs r9, r9, r6
+ sbc r11, r11, r11
+ asr r10, r2, #31
+ eor r7, r3, lr
+ and r7, r7, r10
+ eor r3, r3, r7
+ eor lr, lr, r7
+ eor r7, r12, r4
+ and r7, r7, r10
+ eor r12, r12, r7
+ eor r4, r4, r7
+ eor r8, r8, r5
+ and r8, r8, r10
+ eor r5, r5, r8
+ eor r9, r9, r6
+ and r9, r9, r10
+ eor r6, r6, r9
+ str r3, [r0, #8]
+ str r12, [r0, #12]
+ str lr, [r0, #40]
+ str r4, [r0, #44]
+ str r5, [r0, #72]
+ str r6, [r0, #76]
+ sbfx r7, r2, #7, #1
+ eor r10, r2, r7
+ sub r10, r10, r7
+ mov r3, #0
+ mov r12, #0
+ mov lr, #0
+ mov r4, #0
+ mov r5, #0
+ mov r6, #0
+ mov r7, #0x80000000
+ ror r7, r7, #31
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #16]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #48]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #80]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #30
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #16]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #48]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #80]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #29
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #16]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #48]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #80]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #28
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #16]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #48]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #80]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #27
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #16]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #48]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #80]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #26
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #16]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #48]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #80]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #25
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #16]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #48]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #80]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #24
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #16]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #48]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #80]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ sub r1, r1, #0x2a0
+ mov r8, #-1
+ mov r9, #-1
+ rsbs r11, r11, #0
+ sbcs r8, r8, r5
+ sbcs r9, r9, r6
+ sbc r11, r11, r11
+ asr r10, r2, #31
+ eor r7, r3, lr
+ and r7, r7, r10
+ eor r3, r3, r7
+ eor lr, lr, r7
+ eor r7, r12, r4
+ and r7, r7, r10
+ eor r12, r12, r7
+ eor r4, r4, r7
+ eor r8, r8, r5
+ and r8, r8, r10
+ eor r5, r5, r8
+ eor r9, r9, r6
+ and r9, r9, r10
+ eor r6, r6, r9
+ str r3, [r0, #16]
+ str r12, [r0, #20]
+ str lr, [r0, #48]
+ str r4, [r0, #52]
+ str r5, [r0, #80]
+ str r6, [r0, #84]
+ sbfx r7, r2, #7, #1
+ eor r10, r2, r7
+ sub r10, r10, r7
+ mov r3, #0
+ mov r12, #0
+ mov lr, #0
+ mov r4, #0
+ mov r5, #0
+ mov r6, #0
+ mov r7, #0x80000000
+ ror r7, r7, #31
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #24]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #56]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #88]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #30
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #24]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #56]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #88]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #29
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #24]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #56]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #88]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #28
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #24]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #56]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #88]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #27
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #24]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #56]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #88]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #26
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #24]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #56]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #88]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #25
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #24]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #56]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #88]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ add r1, r1, #0x60
+ mov r7, #0x80000000
+ ror r7, r7, #24
+ ror r7, r7, r10
+ asr r7, r7, #31
+ ldrd r8, r9, [r1, #24]
+ eor r8, r8, r3
+ eor r9, r9, r12
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r3, r3, r8
+ eor r12, r12, r9
+ ldrd r8, r9, [r1, #56]
+ eor r8, r8, lr
+ eor r9, r9, r4
+ and r8, r8, r7
+ and r9, r9, r7
+ eor lr, lr, r8
+ eor r4, r4, r9
+ ldrd r8, r9, [r1, #88]
+ eor r8, r8, r5
+ eor r9, r9, r6
+ and r8, r8, r7
+ and r9, r9, r7
+ eor r5, r5, r8
+ eor r6, r6, r9
+ sub r1, r1, #0x2a0
+ mov r8, #-1
+ mov r9, #0x7fffffff
+ rsbs r11, r11, #0
+ sbcs r8, r8, r5
+ sbc r9, r9, r6
+ asr r10, r2, #31
+ eor r7, r3, lr
+ and r7, r7, r10
+ eor r3, r3, r7
+ eor lr, lr, r7
+ eor r7, r12, r4
+ and r7, r7, r10
+ eor r12, r12, r7
+ eor r4, r4, r7
+ eor r8, r8, r5
+ and r8, r8, r10
+ eor r5, r5, r8
+ eor r9, r9, r6
+ and r9, r9, r10
+ eor r6, r6, r9
+ str r3, [r0, #24]
+ str r12, [r0, #28]
+ str lr, [r0, #56]
+ str r4, [r0, #60]
+ str r5, [r0, #88]
+ str r6, [r0, #92]
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_cmov_table,.-fe_cmov_table
+ .text
+ .align 2
+ .globl fe_mul
+ .type fe_mul, %function
+fe_mul:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub sp, sp, #0x40
+ # Multiply
+ ldr r7, [r1]
+ ldr r8, [r1, #4]
+ ldr r9, [r2]
+ ldr lr, [r2, #4]
+ # A[0] * B[0] = 0
+ umull r4, r5, r7, r9
+ str r4, [sp]
+ # A[0] * B[1] = 1
+ umull r3, r6, r7, lr
+ adds r5, r5, r3
+ adc r6, r6, #0
+ # A[1] * B[0] = 1
+ umull r3, r12, r8, r9
+ adds r5, r5, r3
+ mov r4, #0
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ str r5, [sp, #4]
+ # A[2] * B[0] = 2
+ ldr r10, [r1, #8]
+ umull r3, r12, r10, r9
+ adds r6, r6, r3
+ adc r4, r4, r12
+ # A[1] * B[1] = 2
+ umull r3, r12, r8, lr
+ adds r6, r6, r3
+ mov r5, #0
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[0] * B[2] = 2
+ ldr r11, [r2, #8]
+ umull r3, r12, r7, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ str r6, [sp, #8]
+ # A[0] * B[3] = 3
+ ldr r11, [r2, #12]
+ umull r3, r12, r7, r11
+ adds r4, r4, r3
+ mov r6, #0
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[1] * B[2] = 3
+ ldr r11, [r2, #8]
+ umull r3, r12, r8, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[2] * B[1] = 3
+ umull r3, r12, r10, lr
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[3] * B[0] = 3
+ ldr r10, [r1, #12]
+ umull r3, r12, r10, r9
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ str r4, [sp, #12]
+ # A[4] * B[0] = 4
+ ldr r10, [r1, #16]
+ umull r3, r12, r10, r9
+ adds r5, r5, r3
+ mov r4, #0
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[3] * B[1] = 4
+ ldr r10, [r1, #12]
+ umull r3, r12, r10, lr
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[2] * B[2] = 4
+ ldr r10, [r1, #8]
+ umull r3, r12, r10, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[1] * B[3] = 4
+ ldr r11, [r2, #12]
+ umull r3, r12, r8, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[0] * B[4] = 4
+ ldr r11, [r2, #16]
+ umull r3, r12, r7, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ str r5, [sp, #16]
+ # A[0] * B[5] = 5
+ ldr r11, [r2, #20]
+ umull r3, r12, r7, r11
+ adds r6, r6, r3
+ mov r5, #0
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[1] * B[4] = 5
+ ldr r11, [r2, #16]
+ umull r3, r12, r8, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[2] * B[3] = 5
+ ldr r11, [r2, #12]
+ umull r3, r12, r10, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[3] * B[2] = 5
+ ldr r10, [r1, #12]
+ ldr r11, [r2, #8]
+ umull r3, r12, r10, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[4] * B[1] = 5
+ ldr r10, [r1, #16]
+ umull r3, r12, r10, lr
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[5] * B[0] = 5
+ ldr r10, [r1, #20]
+ umull r3, r12, r10, r9
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ str r6, [sp, #20]
+ # A[6] * B[0] = 6
+ ldr r10, [r1, #24]
+ umull r3, r12, r10, r9
+ adds r4, r4, r3
+ mov r6, #0
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[5] * B[1] = 6
+ ldr r10, [r1, #20]
+ umull r3, r12, r10, lr
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[4] * B[2] = 6
+ ldr r10, [r1, #16]
+ umull r3, r12, r10, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[3] * B[3] = 6
+ ldr r10, [r1, #12]
+ ldr r11, [r2, #12]
+ umull r3, r12, r10, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[2] * B[4] = 6
+ ldr r10, [r1, #8]
+ ldr r11, [r2, #16]
+ umull r3, r12, r10, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[1] * B[5] = 6
+ ldr r11, [r2, #20]
+ umull r3, r12, r8, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[0] * B[6] = 6
+ ldr r11, [r2, #24]
+ umull r3, r12, r7, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ str r4, [sp, #24]
+ # A[0] * B[7] = 7
+ ldr r11, [r2, #28]
+ umull r3, r12, r7, r11
+ adds r5, r5, r3
+ mov r4, #0
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[1] * B[6] = 7
+ ldr r11, [r2, #24]
+ umull r3, r12, r8, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[2] * B[5] = 7
+ ldr r11, [r2, #20]
+ umull r3, r12, r10, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[3] * B[4] = 7
+ ldr r10, [r1, #12]
+ ldr r11, [r2, #16]
+ umull r3, r12, r10, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[4] * B[3] = 7
+ ldr r10, [r1, #16]
+ ldr r11, [r2, #12]
+ umull r3, r12, r10, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[5] * B[2] = 7
+ ldr r10, [r1, #20]
+ ldr r11, [r2, #8]
+ umull r3, r12, r10, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[6] * B[1] = 7
+ ldr r10, [r1, #24]
+ umull r3, r12, r10, lr
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[7] * B[0] = 7
+ ldr r10, [r1, #28]
+ umull r3, r12, r10, r9
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ str r5, [sp, #28]
+ ldr r7, [r1, #24]
+ ldr r9, [r2, #24]
+ # A[7] * B[1] = 8
+ umull r3, r12, r10, lr
+ adds r6, r6, r3
+ mov r5, #0
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[6] * B[2] = 8
+ umull r3, r12, r7, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[5] * B[3] = 8
+ ldr r10, [r1, #20]
+ ldr r11, [r2, #12]
+ umull r3, r12, r10, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[4] * B[4] = 8
+ ldr r10, [r1, #16]
+ ldr r11, [r2, #16]
+ umull r3, r12, r10, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[3] * B[5] = 8
+ ldr r10, [r1, #12]
+ ldr r11, [r2, #20]
+ umull r3, r12, r10, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[2] * B[6] = 8
+ ldr r10, [r1, #8]
+ umull r3, r12, r10, r9
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[1] * B[7] = 8
+ ldr r11, [r2, #28]
+ umull r3, r12, r8, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ str r6, [sp, #32]
+ ldr r8, [r1, #28]
+ mov lr, r11
+ # A[2] * B[7] = 9
+ umull r3, r12, r10, lr
+ adds r4, r4, r3
+ mov r6, #0
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[3] * B[6] = 9
+ ldr r10, [r1, #12]
+ umull r3, r12, r10, r9
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[4] * B[5] = 9
+ ldr r10, [r1, #16]
+ ldr r11, [r2, #20]
+ umull r3, r12, r10, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[5] * B[4] = 9
+ ldr r10, [r1, #20]
+ ldr r11, [r2, #16]
+ umull r3, r12, r10, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[6] * B[3] = 9
+ ldr r11, [r2, #12]
+ umull r3, r12, r7, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[7] * B[2] = 9
+ ldr r11, [r2, #8]
+ umull r3, r12, r8, r11
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ str r4, [sp, #36]
+ # A[7] * B[3] = 10
+ ldr r11, [r2, #12]
+ umull r3, r12, r8, r11
+ adds r5, r5, r3
+ mov r4, #0
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[6] * B[4] = 10
+ ldr r11, [r2, #16]
+ umull r3, r12, r7, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[5] * B[5] = 10
+ ldr r11, [r2, #20]
+ umull r3, r12, r10, r11
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[4] * B[6] = 10
+ ldr r10, [r1, #16]
+ umull r3, r12, r10, r9
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[3] * B[7] = 10
+ ldr r10, [r1, #12]
+ umull r3, r12, r10, lr
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ str r5, [sp, #40]
+ # A[4] * B[7] = 11
+ ldr r10, [r1, #16]
+ umull r3, r12, r10, lr
+ adds r6, r6, r3
+ mov r5, #0
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[5] * B[6] = 11
+ ldr r10, [r1, #20]
+ umull r3, r12, r10, r9
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[6] * B[5] = 11
+ umull r3, r12, r7, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ # A[7] * B[4] = 11
+ ldr r11, [r2, #16]
+ umull r3, r12, r8, r11
+ adds r6, r6, r3
+ adcs r4, r4, r12
+ adc r5, r5, #0
+ str r6, [sp, #44]
+ # A[7] * B[5] = 12
+ ldr r11, [r2, #20]
+ umull r3, r12, r8, r11
+ adds r4, r4, r3
+ mov r6, #0
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[6] * B[6] = 12
+ umull r3, r12, r7, r9
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ # A[5] * B[7] = 12
+ umull r3, r12, r10, lr
+ adds r4, r4, r3
+ adcs r5, r5, r12
+ adc r6, r6, #0
+ str r4, [sp, #48]
+ # A[6] * B[7] = 13
+ umull r3, r12, r7, lr
+ adds r5, r5, r3
+ mov r4, #0
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ # A[7] * B[6] = 13
+ umull r3, r12, r8, r9
+ adds r5, r5, r3
+ adcs r6, r6, r12
+ adc r4, r4, #0
+ str r5, [sp, #52]
+ # A[7] * B[7] = 14
+ umull r3, r12, r8, lr
+ adds r6, r6, r3
+ adc r4, r4, r12
+ str r6, [sp, #56]
+ str r4, [sp, #60]
+ # Reduce
+ # Load bottom half
+ ldrd r4, r5, [sp]
+ ldrd r6, r7, [sp, #8]
+ ldrd r8, r9, [sp, #16]
+ ldrd r10, r11, [sp, #24]
+ lsr r3, r11, #31
+ and r11, r11, #0x7fffffff
+ mov lr, #19
+ ldr r1, [sp, #32]
+ orr r3, r3, r1, lsl #1
+ umull r3, r12, lr, r3
+ adds r4, r4, r3
+ mov r2, #0
+ adcs r5, r5, r12
+ adc r2, r2, #0
+ lsr r3, r1, #31
+ ldr r1, [sp, #36]
+ orr r3, r3, r1, lsl #1
+ umull r3, r12, lr, r3
+ add r12, r12, r2
+ adds r5, r5, r3
+ mov r2, #0
+ adcs r6, r6, r12
+ adc r2, r2, #0
+ lsr r3, r1, #31
+ ldr r1, [sp, #40]
+ orr r3, r3, r1, lsl #1
+ umull r3, r12, lr, r3
+ add r12, r12, r2
+ adds r6, r6, r3
+ mov r2, #0
+ adcs r7, r7, r12
+ adc r2, r2, #0
+ lsr r3, r1, #31
+ ldr r1, [sp, #44]
+ orr r3, r3, r1, lsl #1
+ umull r3, r12, lr, r3
+ add r12, r12, r2
+ adds r7, r7, r3
+ mov r2, #0
+ adcs r8, r8, r12
+ adc r2, r2, #0
+ lsr r3, r1, #31
+ ldr r1, [sp, #48]
+ orr r3, r3, r1, lsl #1
+ umull r3, r12, lr, r3
+ add r12, r12, r2
+ adds r8, r8, r3
+ mov r2, #0
+ adcs r9, r9, r12
+ adc r2, r2, #0
+ lsr r3, r1, #31
+ ldr r1, [sp, #52]
+ orr r3, r3, r1, lsl #1
+ umull r3, r12, lr, r3
+ add r12, r12, r2
+ adds r9, r9, r3
+ mov r2, #0
+ adcs r10, r10, r12
+ adc r2, r2, #0
+ lsr r3, r1, #31
+ ldr r1, [sp, #56]
+ orr r3, r3, r1, lsl #1
+ umull r3, r12, lr, r3
+ add r12, r12, r2
+ adds r10, r10, r3
+ mov r2, #0
+ adcs r11, r11, r12
+ adc r2, r2, #0
+ lsr r3, r1, #31
+ ldr r1, [sp, #60]
+ orr r3, r3, r1, lsl #1
+ umull r3, r12, lr, r3
+ adds r11, r11, r3
+ adc r3, r12, r2
+ # Overflow
+ lsl r3, r3, #1
+ orr r3, r3, r11, lsr #31
+ mul r3, r3, lr
+ and r11, r11, #0x7fffffff
+ adds r4, r4, r3
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adcs r7, r7, #0
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adcs r10, r10, #0
+ adc r11, r11, #0
+ # Reduce if top bit set
+ asr r3, r11, #31
+ and r3, r3, lr
+ and r11, r11, #0x7fffffff
+ adds r4, r4, r3
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adcs r7, r7, #0
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adcs r10, r10, #0
+ adc r11, r11, #0
+ # Store
+ strd r4, r5, [r0]
+ strd r6, r7, [r0, #8]
+ strd r8, r9, [r0, #16]
+ strd r10, r11, [r0, #24]
+ add sp, sp, #0x40
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_mul,.-fe_mul
+ .text
+ .align 2
+ .globl fe_sq
+ .type fe_sq, %function
+fe_sq:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub sp, sp, #0x40
+ # Square
+ ldr r7, [r1]
+ ldr r8, [r1, #4]
+ ldr r9, [r1, #8]
+ ldr r10, [r1, #12]
+ ldr r12, [r1, #16]
+ # A[0] * A[0] = 0
+ umull r4, r5, r7, r7
+ str r4, [sp]
+ # A[0] * A[1] = 1
+ umull r2, r3, r7, r8
+ mov r6, #0
+ adds r5, r5, r2
+ adc r6, r6, r3
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #4]
+ # A[1] * A[1] = 2
+ umull r2, r3, r8, r8
+ adds r6, r6, r2
+ adc r4, r4, r3
+ # A[0] * A[2] = 2
+ umull r2, r3, r7, r9
+ adds r6, r6, r2
+ mov r5, #0
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ str r6, [sp, #8]
+ # A[0] * A[3] = 3
+ umull r2, r3, r7, r10
+ adds r4, r4, r2
+ adc r5, r5, r3
+ adds r4, r4, r2
+ mov r6, #0
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[1] * A[2] = 3
+ umull r2, r3, r8, r9
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ str r4, [sp, #12]
+ # A[2] * A[2] = 4
+ umull r2, r3, r9, r9
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[1] * A[3] = 4
+ umull r2, r3, r8, r10
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[0] * A[4] = 4
+ umull r2, r3, r7, r12
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #16]
+ # A[0] * A[5] = 5
+ ldr r11, [r1, #20]
+ umull r2, r3, r7, r11
+ adds r6, r6, r2
+ mov r5, #0
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[1] * A[4] = 5
+ umull r2, r3, r8, r12
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[2] * A[3] = 5
+ umull r2, r3, r9, r10
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ str r6, [sp, #20]
+ # A[3] * A[3] = 6
+ umull r2, r3, r10, r10
+ adds r4, r4, r2
+ mov r6, #0
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[2] * A[4] = 6
+ umull r2, r3, r9, r12
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[1] * A[5] = 6
+ umull r2, r3, r8, r11
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[0] * A[6] = 6
+ ldr r11, [r1, #24]
+ umull r2, r3, r7, r11
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ str r4, [sp, #24]
+ # A[0] * A[7] = 7
+ ldr r11, [r1, #28]
+ umull r2, r3, r7, r11
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[1] * A[6] = 7
+ ldr r11, [r1, #24]
+ umull r2, r3, r8, r11
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[2] * A[5] = 7
+ ldr r11, [r1, #20]
+ umull r2, r3, r9, r11
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[3] * A[4] = 7
+ umull r2, r3, r10, r12
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #28]
+ # A[4] * A[4] = 8
+ umull r2, r3, r12, r12
+ adds r6, r6, r2
+ mov r5, #0
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[3] * A[5] = 8
+ umull r2, r3, r10, r11
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[2] * A[6] = 8
+ ldr r11, [r1, #24]
+ umull r2, r3, r9, r11
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[1] * A[7] = 8
+ ldr r11, [r1, #28]
+ umull r2, r3, r8, r11
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ str r6, [sp, #32]
+ ldr r7, [r1, #20]
+ # A[2] * A[7] = 9
+ umull r2, r3, r9, r11
+ adds r4, r4, r2
+ mov r6, #0
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[3] * A[6] = 9
+ ldr r11, [r1, #24]
+ umull r2, r3, r10, r11
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[4] * A[5] = 9
+ umull r2, r3, r12, r7
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ str r4, [sp, #36]
+ mov r8, r11
+ # A[5] * A[5] = 10
+ umull r2, r3, r7, r7
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[4] * A[6] = 10
+ umull r2, r3, r12, r8
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[3] * A[7] = 10
+ ldr r11, [r1, #28]
+ umull r2, r3, r10, r11
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #40]
+ mov r9, r11
+ # A[4] * A[7] = 11
+ umull r2, r3, r12, r9
+ adds r6, r6, r2
+ mov r5, #0
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[5] * A[6] = 11
+ umull r2, r3, r7, r8
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ str r6, [sp, #44]
+ # A[6] * A[6] = 12
+ umull r2, r3, r8, r8
+ adds r4, r4, r2
+ mov r6, #0
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[5] * A[7] = 12
+ umull r2, r3, r7, r9
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ str r4, [sp, #48]
+ # A[6] * A[7] = 13
+ umull r2, r3, r8, r9
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #52]
+ # A[7] * A[7] = 14
+ umull r2, r3, r9, r9
+ adds r6, r6, r2
+ adc r4, r4, r3
+ str r6, [sp, #56]
+ str r4, [sp, #60]
+ # Reduce
+ # Load bottom half
+ ldrd r4, r5, [sp]
+ ldrd r6, r7, [sp, #8]
+ ldrd r8, r9, [sp, #16]
+ ldrd r10, r11, [sp, #24]
+ lsr r2, r11, #31
+ and r11, r11, #0x7fffffff
+ mov r12, #19
+ ldr r1, [sp, #32]
+ orr r2, r2, r1, lsl #1
+ umull r2, r3, r12, r2
+ adds r4, r4, r2
+ mov lr, #0
+ adcs r5, r5, r3
+ adc lr, lr, #0
+ lsr r2, r1, #31
+ ldr r1, [sp, #36]
+ orr r2, r2, r1, lsl #1
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r5, r5, r2
+ mov lr, #0
+ adcs r6, r6, r3
+ adc lr, lr, #0
+ lsr r2, r1, #31
+ ldr r1, [sp, #40]
+ orr r2, r2, r1, lsl #1
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r6, r6, r2
+ mov lr, #0
+ adcs r7, r7, r3
+ adc lr, lr, #0
+ lsr r2, r1, #31
+ ldr r1, [sp, #44]
+ orr r2, r2, r1, lsl #1
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r7, r7, r2
+ mov lr, #0
+ adcs r8, r8, r3
+ adc lr, lr, #0
+ lsr r2, r1, #31
+ ldr r1, [sp, #48]
+ orr r2, r2, r1, lsl #1
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r8, r8, r2
+ mov lr, #0
+ adcs r9, r9, r3
+ adc lr, lr, #0
+ lsr r2, r1, #31
+ ldr r1, [sp, #52]
+ orr r2, r2, r1, lsl #1
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r9, r9, r2
+ mov lr, #0
+ adcs r10, r10, r3
+ adc lr, lr, #0
+ lsr r2, r1, #31
+ ldr r1, [sp, #56]
+ orr r2, r2, r1, lsl #1
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r10, r10, r2
+ mov lr, #0
+ adcs r11, r11, r3
+ adc lr, lr, #0
+ lsr r2, r1, #31
+ ldr r1, [sp, #60]
+ orr r2, r2, r1, lsl #1
+ umull r2, r3, r12, r2
+ adds r11, r11, r2
+ adc r2, r3, lr
+ # Overflow
+ lsl r2, r2, #1
+ orr r2, r2, r11, lsr #31
+ mul r2, r2, r12
+ and r11, r11, #0x7fffffff
+ adds r4, r4, r2
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adcs r7, r7, #0
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adcs r10, r10, #0
+ adc r11, r11, #0
+ # Reduce if top bit set
+ asr r2, r11, #31
+ and r2, r2, r12
+ and r11, r11, #0x7fffffff
+ adds r4, r4, r2
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adcs r7, r7, #0
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adcs r10, r10, #0
+ adc r11, r11, #0
+ # Store
+ strd r4, r5, [r0]
+ strd r6, r7, [r0, #8]
+ strd r8, r9, [r0, #16]
+ strd r10, r11, [r0, #24]
+ add sp, sp, #0x40
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_sq,.-fe_sq
+ .text
+ .align 2
+ .globl fe_mul121666
+ .type fe_mul121666, %function
+fe_mul121666:
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
+ # Multiply by 121666
+ ldrd r2, r3, [r1]
+ ldrd r4, r5, [r1, #8]
+ ldrd r6, r7, [r1, #16]
+ ldrd r8, r9, [r1, #24]
+ movw lr, #0xdb42
+ movt lr, #1
+ umull r2, r10, r2, lr
+ umull r3, r12, r3, lr
+ adds r3, r3, r10
+ adc r10, r12, #0
+ umull r4, r12, r4, lr
+ adds r4, r4, r10
+ adc r10, r12, #0
+ umull r5, r12, r5, lr
+ adds r5, r5, r10
+ adc r10, r12, #0
+ umull r6, r12, r6, lr
+ adds r6, r6, r10
+ adc r10, r12, #0
+ umull r7, r12, r7, lr
+ adds r7, r7, r10
+ adc r10, r12, #0
+ umull r8, r12, r8, lr
+ adds r8, r8, r10
+ adc r10, r12, #0
+ umull r9, r12, r9, lr
+ adds r9, r9, r10
+ adc r10, r12, #0
+ mov lr, #19
+ lsl r10, r10, #1
+ orr r10, r10, r9, lsr #31
+ mul r10, r10, lr
+ and r9, r9, #0x7fffffff
+ adds r2, r2, r10
+ adcs r3, r3, #0
+ adcs r4, r4, #0
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adcs r7, r7, #0
+ adcs r8, r8, #0
+ adc r9, r9, #0
+ strd r2, r3, [r0]
+ strd r4, r5, [r0, #8]
+ strd r6, r7, [r0, #16]
+ strd r8, r9, [r0, #24]
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
+ .size fe_mul121666,.-fe_mul121666
+ .text
+ .align 2
+ .globl fe_sq2
+ .type fe_sq2, %function
+fe_sq2:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub sp, sp, #0x40
+ # Square * 2
+ ldr r7, [r1]
+ ldr r8, [r1, #4]
+ ldr r9, [r1, #8]
+ ldr r10, [r1, #12]
+ ldr r12, [r1, #16]
+ # A[0] * A[0] = 0
+ umull r4, r5, r7, r7
+ str r4, [sp]
+ # A[0] * A[1] = 1
+ umull r2, r3, r7, r8
+ mov r6, #0
+ adds r5, r5, r2
+ adc r6, r6, r3
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #4]
+ # A[1] * A[1] = 2
+ umull r2, r3, r8, r8
+ adds r6, r6, r2
+ adc r4, r4, r3
+ # A[0] * A[2] = 2
+ umull r2, r3, r7, r9
+ adds r6, r6, r2
+ mov r5, #0
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ str r6, [sp, #8]
+ # A[0] * A[3] = 3
+ umull r2, r3, r7, r10
+ adds r4, r4, r2
+ adc r5, r5, r3
+ adds r4, r4, r2
+ mov r6, #0
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[1] * A[2] = 3
+ umull r2, r3, r8, r9
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ str r4, [sp, #12]
+ # A[2] * A[2] = 4
+ umull r2, r3, r9, r9
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[1] * A[3] = 4
+ umull r2, r3, r8, r10
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[0] * A[4] = 4
+ umull r2, r3, r7, r12
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #16]
+ # A[0] * A[5] = 5
+ ldr r11, [r1, #20]
+ umull r2, r3, r7, r11
+ adds r6, r6, r2
+ mov r5, #0
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[1] * A[4] = 5
+ umull r2, r3, r8, r12
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[2] * A[3] = 5
+ umull r2, r3, r9, r10
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ str r6, [sp, #20]
+ # A[3] * A[3] = 6
+ umull r2, r3, r10, r10
+ adds r4, r4, r2
+ mov r6, #0
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[2] * A[4] = 6
+ umull r2, r3, r9, r12
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[1] * A[5] = 6
+ umull r2, r3, r8, r11
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[0] * A[6] = 6
+ ldr r11, [r1, #24]
+ umull r2, r3, r7, r11
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ str r4, [sp, #24]
+ # A[0] * A[7] = 7
+ ldr r11, [r1, #28]
+ umull r2, r3, r7, r11
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[1] * A[6] = 7
+ ldr r11, [r1, #24]
+ umull r2, r3, r8, r11
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[2] * A[5] = 7
+ ldr r11, [r1, #20]
+ umull r2, r3, r9, r11
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[3] * A[4] = 7
+ umull r2, r3, r10, r12
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #28]
+ # A[4] * A[4] = 8
+ umull r2, r3, r12, r12
+ adds r6, r6, r2
+ mov r5, #0
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[3] * A[5] = 8
+ umull r2, r3, r10, r11
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[2] * A[6] = 8
+ ldr r11, [r1, #24]
+ umull r2, r3, r9, r11
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[1] * A[7] = 8
+ ldr r11, [r1, #28]
+ umull r2, r3, r8, r11
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ str r6, [sp, #32]
+ ldr r7, [r1, #20]
+ # A[2] * A[7] = 9
+ umull r2, r3, r9, r11
+ adds r4, r4, r2
+ mov r6, #0
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[3] * A[6] = 9
+ ldr r11, [r1, #24]
+ umull r2, r3, r10, r11
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[4] * A[5] = 9
+ umull r2, r3, r12, r7
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ str r4, [sp, #36]
+ mov r8, r11
+ # A[5] * A[5] = 10
+ umull r2, r3, r7, r7
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[4] * A[6] = 10
+ umull r2, r3, r12, r8
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ # A[3] * A[7] = 10
+ ldr r11, [r1, #28]
+ umull r2, r3, r10, r11
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #40]
+ mov r9, r11
+ # A[4] * A[7] = 11
+ umull r2, r3, r12, r9
+ adds r6, r6, r2
+ mov r5, #0
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ # A[5] * A[6] = 11
+ umull r2, r3, r7, r8
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ adds r6, r6, r2
+ adcs r4, r4, r3
+ adc r5, r5, #0
+ str r6, [sp, #44]
+ # A[6] * A[6] = 12
+ umull r2, r3, r8, r8
+ adds r4, r4, r2
+ mov r6, #0
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ # A[5] * A[7] = 12
+ umull r2, r3, r7, r9
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ adds r4, r4, r2
+ adcs r5, r5, r3
+ adc r6, r6, #0
+ str r4, [sp, #48]
+ # A[6] * A[7] = 13
+ umull r2, r3, r8, r9
+ adds r5, r5, r2
+ mov r4, #0
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ adds r5, r5, r2
+ adcs r6, r6, r3
+ adc r4, r4, #0
+ str r5, [sp, #52]
+ # A[7] * A[7] = 14
+ umull r2, r3, r9, r9
+ adds r6, r6, r2
+ adc r4, r4, r3
+ str r6, [sp, #56]
+ str r4, [sp, #60]
+ # Double and Reduce
+ # Load bottom half
+ ldrd r4, r5, [sp]
+ ldrd r6, r7, [sp, #8]
+ ldrd r8, r9, [sp, #16]
+ ldrd r10, r11, [sp, #24]
+ lsr r2, r11, #30
+ lsl r11, r11, #1
+ orr r11, r11, r10, lsr #31
+ lsl r10, r10, #1
+ orr r10, r10, r9, lsr #31
+ lsl r9, r9, #1
+ orr r9, r9, r8, lsr #31
+ lsl r8, r8, #1
+ orr r8, r8, r7, lsr #31
+ lsl r7, r7, #1
+ orr r7, r7, r6, lsr #31
+ lsl r6, r6, #1
+ orr r6, r6, r5, lsr #31
+ lsl r5, r5, #1
+ orr r5, r5, r4, lsr #31
+ lsl r4, r4, #1
+ and r11, r11, #0x7fffffff
+ mov r12, #19
+ ldr r1, [sp, #32]
+ orr r2, r2, r1, lsl #2
+ umull r2, r3, r12, r2
+ adds r4, r4, r2
+ mov lr, #0
+ adcs r5, r5, r3
+ adc lr, lr, #0
+ lsr r2, r1, #30
+ ldr r1, [sp, #36]
+ orr r2, r2, r1, lsl #2
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r5, r5, r2
+ mov lr, #0
+ adcs r6, r6, r3
+ adc lr, lr, #0
+ lsr r2, r1, #30
+ ldr r1, [sp, #40]
+ orr r2, r2, r1, lsl #2
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r6, r6, r2
+ mov lr, #0
+ adcs r7, r7, r3
+ adc lr, lr, #0
+ lsr r2, r1, #30
+ ldr r1, [sp, #44]
+ orr r2, r2, r1, lsl #2
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r7, r7, r2
+ mov lr, #0
+ adcs r8, r8, r3
+ adc lr, lr, #0
+ lsr r2, r1, #30
+ ldr r1, [sp, #48]
+ orr r2, r2, r1, lsl #2
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r8, r8, r2
+ mov lr, #0
+ adcs r9, r9, r3
+ adc lr, lr, #0
+ lsr r2, r1, #30
+ ldr r1, [sp, #52]
+ orr r2, r2, r1, lsl #2
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r9, r9, r2
+ mov lr, #0
+ adcs r10, r10, r3
+ adc lr, lr, #0
+ lsr r2, r1, #30
+ ldr r1, [sp, #56]
+ orr r2, r2, r1, lsl #2
+ umull r2, r3, r12, r2
+ add r3, r3, lr
+ adds r10, r10, r2
+ mov lr, #0
+ adcs r11, r11, r3
+ adc lr, lr, #0
+ lsr r2, r1, #30
+ ldr r1, [sp, #60]
+ orr r2, r2, r1, lsl #2
+ umull r2, r3, r12, r2
+ adds r11, r11, r2
+ adc r2, r3, lr
+ # Overflow
+ lsl r2, r2, #1
+ orr r2, r2, r11, lsr #31
+ mul r2, r2, r12
+ and r11, r11, #0x7fffffff
+ adds r4, r4, r2
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adcs r7, r7, #0
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adcs r10, r10, #0
+ adc r11, r11, #0
+ # Reduce if top bit set
+ asr r2, r11, #31
+ and r2, r2, r12
+ and r11, r11, #0x7fffffff
+ adds r4, r4, r2
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adcs r7, r7, #0
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adcs r10, r10, #0
+ adc r11, r11, #0
+ # Store
+ strd r4, r5, [r0]
+ strd r6, r7, [r0, #8]
+ strd r8, r9, [r0, #16]
+ strd r10, r11, [r0, #24]
+ add sp, sp, #0x40
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_sq2,.-fe_sq2
+ .text
+ .align 2
+ .globl fe_invert
+ .type fe_invert, %function
+fe_invert:
+ push {r4, lr}
+ sub sp, sp, #0x88
+ # Invert
+ str r0, [sp, #128]
+ str r1, [sp, #132]
+ mov r0, sp
+ ldr r1, [sp, #132]
+ bl fe_sq
+ add r0, sp, #32
+ mov r1, sp
+ bl fe_sq
+ add r0, sp, #32
+ add r1, sp, #32
+ bl fe_sq
+ add r0, sp, #32
+ ldr r1, [sp, #132]
+ add r2, sp, #32
+ bl fe_mul
+ mov r0, sp
+ mov r1, sp
+ add r2, sp, #32
+ bl fe_mul
+ add r0, sp, #0x40
+ mov r1, sp
+ bl fe_sq
+ add r0, sp, #32
+ add r1, sp, #32
+ add r2, sp, #0x40
+ bl fe_mul
+ add r0, sp, #0x40
+ add r1, sp, #32
+ bl fe_sq
+ mov r4, #4
+L_fe_invert1:
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_invert1
+ add r0, sp, #32
+ add r1, sp, #0x40
+ add r2, sp, #32
+ bl fe_mul
+ add r0, sp, #0x40
+ add r1, sp, #32
+ bl fe_sq
+ mov r4, #9
+L_fe_invert2:
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_invert2
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ add r2, sp, #32
+ bl fe_mul
+ add r0, sp, #0x60
+ add r1, sp, #0x40
+ bl fe_sq
+ mov r4, #19
+L_fe_invert3:
+ add r0, sp, #0x60
+ add r1, sp, #0x60
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_invert3
+ add r0, sp, #0x40
+ add r1, sp, #0x60
+ add r2, sp, #0x40
+ bl fe_mul
+ mov r4, #10
+L_fe_invert4:
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_invert4
+ add r0, sp, #32
+ add r1, sp, #0x40
+ add r2, sp, #32
+ bl fe_mul
+ add r0, sp, #0x40
+ add r1, sp, #32
+ bl fe_sq
+ mov r4, #49
+L_fe_invert5:
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_invert5
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ add r2, sp, #32
+ bl fe_mul
+ add r0, sp, #0x60
+ add r1, sp, #0x40
+ bl fe_sq
+ mov r4, #0x63
+L_fe_invert6:
+ add r0, sp, #0x60
+ add r1, sp, #0x60
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_invert6
+ add r0, sp, #0x40
+ add r1, sp, #0x60
+ add r2, sp, #0x40
+ bl fe_mul
+ mov r4, #50
+L_fe_invert7:
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_invert7
+ add r0, sp, #32
+ add r1, sp, #0x40
+ add r2, sp, #32
+ bl fe_mul
+ mov r4, #5
+L_fe_invert8:
+ add r0, sp, #32
+ add r1, sp, #32
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_invert8
+ ldr r0, [sp, #128]
+ add r1, sp, #32
+ mov r2, sp
+ bl fe_mul
+ ldr r1, [sp, #132]
+ ldr r0, [sp, #128]
+ add sp, sp, #0x88
+ pop {r4, pc}
+ .size fe_invert,.-fe_invert
+ .text
+ .align 2
+ .globl curve25519
+ .type curve25519, %function
+curve25519:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub sp, sp, #0xbc
+ str r0, [sp, #160]
+ str r1, [sp, #164]
+ str r2, [sp, #168]
+ mov r1, #0
+ str r1, [sp, #172]
+ # Set one
+ mov r11, #1
+ mov r10, #0
+ str r11, [r0]
+ str r10, [r0, #4]
+ str r10, [r0, #8]
+ str r10, [r0, #12]
+ str r10, [r0, #16]
+ str r10, [r0, #20]
+ str r10, [r0, #24]
+ str r10, [r0, #28]
+ # Set zero
+ mov r10, #0
+ str r10, [sp]
+ str r10, [sp, #4]
+ str r10, [sp, #8]
+ str r10, [sp, #12]
+ str r10, [sp, #16]
+ str r10, [sp, #20]
+ str r10, [sp, #24]
+ str r10, [sp, #28]
+ # Set one
+ mov r11, #1
+ mov r10, #0
+ str r11, [sp, #32]
+ str r10, [sp, #36]
+ str r10, [sp, #40]
+ str r10, [sp, #44]
+ str r10, [sp, #48]
+ str r10, [sp, #52]
+ str r10, [sp, #56]
+ str r10, [sp, #60]
+ # Copy
+ ldrd r4, r5, [r2]
+ ldrd r6, r7, [r2, #8]
+ strd r4, r5, [sp, #64]
+ strd r6, r7, [sp, #72]
+ ldrd r4, r5, [r2, #16]
+ ldrd r6, r7, [r2, #24]
+ strd r4, r5, [sp, #80]
+ strd r6, r7, [sp, #88]
+ mov r1, #30
+ str r1, [sp, #180]
+ mov r2, #28
+ str r2, [sp, #176]
+L_curve25519_words:
+L_curve25519_bits:
+ ldr r1, [sp, #164]
+ ldr r2, [r1, r2]
+ ldr r1, [sp, #180]
+ lsr r2, r2, r1
+ and r2, r2, #1
+ str r2, [sp, #184]
+ ldr r1, [sp, #172]
+ eor r1, r1, r2
+ str r1, [sp, #172]
+ ldr r0, [sp, #160]
+ # Conditional Swap
+ neg r1, r1
+ ldrd r4, r5, [r0]
+ ldrd r6, r7, [sp, #64]
+ eor r8, r4, r6
+ eor r9, r5, r7
+ and r8, r8, r1
+ and r9, r9, r1
+ eor r4, r4, r8
+ eor r5, r5, r9
+ eor r6, r6, r8
+ eor r7, r7, r9
+ strd r4, r5, [r0]
+ strd r6, r7, [sp, #64]
+ ldrd r4, r5, [r0, #8]
+ ldrd r6, r7, [sp, #72]
+ eor r8, r4, r6
+ eor r9, r5, r7
+ and r8, r8, r1
+ and r9, r9, r1
+ eor r4, r4, r8
+ eor r5, r5, r9
+ eor r6, r6, r8
+ eor r7, r7, r9
+ strd r4, r5, [r0, #8]
+ strd r6, r7, [sp, #72]
+ ldrd r4, r5, [r0, #16]
+ ldrd r6, r7, [sp, #80]
+ eor r8, r4, r6
+ eor r9, r5, r7
+ and r8, r8, r1
+ and r9, r9, r1
+ eor r4, r4, r8
+ eor r5, r5, r9
+ eor r6, r6, r8
+ eor r7, r7, r9
+ strd r4, r5, [r0, #16]
+ strd r6, r7, [sp, #80]
+ ldrd r4, r5, [r0, #24]
+ ldrd r6, r7, [sp, #88]
+ eor r8, r4, r6
+ eor r9, r5, r7
+ and r8, r8, r1
+ and r9, r9, r1
+ eor r4, r4, r8
+ eor r5, r5, r9
+ eor r6, r6, r8
+ eor r7, r7, r9
+ strd r4, r5, [r0, #24]
+ strd r6, r7, [sp, #88]
+ ldr r1, [sp, #172]
+ # Conditional Swap
+ neg r1, r1
+ ldrd r4, r5, [sp]
+ ldrd r6, r7, [sp, #32]
+ eor r8, r4, r6
+ eor r9, r5, r7
+ and r8, r8, r1
+ and r9, r9, r1
+ eor r4, r4, r8
+ eor r5, r5, r9
+ eor r6, r6, r8
+ eor r7, r7, r9
+ strd r4, r5, [sp]
+ strd r6, r7, [sp, #32]
+ ldrd r4, r5, [sp, #8]
+ ldrd r6, r7, [sp, #40]
+ eor r8, r4, r6
+ eor r9, r5, r7
+ and r8, r8, r1
+ and r9, r9, r1
+ eor r4, r4, r8
+ eor r5, r5, r9
+ eor r6, r6, r8
+ eor r7, r7, r9
+ strd r4, r5, [sp, #8]
+ strd r6, r7, [sp, #40]
+ ldrd r4, r5, [sp, #16]
+ ldrd r6, r7, [sp, #48]
+ eor r8, r4, r6
+ eor r9, r5, r7
+ and r8, r8, r1
+ and r9, r9, r1
+ eor r4, r4, r8
+ eor r5, r5, r9
+ eor r6, r6, r8
+ eor r7, r7, r9
+ strd r4, r5, [sp, #16]
+ strd r6, r7, [sp, #48]
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #56]
+ eor r8, r4, r6
+ eor r9, r5, r7
+ and r8, r8, r1
+ and r9, r9, r1
+ eor r4, r4, r8
+ eor r5, r5, r9
+ eor r6, r6, r8
+ eor r7, r7, r9
+ strd r4, r5, [sp, #24]
+ strd r6, r7, [sp, #56]
+ ldr r1, [sp, #184]
+ str r1, [sp, #172]
+ # Add-Sub
+ # Add
+ ldrd r4, r5, [r0]
+ ldrd r6, r7, [sp]
+ adds r8, r4, r6
+ mov r3, #0
+ adcs r9, r5, r7
+ adc r3, r3, #0
+ strd r8, r9, [r0]
+ # Sub
+ subs r10, r4, r6
+ mov r12, #0
+ sbcs r11, r5, r7
+ adc r12, r12, #0
+ strd r10, r11, [sp, #128]
+ # Add
+ ldrd r4, r5, [r0, #8]
+ ldrd r6, r7, [sp, #8]
+ adds r3, r3, #-1
+ adcs r8, r4, r6
+ mov r3, #0
+ adcs r9, r5, r7
+ adc r3, r3, #0
+ strd r8, r9, [r0, #8]
+ # Sub
+ adds r12, r12, #-1
+ sbcs r10, r4, r6
+ mov r12, #0
+ sbcs r11, r5, r7
+ adc r12, r12, #0
+ strd r10, r11, [sp, #136]
+ # Add
+ ldrd r4, r5, [r0, #16]
+ ldrd r6, r7, [sp, #16]
+ adds r3, r3, #-1
+ adcs r8, r4, r6
+ mov r3, #0
+ adcs r9, r5, r7
+ adc r3, r3, #0
+ strd r8, r9, [r0, #16]
+ # Sub
+ adds r12, r12, #-1
+ sbcs r10, r4, r6
+ mov r12, #0
+ sbcs r11, r5, r7
+ adc r12, r12, #0
+ strd r10, r11, [sp, #144]
+ # Add
+ ldrd r4, r5, [r0, #24]
+ ldrd r6, r7, [sp, #24]
+ adds r3, r3, #-1
+ adcs r8, r4, r6
+ adc r9, r5, r7
+ # Sub
+ adds r12, r12, #-1
+ sbcs r10, r4, r6
+ sbc r11, r5, r7
+ mov r3, #-19
+ asr r2, r9, #31
+ # Mask the modulus
+ and r3, r2, r3
+ and r12, r2, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldrd r4, r5, [r0]
+ subs r4, r4, r3
+ sbcs r5, r5, r2
+ strd r4, r5, [r0]
+ ldrd r4, r5, [r0, #8]
+ sbcs r4, r4, r2
+ sbcs r5, r5, r2
+ strd r4, r5, [r0, #8]
+ ldrd r4, r5, [r0, #16]
+ sbcs r4, r4, r2
+ sbcs r5, r5, r2
+ strd r4, r5, [r0, #16]
+ sbcs r8, r8, r2
+ sbc r9, r9, r12
+ strd r8, r9, [r0, #24]
+ mov r3, #-19
+ asr r2, r11, #31
+ # Mask the modulus
+ and r3, r2, r3
+ and r12, r2, #0x7fffffff
+ # Add modulus (if underflow)
+ ldrd r4, r5, [sp, #128]
+ adds r4, r4, r3
+ adcs r5, r5, r2
+ strd r4, r5, [sp, #128]
+ ldrd r4, r5, [sp, #136]
+ adcs r4, r4, r2
+ adcs r5, r5, r2
+ strd r4, r5, [sp, #136]
+ ldrd r4, r5, [sp, #144]
+ adcs r4, r4, r2
+ adcs r5, r5, r2
+ strd r4, r5, [sp, #144]
+ adcs r10, r10, r2
+ adc r11, r11, r12
+ strd r10, r11, [sp, #152]
+ # Add-Sub
+ # Add
+ ldrd r4, r5, [sp, #64]
+ ldrd r6, r7, [sp, #32]
+ adds r8, r4, r6
+ mov r3, #0
+ adcs r9, r5, r7
+ adc r3, r3, #0
+ strd r8, r9, [sp]
+ # Sub
+ subs r10, r4, r6
+ mov r12, #0
+ sbcs r11, r5, r7
+ adc r12, r12, #0
+ strd r10, r11, [sp, #96]
+ # Add
+ ldrd r4, r5, [sp, #72]
+ ldrd r6, r7, [sp, #40]
+ adds r3, r3, #-1
+ adcs r8, r4, r6
+ mov r3, #0
+ adcs r9, r5, r7
+ adc r3, r3, #0
+ strd r8, r9, [sp, #8]
+ # Sub
+ adds r12, r12, #-1
+ sbcs r10, r4, r6
+ mov r12, #0
+ sbcs r11, r5, r7
+ adc r12, r12, #0
+ strd r10, r11, [sp, #104]
+ # Add
+ ldrd r4, r5, [sp, #80]
+ ldrd r6, r7, [sp, #48]
+ adds r3, r3, #-1
+ adcs r8, r4, r6
+ mov r3, #0
+ adcs r9, r5, r7
+ adc r3, r3, #0
+ strd r8, r9, [sp, #16]
+ # Sub
+ adds r12, r12, #-1
+ sbcs r10, r4, r6
+ mov r12, #0
+ sbcs r11, r5, r7
+ adc r12, r12, #0
+ strd r10, r11, [sp, #112]
+ # Add
+ ldrd r4, r5, [sp, #88]
+ ldrd r6, r7, [sp, #56]
+ adds r3, r3, #-1
+ adcs r8, r4, r6
+ adc r9, r5, r7
+ # Sub
+ adds r12, r12, #-1
+ sbcs r10, r4, r6
+ sbc r11, r5, r7
+ mov r3, #-19
+ asr r2, r9, #31
+ # Mask the modulus
+ and r3, r2, r3
+ and r12, r2, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldrd r4, r5, [sp]
+ subs r4, r4, r3
+ sbcs r5, r5, r2
+ strd r4, r5, [sp]
+ ldrd r4, r5, [sp, #8]
+ sbcs r4, r4, r2
+ sbcs r5, r5, r2
+ strd r4, r5, [sp, #8]
+ ldrd r4, r5, [sp, #16]
+ sbcs r4, r4, r2
+ sbcs r5, r5, r2
+ strd r4, r5, [sp, #16]
+ sbcs r8, r8, r2
+ sbc r9, r9, r12
+ strd r8, r9, [sp, #24]
+ mov r3, #-19
+ asr r2, r11, #31
+ # Mask the modulus
+ and r3, r2, r3
+ and r12, r2, #0x7fffffff
+ # Add modulus (if underflow)
+ ldrd r4, r5, [sp, #96]
+ adds r4, r4, r3
+ adcs r5, r5, r2
+ strd r4, r5, [sp, #96]
+ ldrd r4, r5, [sp, #104]
+ adcs r4, r4, r2
+ adcs r5, r5, r2
+ strd r4, r5, [sp, #104]
+ ldrd r4, r5, [sp, #112]
+ adcs r4, r4, r2
+ adcs r5, r5, r2
+ strd r4, r5, [sp, #112]
+ adcs r10, r10, r2
+ adc r11, r11, r12
+ strd r10, r11, [sp, #120]
+ ldr r2, [sp, #160]
+ add r1, sp, #0x60
+ add r0, sp, #32
+ bl fe_mul
+ add r2, sp, #0x80
+ add r1, sp, #0
+ add r0, sp, #0
+ bl fe_mul
+ add r1, sp, #0x80
+ add r0, sp, #0x60
+ bl fe_sq
+ ldr r1, [sp, #160]
+ add r0, sp, #0x80
+ bl fe_sq
+ # Add-Sub
+ # Add
+ ldrd r4, r5, [sp, #32]
+ ldrd r6, r7, [sp]
+ adds r8, r4, r6
+ mov r3, #0
+ adcs r9, r5, r7
+ adc r3, r3, #0
+ strd r8, r9, [sp, #64]
+ # Sub
+ subs r10, r4, r6
+ mov r12, #0
+ sbcs r11, r5, r7
+ adc r12, r12, #0
+ strd r10, r11, [sp]
+ # Add
+ ldrd r4, r5, [sp, #40]
+ ldrd r6, r7, [sp, #8]
+ adds r3, r3, #-1
+ adcs r8, r4, r6
+ mov r3, #0
+ adcs r9, r5, r7
+ adc r3, r3, #0
+ strd r8, r9, [sp, #72]
+ # Sub
+ adds r12, r12, #-1
+ sbcs r10, r4, r6
+ mov r12, #0
+ sbcs r11, r5, r7
+ adc r12, r12, #0
+ strd r10, r11, [sp, #8]
+ # Add
+ ldrd r4, r5, [sp, #48]
+ ldrd r6, r7, [sp, #16]
+ adds r3, r3, #-1
+ adcs r8, r4, r6
+ mov r3, #0
+ adcs r9, r5, r7
+ adc r3, r3, #0
+ strd r8, r9, [sp, #80]
+ # Sub
+ adds r12, r12, #-1
+ sbcs r10, r4, r6
+ mov r12, #0
+ sbcs r11, r5, r7
+ adc r12, r12, #0
+ strd r10, r11, [sp, #16]
+ # Add
+ ldrd r4, r5, [sp, #56]
+ ldrd r6, r7, [sp, #24]
+ adds r3, r3, #-1
+ adcs r8, r4, r6
+ adc r9, r5, r7
+ # Sub
+ adds r12, r12, #-1
+ sbcs r10, r4, r6
+ sbc r11, r5, r7
+ mov r3, #-19
+ asr r2, r9, #31
+ # Mask the modulus
+ and r3, r2, r3
+ and r12, r2, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldrd r4, r5, [sp, #64]
+ subs r4, r4, r3
+ sbcs r5, r5, r2
+ strd r4, r5, [sp, #64]
+ ldrd r4, r5, [sp, #72]
+ sbcs r4, r4, r2
+ sbcs r5, r5, r2
+ strd r4, r5, [sp, #72]
+ ldrd r4, r5, [sp, #80]
+ sbcs r4, r4, r2
+ sbcs r5, r5, r2
+ strd r4, r5, [sp, #80]
+ sbcs r8, r8, r2
+ sbc r9, r9, r12
+ strd r8, r9, [sp, #88]
+ mov r3, #-19
+ asr r2, r11, #31
+ # Mask the modulus
+ and r3, r2, r3
+ and r12, r2, #0x7fffffff
+ # Add modulus (if underflow)
+ ldrd r4, r5, [sp]
+ adds r4, r4, r3
+ adcs r5, r5, r2
+ strd r4, r5, [sp]
+ ldrd r4, r5, [sp, #8]
+ adcs r4, r4, r2
+ adcs r5, r5, r2
+ strd r4, r5, [sp, #8]
+ ldrd r4, r5, [sp, #16]
+ adcs r4, r4, r2
+ adcs r5, r5, r2
+ strd r4, r5, [sp, #16]
+ adcs r10, r10, r2
+ adc r11, r11, r12
+ strd r10, r11, [sp, #24]
+ add r2, sp, #0x60
+ add r1, sp, #0x80
+ ldr r0, [sp, #160]
+ bl fe_mul
+ # Sub
+ ldrd r4, r5, [sp, #128]
+ ldrd r6, r7, [sp, #136]
+ ldrd r8, r9, [sp, #96]
+ ldrd r10, r11, [sp, #104]
+ subs r8, r4, r8
+ sbcs r9, r5, r9
+ sbcs r10, r6, r10
+ sbcs r11, r7, r11
+ strd r8, r9, [sp, #128]
+ strd r10, r11, [sp, #136]
+ ldrd r4, r5, [sp, #144]
+ ldrd r6, r7, [sp, #152]
+ ldrd r8, r9, [sp, #112]
+ ldrd r10, r11, [sp, #120]
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbcs r10, r6, r10
+ sbc r11, r7, r11
+ mov r3, #-19
+ asr r2, r11, #31
+ # Mask the modulus
+ and r3, r2, r3
+ and r12, r2, #0x7fffffff
+ # Add modulus (if underflow)
+ ldrd r4, r5, [sp, #128]
+ ldrd r6, r7, [sp, #136]
+ adds r4, r4, r3
+ adcs r5, r5, r2
+ adcs r6, r6, r2
+ adcs r7, r7, r2
+ adcs r8, r8, r2
+ adcs r9, r9, r2
+ adcs r10, r10, r2
+ adc r11, r11, r12
+ strd r4, r5, [sp, #128]
+ strd r6, r7, [sp, #136]
+ strd r8, r9, [sp, #144]
+ strd r10, r11, [sp, #152]
+ add r1, sp, #0
+ add r0, sp, #0
+ bl fe_sq
+ # Multiply by 121666
+ ldrd r4, r5, [sp, #128]
+ ldrd r6, r7, [sp, #136]
+ ldrd r8, r9, [sp, #144]
+ ldrd r10, r11, [sp, #152]
+ movw r12, #0xdb42
+ movt r12, #1
+ umull r4, r2, r4, r12
+ umull r5, r3, r5, r12
+ adds r5, r5, r2
+ adc r2, r3, #0
+ umull r6, r3, r6, r12
+ adds r6, r6, r2
+ adc r2, r3, #0
+ umull r7, r3, r7, r12
+ adds r7, r7, r2
+ adc r2, r3, #0
+ umull r8, r3, r8, r12
+ adds r8, r8, r2
+ adc r2, r3, #0
+ umull r9, r3, r9, r12
+ adds r9, r9, r2
+ adc r2, r3, #0
+ umull r10, r3, r10, r12
+ adds r10, r10, r2
+ adc r2, r3, #0
+ umull r11, r3, r11, r12
+ adds r11, r11, r2
+ adc r2, r3, #0
+ mov r12, #19
+ lsl r2, r2, #1
+ orr r2, r2, r11, lsr #31
+ mul r2, r2, r12
+ and r11, r11, #0x7fffffff
+ adds r4, r4, r2
+ adcs r5, r5, #0
+ adcs r6, r6, #0
+ adcs r7, r7, #0
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adcs r10, r10, #0
+ adc r11, r11, #0
+ strd r4, r5, [sp, #32]
+ strd r6, r7, [sp, #40]
+ strd r8, r9, [sp, #48]
+ strd r10, r11, [sp, #56]
+ add r1, sp, #0x40
+ add r0, sp, #0x40
+ bl fe_sq
+ # Add
+ ldrd r4, r5, [sp, #96]
+ ldrd r6, r7, [sp, #104]
+ ldrd r8, r9, [sp, #32]
+ ldrd r10, r11, [sp, #40]
+ adds r8, r4, r8
+ adcs r9, r5, r9
+ adcs r10, r6, r10
+ adcs r11, r7, r11
+ strd r8, r9, [sp, #96]
+ strd r10, r11, [sp, #104]
+ ldrd r4, r5, [sp, #112]
+ ldrd r6, r7, [sp, #120]
+ ldrd r8, r9, [sp, #48]
+ ldrd r10, r11, [sp, #56]
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adcs r10, r6, r10
+ adc r11, r7, r11
+ mov r3, #-19
+ asr r2, r11, #31
+ # Mask the modulus
+ and r3, r2, r3
+ and r12, r2, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldrd r4, r5, [sp, #96]
+ ldrd r6, r7, [sp, #104]
+ subs r4, r4, r3
+ sbcs r5, r5, r2
+ sbcs r6, r6, r2
+ sbcs r7, r7, r2
+ sbcs r8, r8, r2
+ sbcs r9, r9, r2
+ sbcs r10, r10, r2
+ sbc r11, r11, r12
+ strd r4, r5, [sp, #96]
+ strd r6, r7, [sp, #104]
+ strd r8, r9, [sp, #112]
+ strd r10, r11, [sp, #120]
+ add r2, sp, #0
+ ldr r1, [sp, #168]
+ add r0, sp, #32
+ bl fe_mul
+ add r2, sp, #0x60
+ add r1, sp, #0x80
+ add r0, sp, #0
+ bl fe_mul
+ ldr r2, [sp, #176]
+ ldr r1, [sp, #180]
+ subs r1, r1, #1
+ str r1, [sp, #180]
+ bge L_curve25519_bits
+ mov r1, #31
+ str r1, [sp, #180]
+ subs r2, r2, #4
+ str r2, [sp, #176]
+ bge L_curve25519_words
+ # Invert
+ add r0, sp, #32
+ add r1, sp, #0
+ bl fe_sq
+ add r0, sp, #0x40
+ add r1, sp, #32
+ bl fe_sq
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ bl fe_sq
+ add r0, sp, #0x40
+ add r1, sp, #0
+ add r2, sp, #0x40
+ bl fe_mul
+ add r0, sp, #32
+ add r1, sp, #32
+ add r2, sp, #0x40
+ bl fe_mul
+ add r0, sp, #0x60
+ add r1, sp, #32
+ bl fe_sq
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ add r2, sp, #0x60
+ bl fe_mul
+ add r0, sp, #0x60
+ add r1, sp, #0x40
+ bl fe_sq
+ mov r4, #4
+L_curve25519_inv_1:
+ add r0, sp, #0x60
+ add r1, sp, #0x60
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_curve25519_inv_1
+ add r0, sp, #0x40
+ add r1, sp, #0x60
+ add r2, sp, #0x40
+ bl fe_mul
+ add r0, sp, #0x60
+ add r1, sp, #0x40
+ bl fe_sq
+ mov r4, #9
+L_curve25519_inv_2:
+ add r0, sp, #0x60
+ add r1, sp, #0x60
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_curve25519_inv_2
+ add r0, sp, #0x60
+ add r1, sp, #0x60
+ add r2, sp, #0x40
+ bl fe_mul
+ add r0, sp, #0x80
+ add r1, sp, #0x60
+ bl fe_sq
+ mov r4, #19
+L_curve25519_inv_3:
+ add r0, sp, #0x80
+ add r1, sp, #0x80
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_curve25519_inv_3
+ add r0, sp, #0x60
+ add r1, sp, #0x80
+ add r2, sp, #0x60
+ bl fe_mul
+ mov r4, #10
+L_curve25519_inv_4:
+ add r0, sp, #0x60
+ add r1, sp, #0x60
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_curve25519_inv_4
+ add r0, sp, #0x40
+ add r1, sp, #0x60
+ add r2, sp, #0x40
+ bl fe_mul
+ add r0, sp, #0x60
+ add r1, sp, #0x40
+ bl fe_sq
+ mov r4, #49
+L_curve25519_inv_5:
+ add r0, sp, #0x60
+ add r1, sp, #0x60
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_curve25519_inv_5
+ add r0, sp, #0x60
+ add r1, sp, #0x60
+ add r2, sp, #0x40
+ bl fe_mul
+ add r0, sp, #0x80
+ add r1, sp, #0x60
+ bl fe_sq
+ mov r4, #0x63
+L_curve25519_inv_6:
+ add r0, sp, #0x80
+ add r1, sp, #0x80
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_curve25519_inv_6
+ add r0, sp, #0x60
+ add r1, sp, #0x80
+ add r2, sp, #0x60
+ bl fe_mul
+ mov r4, #50
+L_curve25519_inv_7:
+ add r0, sp, #0x60
+ add r1, sp, #0x60
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_curve25519_inv_7
+ add r0, sp, #0x40
+ add r1, sp, #0x60
+ add r2, sp, #0x40
+ bl fe_mul
+ mov r4, #5
+L_curve25519_inv_8:
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_curve25519_inv_8
+ add r0, sp, #0
+ add r1, sp, #0x40
+ add r2, sp, #32
+ bl fe_mul
+ add r2, sp, #0
+ ldr r1, [sp, #160]
+ ldr r0, [sp, #160]
+ bl fe_mul
+ mov r0, #0
+ add sp, sp, #0xbc
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size curve25519,.-curve25519
+ .text
+ .align 2
+ .globl fe_pow22523
+ .type fe_pow22523, %function
+fe_pow22523:
+ push {r4, lr}
+ sub sp, sp, #0x68
+ # pow22523
+ str r0, [sp, #96]
+ str r1, [sp, #100]
+ mov r0, sp
+ ldr r1, [sp, #100]
+ bl fe_sq
+ add r0, sp, #32
+ mov r1, sp
+ bl fe_sq
+ add r0, sp, #32
+ add r1, sp, #32
+ bl fe_sq
+ add r0, sp, #32
+ ldr r1, [sp, #100]
+ add r2, sp, #32
+ bl fe_mul
+ mov r0, sp
+ mov r1, sp
+ add r2, sp, #32
+ bl fe_mul
+ mov r0, sp
+ mov r1, sp
+ bl fe_sq
+ mov r0, sp
+ add r1, sp, #32
+ mov r2, sp
+ bl fe_mul
+ add r0, sp, #32
+ mov r1, sp
+ bl fe_sq
+ mov r4, #4
+L_fe_pow22523_1:
+ add r0, sp, #32
+ add r1, sp, #32
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_pow22523_1
+ mov r0, sp
+ add r1, sp, #32
+ mov r2, sp
+ bl fe_mul
+ add r0, sp, #32
+ mov r1, sp
+ bl fe_sq
+ mov r4, #9
+L_fe_pow22523_2:
+ add r0, sp, #32
+ add r1, sp, #32
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_pow22523_2
+ add r0, sp, #32
+ add r1, sp, #32
+ mov r2, sp
+ bl fe_mul
+ add r0, sp, #0x40
+ add r1, sp, #32
+ bl fe_sq
+ mov r4, #19
+L_fe_pow22523_3:
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_pow22523_3
+ add r0, sp, #32
+ add r1, sp, #0x40
+ add r2, sp, #32
+ bl fe_mul
+ mov r4, #10
+L_fe_pow22523_4:
+ add r0, sp, #32
+ add r1, sp, #32
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_pow22523_4
+ mov r0, sp
+ add r1, sp, #32
+ mov r2, sp
+ bl fe_mul
+ add r0, sp, #32
+ mov r1, sp
+ bl fe_sq
+ mov r4, #49
+L_fe_pow22523_5:
+ add r0, sp, #32
+ add r1, sp, #32
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_pow22523_5
+ add r0, sp, #32
+ add r1, sp, #32
+ mov r2, sp
+ bl fe_mul
+ add r0, sp, #0x40
+ add r1, sp, #32
+ bl fe_sq
+ mov r4, #0x63
+L_fe_pow22523_6:
+ add r0, sp, #0x40
+ add r1, sp, #0x40
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_pow22523_6
+ add r0, sp, #32
+ add r1, sp, #0x40
+ add r2, sp, #32
+ bl fe_mul
+ mov r4, #50
+L_fe_pow22523_7:
+ add r0, sp, #32
+ add r1, sp, #32
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_pow22523_7
+ mov r0, sp
+ add r1, sp, #32
+ mov r2, sp
+ bl fe_mul
+ mov r4, #2
+L_fe_pow22523_8:
+ mov r0, sp
+ mov r1, sp
+ bl fe_sq
+ sub r4, r4, #1
+ cmp r4, #0
+ bne L_fe_pow22523_8
+ ldr r0, [sp, #96]
+ mov r1, sp
+ ldr r2, [sp, #100]
+ bl fe_mul
+ ldr r1, [sp, #100]
+ ldr r0, [sp, #96]
+ add sp, sp, #0x68
+ pop {r4, pc}
+ .size fe_pow22523,.-fe_pow22523
+ .text
+ .align 2
+ .globl fe_ge_to_p2
+ .type fe_ge_to_p2, %function
+fe_ge_to_p2:
+ push {lr}
+ sub sp, sp, #16
+ str r0, [sp]
+ str r1, [sp, #4]
+ str r2, [sp, #8]
+ str r3, [sp, #12]
+ ldr r2, [sp, #28]
+ ldr r1, [sp, #12]
+ ldr r0, [sp]
+ bl fe_mul
+ ldr r2, [sp, #24]
+ ldr r1, [sp, #20]
+ ldr r0, [sp, #4]
+ bl fe_mul
+ ldr r2, [sp, #28]
+ ldr r1, [sp, #24]
+ ldr r0, [sp, #8]
+ bl fe_mul
+ add sp, sp, #16
+ pop {pc}
+ .size fe_ge_to_p2,.-fe_ge_to_p2
+ .text
+ .align 2
+ .globl fe_ge_to_p3
+ .type fe_ge_to_p3, %function
+fe_ge_to_p3:
+ push {lr}
+ sub sp, sp, #16
+ str r0, [sp]
+ str r1, [sp, #4]
+ str r2, [sp, #8]
+ str r3, [sp, #12]
+ ldr r2, [sp, #32]
+ ldr r1, [sp, #20]
+ ldr r0, [sp]
+ bl fe_mul
+ ldr r2, [sp, #28]
+ ldr r1, [sp, #24]
+ ldr r0, [sp, #4]
+ bl fe_mul
+ ldr r2, [sp, #32]
+ ldr r1, [sp, #28]
+ ldr r0, [sp, #8]
+ bl fe_mul
+ ldr r2, [sp, #24]
+ ldr r1, [sp, #20]
+ ldr r0, [sp, #12]
+ bl fe_mul
+ add sp, sp, #16
+ pop {pc}
+ .size fe_ge_to_p3,.-fe_ge_to_p3
+ .text
+ .align 2
+ .globl fe_ge_dbl
+ .type fe_ge_dbl, %function
+fe_ge_dbl:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub sp, sp, #16
+ str r0, [sp]
+ str r1, [sp, #4]
+ str r2, [sp, #8]
+ str r3, [sp, #12]
+ ldr r1, [sp, #52]
+ ldr r0, [sp]
+ bl fe_sq
+ ldr r1, [sp, #56]
+ ldr r0, [sp, #8]
+ bl fe_sq
+ ldr r0, [sp, #4]
+ ldr r1, [sp, #52]
+ ldr r2, [sp, #56]
+ # Add
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ adds r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ adcs r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ sbcs r5, r5, r11
+ sbcs r6, r6, r11
+ sbcs r7, r7, r11
+ sbcs r8, r8, r11
+ sbcs r9, r9, r11
+ sbc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r1, [sp, #4]
+ ldr r0, [sp, #12]
+ bl fe_sq
+ ldr r0, [sp, #4]
+ ldr r1, [sp, #8]
+ ldr r2, [sp]
+ # Add-Sub
+ # Add
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r2]
+ ldr r6, [r2, #4]
+ adds r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0]
+ str r8, [r0, #4]
+ # Sub
+ subs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1]
+ str r10, [r1, #4]
+ # Add
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ ldr r5, [r2, #8]
+ ldr r6, [r2, #12]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #8]
+ str r8, [r0, #12]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #8]
+ str r10, [r1, #12]
+ # Add
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r2, #16]
+ ldr r6, [r2, #20]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #16]
+ str r10, [r1, #20]
+ # Add
+ ldr r3, [r1, #24]
+ ldr r4, [r1, #28]
+ ldr r5, [r2, #24]
+ ldr r6, [r2, #28]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ adc r8, r4, r6
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ sbc r10, r4, r6
+ mov r12, #-19
+ asr r11, r8, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ str r3, [r0]
+ str r4, [r0, #4]
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #8]
+ str r4, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #16]
+ str r4, [r0, #20]
+ sbcs r7, r7, r11
+ sbc r8, r8, lr
+ str r7, [r0, #24]
+ str r8, [r0, #28]
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ str r3, [r1]
+ str r4, [r1, #4]
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #8]
+ str r4, [r1, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #16]
+ str r4, [r1, #20]
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r9, [r1, #24]
+ str r10, [r1, #28]
+ ldr r0, [sp]
+ ldr r1, [sp, #12]
+ ldr r2, [sp, #4]
+ # Sub
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ subs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ sbcs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ adcs r5, r5, r11
+ adcs r6, r6, r11
+ adcs r7, r7, r11
+ adcs r8, r8, r11
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r1, [sp, #60]
+ ldr r0, [sp, #12]
+ bl fe_sq2
+ ldr r0, [sp, #12]
+ ldr r1, [sp, #8]
+ # Sub
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ ldr r7, [r1]
+ ldr r8, [r1, #4]
+ ldr r9, [r1, #8]
+ ldr r10, [r1, #12]
+ subs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ ldr r5, [r0, #24]
+ ldr r6, [r0, #28]
+ ldr r7, [r1, #16]
+ ldr r8, [r1, #20]
+ ldr r9, [r1, #24]
+ ldr r10, [r1, #28]
+ sbcs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ adcs r5, r5, r11
+ adcs r6, r6, r11
+ adcs r7, r7, r11
+ adcs r8, r8, r11
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ add sp, sp, #16
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_ge_dbl,.-fe_ge_dbl
+ .text
+ .align 2
+ .globl fe_ge_madd
+ .type fe_ge_madd, %function
+fe_ge_madd:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub sp, sp, #32
+ str r0, [sp]
+ str r1, [sp, #4]
+ str r2, [sp, #8]
+ str r3, [sp, #12]
+ ldr r0, [sp]
+ ldr r1, [sp, #72]
+ ldr r2, [sp, #68]
+ # Add
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ adds r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ adcs r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ sbcs r5, r5, r11
+ sbcs r6, r6, r11
+ sbcs r7, r7, r11
+ sbcs r8, r8, r11
+ sbcs r9, r9, r11
+ sbc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r0, [sp, #4]
+ ldr r1, [sp, #72]
+ ldr r2, [sp, #68]
+ # Sub
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ subs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ sbcs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ adcs r5, r5, r11
+ adcs r6, r6, r11
+ adcs r7, r7, r11
+ adcs r8, r8, r11
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r2, [sp, #88]
+ ldr r1, [sp]
+ ldr r0, [sp, #8]
+ bl fe_mul
+ ldr r2, [sp, #92]
+ ldr r1, [sp, #4]
+ ldr r0, [sp, #4]
+ bl fe_mul
+ ldr r2, [sp, #80]
+ ldr r1, [sp, #84]
+ ldr r0, [sp, #12]
+ bl fe_mul
+ ldr r0, [sp, #4]
+ ldr r1, [sp]
+ ldr r2, [sp, #8]
+ # Add-Sub
+ # Add
+ ldr r3, [r2]
+ ldr r4, [r2, #4]
+ ldr r5, [r0]
+ ldr r6, [r0, #4]
+ adds r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0]
+ str r8, [r0, #4]
+ # Sub
+ subs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1]
+ str r10, [r1, #4]
+ # Add
+ ldr r3, [r2, #8]
+ ldr r4, [r2, #12]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #8]
+ str r8, [r0, #12]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #8]
+ str r10, [r1, #12]
+ # Add
+ ldr r3, [r2, #16]
+ ldr r4, [r2, #20]
+ ldr r5, [r0, #16]
+ ldr r6, [r0, #20]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #16]
+ str r10, [r1, #20]
+ # Add
+ ldr r3, [r2, #24]
+ ldr r4, [r2, #28]
+ ldr r5, [r0, #24]
+ ldr r6, [r0, #28]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ adc r8, r4, r6
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ sbc r10, r4, r6
+ mov r12, #-19
+ asr r11, r8, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ str r3, [r0]
+ str r4, [r0, #4]
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #8]
+ str r4, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #16]
+ str r4, [r0, #20]
+ sbcs r7, r7, r11
+ sbc r8, r8, lr
+ str r7, [r0, #24]
+ str r8, [r0, #28]
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ str r3, [r1]
+ str r4, [r1, #4]
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #8]
+ str r4, [r1, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #16]
+ str r4, [r1, #20]
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r9, [r1, #24]
+ str r10, [r1, #28]
+ ldr r0, [sp, #8]
+ ldr r1, [sp, #76]
+ # Double
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r1, #16]
+ ldr r8, [r1, #20]
+ ldr r9, [r1, #24]
+ ldr r10, [r1, #28]
+ adds r3, r3, r3
+ adcs r4, r4, r4
+ adcs r5, r5, r5
+ adcs r6, r6, r6
+ adcs r7, r7, r7
+ adcs r8, r8, r8
+ adcs r9, r9, r9
+ adc r10, r10, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ sbcs r5, r5, r11
+ sbcs r6, r6, r11
+ sbcs r7, r7, r11
+ sbcs r8, r8, r11
+ sbcs r9, r9, r11
+ sbc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r0, [sp, #8]
+ ldr r1, [sp, #12]
+ # Add-Sub
+ # Add
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r1]
+ ldr r6, [r1, #4]
+ adds r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0]
+ str r8, [r0, #4]
+ # Sub
+ subs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1]
+ str r10, [r1, #4]
+ # Add
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #8]
+ str r8, [r0, #12]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #8]
+ str r10, [r1, #12]
+ # Add
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ ldr r5, [r1, #16]
+ ldr r6, [r1, #20]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #16]
+ str r10, [r1, #20]
+ # Add
+ ldr r3, [r0, #24]
+ ldr r4, [r0, #28]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ adc r8, r4, r6
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ sbc r10, r4, r6
+ mov r12, #-19
+ asr r11, r8, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ str r3, [r0]
+ str r4, [r0, #4]
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #8]
+ str r4, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #16]
+ str r4, [r0, #20]
+ sbcs r7, r7, r11
+ sbc r8, r8, lr
+ str r7, [r0, #24]
+ str r8, [r0, #28]
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ str r3, [r1]
+ str r4, [r1, #4]
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #8]
+ str r4, [r1, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #16]
+ str r4, [r1, #20]
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r9, [r1, #24]
+ str r10, [r1, #28]
+ add sp, sp, #32
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_ge_madd,.-fe_ge_madd
+ .text
+ .align 2
+ .globl fe_ge_msub
+ .type fe_ge_msub, %function
+fe_ge_msub:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub sp, sp, #32
+ str r0, [sp]
+ str r1, [sp, #4]
+ str r2, [sp, #8]
+ str r3, [sp, #12]
+ ldr r0, [sp]
+ ldr r1, [sp, #72]
+ ldr r2, [sp, #68]
+ # Add
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ adds r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ adcs r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ sbcs r5, r5, r11
+ sbcs r6, r6, r11
+ sbcs r7, r7, r11
+ sbcs r8, r8, r11
+ sbcs r9, r9, r11
+ sbc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r0, [sp, #4]
+ ldr r1, [sp, #72]
+ ldr r2, [sp, #68]
+ # Sub
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ subs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ sbcs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ adcs r5, r5, r11
+ adcs r6, r6, r11
+ adcs r7, r7, r11
+ adcs r8, r8, r11
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r2, [sp, #92]
+ ldr r1, [sp]
+ ldr r0, [sp, #8]
+ bl fe_mul
+ ldr r2, [sp, #88]
+ ldr r1, [sp, #4]
+ ldr r0, [sp, #4]
+ bl fe_mul
+ ldr r2, [sp, #80]
+ ldr r1, [sp, #84]
+ ldr r0, [sp, #12]
+ bl fe_mul
+ ldr r0, [sp, #4]
+ ldr r1, [sp]
+ ldr r2, [sp, #8]
+ # Add-Sub
+ # Add
+ ldr r3, [r2]
+ ldr r4, [r2, #4]
+ ldr r5, [r0]
+ ldr r6, [r0, #4]
+ adds r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0]
+ str r8, [r0, #4]
+ # Sub
+ subs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1]
+ str r10, [r1, #4]
+ # Add
+ ldr r3, [r2, #8]
+ ldr r4, [r2, #12]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #8]
+ str r8, [r0, #12]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #8]
+ str r10, [r1, #12]
+ # Add
+ ldr r3, [r2, #16]
+ ldr r4, [r2, #20]
+ ldr r5, [r0, #16]
+ ldr r6, [r0, #20]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #16]
+ str r10, [r1, #20]
+ # Add
+ ldr r3, [r2, #24]
+ ldr r4, [r2, #28]
+ ldr r5, [r0, #24]
+ ldr r6, [r0, #28]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ adc r8, r4, r6
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ sbc r10, r4, r6
+ mov r12, #-19
+ asr r11, r8, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ str r3, [r0]
+ str r4, [r0, #4]
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #8]
+ str r4, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #16]
+ str r4, [r0, #20]
+ sbcs r7, r7, r11
+ sbc r8, r8, lr
+ str r7, [r0, #24]
+ str r8, [r0, #28]
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ str r3, [r1]
+ str r4, [r1, #4]
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #8]
+ str r4, [r1, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #16]
+ str r4, [r1, #20]
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r9, [r1, #24]
+ str r10, [r1, #28]
+ ldr r0, [sp, #8]
+ ldr r1, [sp, #76]
+ # Double
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r1, #16]
+ ldr r8, [r1, #20]
+ ldr r9, [r1, #24]
+ ldr r10, [r1, #28]
+ adds r3, r3, r3
+ adcs r4, r4, r4
+ adcs r5, r5, r5
+ adcs r6, r6, r6
+ adcs r7, r7, r7
+ adcs r8, r8, r8
+ adcs r9, r9, r9
+ adc r10, r10, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ sbcs r5, r5, r11
+ sbcs r6, r6, r11
+ sbcs r7, r7, r11
+ sbcs r8, r8, r11
+ sbcs r9, r9, r11
+ sbc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r0, [sp, #12]
+ ldr r1, [sp, #8]
+ # Add-Sub
+ # Add
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r0]
+ ldr r6, [r0, #4]
+ adds r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0]
+ str r8, [r0, #4]
+ # Sub
+ subs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1]
+ str r10, [r1, #4]
+ # Add
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #8]
+ str r8, [r0, #12]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #8]
+ str r10, [r1, #12]
+ # Add
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r0, #16]
+ ldr r6, [r0, #20]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #16]
+ str r10, [r1, #20]
+ # Add
+ ldr r3, [r1, #24]
+ ldr r4, [r1, #28]
+ ldr r5, [r0, #24]
+ ldr r6, [r0, #28]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ adc r8, r4, r6
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ sbc r10, r4, r6
+ mov r12, #-19
+ asr r11, r8, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ str r3, [r0]
+ str r4, [r0, #4]
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #8]
+ str r4, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #16]
+ str r4, [r0, #20]
+ sbcs r7, r7, r11
+ sbc r8, r8, lr
+ str r7, [r0, #24]
+ str r8, [r0, #28]
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ str r3, [r1]
+ str r4, [r1, #4]
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #8]
+ str r4, [r1, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #16]
+ str r4, [r1, #20]
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r9, [r1, #24]
+ str r10, [r1, #28]
+ add sp, sp, #32
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_ge_msub,.-fe_ge_msub
+ .text
+ .align 2
+ .globl fe_ge_add
+ .type fe_ge_add, %function
+fe_ge_add:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub sp, sp, #0x60
+ str r0, [sp]
+ str r1, [sp, #4]
+ str r2, [sp, #8]
+ str r3, [sp, #12]
+ ldr r0, [sp]
+ ldr r1, [sp, #136]
+ ldr r2, [sp, #132]
+ # Add
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ adds r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ adcs r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ sbcs r5, r5, r11
+ sbcs r6, r6, r11
+ sbcs r7, r7, r11
+ sbcs r8, r8, r11
+ sbcs r9, r9, r11
+ sbc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r0, [sp, #4]
+ ldr r1, [sp, #136]
+ ldr r2, [sp, #132]
+ # Sub
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ subs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ sbcs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ adcs r5, r5, r11
+ adcs r6, r6, r11
+ adcs r7, r7, r11
+ adcs r8, r8, r11
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r2, [sp, #156]
+ ldr r1, [sp]
+ ldr r0, [sp, #8]
+ bl fe_mul
+ ldr r2, [sp, #160]
+ ldr r1, [sp, #4]
+ ldr r0, [sp, #4]
+ bl fe_mul
+ ldr r2, [sp, #144]
+ ldr r1, [sp, #152]
+ ldr r0, [sp, #12]
+ bl fe_mul
+ ldr r2, [sp, #148]
+ ldr r1, [sp, #140]
+ ldr r0, [sp]
+ bl fe_mul
+ add r0, sp, #16
+ ldr r1, [sp]
+ # Double
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r1, #16]
+ ldr r8, [r1, #20]
+ ldr r9, [r1, #24]
+ ldr r10, [r1, #28]
+ adds r3, r3, r3
+ adcs r4, r4, r4
+ adcs r5, r5, r5
+ adcs r6, r6, r6
+ adcs r7, r7, r7
+ adcs r8, r8, r8
+ adcs r9, r9, r9
+ adc r10, r10, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ sbcs r5, r5, r11
+ sbcs r6, r6, r11
+ sbcs r7, r7, r11
+ sbcs r8, r8, r11
+ sbcs r9, r9, r11
+ sbc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r0, [sp, #4]
+ ldr r1, [sp]
+ ldr r2, [sp, #8]
+ # Add-Sub
+ # Add
+ ldr r3, [r2]
+ ldr r4, [r2, #4]
+ ldr r5, [r0]
+ ldr r6, [r0, #4]
+ adds r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0]
+ str r8, [r0, #4]
+ # Sub
+ subs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1]
+ str r10, [r1, #4]
+ # Add
+ ldr r3, [r2, #8]
+ ldr r4, [r2, #12]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #8]
+ str r8, [r0, #12]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #8]
+ str r10, [r1, #12]
+ # Add
+ ldr r3, [r2, #16]
+ ldr r4, [r2, #20]
+ ldr r5, [r0, #16]
+ ldr r6, [r0, #20]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #16]
+ str r10, [r1, #20]
+ # Add
+ ldr r3, [r2, #24]
+ ldr r4, [r2, #28]
+ ldr r5, [r0, #24]
+ ldr r6, [r0, #28]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ adc r8, r4, r6
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ sbc r10, r4, r6
+ mov r12, #-19
+ asr r11, r8, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ str r3, [r0]
+ str r4, [r0, #4]
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #8]
+ str r4, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #16]
+ str r4, [r0, #20]
+ sbcs r7, r7, r11
+ sbc r8, r8, lr
+ str r7, [r0, #24]
+ str r8, [r0, #28]
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ str r3, [r1]
+ str r4, [r1, #4]
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #8]
+ str r4, [r1, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #16]
+ str r4, [r1, #20]
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r9, [r1, #24]
+ str r10, [r1, #28]
+ ldr r0, [sp, #8]
+ ldr r1, [sp, #12]
+ add r2, sp, #16
+ # Add-Sub
+ # Add
+ ldr r3, [r2]
+ ldr r4, [r2, #4]
+ ldr r5, [r1]
+ ldr r6, [r1, #4]
+ adds r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0]
+ str r8, [r0, #4]
+ # Sub
+ subs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1]
+ str r10, [r1, #4]
+ # Add
+ ldr r3, [r2, #8]
+ ldr r4, [r2, #12]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #8]
+ str r8, [r0, #12]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #8]
+ str r10, [r1, #12]
+ # Add
+ ldr r3, [r2, #16]
+ ldr r4, [r2, #20]
+ ldr r5, [r1, #16]
+ ldr r6, [r1, #20]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #16]
+ str r10, [r1, #20]
+ # Add
+ ldr r3, [r2, #24]
+ ldr r4, [r2, #28]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ adc r8, r4, r6
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ sbc r10, r4, r6
+ mov r12, #-19
+ asr r11, r8, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ str r3, [r0]
+ str r4, [r0, #4]
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #8]
+ str r4, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #16]
+ str r4, [r0, #20]
+ sbcs r7, r7, r11
+ sbc r8, r8, lr
+ str r7, [r0, #24]
+ str r8, [r0, #28]
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ str r3, [r1]
+ str r4, [r1, #4]
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #8]
+ str r4, [r1, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #16]
+ str r4, [r1, #20]
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r9, [r1, #24]
+ str r10, [r1, #28]
+ add sp, sp, #0x60
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_ge_add,.-fe_ge_add
+ .text
+ .align 2
+ .globl fe_ge_sub
+ .type fe_ge_sub, %function
+fe_ge_sub:
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ sub sp, sp, #0x60
+ str r0, [sp]
+ str r1, [sp, #4]
+ str r2, [sp, #8]
+ str r3, [sp, #12]
+ ldr r0, [sp]
+ ldr r1, [sp, #136]
+ ldr r2, [sp, #132]
+ # Add
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ adds r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ adcs r7, r3, r7
+ adcs r8, r4, r8
+ adcs r9, r5, r9
+ adc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ sbcs r5, r5, r11
+ sbcs r6, r6, r11
+ sbcs r7, r7, r11
+ sbcs r8, r8, r11
+ sbcs r9, r9, r11
+ sbc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r0, [sp, #4]
+ ldr r1, [sp, #136]
+ ldr r2, [sp, #132]
+ # Sub
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r2]
+ ldr r8, [r2, #4]
+ ldr r9, [r2, #8]
+ ldr r10, [r2, #12]
+ subs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbcs r10, r6, r10
+ str r7, [r0]
+ str r8, [r0, #4]
+ str r9, [r0, #8]
+ str r10, [r0, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ ldr r5, [r1, #24]
+ ldr r6, [r1, #28]
+ ldr r7, [r2, #16]
+ ldr r8, [r2, #20]
+ ldr r9, [r2, #24]
+ ldr r10, [r2, #28]
+ sbcs r7, r3, r7
+ sbcs r8, r4, r8
+ sbcs r9, r5, r9
+ sbc r10, r6, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ adcs r5, r5, r11
+ adcs r6, r6, r11
+ adcs r7, r7, r11
+ adcs r8, r8, r11
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r2, [sp, #160]
+ ldr r1, [sp]
+ ldr r0, [sp, #8]
+ bl fe_mul
+ ldr r2, [sp, #156]
+ ldr r1, [sp, #4]
+ ldr r0, [sp, #4]
+ bl fe_mul
+ ldr r2, [sp, #144]
+ ldr r1, [sp, #152]
+ ldr r0, [sp, #12]
+ bl fe_mul
+ ldr r2, [sp, #148]
+ ldr r1, [sp, #140]
+ ldr r0, [sp]
+ bl fe_mul
+ add r0, sp, #16
+ ldr r1, [sp]
+ # Double
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ ldr r5, [r1, #8]
+ ldr r6, [r1, #12]
+ ldr r7, [r1, #16]
+ ldr r8, [r1, #20]
+ ldr r9, [r1, #24]
+ ldr r10, [r1, #28]
+ adds r3, r3, r3
+ adcs r4, r4, r4
+ adcs r5, r5, r5
+ adcs r6, r6, r6
+ adcs r7, r7, r7
+ adcs r8, r8, r8
+ adcs r9, r9, r9
+ adc r10, r10, r10
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ sbcs r5, r5, r11
+ sbcs r6, r6, r11
+ sbcs r7, r7, r11
+ sbcs r8, r8, r11
+ sbcs r9, r9, r11
+ sbc r10, r10, lr
+ str r3, [r0]
+ str r4, [r0, #4]
+ str r5, [r0, #8]
+ str r6, [r0, #12]
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ str r9, [r0, #24]
+ str r10, [r0, #28]
+ ldr r0, [sp, #4]
+ ldr r1, [sp]
+ ldr r2, [sp, #8]
+ # Add-Sub
+ # Add
+ ldr r3, [r2]
+ ldr r4, [r2, #4]
+ ldr r5, [r0]
+ ldr r6, [r0, #4]
+ adds r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0]
+ str r8, [r0, #4]
+ # Sub
+ subs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1]
+ str r10, [r1, #4]
+ # Add
+ ldr r3, [r2, #8]
+ ldr r4, [r2, #12]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #8]
+ str r8, [r0, #12]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #8]
+ str r10, [r1, #12]
+ # Add
+ ldr r3, [r2, #16]
+ ldr r4, [r2, #20]
+ ldr r5, [r0, #16]
+ ldr r6, [r0, #20]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #16]
+ str r10, [r1, #20]
+ # Add
+ ldr r3, [r2, #24]
+ ldr r4, [r2, #28]
+ ldr r5, [r0, #24]
+ ldr r6, [r0, #28]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ adc r8, r4, r6
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ sbc r10, r4, r6
+ mov r12, #-19
+ asr r11, r8, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ str r3, [r0]
+ str r4, [r0, #4]
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #8]
+ str r4, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #16]
+ str r4, [r0, #20]
+ sbcs r7, r7, r11
+ sbc r8, r8, lr
+ str r7, [r0, #24]
+ str r8, [r0, #28]
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ str r3, [r1]
+ str r4, [r1, #4]
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #8]
+ str r4, [r1, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #16]
+ str r4, [r1, #20]
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r9, [r1, #24]
+ str r10, [r1, #28]
+ ldr r0, [sp, #12]
+ ldr r1, [sp, #8]
+ add r2, sp, #16
+ # Add-Sub
+ # Add
+ ldr r3, [r2]
+ ldr r4, [r2, #4]
+ ldr r5, [r0]
+ ldr r6, [r0, #4]
+ adds r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0]
+ str r8, [r0, #4]
+ # Sub
+ subs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1]
+ str r10, [r1, #4]
+ # Add
+ ldr r3, [r2, #8]
+ ldr r4, [r2, #12]
+ ldr r5, [r0, #8]
+ ldr r6, [r0, #12]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #8]
+ str r8, [r0, #12]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #8]
+ str r10, [r1, #12]
+ # Add
+ ldr r3, [r2, #16]
+ ldr r4, [r2, #20]
+ ldr r5, [r0, #16]
+ ldr r6, [r0, #20]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ mov r12, #0
+ adcs r8, r4, r6
+ adc r12, r12, #0
+ str r7, [r0, #16]
+ str r8, [r0, #20]
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ mov lr, #0
+ sbcs r10, r4, r6
+ adc lr, lr, #0
+ str r9, [r1, #16]
+ str r10, [r1, #20]
+ # Add
+ ldr r3, [r2, #24]
+ ldr r4, [r2, #28]
+ ldr r5, [r0, #24]
+ ldr r6, [r0, #28]
+ adds r12, r12, #-1
+ adcs r7, r3, r5
+ adc r8, r4, r6
+ # Sub
+ adds lr, lr, #-1
+ sbcs r9, r3, r5
+ sbc r10, r4, r6
+ mov r12, #-19
+ asr r11, r8, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Sub modulus (if overflow)
+ ldr r3, [r0]
+ ldr r4, [r0, #4]
+ subs r3, r3, r12
+ sbcs r4, r4, r11
+ str r3, [r0]
+ str r4, [r0, #4]
+ ldr r3, [r0, #8]
+ ldr r4, [r0, #12]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #8]
+ str r4, [r0, #12]
+ ldr r3, [r0, #16]
+ ldr r4, [r0, #20]
+ sbcs r3, r3, r11
+ sbcs r4, r4, r11
+ str r3, [r0, #16]
+ str r4, [r0, #20]
+ sbcs r7, r7, r11
+ sbc r8, r8, lr
+ str r7, [r0, #24]
+ str r8, [r0, #28]
+ mov r12, #-19
+ asr r11, r10, #31
+ # Mask the modulus
+ and r12, r11, r12
+ and lr, r11, #0x7fffffff
+ # Add modulus (if underflow)
+ ldr r3, [r1]
+ ldr r4, [r1, #4]
+ adds r3, r3, r12
+ adcs r4, r4, r11
+ str r3, [r1]
+ str r4, [r1, #4]
+ ldr r3, [r1, #8]
+ ldr r4, [r1, #12]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #8]
+ str r4, [r1, #12]
+ ldr r3, [r1, #16]
+ ldr r4, [r1, #20]
+ adcs r3, r3, r11
+ adcs r4, r4, r11
+ str r3, [r1, #16]
+ str r4, [r1, #20]
+ adcs r9, r9, r11
+ adc r10, r10, lr
+ str r9, [r1, #24]
+ str r10, [r1, #28]
+ add sp, sp, #0x60
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .size fe_ge_sub,.-fe_ge_sub
+#endif /* !__aarch64__ */
+#endif /* WOLFSSL_ARMASM */
diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.c b/wolfcrypt/src/port/arm/armv8-32-curve25519.c
new file mode 100644
index 0000000..f7ef379
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.c
@@ -0,0 +1,5581 @@
+/* armv8-32-curve25519
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c
+ */
+
+#ifndef __aarch64__
+
+#include <stdint.h>
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#include <wolfssl/wolfcrypt/fe_operations.h>
+#include <stdint.h>
+
+void fe_init()
+{
+ __asm__ __volatile__ (
+ "\n\t"
+ :
+ :
+ : "memory"
+ );
+}
+
+void fe_frombytes(fe out, const unsigned char* in)
+{
+ __asm__ __volatile__ (
+ "ldrd r2, r3, [%[in]]\n\t"
+ "ldrd r12, lr, [%[in], #8]\n\t"
+ "ldrd r4, r5, [%[in], #16]\n\t"
+ "ldrd r6, r7, [%[in], #24]\n\t"
+ "and r7, r7, #0x7fffffff\n\t"
+ "strd r2, r3, [%[out]]\n\t"
+ "strd r12, lr, [%[out], #8]\n\t"
+ "strd r4, r5, [%[out], #16]\n\t"
+ "strd r6, r7, [%[out], #24]\n\t"
+ : [out] "+r" (out), [in] "+r" (in)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7"
+ );
+}
+
+void fe_tobytes(unsigned char* out, const fe n)
+{
+ __asm__ __volatile__ (
+ "ldrd r2, r3, [%[in]]\n\t"
+ "ldrd r12, lr, [%[in], #8]\n\t"
+ "ldrd r4, r5, [%[in], #16]\n\t"
+ "ldrd r6, r7, [%[in], #24]\n\t"
+ "adds r8, r2, #19\n\t"
+ "adcs r8, r3, #0\n\t"
+ "adcs r8, r12, #0\n\t"
+ "adcs r8, lr, #0\n\t"
+ "adcs r8, r4, #0\n\t"
+ "adcs r8, r5, #0\n\t"
+ "adcs r8, r6, #0\n\t"
+ "adc r8, r7, #0\n\t"
+ "asr r8, r8, #31\n\t"
+ "and r8, r8, #19\n\t"
+ "adds r2, r2, r8\n\t"
+ "adcs r3, r3, #0\n\t"
+ "adcs r12, r12, #0\n\t"
+ "adcs lr, lr, #0\n\t"
+ "adcs r4, r4, #0\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adc r7, r7, #0\n\t"
+ "and r7, r7, #0x7fffffff\n\t"
+ "strd r2, r3, [%[out]]\n\t"
+ "strd r12, lr, [%[out], #8]\n\t"
+ "strd r4, r5, [%[out], #16]\n\t"
+ "strd r6, r7, [%[out], #24]\n\t"
+ : [out] "+r" (out), [n] "+r" (n)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
+ );
+}
+
+void fe_1(fe n)
+{
+ __asm__ __volatile__ (
+ /* Set one */
+ "mov r2, #1\n\t"
+ "mov r1, #0\n\t"
+ "strd r2, r1, [%[n]]\n\t"
+ "strd r1, r1, [%[n], #8]\n\t"
+ "strd r1, r1, [%[n], #16]\n\t"
+ "strd r1, r1, [%[n], #24]\n\t"
+ : [n] "+r" (n)
+ :
+ : "memory", "r1", "r2"
+ );
+}
+
+void fe_0(fe n)
+{
+ __asm__ __volatile__ (
+ /* Set zero */
+ "mov r1, #0\n\t"
+ "strd r1, r1, [%[n]]\n\t"
+ "strd r1, r1, [%[n], #8]\n\t"
+ "strd r1, r1, [%[n], #16]\n\t"
+ "strd r1, r1, [%[n], #24]\n\t"
+ : [n] "+r" (n)
+ :
+ : "memory", "r1"
+ );
+}
+
+void fe_copy(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ /* Copy */
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r12, lr, [%[a], #8]\n\t"
+ "strd r2, r3, [%[r]]\n\t"
+ "strd r12, lr, [%[r], #8]\n\t"
+ "ldrd r2, r3, [%[a], #16]\n\t"
+ "ldrd r12, lr, [%[a], #24]\n\t"
+ "strd r2, r3, [%[r], #16]\n\t"
+ "strd r12, lr, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr"
+ );
+}
+
+void fe_sub(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ /* Sub */
+ "ldrd r12, lr, [%[a]]\n\t"
+ "ldrd r4, r5, [%[a], #8]\n\t"
+ "ldrd r6, r7, [%[b]]\n\t"
+ "ldrd r8, r9, [%[b], #8]\n\t"
+ "subs r6, r12, r6\n\t"
+ "sbcs r7, lr, r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "strd r6, r7, [%[r]]\n\t"
+ "strd r8, r9, [%[r], #8]\n\t"
+ "ldrd r12, lr, [%[a], #16]\n\t"
+ "ldrd r4, r5, [%[a], #24]\n\t"
+ "ldrd r6, r7, [%[b], #16]\n\t"
+ "ldrd r8, r9, [%[b], #24]\n\t"
+ "sbcs r6, r12, r6\n\t"
+ "sbcs r7, lr, r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbc r9, r5, r9\n\t"
+ "mov r10, #-19\n\t"
+ "asr r3, r9, #31\n\t"
+ /* Mask the modulus */
+ "and r10, r3, r10\n\t"
+ "and r11, r3, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r12, lr, [%[r]]\n\t"
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "adds r12, r12, r10\n\t"
+ "adcs lr, lr, r3\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adcs r7, r7, r3\n\t"
+ "adcs r8, r8, r3\n\t"
+ "adc r9, r9, r11\n\t"
+ "strd r12, lr, [%[r]]\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "strd r6, r7, [%[r], #16]\n\t"
+ "strd r8, r9, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_add(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ /* Add */
+ "ldrd r12, lr, [%[a]]\n\t"
+ "ldrd r4, r5, [%[a], #8]\n\t"
+ "ldrd r6, r7, [%[b]]\n\t"
+ "ldrd r8, r9, [%[b], #8]\n\t"
+ "adds r6, r12, r6\n\t"
+ "adcs r7, lr, r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "strd r6, r7, [%[r]]\n\t"
+ "strd r8, r9, [%[r], #8]\n\t"
+ "ldrd r12, lr, [%[a], #16]\n\t"
+ "ldrd r4, r5, [%[a], #24]\n\t"
+ "ldrd r6, r7, [%[b], #16]\n\t"
+ "ldrd r8, r9, [%[b], #24]\n\t"
+ "adcs r6, r12, r6\n\t"
+ "adcs r7, lr, r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adc r9, r5, r9\n\t"
+ "mov r10, #-19\n\t"
+ "asr r3, r9, #31\n\t"
+ /* Mask the modulus */
+ "and r10, r3, r10\n\t"
+ "and r11, r3, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r12, lr, [%[r]]\n\t"
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "subs r12, r12, r10\n\t"
+ "sbcs lr, lr, r3\n\t"
+ "sbcs r4, r4, r3\n\t"
+ "sbcs r5, r5, r3\n\t"
+ "sbcs r6, r6, r3\n\t"
+ "sbcs r7, r7, r3\n\t"
+ "sbcs r8, r8, r3\n\t"
+ "sbc r9, r9, r11\n\t"
+ "strd r12, lr, [%[r]]\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "strd r6, r7, [%[r], #16]\n\t"
+ "strd r8, r9, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_neg(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "mov r5, #-1\n\t"
+ "mov r4, #-19\n\t"
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r12, lr, [%[a], #8]\n\t"
+ "subs r2, r4, r2\n\t"
+ "sbcs r3, r5, r3\n\t"
+ "sbcs r12, r5, r12\n\t"
+ "sbcs lr, r5, lr\n\t"
+ "strd r2, r3, [%[r]]\n\t"
+ "strd r12, lr, [%[r], #8]\n\t"
+ "mov r4, #0x7fffffff\n\t"
+ "ldrd r2, r3, [%[a], #16]\n\t"
+ "ldrd r12, lr, [%[a], #24]\n\t"
+ "sbcs r2, r5, r2\n\t"
+ "sbcs r3, r5, r3\n\t"
+ "sbcs r12, r5, r12\n\t"
+ "sbc lr, r4, lr\n\t"
+ "strd r2, r3, [%[r], #16]\n\t"
+ "strd r12, lr, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5"
+ );
+}
+
+int fe_isnonzero(const fe a)
+{
+ __asm__ __volatile__ (
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r12, lr, [%[a], #8]\n\t"
+ "ldrd r4, r5, [%[a], #16]\n\t"
+ "ldrd r6, r7, [%[a], #24]\n\t"
+ "adds r1, r2, #19\n\t"
+ "adcs r1, r3, #0\n\t"
+ "adcs r1, r12, #0\n\t"
+ "adcs r1, lr, #0\n\t"
+ "adcs r1, r4, #0\n\t"
+ "adcs r1, r5, #0\n\t"
+ "adcs r1, r6, #0\n\t"
+ "adc r1, r7, #0\n\t"
+ "asr r1, r1, #31\n\t"
+ "and r1, r1, #19\n\t"
+ "adds r2, r2, r1\n\t"
+ "adcs r3, r3, #0\n\t"
+ "adcs r12, r12, #0\n\t"
+ "adcs lr, lr, #0\n\t"
+ "adcs r4, r4, #0\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adc r7, r7, #0\n\t"
+ "and r7, r7, #0x7fffffff\n\t"
+ "orr r2, r2, r3\n\t"
+ "orr r12, r12, lr\n\t"
+ "orr r4, r4, r5\n\t"
+ "orr r6, r6, r7\n\t"
+ "orr r12, r12, r4\n\t"
+ "orr r2, r2, r6\n\t"
+ "orr %[a], r2, r12\n\t"
+ : [a] "+r" (a)
+ :
+ : "memory", "r1", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
+ );
+ return (uint32_t)(size_t)a;
+}
+
+int fe_isnegative(const fe a)
+{
+ __asm__ __volatile__ (
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r12, lr, [%[a], #8]\n\t"
+ "adds r1, r2, #19\n\t"
+ "adcs r1, r3, #0\n\t"
+ "adcs r1, r12, #0\n\t"
+ "adcs r1, lr, #0\n\t"
+ "ldrd r2, r3, [%[a], #16]\n\t"
+ "ldrd r12, lr, [%[a], #24]\n\t"
+ "adcs r1, r2, #0\n\t"
+ "adcs r1, r3, #0\n\t"
+ "adcs r1, r12, #0\n\t"
+ "ldr r2, [%[a]]\n\t"
+ "adc r1, lr, #0\n\t"
+ "and %[a], r2, #1\n\t"
+ "lsr r1, r1, #31\n\t"
+ "eor %[a], %[a], r1\n\t"
+ : [a] "+r" (a)
+ :
+ : "memory", "r1", "r2", "r3", "r12", "lr"
+ );
+ return (uint32_t)(size_t)a;
+}
+
+void fe_cmov_table(fe* r, fe* base, signed char b)
+{
+ __asm__ __volatile__ (
+ "sxtb %[b], %[b]\n\t"
+ "sbfx r7, %[b], #7, #1\n\t"
+ "eor r10, %[b], r7\n\t"
+ "sub r10, r10, r7\n\t"
+ "mov r3, #1\n\t"
+ "mov r12, #0\n\t"
+ "mov lr, #1\n\t"
+ "mov r4, #0\n\t"
+ "mov r5, #0\n\t"
+ "mov r6, #0\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #31\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #30\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #29\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #28\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #27\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #26\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #25\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #24\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base]]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #32]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #64]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "sub %[base], %[base], #0x2a0\n\t"
+ "mov r8, #-19\n\t"
+ "mov r9, #-1\n\t"
+ "subs r8, r8, r5\n\t"
+ "sbcs r9, r9, r6\n\t"
+ "sbc r11, r11, r11\n\t"
+ "asr r10, %[b], #31\n\t"
+ "eor r7, r3, lr\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r3, r3, r7\n\t"
+ "eor lr, lr, r7\n\t"
+ "eor r7, r12, r4\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r12, r12, r7\n\t"
+ "eor r4, r4, r7\n\t"
+ "eor r8, r8, r5\n\t"
+ "and r8, r8, r10\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r9, r9, r10\n\t"
+ "eor r6, r6, r9\n\t"
+ "strd r3, r12, [%[r]]\n\t"
+ "strd lr, r4, [%[r], #32]\n\t"
+ "strd r5, r6, [%[r], #64]\n\t"
+ "sbfx r7, %[b], #7, #1\n\t"
+ "eor r10, %[b], r7\n\t"
+ "sub r10, r10, r7\n\t"
+ "mov r3, #0\n\t"
+ "mov r12, #0\n\t"
+ "mov lr, #0\n\t"
+ "mov r4, #0\n\t"
+ "mov r5, #0\n\t"
+ "mov r6, #0\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #31\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #30\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #29\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #28\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #27\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #26\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #25\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #24\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #8]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #40]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #72]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "sub %[base], %[base], #0x2a0\n\t"
+ "mov r8, #-1\n\t"
+ "mov r9, #-1\n\t"
+ "rsbs r11, r11, #0\n\t"
+ "sbcs r8, r8, r5\n\t"
+ "sbcs r9, r9, r6\n\t"
+ "sbc r11, r11, r11\n\t"
+ "asr r10, %[b], #31\n\t"
+ "eor r7, r3, lr\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r3, r3, r7\n\t"
+ "eor lr, lr, r7\n\t"
+ "eor r7, r12, r4\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r12, r12, r7\n\t"
+ "eor r4, r4, r7\n\t"
+ "eor r8, r8, r5\n\t"
+ "and r8, r8, r10\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r9, r9, r10\n\t"
+ "eor r6, r6, r9\n\t"
+ "strd r3, r12, [%[r], #8]\n\t"
+ "strd lr, r4, [%[r], #40]\n\t"
+ "strd r5, r6, [%[r], #72]\n\t"
+ "sbfx r7, %[b], #7, #1\n\t"
+ "eor r10, %[b], r7\n\t"
+ "sub r10, r10, r7\n\t"
+ "mov r3, #0\n\t"
+ "mov r12, #0\n\t"
+ "mov lr, #0\n\t"
+ "mov r4, #0\n\t"
+ "mov r5, #0\n\t"
+ "mov r6, #0\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #31\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #30\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #29\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #28\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #27\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #26\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #25\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #24\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #16]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #48]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #80]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "sub %[base], %[base], #0x2a0\n\t"
+ "mov r8, #-1\n\t"
+ "mov r9, #-1\n\t"
+ "rsbs r11, r11, #0\n\t"
+ "sbcs r8, r8, r5\n\t"
+ "sbcs r9, r9, r6\n\t"
+ "sbc r11, r11, r11\n\t"
+ "asr r10, %[b], #31\n\t"
+ "eor r7, r3, lr\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r3, r3, r7\n\t"
+ "eor lr, lr, r7\n\t"
+ "eor r7, r12, r4\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r12, r12, r7\n\t"
+ "eor r4, r4, r7\n\t"
+ "eor r8, r8, r5\n\t"
+ "and r8, r8, r10\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r9, r9, r10\n\t"
+ "eor r6, r6, r9\n\t"
+ "strd r3, r12, [%[r], #16]\n\t"
+ "strd lr, r4, [%[r], #48]\n\t"
+ "strd r5, r6, [%[r], #80]\n\t"
+ "sbfx r7, %[b], #7, #1\n\t"
+ "eor r10, %[b], r7\n\t"
+ "sub r10, r10, r7\n\t"
+ "mov r3, #0\n\t"
+ "mov r12, #0\n\t"
+ "mov lr, #0\n\t"
+ "mov r4, #0\n\t"
+ "mov r5, #0\n\t"
+ "mov r6, #0\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #31\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #30\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #29\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #28\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #27\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #26\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #25\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "add %[base], %[base], #0x60\n\t"
+ "mov r7, #0x80000000\n\t"
+ "ror r7, r7, #24\n\t"
+ "ror r7, r7, r10\n\t"
+ "asr r7, r7, #31\n\t"
+ "ldrd r8, r9, [%[base], #24]\n\t"
+ "eor r8, r8, r3\n\t"
+ "eor r9, r9, r12\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r3, r3, r8\n\t"
+ "eor r12, r12, r9\n\t"
+ "ldrd r8, r9, [%[base], #56]\n\t"
+ "eor r8, r8, lr\n\t"
+ "eor r9, r9, r4\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor lr, lr, r8\n\t"
+ "eor r4, r4, r9\n\t"
+ "ldrd r8, r9, [%[base], #88]\n\t"
+ "eor r8, r8, r5\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r8, r8, r7\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r6, r6, r9\n\t"
+ "sub %[base], %[base], #0x2a0\n\t"
+ "mov r8, #-1\n\t"
+ "mov r9, #0x7fffffff\n\t"
+ "rsbs r11, r11, #0\n\t"
+ "sbcs r8, r8, r5\n\t"
+ "sbc r9, r9, r6\n\t"
+ "asr r10, %[b], #31\n\t"
+ "eor r7, r3, lr\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r3, r3, r7\n\t"
+ "eor lr, lr, r7\n\t"
+ "eor r7, r12, r4\n\t"
+ "and r7, r7, r10\n\t"
+ "eor r12, r12, r7\n\t"
+ "eor r4, r4, r7\n\t"
+ "eor r8, r8, r5\n\t"
+ "and r8, r8, r10\n\t"
+ "eor r5, r5, r8\n\t"
+ "eor r9, r9, r6\n\t"
+ "and r9, r9, r10\n\t"
+ "eor r6, r6, r9\n\t"
+ "strd r3, r12, [%[r], #24]\n\t"
+ "strd lr, r4, [%[r], #56]\n\t"
+ "strd r5, r6, [%[r], #88]\n\t"
+ : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_mul(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x40\n\t"
+ /* Multiply */
+ "ldr r7, [%[a]]\n\t"
+ "ldr r8, [%[a], #4]\n\t"
+ "ldr r9, [%[b]]\n\t"
+ "ldr lr, [%[b], #4]\n\t"
+ /* A[0] * B[0] = 0 */
+ "umull r4, r5, r7, r9\n\t"
+ "str r4, [sp]\n\t"
+ /* A[0] * B[1] = 1 */
+ "umull r3, r6, r7, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * B[0] = 1 */
+ "umull r3, r12, r8, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #4]\n\t"
+ /* A[2] * B[0] = 2 */
+ "ldr r10, [%[a], #8]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r6, r6, r3\n\t"
+ "adc r4, r4, r12\n\t"
+ /* A[1] * B[1] = 2 */
+ "umull r3, r12, r8, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[0] * B[2] = 2 */
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #8]\n\t"
+ /* A[0] * B[3] = 3 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * B[2] = 3 */
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[2] * B[1] = 3 */
+ "umull r3, r12, r10, lr\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * B[0] = 3 */
+ "ldr r10, [%[a], #12]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #12]\n\t"
+ /* A[4] * B[0] = 4 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * B[1] = 4 */
+ "ldr r10, [%[a], #12]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[2] * B[2] = 4 */
+ "ldr r10, [%[a], #8]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * B[3] = 4 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[0] * B[4] = 4 */
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #16]\n\t"
+ /* A[0] * B[5] = 5 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * B[4] = 5 */
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * B[3] = 5 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[3] * B[2] = 5 */
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[4] * B[1] = 5 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * B[0] = 5 */
+ "ldr r10, [%[a], #20]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #20]\n\t"
+ /* A[6] * B[0] = 6 */
+ "ldr r10, [%[a], #24]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * B[1] = 6 */
+ "ldr r10, [%[a], #20]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[4] * B[2] = 6 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * B[3] = 6 */
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[2] * B[4] = 6 */
+ "ldr r10, [%[a], #8]\n\t"
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * B[5] = 6 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[0] * B[6] = 6 */
+ "ldr r11, [%[b], #24]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #24]\n\t"
+ /* A[0] * B[7] = 7 */
+ "ldr r11, [%[b], #28]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * B[6] = 7 */
+ "ldr r11, [%[b], #24]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[2] * B[5] = 7 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * B[4] = 7 */
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[4] * B[3] = 7 */
+ "ldr r10, [%[a], #16]\n\t"
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[5] * B[2] = 7 */
+ "ldr r10, [%[a], #20]\n\t"
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[6] * B[1] = 7 */
+ "ldr r10, [%[a], #24]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[7] * B[0] = 7 */
+ "ldr r10, [%[a], #28]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #28]\n\t"
+ "ldr r7, [%[a], #24]\n\t"
+ "ldr r9, [%[b], #24]\n\t"
+ /* A[7] * B[1] = 8 */
+ "umull r3, r12, r10, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[6] * B[2] = 8 */
+ "umull r3, r12, r7, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * B[3] = 8 */
+ "ldr r10, [%[a], #20]\n\t"
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[4] * B[4] = 8 */
+ "ldr r10, [%[a], #16]\n\t"
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[3] * B[5] = 8 */
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * B[6] = 8 */
+ "ldr r10, [%[a], #8]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * B[7] = 8 */
+ "ldr r11, [%[b], #28]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #32]\n\t"
+ "ldr r8, [%[a], #28]\n\t"
+ "mov lr, r11\n\t"
+ /* A[2] * B[7] = 9 */
+ "umull r3, r12, r10, lr\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * B[6] = 9 */
+ "ldr r10, [%[a], #12]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[4] * B[5] = 9 */
+ "ldr r10, [%[a], #16]\n\t"
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * B[4] = 9 */
+ "ldr r10, [%[a], #20]\n\t"
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[6] * B[3] = 9 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[7] * B[2] = 9 */
+ "ldr r11, [%[b], #8]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #36]\n\t"
+ /* A[7] * B[3] = 10 */
+ "ldr r11, [%[b], #12]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[6] * B[4] = 10 */
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r7, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[5] * B[5] = 10 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r10, r11\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[4] * B[6] = 10 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * B[7] = 10 */
+ "ldr r10, [%[a], #12]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #40]\n\t"
+ /* A[4] * B[7] = 11 */
+ "ldr r10, [%[a], #16]\n\t"
+ "umull r3, r12, r10, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * B[6] = 11 */
+ "ldr r10, [%[a], #20]\n\t"
+ "umull r3, r12, r10, r9\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[6] * B[5] = 11 */
+ "umull r3, r12, r7, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[7] * B[4] = 11 */
+ "ldr r11, [%[b], #16]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r6, r6, r3\n\t"
+ "adcs r4, r4, r12\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #44]\n\t"
+ /* A[7] * B[5] = 12 */
+ "ldr r11, [%[b], #20]\n\t"
+ "umull r3, r12, r8, r11\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[6] * B[6] = 12 */
+ "umull r3, r12, r7, r9\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * B[7] = 12 */
+ "umull r3, r12, r10, lr\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #48]\n\t"
+ /* A[6] * B[7] = 13 */
+ "umull r3, r12, r7, lr\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[7] * B[6] = 13 */
+ "umull r3, r12, r8, r9\n\t"
+ "adds r5, r5, r3\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #52]\n\t"
+ /* A[7] * B[7] = 14 */
+ "umull r3, r12, r8, lr\n\t"
+ "adds r6, r6, r3\n\t"
+ "adc r4, r4, r12\n\t"
+ "str r6, [sp, #56]\n\t"
+ "str r4, [sp, #60]\n\t"
+ /* Reduce */
+ /* Load bottom half */
+ "ldrd r4, r5, [sp]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "ldrd r8, r9, [sp, #16]\n\t"
+ "ldrd r10, r11, [sp, #24]\n\t"
+ "lsr r3, r11, #31\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "mov lr, #19\n\t"
+ "ldr %[a], [sp, #32]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "adds r4, r4, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r5, r5, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #36]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r5, r5, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r6, r6, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #40]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r6, r6, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r7, r7, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #44]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r7, r7, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r8, r8, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #48]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r8, r8, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r9, r9, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #52]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r9, r9, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r10, r10, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #56]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "add r12, r12, %[b]\n\t"
+ "adds r10, r10, r3\n\t"
+ "mov %[b], #0\n\t"
+ "adcs r11, r11, r12\n\t"
+ "adc %[b], %[b], #0\n\t"
+ "lsr r3, %[a], #31\n\t"
+ "ldr %[a], [sp, #60]\n\t"
+ "orr r3, r3, %[a], lsl #1\n\t"
+ "umull r3, r12, lr, r3\n\t"
+ "adds r11, r11, r3\n\t"
+ "adc r3, r12, %[b]\n\t"
+ /* Overflow */
+ "lsl r3, r3, #1\n\t"
+ "orr r3, r3, r11, lsr #31\n\t"
+ "mul r3, r3, lr\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Reduce if top bit set */
+ "asr r3, r11, #31\n\t"
+ "and r3, r3, lr\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Store */
+ "strd r4, r5, [%[r]]\n\t"
+ "strd r6, r7, [%[r], #8]\n\t"
+ "strd r8, r9, [%[r], #16]\n\t"
+ "strd r10, r11, [%[r], #24]\n\t"
+ "add sp, sp, #0x40\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_sq(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x40\n\t"
+ /* Square */
+ "ldr r7, [%[a]]\n\t"
+ "ldr r8, [%[a], #4]\n\t"
+ "ldr r9, [%[a], #8]\n\t"
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r12, [%[a], #16]\n\t"
+ /* A[0] * A[0] = 0 */
+ "umull r4, r5, r7, r7\n\t"
+ "str r4, [sp]\n\t"
+ /* A[0] * A[1] = 1 */
+ "umull r2, r3, r7, r8\n\t"
+ "mov r6, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adc r6, r6, r3\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #4]\n\t"
+ /* A[1] * A[1] = 2 */
+ "umull r2, r3, r8, r8\n\t"
+ "adds r6, r6, r2\n\t"
+ "adc r4, r4, r3\n\t"
+ /* A[0] * A[2] = 2 */
+ "umull r2, r3, r7, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #8]\n\t"
+ /* A[0] * A[3] = 3 */
+ "umull r2, r3, r7, r10\n\t"
+ "adds r4, r4, r2\n\t"
+ "adc r5, r5, r3\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * A[2] = 3 */
+ "umull r2, r3, r8, r9\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #12]\n\t"
+ /* A[2] * A[2] = 4 */
+ "umull r2, r3, r9, r9\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * A[3] = 4 */
+ "umull r2, r3, r8, r10\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[0] * A[4] = 4 */
+ "umull r2, r3, r7, r12\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #16]\n\t"
+ /* A[0] * A[5] = 5 */
+ "ldr r11, [%[a], #20]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * A[4] = 5 */
+ "umull r2, r3, r8, r12\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * A[3] = 5 */
+ "umull r2, r3, r9, r10\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #20]\n\t"
+ /* A[3] * A[3] = 6 */
+ "umull r2, r3, r10, r10\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[2] * A[4] = 6 */
+ "umull r2, r3, r9, r12\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * A[5] = 6 */
+ "umull r2, r3, r8, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[0] * A[6] = 6 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #24]\n\t"
+ /* A[0] * A[7] = 7 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * A[6] = 7 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r8, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[2] * A[5] = 7 */
+ "ldr r11, [%[a], #20]\n\t"
+ "umull r2, r3, r9, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * A[4] = 7 */
+ "umull r2, r3, r10, r12\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #28]\n\t"
+ /* A[4] * A[4] = 8 */
+ "umull r2, r3, r12, r12\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[3] * A[5] = 8 */
+ "umull r2, r3, r10, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * A[6] = 8 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r9, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * A[7] = 8 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r8, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #32]\n\t"
+ "ldr r7, [%[a], #20]\n\t"
+ /* A[2] * A[7] = 9 */
+ "umull r2, r3, r9, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * A[6] = 9 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r10, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[4] * A[5] = 9 */
+ "umull r2, r3, r12, r7\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #36]\n\t"
+ "mov r8, r11\n\t"
+ /* A[5] * A[5] = 10 */
+ "umull r2, r3, r7, r7\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[4] * A[6] = 10 */
+ "umull r2, r3, r12, r8\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * A[7] = 10 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r10, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #40]\n\t"
+ "mov r9, r11\n\t"
+ /* A[4] * A[7] = 11 */
+ "umull r2, r3, r12, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * A[6] = 11 */
+ "umull r2, r3, r7, r8\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #44]\n\t"
+ /* A[6] * A[6] = 12 */
+ "umull r2, r3, r8, r8\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * A[7] = 12 */
+ "umull r2, r3, r7, r9\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #48]\n\t"
+ /* A[6] * A[7] = 13 */
+ "umull r2, r3, r8, r9\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #52]\n\t"
+ /* A[7] * A[7] = 14 */
+ "umull r2, r3, r9, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "adc r4, r4, r3\n\t"
+ "str r6, [sp, #56]\n\t"
+ "str r4, [sp, #60]\n\t"
+ /* Reduce */
+ /* Load bottom half */
+ "ldrd r4, r5, [sp]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "ldrd r8, r9, [sp, #16]\n\t"
+ "ldrd r10, r11, [sp, #24]\n\t"
+ "lsr r2, r11, #31\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "mov r12, #19\n\t"
+ "ldr %[a], [sp, #32]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #36]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #40]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r7, r7, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #44]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r7, r7, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r8, r8, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #48]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r8, r8, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r9, r9, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #52]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r9, r9, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r10, r10, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #56]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r10, r10, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r11, r11, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #31\n\t"
+ "ldr %[a], [sp, #60]\n\t"
+ "orr r2, r2, %[a], lsl #1\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "adds r11, r11, r2\n\t"
+ "adc r2, r3, lr\n\t"
+ /* Overflow */
+ "lsl r2, r2, #1\n\t"
+ "orr r2, r2, r11, lsr #31\n\t"
+ "mul r2, r2, r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Reduce if top bit set */
+ "asr r2, r11, #31\n\t"
+ "and r2, r2, r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Store */
+ "strd r4, r5, [%[r]]\n\t"
+ "strd r6, r7, [%[r], #8]\n\t"
+ "strd r8, r9, [%[r], #16]\n\t"
+ "strd r10, r11, [%[r], #24]\n\t"
+ "add sp, sp, #0x40\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_mul121666(fe r, fe a)
+{
+ __asm__ __volatile__ (
+ /* Multiply by 121666 */
+ "ldrd r2, r3, [%[a]]\n\t"
+ "ldrd r4, r5, [%[a], #8]\n\t"
+ "ldrd r6, r7, [%[a], #16]\n\t"
+ "ldrd r8, r9, [%[a], #24]\n\t"
+ "movw lr, #0xdb42\n\t"
+ "movt lr, #1\n\t"
+ "umull r2, r10, r2, lr\n\t"
+ "umull r3, r12, r3, lr\n\t"
+ "adds r3, r3, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r4, r12, r4, lr\n\t"
+ "adds r4, r4, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r5, r12, r5, lr\n\t"
+ "adds r5, r5, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r6, r12, r6, lr\n\t"
+ "adds r6, r6, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r7, r12, r7, lr\n\t"
+ "adds r7, r7, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r8, r12, r8, lr\n\t"
+ "adds r8, r8, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "umull r9, r12, r9, lr\n\t"
+ "adds r9, r9, r10\n\t"
+ "adc r10, r12, #0\n\t"
+ "mov lr, #19\n\t"
+ "lsl r10, r10, #1\n\t"
+ "orr r10, r10, r9, lsr #31\n\t"
+ "mul r10, r10, lr\n\t"
+ "and r9, r9, #0x7fffffff\n\t"
+ "adds r2, r2, r10\n\t"
+ "adcs r3, r3, #0\n\t"
+ "adcs r4, r4, #0\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adc r9, r9, #0\n\t"
+ "strd r2, r3, [%[r]]\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "strd r6, r7, [%[r], #16]\n\t"
+ "strd r8, r9, [%[r], #24]\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+ );
+}
+
+void fe_sq2(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x40\n\t"
+ /* Square * 2 */
+ "ldr r7, [%[a]]\n\t"
+ "ldr r8, [%[a], #4]\n\t"
+ "ldr r9, [%[a], #8]\n\t"
+ "ldr r10, [%[a], #12]\n\t"
+ "ldr r12, [%[a], #16]\n\t"
+ /* A[0] * A[0] = 0 */
+ "umull r4, r5, r7, r7\n\t"
+ "str r4, [sp]\n\t"
+ /* A[0] * A[1] = 1 */
+ "umull r2, r3, r7, r8\n\t"
+ "mov r6, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adc r6, r6, r3\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #4]\n\t"
+ /* A[1] * A[1] = 2 */
+ "umull r2, r3, r8, r8\n\t"
+ "adds r6, r6, r2\n\t"
+ "adc r4, r4, r3\n\t"
+ /* A[0] * A[2] = 2 */
+ "umull r2, r3, r7, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #8]\n\t"
+ /* A[0] * A[3] = 3 */
+ "umull r2, r3, r7, r10\n\t"
+ "adds r4, r4, r2\n\t"
+ "adc r5, r5, r3\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * A[2] = 3 */
+ "umull r2, r3, r8, r9\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #12]\n\t"
+ /* A[2] * A[2] = 4 */
+ "umull r2, r3, r9, r9\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * A[3] = 4 */
+ "umull r2, r3, r8, r10\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[0] * A[4] = 4 */
+ "umull r2, r3, r7, r12\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #16]\n\t"
+ /* A[0] * A[5] = 5 */
+ "ldr r11, [%[a], #20]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * A[4] = 5 */
+ "umull r2, r3, r8, r12\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * A[3] = 5 */
+ "umull r2, r3, r9, r10\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #20]\n\t"
+ /* A[3] * A[3] = 6 */
+ "umull r2, r3, r10, r10\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[2] * A[4] = 6 */
+ "umull r2, r3, r9, r12\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[1] * A[5] = 6 */
+ "umull r2, r3, r8, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[0] * A[6] = 6 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #24]\n\t"
+ /* A[0] * A[7] = 7 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r7, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[1] * A[6] = 7 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r8, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[2] * A[5] = 7 */
+ "ldr r11, [%[a], #20]\n\t"
+ "umull r2, r3, r9, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * A[4] = 7 */
+ "umull r2, r3, r10, r12\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #28]\n\t"
+ /* A[4] * A[4] = 8 */
+ "umull r2, r3, r12, r12\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[3] * A[5] = 8 */
+ "umull r2, r3, r10, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[2] * A[6] = 8 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r9, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[1] * A[7] = 8 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r8, r11\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #32]\n\t"
+ "ldr r7, [%[a], #20]\n\t"
+ /* A[2] * A[7] = 9 */
+ "umull r2, r3, r9, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[3] * A[6] = 9 */
+ "ldr r11, [%[a], #24]\n\t"
+ "umull r2, r3, r10, r11\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[4] * A[5] = 9 */
+ "umull r2, r3, r12, r7\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #36]\n\t"
+ "mov r8, r11\n\t"
+ /* A[5] * A[5] = 10 */
+ "umull r2, r3, r7, r7\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[4] * A[6] = 10 */
+ "umull r2, r3, r12, r8\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ /* A[3] * A[7] = 10 */
+ "ldr r11, [%[a], #28]\n\t"
+ "umull r2, r3, r10, r11\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #40]\n\t"
+ "mov r9, r11\n\t"
+ /* A[4] * A[7] = 11 */
+ "umull r2, r3, r12, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov r5, #0\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ /* A[5] * A[6] = 11 */
+ "umull r2, r3, r7, r8\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "adds r6, r6, r2\n\t"
+ "adcs r4, r4, r3\n\t"
+ "adc r5, r5, #0\n\t"
+ "str r6, [sp, #44]\n\t"
+ /* A[6] * A[6] = 12 */
+ "umull r2, r3, r8, r8\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov r6, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ /* A[5] * A[7] = 12 */
+ "umull r2, r3, r7, r9\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc r6, r6, #0\n\t"
+ "str r4, [sp, #48]\n\t"
+ /* A[6] * A[7] = 13 */
+ "umull r2, r3, r8, r9\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov r4, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "adds r5, r5, r2\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc r4, r4, #0\n\t"
+ "str r5, [sp, #52]\n\t"
+ /* A[7] * A[7] = 14 */
+ "umull r2, r3, r9, r9\n\t"
+ "adds r6, r6, r2\n\t"
+ "adc r4, r4, r3\n\t"
+ "str r6, [sp, #56]\n\t"
+ "str r4, [sp, #60]\n\t"
+ /* Double and Reduce */
+ /* Load bottom half */
+ "ldrd r4, r5, [sp]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "ldrd r8, r9, [sp, #16]\n\t"
+ "ldrd r10, r11, [sp, #24]\n\t"
+ "lsr r2, r11, #30\n\t"
+ "lsl r11, r11, #1\n\t"
+ "orr r11, r11, r10, lsr #31\n\t"
+ "lsl r10, r10, #1\n\t"
+ "orr r10, r10, r9, lsr #31\n\t"
+ "lsl r9, r9, #1\n\t"
+ "orr r9, r9, r8, lsr #31\n\t"
+ "lsl r8, r8, #1\n\t"
+ "orr r8, r8, r7, lsr #31\n\t"
+ "lsl r7, r7, #1\n\t"
+ "orr r7, r7, r6, lsr #31\n\t"
+ "lsl r6, r6, #1\n\t"
+ "orr r6, r6, r5, lsr #31\n\t"
+ "lsl r5, r5, #1\n\t"
+ "orr r5, r5, r4, lsr #31\n\t"
+ "lsl r4, r4, #1\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "mov r12, #19\n\t"
+ "ldr %[a], [sp, #32]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "adds r4, r4, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r5, r5, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #36]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r5, r5, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r6, r6, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #40]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r6, r6, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r7, r7, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #44]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r7, r7, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r8, r8, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #48]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r8, r8, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r9, r9, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #52]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r9, r9, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r10, r10, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #56]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "add r3, r3, lr\n\t"
+ "adds r10, r10, r2\n\t"
+ "mov lr, #0\n\t"
+ "adcs r11, r11, r3\n\t"
+ "adc lr, lr, #0\n\t"
+ "lsr r2, %[a], #30\n\t"
+ "ldr %[a], [sp, #60]\n\t"
+ "orr r2, r2, %[a], lsl #2\n\t"
+ "umull r2, r3, r12, r2\n\t"
+ "adds r11, r11, r2\n\t"
+ "adc r2, r3, lr\n\t"
+ /* Overflow */
+ "lsl r2, r2, #1\n\t"
+ "orr r2, r2, r11, lsr #31\n\t"
+ "mul r2, r2, r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Reduce if top bit set */
+ "asr r2, r11, #31\n\t"
+ "and r2, r2, r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, r2\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ /* Store */
+ "strd r4, r5, [%[r]]\n\t"
+ "strd r6, r7, [%[r], #8]\n\t"
+ "strd r8, r9, [%[r], #16]\n\t"
+ "strd r10, r11, [%[r], #24]\n\t"
+ "add sp, sp, #0x40\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_invert(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x88\n\t"
+ /* Invert */
+ "str %[r], [sp, #128]\n\t"
+ "str %[a], [sp, #132]\n\t"
+ "mov r0, sp\n\t"
+ "ldr r1, [sp, #132]\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "ldr r1, [sp, #132]\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r0, sp\n\t"
+ "mov r1, sp\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #4\n\t"
+ "\n"
+ "L_fe_invert1_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert1_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #9\n\t"
+ "\n"
+ "L_fe_invert2_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert2_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #19\n\t"
+ "\n"
+ "L_fe_invert3_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert3_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #10\n\t"
+ "\n"
+ "L_fe_invert4_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert4_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #49\n\t"
+ "\n"
+ "L_fe_invert5_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert5_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #0x63\n\t"
+ "\n"
+ "L_fe_invert6_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert6_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #50\n\t"
+ "\n"
+ "L_fe_invert7_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert7_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #5\n\t"
+ "\n"
+ "L_fe_invert8_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_invert8_%=\n\t"
+ "ldr r0, [sp, #128]\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "ldr %[a], [sp, #132]\n\t"
+ "ldr %[r], [sp, #128]\n\t"
+ "add sp, sp, #0x88\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "lr", "r4"
+ );
+}
+
+int curve25519(byte* r, byte* n, byte* a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0xbc\n\t"
+ "str %[r], [sp, #160]\n\t"
+ "str %[n], [sp, #164]\n\t"
+ "str %[a], [sp, #168]\n\t"
+ "mov %[n], #0\n\t"
+ "str %[n], [sp, #172]\n\t"
+ /* Set one */
+ "mov r11, #1\n\t"
+ "mov r10, #0\n\t"
+ "strd r11, r10, [%[r]]\n\t"
+ "strd r10, r10, [%[r], #8]\n\t"
+ "strd r10, r10, [%[r], #16]\n\t"
+ "strd r10, r10, [%[r], #24]\n\t"
+ /* Set zero */
+ "mov r10, #0\n\t"
+ "strd r10, r10, [sp]\n\t"
+ "strd r10, r10, [sp, #8]\n\t"
+ "strd r10, r10, [sp, #16]\n\t"
+ "strd r10, r10, [sp, #24]\n\t"
+ /* Set one */
+ "mov r11, #1\n\t"
+ "mov r10, #0\n\t"
+ "strd r11, r10, [sp, #32]\n\t"
+ "strd r10, r10, [sp, #40]\n\t"
+ "strd r10, r10, [sp, #48]\n\t"
+ "strd r10, r10, [sp, #56]\n\t"
+ /* Copy */
+ "ldrd r4, r5, [%[a]]\n\t"
+ "ldrd r6, r7, [%[a], #8]\n\t"
+ "strd r4, r5, [sp, #64]\n\t"
+ "strd r6, r7, [sp, #72]\n\t"
+ "ldrd r4, r5, [%[a], #16]\n\t"
+ "ldrd r6, r7, [%[a], #24]\n\t"
+ "strd r4, r5, [sp, #80]\n\t"
+ "strd r6, r7, [sp, #88]\n\t"
+ "mov %[n], #30\n\t"
+ "str %[n], [sp, #180]\n\t"
+ "mov %[a], #28\n\t"
+ "str %[a], [sp, #176]\n\t"
+ "\n"
+ "L_curve25519_words_%=: \n\t"
+ "\n"
+ "L_curve25519_bits_%=: \n\t"
+ "ldr %[n], [sp, #164]\n\t"
+ "ldr %[a], [%[n], r2]\n\t"
+ "ldr %[n], [sp, #180]\n\t"
+ "lsr %[a], %[a], %[n]\n\t"
+ "and %[a], %[a], #1\n\t"
+ "str %[a], [sp, #184]\n\t"
+ "ldr %[n], [sp, #172]\n\t"
+ "eor %[n], %[n], %[a]\n\t"
+ "str %[n], [sp, #172]\n\t"
+ "ldr %[r], [sp, #160]\n\t"
+ /* Conditional Swap */
+ "neg %[n], %[n]\n\t"
+ "ldrd r4, r5, [%[r]]\n\t"
+ "ldrd r6, r7, [sp, #64]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [%[r]]\n\t"
+ "strd r6, r7, [sp, #64]\n\t"
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "ldrd r6, r7, [sp, #72]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "strd r6, r7, [sp, #72]\n\t"
+ "ldrd r4, r5, [%[r], #16]\n\t"
+ "ldrd r6, r7, [sp, #80]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [%[r], #16]\n\t"
+ "strd r6, r7, [sp, #80]\n\t"
+ "ldrd r4, r5, [%[r], #24]\n\t"
+ "ldrd r6, r7, [sp, #88]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [%[r], #24]\n\t"
+ "strd r6, r7, [sp, #88]\n\t"
+ "ldr %[n], [sp, #172]\n\t"
+ /* Conditional Swap */
+ "neg %[n], %[n]\n\t"
+ "ldrd r4, r5, [sp]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [sp]\n\t"
+ "strd r6, r7, [sp, #32]\n\t"
+ "ldrd r4, r5, [sp, #8]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [sp, #8]\n\t"
+ "strd r6, r7, [sp, #40]\n\t"
+ "ldrd r4, r5, [sp, #16]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [sp, #16]\n\t"
+ "strd r6, r7, [sp, #48]\n\t"
+ "ldrd r4, r5, [sp, #24]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "eor r8, r4, r6\n\t"
+ "eor r9, r5, r7\n\t"
+ "and r8, r8, %[n]\n\t"
+ "and r9, r9, %[n]\n\t"
+ "eor r4, r4, r8\n\t"
+ "eor r5, r5, r9\n\t"
+ "eor r6, r6, r8\n\t"
+ "eor r7, r7, r9\n\t"
+ "strd r4, r5, [sp, #24]\n\t"
+ "strd r6, r7, [sp, #56]\n\t"
+ "ldr %[n], [sp, #184]\n\t"
+ "str %[n], [sp, #172]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd r4, r5, [%[r]]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [%[r]]\n\t"
+ /* Sub */
+ "subs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #128]\n\t"
+ /* Add */
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [%[r], #8]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #136]\n\t"
+ /* Add */
+ "ldrd r4, r5, [%[r], #16]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [%[r], #16]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #144]\n\t"
+ /* Add */
+ "ldrd r4, r5, [%[r], #24]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r9, r5, r7\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "sbc r11, r5, r7\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r9, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r4, r5, [%[r]]\n\t"
+ "subs r4, r4, r3\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [%[r]]\n\t"
+ "ldrd r4, r5, [%[r], #8]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [%[r], #8]\n\t"
+ "ldrd r4, r5, [%[r], #16]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [%[r], #16]\n\t"
+ "sbcs r8, r8, %[a]\n\t"
+ "sbc r9, r9, r12\n\t"
+ "strd r8, r9, [%[r], #24]\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r4, r5, [sp, #128]\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #128]\n\t"
+ "ldrd r4, r5, [sp, #136]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #136]\n\t"
+ "ldrd r4, r5, [sp, #144]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #144]\n\t"
+ "adcs r10, r10, %[a]\n\t"
+ "adc r11, r11, r12\n\t"
+ "strd r10, r11, [sp, #152]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd r4, r5, [sp, #64]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "adds r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp]\n\t"
+ /* Sub */
+ "subs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #96]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #72]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #8]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #104]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #80]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #16]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #112]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #88]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r9, r5, r7\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "sbc r11, r5, r7\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r9, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r4, r5, [sp]\n\t"
+ "subs r4, r4, r3\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp]\n\t"
+ "ldrd r4, r5, [sp, #8]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #8]\n\t"
+ "ldrd r4, r5, [sp, #16]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #16]\n\t"
+ "sbcs r8, r8, %[a]\n\t"
+ "sbc r9, r9, r12\n\t"
+ "strd r8, r9, [sp, #24]\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r4, r5, [sp, #96]\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #96]\n\t"
+ "ldrd r4, r5, [sp, #104]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #104]\n\t"
+ "ldrd r4, r5, [sp, #112]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #112]\n\t"
+ "adcs r10, r10, %[a]\n\t"
+ "adc r11, r11, r12\n\t"
+ "strd r10, r11, [sp, #120]\n\t"
+ "ldr r2, [sp, #160]\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r0, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r2, sp, #0x80\n\t"
+ "add r1, sp, #0\n\t"
+ "add r0, sp, #0\n\t"
+ "bl fe_mul\n\t"
+ "add r1, sp, #0x80\n\t"
+ "add r0, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "ldr r1, [sp, #160]\n\t"
+ "add r0, sp, #0x80\n\t"
+ "bl fe_sq\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd r4, r5, [sp, #32]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #64]\n\t"
+ /* Sub */
+ "subs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #40]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #72]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #8]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #48]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "mov r3, #0\n\t"
+ "adcs r9, r5, r7\n\t"
+ "adc r3, r3, #0\n\t"
+ "strd r8, r9, [sp, #80]\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "mov r12, #0\n\t"
+ "sbcs r11, r5, r7\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r10, r11, [sp, #16]\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #56]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r3, r3, #-1\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r9, r5, r7\n\t"
+ /* Sub */
+ "adds r12, r12, #-1\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "sbc r11, r5, r7\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r9, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r4, r5, [sp, #64]\n\t"
+ "subs r4, r4, r3\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #64]\n\t"
+ "ldrd r4, r5, [sp, #72]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #72]\n\t"
+ "ldrd r4, r5, [sp, #80]\n\t"
+ "sbcs r4, r4, %[a]\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #80]\n\t"
+ "sbcs r8, r8, %[a]\n\t"
+ "sbc r9, r9, r12\n\t"
+ "strd r8, r9, [sp, #88]\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r4, r5, [sp]\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp]\n\t"
+ "ldrd r4, r5, [sp, #8]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #8]\n\t"
+ "ldrd r4, r5, [sp, #16]\n\t"
+ "adcs r4, r4, %[a]\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "strd r4, r5, [sp, #16]\n\t"
+ "adcs r10, r10, %[a]\n\t"
+ "adc r11, r11, r12\n\t"
+ "strd r10, r11, [sp, #24]\n\t"
+ "add r2, sp, #0x60\n\t"
+ "add r1, sp, #0x80\n\t"
+ "ldr r0, [sp, #160]\n\t"
+ "bl fe_mul\n\t"
+ /* Sub */
+ "ldrd r4, r5, [sp, #128]\n\t"
+ "ldrd r6, r7, [sp, #136]\n\t"
+ "ldrd r8, r9, [sp, #96]\n\t"
+ "ldrd r10, r11, [sp, #104]\n\t"
+ "subs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "sbcs r11, r7, r11\n\t"
+ "strd r8, r9, [sp, #128]\n\t"
+ "strd r10, r11, [sp, #136]\n\t"
+ "ldrd r4, r5, [sp, #144]\n\t"
+ "ldrd r6, r7, [sp, #152]\n\t"
+ "ldrd r8, r9, [sp, #112]\n\t"
+ "ldrd r10, r11, [sp, #120]\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "sbc r11, r7, r11\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd r4, r5, [sp, #128]\n\t"
+ "ldrd r6, r7, [sp, #136]\n\t"
+ "adds r4, r4, r3\n\t"
+ "adcs r5, r5, %[a]\n\t"
+ "adcs r6, r6, %[a]\n\t"
+ "adcs r7, r7, %[a]\n\t"
+ "adcs r8, r8, %[a]\n\t"
+ "adcs r9, r9, %[a]\n\t"
+ "adcs r10, r10, %[a]\n\t"
+ "adc r11, r11, r12\n\t"
+ "strd r4, r5, [sp, #128]\n\t"
+ "strd r6, r7, [sp, #136]\n\t"
+ "strd r8, r9, [sp, #144]\n\t"
+ "strd r10, r11, [sp, #152]\n\t"
+ "add r1, sp, #0\n\t"
+ "add r0, sp, #0\n\t"
+ "bl fe_sq\n\t"
+ /* Multiply by 121666 */
+ "ldrd r4, r5, [sp, #128]\n\t"
+ "ldrd r6, r7, [sp, #136]\n\t"
+ "ldrd r8, r9, [sp, #144]\n\t"
+ "ldrd r10, r11, [sp, #152]\n\t"
+ "movw r12, #0xdb42\n\t"
+ "movt r12, #1\n\t"
+ "umull r4, %[a], r4, r12\n\t"
+ "umull r5, r3, r5, r12\n\t"
+ "adds r5, r5, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r6, r3, r6, r12\n\t"
+ "adds r6, r6, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r7, r3, r7, r12\n\t"
+ "adds r7, r7, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r8, r3, r8, r12\n\t"
+ "adds r8, r8, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r9, r3, r9, r12\n\t"
+ "adds r9, r9, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r10, r3, r10, r12\n\t"
+ "adds r10, r10, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "umull r11, r3, r11, r12\n\t"
+ "adds r11, r11, %[a]\n\t"
+ "adc %[a], r3, #0\n\t"
+ "mov r12, #19\n\t"
+ "lsl %[a], %[a], #1\n\t"
+ "orr %[a], %[a], r11, lsr #31\n\t"
+ "mul %[a], %[a], r12\n\t"
+ "and r11, r11, #0x7fffffff\n\t"
+ "adds r4, r4, %[a]\n\t"
+ "adcs r5, r5, #0\n\t"
+ "adcs r6, r6, #0\n\t"
+ "adcs r7, r7, #0\n\t"
+ "adcs r8, r8, #0\n\t"
+ "adcs r9, r9, #0\n\t"
+ "adcs r10, r10, #0\n\t"
+ "adc r11, r11, #0\n\t"
+ "strd r4, r5, [sp, #32]\n\t"
+ "strd r6, r7, [sp, #40]\n\t"
+ "strd r8, r9, [sp, #48]\n\t"
+ "strd r10, r11, [sp, #56]\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r0, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ /* Add */
+ "ldrd r4, r5, [sp, #96]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "ldrd r8, r9, [sp, #32]\n\t"
+ "ldrd r10, r11, [sp, #40]\n\t"
+ "adds r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "adcs r11, r7, r11\n\t"
+ "strd r8, r9, [sp, #96]\n\t"
+ "strd r10, r11, [sp, #104]\n\t"
+ "ldrd r4, r5, [sp, #112]\n\t"
+ "ldrd r6, r7, [sp, #120]\n\t"
+ "ldrd r8, r9, [sp, #48]\n\t"
+ "ldrd r10, r11, [sp, #56]\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "adc r11, r7, r11\n\t"
+ "mov r3, #-19\n\t"
+ "asr %[a], r11, #31\n\t"
+ /* Mask the modulus */
+ "and r3, %[a], r3\n\t"
+ "and r12, %[a], #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd r4, r5, [sp, #96]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "subs r4, r4, r3\n\t"
+ "sbcs r5, r5, %[a]\n\t"
+ "sbcs r6, r6, %[a]\n\t"
+ "sbcs r7, r7, %[a]\n\t"
+ "sbcs r8, r8, %[a]\n\t"
+ "sbcs r9, r9, %[a]\n\t"
+ "sbcs r10, r10, %[a]\n\t"
+ "sbc r11, r11, r12\n\t"
+ "strd r4, r5, [sp, #96]\n\t"
+ "strd r6, r7, [sp, #104]\n\t"
+ "strd r8, r9, [sp, #112]\n\t"
+ "strd r10, r11, [sp, #120]\n\t"
+ "add r2, sp, #0\n\t"
+ "ldr r1, [sp, #168]\n\t"
+ "add r0, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r2, sp, #0x60\n\t"
+ "add r1, sp, #0x80\n\t"
+ "add r0, sp, #0\n\t"
+ "bl fe_mul\n\t"
+ "ldr %[a], [sp, #176]\n\t"
+ "ldr %[n], [sp, #180]\n\t"
+ "subs %[n], %[n], #1\n\t"
+ "str %[n], [sp, #180]\n\t"
+ "bge L_curve25519_bits_%=\n\t"
+ "mov %[n], #31\n\t"
+ "str %[n], [sp, #180]\n\t"
+ "subs %[a], %[a], #4\n\t"
+ "str %[a], [sp, #176]\n\t"
+ "bge L_curve25519_words_%=\n\t"
+ /* Invert */
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #0x60\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #4\n\t"
+ "\n"
+ "L_curve25519_inv_1_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_1_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #9\n\t"
+ "\n"
+ "L_curve25519_inv_2_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_2_%=\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x80\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #19\n\t"
+ "\n"
+ "L_curve25519_inv_3_%=: \n\t"
+ "add r0, sp, #0x80\n\t"
+ "add r1, sp, #0x80\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_3_%=\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x80\n\t"
+ "add r2, sp, #0x60\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #10\n\t"
+ "\n"
+ "L_curve25519_inv_4_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_4_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #49\n\t"
+ "\n"
+ "L_curve25519_inv_5_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_5_%=\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x80\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #0x63\n\t"
+ "\n"
+ "L_curve25519_inv_6_%=: \n\t"
+ "add r0, sp, #0x80\n\t"
+ "add r1, sp, #0x80\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_6_%=\n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x80\n\t"
+ "add r2, sp, #0x60\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #50\n\t"
+ "\n"
+ "L_curve25519_inv_7_%=: \n\t"
+ "add r0, sp, #0x60\n\t"
+ "add r1, sp, #0x60\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_7_%=\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x60\n\t"
+ "add r2, sp, #0x40\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #5\n\t"
+ "\n"
+ "L_curve25519_inv_8_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_curve25519_inv_8_%=\n\t"
+ "add r0, sp, #0\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "add r2, sp, #0\n\t"
+ "ldr r1, [sp, #160]\n\t"
+ "ldr r0, [sp, #160]\n\t"
+ "bl fe_mul\n\t"
+ "mov r0, #0\n\t"
+ "add sp, sp, #0xbc\n\t"
+ : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
+ :
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ return (uint32_t)(size_t)r;
+}
+
+void fe_pow22523(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x68\n\t"
+ /* pow22523 */
+ "str %[r], [sp, #96]\n\t"
+ "str %[a], [sp, #100]\n\t"
+ "mov r0, sp\n\t"
+ "ldr r1, [sp, #100]\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "add r0, sp, #32\n\t"
+ "ldr r1, [sp, #100]\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r0, sp\n\t"
+ "mov r1, sp\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r0, sp\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "mov r0, sp\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #4\n\t"
+ "\n"
+ "L_fe_pow22523_1_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_1_%=\n\t"
+ "mov r0, sp\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #9\n\t"
+ "\n"
+ "L_fe_pow22523_2_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_2_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #19\n\t"
+ "\n"
+ "L_fe_pow22523_3_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_3_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #10\n\t"
+ "\n"
+ "L_fe_pow22523_4_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_4_%=\n\t"
+ "mov r0, sp\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #32\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #49\n\t"
+ "\n"
+ "L_fe_pow22523_5_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_5_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "mov r4, #0x63\n\t"
+ "\n"
+ "L_fe_pow22523_6_%=: \n\t"
+ "add r0, sp, #0x40\n\t"
+ "add r1, sp, #0x40\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_6_%=\n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #0x40\n\t"
+ "add r2, sp, #32\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #50\n\t"
+ "\n"
+ "L_fe_pow22523_7_%=: \n\t"
+ "add r0, sp, #32\n\t"
+ "add r1, sp, #32\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_7_%=\n\t"
+ "mov r0, sp\n\t"
+ "add r1, sp, #32\n\t"
+ "mov r2, sp\n\t"
+ "bl fe_mul\n\t"
+ "mov r4, #2\n\t"
+ "\n"
+ "L_fe_pow22523_8_%=: \n\t"
+ "mov r0, sp\n\t"
+ "mov r1, sp\n\t"
+ "bl fe_sq\n\t"
+ "sub r4, r4, #1\n\t"
+ "cmp r4, #0\n\t"
+ "bne L_fe_pow22523_8_%=\n\t"
+ "ldr r0, [sp, #96]\n\t"
+ "mov r1, sp\n\t"
+ "ldr r2, [sp, #100]\n\t"
+ "bl fe_mul\n\t"
+ "ldr %[a], [sp, #100]\n\t"
+ "ldr %[r], [sp, #96]\n\t"
+ "add sp, sp, #0x68\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "lr", "r4"
+ );
+}
+
+void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #16\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[px], [sp, #12]\n\t"
+ "ldr r2, [sp, #32]\n\t"
+ "ldr r1, [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #28]\n\t"
+ "ldr r1, [sp, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #32]\n\t"
+ "ldr r1, [sp, #28]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "add sp, sp, #16\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "lr"
+ );
+}
+
+void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #16\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r2, [sp, #36]\n\t"
+ "ldr r1, [sp, #24]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #32]\n\t"
+ "ldr r1, [sp, #28]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #36]\n\t"
+ "ldr r1, [sp, #32]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #28]\n\t"
+ "ldr r1, [sp, #24]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "add sp, sp, #16\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "lr"
+ );
+}
+
+void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #16\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r1, [sp, #88]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_sq\n\t"
+ "ldr r1, [sp, #92]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_sq\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #88]\n\t"
+ "ldr r2, [sp, #92]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_sq\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #8]\n\t"
+ "ldr r2, [sp]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r2]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "ldrd r5, r6, [r2, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r2, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #24]\n\t"
+ "ldrd r5, r6, [r2, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #12]\n\t"
+ "ldr r2, [sp, #4]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r1, [sp, #96]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_sq2\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "ldr r1, [sp, #8]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "ldrd r7, r8, [r1]\n\t"
+ "ldrd r9, r10, [r1, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "add sp, sp, #16\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+}
+
+void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #32\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #108]\n\t"
+ "ldr r2, [sp, #104]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #108]\n\t"
+ "ldr r2, [sp, #104]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r2, [sp, #124]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #128]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #116]\n\t"
+ "ldr r1, [sp, #120]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r2, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "ldr r1, [sp, #112]\n\t"
+ /* Double */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "adds %[rt], %[rt], %[rt]\n\t"
+ "adcs r4, r4, r4\n\t"
+ "adcs r5, r5, r5\n\t"
+ "adcs r6, r6, r6\n\t"
+ "adcs r7, r7, r7\n\t"
+ "adcs r8, r8, r8\n\t"
+ "adcs r9, r9, r9\n\t"
+ "adc r10, r10, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "ldr r1, [sp, #12]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r1]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "ldrd r5, r6, [r1, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r0, #24]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "add sp, sp, #32\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ (void)qxy2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #32\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #108]\n\t"
+ "ldr r2, [sp, #104]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #108]\n\t"
+ "ldr r2, [sp, #104]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r2, [sp, #128]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #124]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #116]\n\t"
+ "ldr r1, [sp, #120]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r2, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "ldr r1, [sp, #112]\n\t"
+ /* Double */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "adds %[rt], %[rt], %[rt]\n\t"
+ "adcs r4, r4, r4\n\t"
+ "adcs r5, r5, r5\n\t"
+ "adcs r6, r6, r6\n\t"
+ "adcs r7, r7, r7\n\t"
+ "adcs r8, r8, r8\n\t"
+ "adcs r9, r9, r9\n\t"
+ "adc r10, r10, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "ldr r1, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "add sp, sp, #32\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ (void)qxy2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x60\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #172]\n\t"
+ "ldr r2, [sp, #168]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #172]\n\t"
+ "ldr r2, [sp, #168]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r2, [sp, #192]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #196]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #180]\n\t"
+ "ldr r1, [sp, #188]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #184]\n\t"
+ "ldr r1, [sp, #176]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #16\n\t"
+ "ldr r1, [sp]\n\t"
+ /* Double */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "adds %[rt], %[rt], %[rt]\n\t"
+ "adcs r4, r4, r4\n\t"
+ "adcs r5, r5, r5\n\t"
+ "adcs r6, r6, r6\n\t"
+ "adcs r7, r7, r7\n\t"
+ "adcs r8, r8, r8\n\t"
+ "adcs r9, r9, r9\n\t"
+ "adc r10, r10, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r2, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "ldr r1, [sp, #12]\n\t"
+ "add r2, sp, #16\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r1]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r1, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "add sp, sp, #0x60\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ (void)qz;
+ (void)qt2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0x60\n\t"
+ "str %[rx], [sp]\n\t"
+ "str %[ry], [sp, #4]\n\t"
+ "str %[rz], [sp, #8]\n\t"
+ "str %[rt], [sp, #12]\n\t"
+ "ldr r0, [sp]\n\t"
+ "ldr r1, [sp, #172]\n\t"
+ "ldr r2, [sp, #168]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "adds r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "adcs r7, %[rt], r7\n\t"
+ "adcs r8, r4, r8\n\t"
+ "adcs r9, r5, r9\n\t"
+ "adc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp, #172]\n\t"
+ "ldr r2, [sp, #168]\n\t"
+ /* Sub */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r2]\n\t"
+ "ldrd r9, r10, [r2, #8]\n\t"
+ "subs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbcs r10, r6, r10\n\t"
+ "strd r7, r8, [r0]\n\t"
+ "strd r9, r10, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "ldrd r5, r6, [r1, #24]\n\t"
+ "ldrd r7, r8, [r2, #16]\n\t"
+ "ldrd r9, r10, [r2, #24]\n\t"
+ "sbcs r7, %[rt], r7\n\t"
+ "sbcs r8, r4, r8\n\t"
+ "sbcs r9, r5, r9\n\t"
+ "sbc r10, r6, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "adcs r5, r5, r11\n\t"
+ "adcs r6, r6, r11\n\t"
+ "adcs r7, r7, r11\n\t"
+ "adcs r8, r8, r11\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r2, [sp, #196]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r0, [sp, #8]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #192]\n\t"
+ "ldr r1, [sp, #4]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #180]\n\t"
+ "ldr r1, [sp, #188]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "bl fe_mul\n\t"
+ "ldr r2, [sp, #184]\n\t"
+ "ldr r1, [sp, #176]\n\t"
+ "ldr r0, [sp]\n\t"
+ "bl fe_mul\n\t"
+ "add r0, sp, #16\n\t"
+ "ldr r1, [sp]\n\t"
+ /* Double */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "ldrd r5, r6, [r1, #8]\n\t"
+ "ldrd r7, r8, [r1, #16]\n\t"
+ "ldrd r9, r10, [r1, #24]\n\t"
+ "adds %[rt], %[rt], %[rt]\n\t"
+ "adcs r4, r4, r4\n\t"
+ "adcs r5, r5, r5\n\t"
+ "adcs r6, r6, r6\n\t"
+ "adcs r7, r7, r7\n\t"
+ "adcs r8, r8, r8\n\t"
+ "adcs r9, r9, r9\n\t"
+ "adc r10, r10, r10\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "sbcs r5, r5, r11\n\t"
+ "sbcs r6, r6, r11\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbcs r8, r8, r11\n\t"
+ "sbcs r9, r9, r11\n\t"
+ "sbc r10, r10, lr\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "strd r5, r6, [r0, #8]\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ "strd r9, r10, [r0, #24]\n\t"
+ "ldr r0, [sp, #4]\n\t"
+ "ldr r1, [sp]\n\t"
+ "ldr r2, [sp, #8]\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "ldr r0, [sp, #12]\n\t"
+ "ldr r1, [sp, #8]\n\t"
+ "add r2, sp, #16\n\t"
+ /* Add-Sub */
+ /* Add */
+ "ldrd %[rt], r4, [r2]\n\t"
+ "ldrd r5, r6, [r0]\n\t"
+ "adds r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0]\n\t"
+ /* Sub */
+ "subs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #8]\n\t"
+ "ldrd r5, r6, [r0, #8]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #8]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #8]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #16]\n\t"
+ "ldrd r5, r6, [r0, #16]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "mov r12, #0\n\t"
+ "adcs r8, r4, r6\n\t"
+ "adc r12, r12, #0\n\t"
+ "strd r7, r8, [r0, #16]\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "mov lr, #0\n\t"
+ "sbcs r10, r4, r6\n\t"
+ "adc lr, lr, #0\n\t"
+ "strd r9, r10, [r1, #16]\n\t"
+ /* Add */
+ "ldrd %[rt], r4, [r2, #24]\n\t"
+ "ldrd r5, r6, [r0, #24]\n\t"
+ "adds r12, r12, #-1\n\t"
+ "adcs r7, %[rt], r5\n\t"
+ "adc r8, r4, r6\n\t"
+ /* Sub */
+ "adds lr, lr, #-1\n\t"
+ "sbcs r9, %[rt], r5\n\t"
+ "sbc r10, r4, r6\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r8, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "ldrd %[rt], r4, [r0]\n\t"
+ "subs %[rt], %[rt], r12\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0]\n\t"
+ "ldrd %[rt], r4, [r0, #8]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #8]\n\t"
+ "ldrd %[rt], r4, [r0, #16]\n\t"
+ "sbcs %[rt], %[rt], r11\n\t"
+ "sbcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r0, #16]\n\t"
+ "sbcs r7, r7, r11\n\t"
+ "sbc r8, r8, lr\n\t"
+ "strd r7, r8, [r0, #24]\n\t"
+ "mov r12, #-19\n\t"
+ "asr r11, r10, #31\n\t"
+ /* Mask the modulus */
+ "and r12, r11, r12\n\t"
+ "and lr, r11, #0x7fffffff\n\t"
+ /* Add modulus (if underflow) */
+ "ldrd %[rt], r4, [r1]\n\t"
+ "adds %[rt], %[rt], r12\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1]\n\t"
+ "ldrd %[rt], r4, [r1, #8]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #8]\n\t"
+ "ldrd %[rt], r4, [r1, #16]\n\t"
+ "adcs %[rt], %[rt], r11\n\t"
+ "adcs r4, r4, r11\n\t"
+ "strd %[rt], r4, [r1, #16]\n\t"
+ "adcs r9, r9, r11\n\t"
+ "adc r10, r10, lr\n\t"
+ "strd r9, r10, [r1, #24]\n\t"
+ "add sp, sp, #0x60\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
+ );
+ (void)qz;
+ (void)qt2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+#endif /* WOLFSSL_ARMASM */
+#endif /* !__aarch64__ */
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
new file mode 100644
index 0000000..d2b899c
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
@@ -0,0 +1,5335 @@
+/* armv8-32-sha512-asm
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S
+ */
+
+#ifdef WOLFSSL_ARMASM
+#ifndef __aarch64__
+#ifdef WOLFSSL_ARMASM_NO_NEON
+ .text
+ .type L_SHA512_transform_len_k, %object
+ .size L_SHA512_transform_len_k, 640
+ .align 3
+L_SHA512_transform_len_k:
+ .word 0xd728ae22
+ .word 0x428a2f98
+ .word 0x23ef65cd
+ .word 0x71374491
+ .word 0xec4d3b2f
+ .word 0xb5c0fbcf
+ .word 0x8189dbbc
+ .word 0xe9b5dba5
+ .word 0xf348b538
+ .word 0x3956c25b
+ .word 0xb605d019
+ .word 0x59f111f1
+ .word 0xaf194f9b
+ .word 0x923f82a4
+ .word 0xda6d8118
+ .word 0xab1c5ed5
+ .word 0xa3030242
+ .word 0xd807aa98
+ .word 0x45706fbe
+ .word 0x12835b01
+ .word 0x4ee4b28c
+ .word 0x243185be
+ .word 0xd5ffb4e2
+ .word 0x550c7dc3
+ .word 0xf27b896f
+ .word 0x72be5d74
+ .word 0x3b1696b1
+ .word 0x80deb1fe
+ .word 0x25c71235
+ .word 0x9bdc06a7
+ .word 0xcf692694
+ .word 0xc19bf174
+ .word 0x9ef14ad2
+ .word 0xe49b69c1
+ .word 0x384f25e3
+ .word 0xefbe4786
+ .word 0x8b8cd5b5
+ .word 0xfc19dc6
+ .word 0x77ac9c65
+ .word 0x240ca1cc
+ .word 0x592b0275
+ .word 0x2de92c6f
+ .word 0x6ea6e483
+ .word 0x4a7484aa
+ .word 0xbd41fbd4
+ .word 0x5cb0a9dc
+ .word 0x831153b5
+ .word 0x76f988da
+ .word 0xee66dfab
+ .word 0x983e5152
+ .word 0x2db43210
+ .word 0xa831c66d
+ .word 0x98fb213f
+ .word 0xb00327c8
+ .word 0xbeef0ee4
+ .word 0xbf597fc7
+ .word 0x3da88fc2
+ .word 0xc6e00bf3
+ .word 0x930aa725
+ .word 0xd5a79147
+ .word 0xe003826f
+ .word 0x6ca6351
+ .word 0xa0e6e70
+ .word 0x14292967
+ .word 0x46d22ffc
+ .word 0x27b70a85
+ .word 0x5c26c926
+ .word 0x2e1b2138
+ .word 0x5ac42aed
+ .word 0x4d2c6dfc
+ .word 0x9d95b3df
+ .word 0x53380d13
+ .word 0x8baf63de
+ .word 0x650a7354
+ .word 0x3c77b2a8
+ .word 0x766a0abb
+ .word 0x47edaee6
+ .word 0x81c2c92e
+ .word 0x1482353b
+ .word 0x92722c85
+ .word 0x4cf10364
+ .word 0xa2bfe8a1
+ .word 0xbc423001
+ .word 0xa81a664b
+ .word 0xd0f89791
+ .word 0xc24b8b70
+ .word 0x654be30
+ .word 0xc76c51a3
+ .word 0xd6ef5218
+ .word 0xd192e819
+ .word 0x5565a910
+ .word 0xd6990624
+ .word 0x5771202a
+ .word 0xf40e3585
+ .word 0x32bbd1b8
+ .word 0x106aa070
+ .word 0xb8d2d0c8
+ .word 0x19a4c116
+ .word 0x5141ab53
+ .word 0x1e376c08
+ .word 0xdf8eeb99
+ .word 0x2748774c
+ .word 0xe19b48a8
+ .word 0x34b0bcb5
+ .word 0xc5c95a63
+ .word 0x391c0cb3
+ .word 0xe3418acb
+ .word 0x4ed8aa4a
+ .word 0x7763e373
+ .word 0x5b9cca4f
+ .word 0xd6b2b8a3
+ .word 0x682e6ff3
+ .word 0x5defb2fc
+ .word 0x748f82ee
+ .word 0x43172f60
+ .word 0x78a5636f
+ .word 0xa1f0ab72
+ .word 0x84c87814
+ .word 0x1a6439ec
+ .word 0x8cc70208
+ .word 0x23631e28
+ .word 0x90befffa
+ .word 0xde82bde9
+ .word 0xa4506ceb
+ .word 0xb2c67915
+ .word 0xbef9a3f7
+ .word 0xe372532b
+ .word 0xc67178f2
+ .word 0xea26619c
+ .word 0xca273ece
+ .word 0x21c0c207
+ .word 0xd186b8c7
+ .word 0xcde0eb1e
+ .word 0xeada7dd6
+ .word 0xee6ed178
+ .word 0xf57d4f7f
+ .word 0x72176fba
+ .word 0x6f067aa
+ .word 0xa2c898a6
+ .word 0xa637dc5
+ .word 0xbef90dae
+ .word 0x113f9804
+ .word 0x131c471b
+ .word 0x1b710b35
+ .word 0x23047d84
+ .word 0x28db77f5
+ .word 0x40c72493
+ .word 0x32caab7b
+ .word 0x15c9bebc
+ .word 0x3c9ebe0a
+ .word 0x9c100d4c
+ .word 0x431d67c4
+ .word 0xcb3e42b6
+ .word 0x4cc5d4be
+ .word 0xfc657e2a
+ .word 0x597f299c
+ .word 0x3ad6faec
+ .word 0x5fcb6fab
+ .word 0x4a475817
+ .word 0x6c44198c
+ .text
+ .align 2
+ .globl Transform_Sha512_Len
+ .type Transform_Sha512_Len, %function
+Transform_Sha512_Len:
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
+ sub sp, sp, #0xc0
+ adr r3, L_SHA512_transform_len_k
+ # Copy digest to add in at end
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r4, r5, [r0, #8]
+ ldrd r6, r7, [r0, #16]
+ ldrd r8, r9, [r0, #24]
+ str r12, [sp, #128]
+ str lr, [sp, #132]
+ strd r4, r5, [sp, #136]
+ strd r6, r7, [sp, #144]
+ strd r8, r9, [sp, #152]
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r4, r5, [r0, #40]
+ ldrd r6, r7, [r0, #48]
+ ldrd r8, r9, [r0, #56]
+ str r12, [sp, #160]
+ str lr, [sp, #164]
+ strd r4, r5, [sp, #168]
+ strd r6, r7, [sp, #176]
+ strd r8, r9, [sp, #184]
+ # Start of loop processing a block
+L_sha512_len_neon_begin:
+ # Load, Reverse and Store W
+ ldr r12, [r1]
+ ldr lr, [r1, #4]
+ ldrd r4, r5, [r1, #8]
+ ldrd r6, r7, [r1, #16]
+ ldrd r8, r9, [r1, #24]
+ rev r12, r12
+ rev lr, lr
+ rev r4, r4
+ rev r5, r5
+ rev r6, r6
+ rev r7, r7
+ rev r8, r8
+ rev r9, r9
+ str lr, [sp]
+ str r12, [sp, #4]
+ str r5, [sp, #8]
+ str r4, [sp, #12]
+ str r7, [sp, #16]
+ str r6, [sp, #20]
+ str r9, [sp, #24]
+ str r8, [sp, #28]
+ ldr r12, [r1, #32]
+ ldr lr, [r1, #36]
+ ldrd r4, r5, [r1, #40]
+ ldrd r6, r7, [r1, #48]
+ ldrd r8, r9, [r1, #56]
+ rev r12, r12
+ rev lr, lr
+ rev r4, r4
+ rev r5, r5
+ rev r6, r6
+ rev r7, r7
+ rev r8, r8
+ rev r9, r9
+ str lr, [sp, #32]
+ str r12, [sp, #36]
+ str r5, [sp, #40]
+ str r4, [sp, #44]
+ str r7, [sp, #48]
+ str r6, [sp, #52]
+ str r9, [sp, #56]
+ str r8, [sp, #60]
+ ldr r12, [r1, #64]
+ ldr lr, [r1, #68]
+ ldrd r4, r5, [r1, #72]
+ ldrd r6, r7, [r1, #80]
+ ldrd r8, r9, [r1, #88]
+ rev r12, r12
+ rev lr, lr
+ rev r4, r4
+ rev r5, r5
+ rev r6, r6
+ rev r7, r7
+ rev r8, r8
+ rev r9, r9
+ str lr, [sp, #64]
+ str r12, [sp, #68]
+ str r5, [sp, #72]
+ str r4, [sp, #76]
+ str r7, [sp, #80]
+ str r6, [sp, #84]
+ str r9, [sp, #88]
+ str r8, [sp, #92]
+ ldr r12, [r1, #96]
+ ldr lr, [r1, #100]
+ ldrd r4, r5, [r1, #104]
+ ldrd r6, r7, [r1, #112]
+ ldrd r8, r9, [r1, #120]
+ rev r12, r12
+ rev lr, lr
+ rev r4, r4
+ rev r5, r5
+ rev r6, r6
+ rev r7, r7
+ rev r8, r8
+ rev r9, r9
+ str lr, [sp, #96]
+ str r12, [sp, #100]
+ str r5, [sp, #104]
+ str r4, [sp, #108]
+ str r7, [sp, #112]
+ str r6, [sp, #116]
+ str r9, [sp, #120]
+ str r8, [sp, #124]
+ # Pre-calc: b ^ c
+ ldrd r8, r9, [r0, #8]
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ eor r8, r8, r12
+ eor r9, r9, lr
+ mov r10, #4
+ # Start of 16 rounds
+L_sha512_len_neon_start:
+ # Round 0
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r4, r5, [r0, #40]
+ ldrd r6, r7, [r0, #48]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ ldrd r6, r7, [sp]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #24]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ strd r6, r7, [r0, #24]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0]
+ ldrd r4, r5, [r0, #8]
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #56]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #56]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[0]
+ ldr r12, [sp, #112]
+ ldr lr, [sp, #116]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp]
+ ldr lr, [sp, #4]
+ ldrd r6, r7, [sp, #72]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp]
+ str lr, [sp, #4]
+ ldr r12, [sp, #8]
+ ldr lr, [sp, #12]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp]
+ ldr lr, [sp, #4]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp]
+ str lr, [sp, #4]
+ # Round 1
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ ldrd r4, r5, [r0, #32]
+ ldrd r6, r7, [r0, #40]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ ldrd r6, r7, [sp, #8]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #8]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #16]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ strd r6, r7, [r0, #16]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #56]
+ ldrd r4, r5, [r0]
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #48]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #48]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[1]
+ ldr r12, [sp, #120]
+ ldr lr, [sp, #124]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #8]
+ ldr lr, [sp, #12]
+ ldrd r6, r7, [sp, #80]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #8]
+ str lr, [sp, #12]
+ ldr r12, [sp, #16]
+ ldr lr, [sp, #20]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #8]
+ ldr lr, [sp, #12]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #8]
+ str lr, [sp, #12]
+ # Round 2
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ ldrd r4, r5, [r0, #24]
+ ldrd r6, r7, [r0, #32]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ ldrd r6, r7, [sp, #16]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #16]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #8]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ strd r6, r7, [r0, #8]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #48]
+ ldrd r4, r5, [r0, #56]
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #40]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #40]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[2]
+ ldr r12, [sp]
+ ldr lr, [sp, #4]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #16]
+ ldr lr, [sp, #20]
+ ldrd r6, r7, [sp, #88]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #16]
+ str lr, [sp, #20]
+ ldr r12, [sp, #24]
+ ldr lr, [sp, #28]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #16]
+ ldr lr, [sp, #20]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #16]
+ str lr, [sp, #20]
+ # Round 3
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ ldrd r4, r5, [r0, #16]
+ ldrd r6, r7, [r0, #24]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r6, r7, [sp, #24]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #24]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ strd r6, r7, [r0]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #40]
+ ldrd r4, r5, [r0, #48]
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #32]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #32]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[3]
+ ldr r12, [sp, #8]
+ ldr lr, [sp, #12]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #24]
+ ldr lr, [sp, #28]
+ ldrd r6, r7, [sp, #96]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #24]
+ str lr, [sp, #28]
+ ldr r12, [sp, #32]
+ ldr lr, [sp, #36]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #24]
+ ldr lr, [sp, #28]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #24]
+ str lr, [sp, #28]
+ # Round 4
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r4, r5, [r0, #8]
+ ldrd r6, r7, [r0, #16]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ ldrd r6, r7, [sp, #32]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #32]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #56]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ strd r6, r7, [r0, #56]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #32]
+ ldrd r4, r5, [r0, #40]
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #24]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #24]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[4]
+ ldr r12, [sp, #16]
+ ldr lr, [sp, #20]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #32]
+ ldr lr, [sp, #36]
+ ldrd r6, r7, [sp, #104]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #32]
+ str lr, [sp, #36]
+ ldr r12, [sp, #40]
+ ldr lr, [sp, #44]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #32]
+ ldr lr, [sp, #36]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #32]
+ str lr, [sp, #36]
+ # Round 5
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ ldrd r4, r5, [r0]
+ ldrd r6, r7, [r0, #8]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ ldrd r6, r7, [sp, #40]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #40]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #48]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ strd r6, r7, [r0, #48]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #24]
+ ldrd r4, r5, [r0, #32]
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #16]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #16]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[5]
+ ldr r12, [sp, #24]
+ ldr lr, [sp, #28]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #40]
+ ldr lr, [sp, #44]
+ ldrd r6, r7, [sp, #112]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #40]
+ str lr, [sp, #44]
+ ldr r12, [sp, #48]
+ ldr lr, [sp, #52]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #40]
+ ldr lr, [sp, #44]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #40]
+ str lr, [sp, #44]
+ # Round 6
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ ldrd r4, r5, [r0, #56]
+ ldrd r6, r7, [r0]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ ldrd r6, r7, [sp, #48]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #48]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #40]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ strd r6, r7, [r0, #40]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #16]
+ ldrd r4, r5, [r0, #24]
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #8]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #8]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[6]
+ ldr r12, [sp, #32]
+ ldr lr, [sp, #36]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #48]
+ ldr lr, [sp, #52]
+ ldrd r6, r7, [sp, #120]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #48]
+ str lr, [sp, #52]
+ ldr r12, [sp, #56]
+ ldr lr, [sp, #60]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #48]
+ ldr lr, [sp, #52]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #48]
+ str lr, [sp, #52]
+ # Round 7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0]
+ str lr, [r0, #4]
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ ldrd r4, r5, [r0, #48]
+ ldrd r6, r7, [r0, #56]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r6, r7, [sp, #56]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #56]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #32]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0]
+ str lr, [r0, #4]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ strd r6, r7, [r0, #32]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #8]
+ ldrd r4, r5, [r0, #16]
+ str r12, [r0]
+ str lr, [r0, #4]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[7]
+ ldr r12, [sp, #40]
+ ldr lr, [sp, #44]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #56]
+ ldr lr, [sp, #60]
+ ldrd r6, r7, [sp]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #56]
+ str lr, [sp, #60]
+ ldr r12, [sp, #64]
+ ldr lr, [sp, #68]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #56]
+ ldr lr, [sp, #60]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #56]
+ str lr, [sp, #60]
+ # Round 8
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r4, r5, [r0, #40]
+ ldrd r6, r7, [r0, #48]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ ldrd r6, r7, [sp, #64]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #64]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #24]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ strd r6, r7, [r0, #24]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0]
+ ldrd r4, r5, [r0, #8]
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #56]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #56]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[8]
+ ldr r12, [sp, #48]
+ ldr lr, [sp, #52]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #64]
+ ldr lr, [sp, #68]
+ ldrd r6, r7, [sp, #8]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #64]
+ str lr, [sp, #68]
+ ldr r12, [sp, #72]
+ ldr lr, [sp, #76]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #64]
+ ldr lr, [sp, #68]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #64]
+ str lr, [sp, #68]
+ # Round 9
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ ldrd r4, r5, [r0, #32]
+ ldrd r6, r7, [r0, #40]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ ldrd r6, r7, [sp, #72]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #72]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #16]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ strd r6, r7, [r0, #16]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #56]
+ ldrd r4, r5, [r0]
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #48]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #48]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[9]
+ ldr r12, [sp, #56]
+ ldr lr, [sp, #60]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #72]
+ ldr lr, [sp, #76]
+ ldrd r6, r7, [sp, #16]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #72]
+ str lr, [sp, #76]
+ ldr r12, [sp, #80]
+ ldr lr, [sp, #84]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #72]
+ ldr lr, [sp, #76]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #72]
+ str lr, [sp, #76]
+ # Round 10
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ ldrd r4, r5, [r0, #24]
+ ldrd r6, r7, [r0, #32]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ ldrd r6, r7, [sp, #80]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #80]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #8]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ strd r6, r7, [r0, #8]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #48]
+ ldrd r4, r5, [r0, #56]
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #40]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #40]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[10]
+ ldr r12, [sp, #64]
+ ldr lr, [sp, #68]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #80]
+ ldr lr, [sp, #84]
+ ldrd r6, r7, [sp, #24]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #80]
+ str lr, [sp, #84]
+ ldr r12, [sp, #88]
+ ldr lr, [sp, #92]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #80]
+ ldr lr, [sp, #84]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #80]
+ str lr, [sp, #84]
+ # Round 11
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ ldrd r4, r5, [r0, #16]
+ ldrd r6, r7, [r0, #24]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r6, r7, [sp, #88]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #88]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ strd r6, r7, [r0]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #40]
+ ldrd r4, r5, [r0, #48]
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #32]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #32]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[11]
+ ldr r12, [sp, #72]
+ ldr lr, [sp, #76]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #88]
+ ldr lr, [sp, #92]
+ ldrd r6, r7, [sp, #32]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #88]
+ str lr, [sp, #92]
+ ldr r12, [sp, #96]
+ ldr lr, [sp, #100]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #88]
+ ldr lr, [sp, #92]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #88]
+ str lr, [sp, #92]
+ # Round 12
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r4, r5, [r0, #8]
+ ldrd r6, r7, [r0, #16]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ ldrd r6, r7, [sp, #96]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #96]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #56]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ strd r6, r7, [r0, #56]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #32]
+ ldrd r4, r5, [r0, #40]
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #24]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #24]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[12]
+ ldr r12, [sp, #80]
+ ldr lr, [sp, #84]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #96]
+ ldr lr, [sp, #100]
+ ldrd r6, r7, [sp, #40]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #96]
+ str lr, [sp, #100]
+ ldr r12, [sp, #104]
+ ldr lr, [sp, #108]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #96]
+ ldr lr, [sp, #100]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #96]
+ str lr, [sp, #100]
+ # Round 13
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ ldrd r4, r5, [r0]
+ ldrd r6, r7, [r0, #8]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ ldrd r6, r7, [sp, #104]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #104]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #48]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ strd r6, r7, [r0, #48]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #24]
+ ldrd r4, r5, [r0, #32]
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #16]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #16]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[13]
+ ldr r12, [sp, #88]
+ ldr lr, [sp, #92]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #104]
+ ldr lr, [sp, #108]
+ ldrd r6, r7, [sp, #48]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #104]
+ str lr, [sp, #108]
+ ldr r12, [sp, #112]
+ ldr lr, [sp, #116]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #104]
+ ldr lr, [sp, #108]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #104]
+ str lr, [sp, #108]
+ # Round 14
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ ldrd r4, r5, [r0, #56]
+ ldrd r6, r7, [r0]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ ldrd r6, r7, [sp, #112]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #112]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #40]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ strd r6, r7, [r0, #40]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #16]
+ ldrd r4, r5, [r0, #24]
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #8]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #8]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[14]
+ ldr r12, [sp, #96]
+ ldr lr, [sp, #100]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #112]
+ ldr lr, [sp, #116]
+ ldrd r6, r7, [sp, #56]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #112]
+ str lr, [sp, #116]
+ ldr r12, [sp, #120]
+ ldr lr, [sp, #124]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #112]
+ ldr lr, [sp, #116]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #112]
+ str lr, [sp, #116]
+ # Round 15
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0]
+ str lr, [r0, #4]
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ ldrd r4, r5, [r0, #48]
+ ldrd r6, r7, [r0, #56]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r6, r7, [sp, #120]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #120]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #32]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0]
+ str lr, [r0, #4]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ strd r6, r7, [r0, #32]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #8]
+ ldrd r4, r5, [r0, #16]
+ str r12, [r0]
+ str lr, [r0, #4]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0]
+ mov r8, r6
+ mov r9, r7
+ # Calc new W[15]
+ ldr r12, [sp, #104]
+ ldr lr, [sp, #108]
+ lsrs r4, r12, #19
+ lsrs r5, lr, #19
+ orr r5, r5, r12, lsl #13
+ orr r4, r4, lr, lsl #13
+ lsls r6, r12, #3
+ lsls r7, lr, #3
+ orr r7, r7, r12, lsr #29
+ orr r6, r6, lr, lsr #29
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #6
+ lsrs r7, lr, #6
+ orr r6, r6, lr, lsl #26
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #120]
+ ldr lr, [sp, #124]
+ ldrd r6, r7, [sp, #64]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ adds r12, r12, r6
+ adc lr, lr, r7
+ str r12, [sp, #120]
+ str lr, [sp, #124]
+ ldr r12, [sp]
+ ldr lr, [sp, #4]
+ lsrs r4, r12, #1
+ lsrs r5, lr, #1
+ orr r5, r5, r12, lsl #31
+ orr r4, r4, lr, lsl #31
+ lsrs r6, r12, #8
+ lsrs r7, lr, #8
+ orr r7, r7, r12, lsl #24
+ orr r6, r6, lr, lsl #24
+ eor r5, r5, r7
+ eor r4, r4, r6
+ lsrs r6, r12, #7
+ lsrs r7, lr, #7
+ orr r6, r6, lr, lsl #25
+ eor r5, r5, r7
+ eor r4, r4, r6
+ ldr r12, [sp, #120]
+ ldr lr, [sp, #124]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [sp, #120]
+ str lr, [sp, #124]
+ add r3, r3, #0x80
+ subs r10, r10, #1
+ bne L_sha512_len_neon_start
+ # Round 0
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r4, r5, [r0, #40]
+ ldrd r6, r7, [r0, #48]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ ldrd r6, r7, [sp]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #24]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ strd r6, r7, [r0, #24]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0]
+ ldrd r4, r5, [r0, #8]
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #56]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #56]
+ mov r8, r6
+ mov r9, r7
+ # Round 1
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ ldrd r4, r5, [r0, #32]
+ ldrd r6, r7, [r0, #40]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ ldrd r6, r7, [sp, #8]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #8]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #16]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ strd r6, r7, [r0, #16]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #56]
+ ldrd r4, r5, [r0]
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #48]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #48]
+ mov r8, r6
+ mov r9, r7
+ # Round 2
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ ldrd r4, r5, [r0, #24]
+ ldrd r6, r7, [r0, #32]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ ldrd r6, r7, [sp, #16]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #16]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #8]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ strd r6, r7, [r0, #8]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #48]
+ ldrd r4, r5, [r0, #56]
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #40]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #40]
+ mov r8, r6
+ mov r9, r7
+ # Round 3
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ ldrd r4, r5, [r0, #16]
+ ldrd r6, r7, [r0, #24]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r6, r7, [sp, #24]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #24]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ strd r6, r7, [r0]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #40]
+ ldrd r4, r5, [r0, #48]
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #32]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #32]
+ mov r8, r6
+ mov r9, r7
+ # Round 4
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r4, r5, [r0, #8]
+ ldrd r6, r7, [r0, #16]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ ldrd r6, r7, [sp, #32]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #32]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #56]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ strd r6, r7, [r0, #56]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #32]
+ ldrd r4, r5, [r0, #40]
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #24]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #24]
+ mov r8, r6
+ mov r9, r7
+ # Round 5
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ ldrd r4, r5, [r0]
+ ldrd r6, r7, [r0, #8]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ ldrd r6, r7, [sp, #40]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #40]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #48]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ strd r6, r7, [r0, #48]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #24]
+ ldrd r4, r5, [r0, #32]
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #16]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #16]
+ mov r8, r6
+ mov r9, r7
+ # Round 6
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ ldrd r4, r5, [r0, #56]
+ ldrd r6, r7, [r0]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ ldrd r6, r7, [sp, #48]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #48]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #40]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ strd r6, r7, [r0, #40]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #16]
+ ldrd r4, r5, [r0, #24]
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #8]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #8]
+ mov r8, r6
+ mov r9, r7
+ # Round 7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0]
+ str lr, [r0, #4]
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ ldrd r4, r5, [r0, #48]
+ ldrd r6, r7, [r0, #56]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r6, r7, [sp, #56]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #56]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #32]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0]
+ str lr, [r0, #4]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ strd r6, r7, [r0, #32]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #8]
+ ldrd r4, r5, [r0, #16]
+ str r12, [r0]
+ str lr, [r0, #4]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0]
+ mov r8, r6
+ mov r9, r7
+ # Round 8
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r4, r5, [r0, #40]
+ ldrd r6, r7, [r0, #48]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ ldrd r6, r7, [sp, #64]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #64]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #24]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ strd r6, r7, [r0, #24]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0]
+ ldrd r4, r5, [r0, #8]
+ str r12, [r0, #56]
+ str lr, [r0, #60]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #56]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #56]
+ mov r8, r6
+ mov r9, r7
+ # Round 9
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ ldrd r4, r5, [r0, #32]
+ ldrd r6, r7, [r0, #40]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ ldrd r6, r7, [sp, #72]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #72]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #16]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ strd r6, r7, [r0, #16]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #56]
+ ldrd r4, r5, [r0]
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #48]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #48]
+ mov r8, r6
+ mov r9, r7
+ # Round 10
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ ldrd r4, r5, [r0, #24]
+ ldrd r6, r7, [r0, #32]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ ldrd r6, r7, [sp, #80]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #80]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #8]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ strd r6, r7, [r0, #8]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #48]
+ ldrd r4, r5, [r0, #56]
+ str r12, [r0, #40]
+ str lr, [r0, #44]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #40]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #40]
+ mov r8, r6
+ mov r9, r7
+ # Round 11
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ ldrd r4, r5, [r0, #16]
+ ldrd r6, r7, [r0, #24]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r6, r7, [sp, #88]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #88]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ strd r6, r7, [r0]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #40]
+ ldrd r4, r5, [r0, #48]
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #32]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #32]
+ mov r8, r6
+ mov r9, r7
+ # Round 12
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r4, r5, [r0, #8]
+ ldrd r6, r7, [r0, #16]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ ldrd r6, r7, [sp, #96]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #96]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #56]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ strd r6, r7, [r0, #56]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #32]
+ ldrd r4, r5, [r0, #40]
+ str r12, [r0, #24]
+ str lr, [r0, #28]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #24]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #24]
+ mov r8, r6
+ mov r9, r7
+ # Round 13
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ ldr r12, [r0, #56]
+ ldr lr, [r0, #60]
+ ldrd r4, r5, [r0]
+ ldrd r6, r7, [r0, #8]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ ldrd r6, r7, [sp, #104]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #104]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #48]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #24]
+ ldr lr, [r0, #28]
+ strd r6, r7, [r0, #48]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #24]
+ ldrd r4, r5, [r0, #32]
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #16]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #16]
+ mov r8, r6
+ mov r9, r7
+ # Round 14
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ ldrd r4, r5, [r0, #56]
+ ldrd r6, r7, [r0]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ ldrd r6, r7, [sp, #112]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #112]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #40]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ strd r6, r7, [r0, #40]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #16]
+ ldrd r4, r5, [r0, #24]
+ str r12, [r0, #8]
+ str lr, [r0, #12]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0, #8]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0, #8]
+ mov r8, r6
+ mov r9, r7
+ # Round 15
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ lsrs r4, r12, #14
+ lsrs r5, lr, #14
+ orr r5, r5, r12, lsl #18
+ orr r4, r4, lr, lsl #18
+ lsrs r6, r12, #18
+ lsrs r7, lr, #18
+ orr r7, r7, r12, lsl #14
+ orr r6, r6, lr, lsl #14
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #23
+ lsls r7, lr, #23
+ orr r7, r7, r12, lsr #9
+ orr r6, r6, lr, lsr #9
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0]
+ str lr, [r0, #4]
+ ldr r12, [r0, #40]
+ ldr lr, [r0, #44]
+ ldrd r4, r5, [r0, #48]
+ ldrd r6, r7, [r0, #56]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ and r4, r4, r12
+ and r5, r5, lr
+ eor r4, r4, r6
+ eor r5, r5, r7
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r6, r7, [sp, #120]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r4, r5, [r3, #120]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ ldrd r6, r7, [r0, #32]
+ adds r12, r12, r4
+ adc lr, lr, r5
+ str r12, [r0]
+ str lr, [r0, #4]
+ adds r6, r6, r12
+ adc r7, r7, lr
+ ldr r12, [r0, #8]
+ ldr lr, [r0, #12]
+ strd r6, r7, [r0, #32]
+ lsrs r4, r12, #28
+ lsrs r5, lr, #28
+ orr r5, r5, r12, lsl #4
+ orr r4, r4, lr, lsl #4
+ lsls r6, r12, #30
+ lsls r7, lr, #30
+ orr r7, r7, r12, lsr #2
+ orr r6, r6, lr, lsr #2
+ eor r4, r4, r6
+ eor r5, r5, r7
+ lsls r6, r12, #25
+ lsls r7, lr, #25
+ orr r7, r7, r12, lsr #7
+ orr r6, r6, lr, lsr #7
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ eor r4, r4, r6
+ eor r5, r5, r7
+ adds r12, r12, r4
+ adc lr, lr, r5
+ ldrd r6, r7, [r0, #8]
+ ldrd r4, r5, [r0, #16]
+ str r12, [r0]
+ str lr, [r0, #4]
+ eor r6, r6, r4
+ eor r7, r7, r5
+ and r8, r8, r6
+ and r9, r9, r7
+ eor r8, r8, r4
+ eor r9, r9, r5
+ ldrd r4, r5, [r0]
+ adds r4, r4, r8
+ adc r5, r5, r9
+ strd r4, r5, [r0]
+ mov r8, r6
+ mov r9, r7
+ # Add in digest from start
+ ldr r12, [r0]
+ ldr lr, [r0, #4]
+ ldrd r4, r5, [r0, #8]
+ ldrd r6, r7, [sp, #128]
+ ldrd r8, r9, [sp, #136]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ adds r4, r4, r8
+ adc r5, r5, r9
+ str r12, [r0]
+ str lr, [r0, #4]
+ strd r4, r5, [r0, #8]
+ str r12, [sp, #128]
+ str lr, [sp, #132]
+ strd r4, r5, [sp, #136]
+ ldr r12, [r0, #16]
+ ldr lr, [r0, #20]
+ ldrd r4, r5, [r0, #24]
+ ldrd r6, r7, [sp, #144]
+ ldrd r8, r9, [sp, #152]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ adds r4, r4, r8
+ adc r5, r5, r9
+ str r12, [r0, #16]
+ str lr, [r0, #20]
+ strd r4, r5, [r0, #24]
+ str r12, [sp, #144]
+ str lr, [sp, #148]
+ strd r4, r5, [sp, #152]
+ ldr r12, [r0, #32]
+ ldr lr, [r0, #36]
+ ldrd r4, r5, [r0, #40]
+ ldrd r6, r7, [sp, #160]
+ ldrd r8, r9, [sp, #168]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ adds r4, r4, r8
+ adc r5, r5, r9
+ str r12, [r0, #32]
+ str lr, [r0, #36]
+ strd r4, r5, [r0, #40]
+ str r12, [sp, #160]
+ str lr, [sp, #164]
+ strd r4, r5, [sp, #168]
+ ldr r12, [r0, #48]
+ ldr lr, [r0, #52]
+ ldrd r4, r5, [r0, #56]
+ ldrd r6, r7, [sp, #176]
+ ldrd r8, r9, [sp, #184]
+ adds r12, r12, r6
+ adc lr, lr, r7
+ adds r4, r4, r8
+ adc r5, r5, r9
+ str r12, [r0, #48]
+ str lr, [r0, #52]
+ strd r4, r5, [r0, #56]
+ str r12, [sp, #176]
+ str lr, [sp, #180]
+ strd r4, r5, [sp, #184]
+ subs r2, r2, #0x80
+ sub r3, r3, #0x200
+ add r1, r1, #0x80
+ bne L_sha512_len_neon_begin
+ eor r0, r0, r0
+ add sp, sp, #0xc0
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
+ .size Transform_Sha512_Len,.-Transform_Sha512_Len
+#endif /* WOLFSSL_ARMASM_NO_NEON */
+#ifndef WOLFSSL_ARMASM_NO_NEON
+ .text
+ .type L_SHA512_transform_neon_len_k, %object
+ .size L_SHA512_transform_neon_len_k, 640
+ .align 3
+L_SHA512_transform_neon_len_k:
+ .word 0xd728ae22
+ .word 0x428a2f98
+ .word 0x23ef65cd
+ .word 0x71374491
+ .word 0xec4d3b2f
+ .word 0xb5c0fbcf
+ .word 0x8189dbbc
+ .word 0xe9b5dba5
+ .word 0xf348b538
+ .word 0x3956c25b
+ .word 0xb605d019
+ .word 0x59f111f1
+ .word 0xaf194f9b
+ .word 0x923f82a4
+ .word 0xda6d8118
+ .word 0xab1c5ed5
+ .word 0xa3030242
+ .word 0xd807aa98
+ .word 0x45706fbe
+ .word 0x12835b01
+ .word 0x4ee4b28c
+ .word 0x243185be
+ .word 0xd5ffb4e2
+ .word 0x550c7dc3
+ .word 0xf27b896f
+ .word 0x72be5d74
+ .word 0x3b1696b1
+ .word 0x80deb1fe
+ .word 0x25c71235
+ .word 0x9bdc06a7
+ .word 0xcf692694
+ .word 0xc19bf174
+ .word 0x9ef14ad2
+ .word 0xe49b69c1
+ .word 0x384f25e3
+ .word 0xefbe4786
+ .word 0x8b8cd5b5
+ .word 0xfc19dc6
+ .word 0x77ac9c65
+ .word 0x240ca1cc
+ .word 0x592b0275
+ .word 0x2de92c6f
+ .word 0x6ea6e483
+ .word 0x4a7484aa
+ .word 0xbd41fbd4
+ .word 0x5cb0a9dc
+ .word 0x831153b5
+ .word 0x76f988da
+ .word 0xee66dfab
+ .word 0x983e5152
+ .word 0x2db43210
+ .word 0xa831c66d
+ .word 0x98fb213f
+ .word 0xb00327c8
+ .word 0xbeef0ee4
+ .word 0xbf597fc7
+ .word 0x3da88fc2
+ .word 0xc6e00bf3
+ .word 0x930aa725
+ .word 0xd5a79147
+ .word 0xe003826f
+ .word 0x6ca6351
+ .word 0xa0e6e70
+ .word 0x14292967
+ .word 0x46d22ffc
+ .word 0x27b70a85
+ .word 0x5c26c926
+ .word 0x2e1b2138
+ .word 0x5ac42aed
+ .word 0x4d2c6dfc
+ .word 0x9d95b3df
+ .word 0x53380d13
+ .word 0x8baf63de
+ .word 0x650a7354
+ .word 0x3c77b2a8
+ .word 0x766a0abb
+ .word 0x47edaee6
+ .word 0x81c2c92e
+ .word 0x1482353b
+ .word 0x92722c85
+ .word 0x4cf10364
+ .word 0xa2bfe8a1
+ .word 0xbc423001
+ .word 0xa81a664b
+ .word 0xd0f89791
+ .word 0xc24b8b70
+ .word 0x654be30
+ .word 0xc76c51a3
+ .word 0xd6ef5218
+ .word 0xd192e819
+ .word 0x5565a910
+ .word 0xd6990624
+ .word 0x5771202a
+ .word 0xf40e3585
+ .word 0x32bbd1b8
+ .word 0x106aa070
+ .word 0xb8d2d0c8
+ .word 0x19a4c116
+ .word 0x5141ab53
+ .word 0x1e376c08
+ .word 0xdf8eeb99
+ .word 0x2748774c
+ .word 0xe19b48a8
+ .word 0x34b0bcb5
+ .word 0xc5c95a63
+ .word 0x391c0cb3
+ .word 0xe3418acb
+ .word 0x4ed8aa4a
+ .word 0x7763e373
+ .word 0x5b9cca4f
+ .word 0xd6b2b8a3
+ .word 0x682e6ff3
+ .word 0x5defb2fc
+ .word 0x748f82ee
+ .word 0x43172f60
+ .word 0x78a5636f
+ .word 0xa1f0ab72
+ .word 0x84c87814
+ .word 0x1a6439ec
+ .word 0x8cc70208
+ .word 0x23631e28
+ .word 0x90befffa
+ .word 0xde82bde9
+ .word 0xa4506ceb
+ .word 0xb2c67915
+ .word 0xbef9a3f7
+ .word 0xe372532b
+ .word 0xc67178f2
+ .word 0xea26619c
+ .word 0xca273ece
+ .word 0x21c0c207
+ .word 0xd186b8c7
+ .word 0xcde0eb1e
+ .word 0xeada7dd6
+ .word 0xee6ed178
+ .word 0xf57d4f7f
+ .word 0x72176fba
+ .word 0x6f067aa
+ .word 0xa2c898a6
+ .word 0xa637dc5
+ .word 0xbef90dae
+ .word 0x113f9804
+ .word 0x131c471b
+ .word 0x1b710b35
+ .word 0x23047d84
+ .word 0x28db77f5
+ .word 0x40c72493
+ .word 0x32caab7b
+ .word 0x15c9bebc
+ .word 0x3c9ebe0a
+ .word 0x9c100d4c
+ .word 0x431d67c4
+ .word 0xcb3e42b6
+ .word 0x4cc5d4be
+ .word 0xfc657e2a
+ .word 0x597f299c
+ .word 0x3ad6faec
+ .word 0x5fcb6fab
+ .word 0x4a475817
+ .word 0x6c44198c
+ .text
+ .align 2
+ .globl Transform_Sha512_Len
+ .type Transform_Sha512_Len, %function
+Transform_Sha512_Len:
+ vpush {d8-d15}
+ # Load digest into working vars
+ vldm.64 r0, {d0-d7}
+ # Start of loop processing a block
+L_sha512_len_neon_begin:
+ # Load W
+ vldm.64 r1!, {d16-d31}
+ vrev64.8 q8, q8
+ vrev64.8 q9, q9
+ vrev64.8 q10, q10
+ vrev64.8 q11, q11
+ vrev64.8 q12, q12
+ vrev64.8 q13, q13
+ vrev64.8 q14, q14
+ vrev64.8 q15, q15
+ adr r3, L_SHA512_transform_neon_len_k
+ mov r12, #4
+ # Start of 16 rounds
+L_sha512_len_neon_start:
+ # Round 0
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d4, #50
+ vsri.u64 d8, d4, #14
+ vshl.u64 d9, d0, #36
+ vsri.u64 d9, d0, #28
+ vshl.u64 d10, d4, #46
+ vsri.u64 d10, d4, #18
+ vshl.u64 d11, d0, #30
+ vsri.u64 d11, d0, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d4, #23
+ vsri.u64 d10, d4, #41
+ vshl.u64 d11, d0, #25
+ vsri.u64 d11, d0, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d7, d8
+ vadd.i64 d12, d16
+ vmov d8, d4
+ veor d10, d1, d2
+ vadd.i64 d7, d12
+ vbsl d8, d5, d6
+ vbsl d10, d0, d2
+ vadd.i64 d7, d8
+ vadd.i64 d10, d9
+ vadd.i64 d3, d7
+ vadd.i64 d7, d10
+ # Round 1
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d3, #50
+ vsri.u64 d8, d3, #14
+ vshl.u64 d9, d7, #36
+ vsri.u64 d9, d7, #28
+ vshl.u64 d10, d3, #46
+ vsri.u64 d10, d3, #18
+ vshl.u64 d11, d7, #30
+ vsri.u64 d11, d7, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d3, #23
+ vsri.u64 d10, d3, #41
+ vshl.u64 d11, d7, #25
+ vsri.u64 d11, d7, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d6, d8
+ vadd.i64 d12, d17
+ vmov d8, d3
+ veor d10, d0, d1
+ vadd.i64 d6, d12
+ vbsl d8, d4, d5
+ vbsl d10, d7, d1
+ vadd.i64 d6, d8
+ vadd.i64 d10, d9
+ vadd.i64 d2, d6
+ vadd.i64 d6, d10
+ # Calc new W[0]-W[1]
+ vext.8 q6, q8, q9, #8
+ vshl.u64 q4, q15, #45
+ vsri.u64 q4, q15, #19
+ vshl.u64 q5, q15, #3
+ vsri.u64 q5, q15, #61
+ veor q5, q4
+ vshr.u64 q4, q15, #6
+ veor q5, q4
+ vadd.i64 q8, q5
+ vext.8 q7, q12, q13, #8
+ vadd.i64 q8, q7
+ vshl.u64 q4, q6, #63
+ vsri.u64 q4, q6, #1
+ vshl.u64 q5, q6, #56
+ vsri.u64 q5, q6, #8
+ veor q5, q4
+ vshr.u64 q6, #7
+ veor q5, q6
+ vadd.i64 q8, q5
+ # Round 2
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d2, #50
+ vsri.u64 d8, d2, #14
+ vshl.u64 d9, d6, #36
+ vsri.u64 d9, d6, #28
+ vshl.u64 d10, d2, #46
+ vsri.u64 d10, d2, #18
+ vshl.u64 d11, d6, #30
+ vsri.u64 d11, d6, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d2, #23
+ vsri.u64 d10, d2, #41
+ vshl.u64 d11, d6, #25
+ vsri.u64 d11, d6, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d5, d8
+ vadd.i64 d12, d18
+ vmov d8, d2
+ veor d10, d7, d0
+ vadd.i64 d5, d12
+ vbsl d8, d3, d4
+ vbsl d10, d6, d0
+ vadd.i64 d5, d8
+ vadd.i64 d10, d9
+ vadd.i64 d1, d5
+ vadd.i64 d5, d10
+ # Round 3
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d1, #50
+ vsri.u64 d8, d1, #14
+ vshl.u64 d9, d5, #36
+ vsri.u64 d9, d5, #28
+ vshl.u64 d10, d1, #46
+ vsri.u64 d10, d1, #18
+ vshl.u64 d11, d5, #30
+ vsri.u64 d11, d5, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d1, #23
+ vsri.u64 d10, d1, #41
+ vshl.u64 d11, d5, #25
+ vsri.u64 d11, d5, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d4, d8
+ vadd.i64 d12, d19
+ vmov d8, d1
+ veor d10, d6, d7
+ vadd.i64 d4, d12
+ vbsl d8, d2, d3
+ vbsl d10, d5, d7
+ vadd.i64 d4, d8
+ vadd.i64 d10, d9
+ vadd.i64 d0, d4
+ vadd.i64 d4, d10
+ # Calc new W[2]-W[3]
+ vext.8 q6, q9, q10, #8
+ vshl.u64 q4, q8, #45
+ vsri.u64 q4, q8, #19
+ vshl.u64 q5, q8, #3
+ vsri.u64 q5, q8, #61
+ veor q5, q4
+ vshr.u64 q4, q8, #6
+ veor q5, q4
+ vadd.i64 q9, q5
+ vext.8 q7, q13, q14, #8
+ vadd.i64 q9, q7
+ vshl.u64 q4, q6, #63
+ vsri.u64 q4, q6, #1
+ vshl.u64 q5, q6, #56
+ vsri.u64 q5, q6, #8
+ veor q5, q4
+ vshr.u64 q6, #7
+ veor q5, q6
+ vadd.i64 q9, q5
+ # Round 4
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d0, #50
+ vsri.u64 d8, d0, #14
+ vshl.u64 d9, d4, #36
+ vsri.u64 d9, d4, #28
+ vshl.u64 d10, d0, #46
+ vsri.u64 d10, d0, #18
+ vshl.u64 d11, d4, #30
+ vsri.u64 d11, d4, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d0, #23
+ vsri.u64 d10, d0, #41
+ vshl.u64 d11, d4, #25
+ vsri.u64 d11, d4, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d3, d8
+ vadd.i64 d12, d20
+ vmov d8, d0
+ veor d10, d5, d6
+ vadd.i64 d3, d12
+ vbsl d8, d1, d2
+ vbsl d10, d4, d6
+ vadd.i64 d3, d8
+ vadd.i64 d10, d9
+ vadd.i64 d7, d3
+ vadd.i64 d3, d10
+ # Round 5
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d7, #50
+ vsri.u64 d8, d7, #14
+ vshl.u64 d9, d3, #36
+ vsri.u64 d9, d3, #28
+ vshl.u64 d10, d7, #46
+ vsri.u64 d10, d7, #18
+ vshl.u64 d11, d3, #30
+ vsri.u64 d11, d3, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d7, #23
+ vsri.u64 d10, d7, #41
+ vshl.u64 d11, d3, #25
+ vsri.u64 d11, d3, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d2, d8
+ vadd.i64 d12, d21
+ vmov d8, d7
+ veor d10, d4, d5
+ vadd.i64 d2, d12
+ vbsl d8, d0, d1
+ vbsl d10, d3, d5
+ vadd.i64 d2, d8
+ vadd.i64 d10, d9
+ vadd.i64 d6, d2
+ vadd.i64 d2, d10
+ # Calc new W[4]-W[5]
+ vext.8 q6, q10, q11, #8
+ vshl.u64 q4, q9, #45
+ vsri.u64 q4, q9, #19
+ vshl.u64 q5, q9, #3
+ vsri.u64 q5, q9, #61
+ veor q5, q4
+ vshr.u64 q4, q9, #6
+ veor q5, q4
+ vadd.i64 q10, q5
+ vext.8 q7, q14, q15, #8
+ vadd.i64 q10, q7
+ vshl.u64 q4, q6, #63
+ vsri.u64 q4, q6, #1
+ vshl.u64 q5, q6, #56
+ vsri.u64 q5, q6, #8
+ veor q5, q4
+ vshr.u64 q6, #7
+ veor q5, q6
+ vadd.i64 q10, q5
+ # Round 6
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d6, #50
+ vsri.u64 d8, d6, #14
+ vshl.u64 d9, d2, #36
+ vsri.u64 d9, d2, #28
+ vshl.u64 d10, d6, #46
+ vsri.u64 d10, d6, #18
+ vshl.u64 d11, d2, #30
+ vsri.u64 d11, d2, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d6, #23
+ vsri.u64 d10, d6, #41
+ vshl.u64 d11, d2, #25
+ vsri.u64 d11, d2, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d1, d8
+ vadd.i64 d12, d22
+ vmov d8, d6
+ veor d10, d3, d4
+ vadd.i64 d1, d12
+ vbsl d8, d7, d0
+ vbsl d10, d2, d4
+ vadd.i64 d1, d8
+ vadd.i64 d10, d9
+ vadd.i64 d5, d1
+ vadd.i64 d1, d10
+ # Round 7
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d5, #50
+ vsri.u64 d8, d5, #14
+ vshl.u64 d9, d1, #36
+ vsri.u64 d9, d1, #28
+ vshl.u64 d10, d5, #46
+ vsri.u64 d10, d5, #18
+ vshl.u64 d11, d1, #30
+ vsri.u64 d11, d1, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d5, #23
+ vsri.u64 d10, d5, #41
+ vshl.u64 d11, d1, #25
+ vsri.u64 d11, d1, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d0, d8
+ vadd.i64 d12, d23
+ vmov d8, d5
+ veor d10, d2, d3
+ vadd.i64 d0, d12
+ vbsl d8, d6, d7
+ vbsl d10, d1, d3
+ vadd.i64 d0, d8
+ vadd.i64 d10, d9
+ vadd.i64 d4, d0
+ vadd.i64 d0, d10
+ # Calc new W[6]-W[7]
+ vext.8 q6, q11, q12, #8
+ vshl.u64 q4, q10, #45
+ vsri.u64 q4, q10, #19
+ vshl.u64 q5, q10, #3
+ vsri.u64 q5, q10, #61
+ veor q5, q4
+ vshr.u64 q4, q10, #6
+ veor q5, q4
+ vadd.i64 q11, q5
+ vext.8 q7, q15, q8, #8
+ vadd.i64 q11, q7
+ vshl.u64 q4, q6, #63
+ vsri.u64 q4, q6, #1
+ vshl.u64 q5, q6, #56
+ vsri.u64 q5, q6, #8
+ veor q5, q4
+ vshr.u64 q6, #7
+ veor q5, q6
+ vadd.i64 q11, q5
+ # Round 8
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d4, #50
+ vsri.u64 d8, d4, #14
+ vshl.u64 d9, d0, #36
+ vsri.u64 d9, d0, #28
+ vshl.u64 d10, d4, #46
+ vsri.u64 d10, d4, #18
+ vshl.u64 d11, d0, #30
+ vsri.u64 d11, d0, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d4, #23
+ vsri.u64 d10, d4, #41
+ vshl.u64 d11, d0, #25
+ vsri.u64 d11, d0, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d7, d8
+ vadd.i64 d12, d24
+ vmov d8, d4
+ veor d10, d1, d2
+ vadd.i64 d7, d12
+ vbsl d8, d5, d6
+ vbsl d10, d0, d2
+ vadd.i64 d7, d8
+ vadd.i64 d10, d9
+ vadd.i64 d3, d7
+ vadd.i64 d7, d10
+ # Round 9
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d3, #50
+ vsri.u64 d8, d3, #14
+ vshl.u64 d9, d7, #36
+ vsri.u64 d9, d7, #28
+ vshl.u64 d10, d3, #46
+ vsri.u64 d10, d3, #18
+ vshl.u64 d11, d7, #30
+ vsri.u64 d11, d7, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d3, #23
+ vsri.u64 d10, d3, #41
+ vshl.u64 d11, d7, #25
+ vsri.u64 d11, d7, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d6, d8
+ vadd.i64 d12, d25
+ vmov d8, d3
+ veor d10, d0, d1
+ vadd.i64 d6, d12
+ vbsl d8, d4, d5
+ vbsl d10, d7, d1
+ vadd.i64 d6, d8
+ vadd.i64 d10, d9
+ vadd.i64 d2, d6
+ vadd.i64 d6, d10
+ # Calc new W[8]-W[9]
+ vext.8 q6, q12, q13, #8
+ vshl.u64 q4, q11, #45
+ vsri.u64 q4, q11, #19
+ vshl.u64 q5, q11, #3
+ vsri.u64 q5, q11, #61
+ veor q5, q4
+ vshr.u64 q4, q11, #6
+ veor q5, q4
+ vadd.i64 q12, q5
+ vext.8 q7, q8, q9, #8
+ vadd.i64 q12, q7
+ vshl.u64 q4, q6, #63
+ vsri.u64 q4, q6, #1
+ vshl.u64 q5, q6, #56
+ vsri.u64 q5, q6, #8
+ veor q5, q4
+ vshr.u64 q6, #7
+ veor q5, q6
+ vadd.i64 q12, q5
+ # Round 10
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d2, #50
+ vsri.u64 d8, d2, #14
+ vshl.u64 d9, d6, #36
+ vsri.u64 d9, d6, #28
+ vshl.u64 d10, d2, #46
+ vsri.u64 d10, d2, #18
+ vshl.u64 d11, d6, #30
+ vsri.u64 d11, d6, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d2, #23
+ vsri.u64 d10, d2, #41
+ vshl.u64 d11, d6, #25
+ vsri.u64 d11, d6, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d5, d8
+ vadd.i64 d12, d26
+ vmov d8, d2
+ veor d10, d7, d0
+ vadd.i64 d5, d12
+ vbsl d8, d3, d4
+ vbsl d10, d6, d0
+ vadd.i64 d5, d8
+ vadd.i64 d10, d9
+ vadd.i64 d1, d5
+ vadd.i64 d5, d10
+ # Round 11
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d1, #50
+ vsri.u64 d8, d1, #14
+ vshl.u64 d9, d5, #36
+ vsri.u64 d9, d5, #28
+ vshl.u64 d10, d1, #46
+ vsri.u64 d10, d1, #18
+ vshl.u64 d11, d5, #30
+ vsri.u64 d11, d5, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d1, #23
+ vsri.u64 d10, d1, #41
+ vshl.u64 d11, d5, #25
+ vsri.u64 d11, d5, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d4, d8
+ vadd.i64 d12, d27
+ vmov d8, d1
+ veor d10, d6, d7
+ vadd.i64 d4, d12
+ vbsl d8, d2, d3
+ vbsl d10, d5, d7
+ vadd.i64 d4, d8
+ vadd.i64 d10, d9
+ vadd.i64 d0, d4
+ vadd.i64 d4, d10
+ # Calc new W[10]-W[11]
+ vext.8 q6, q13, q14, #8
+ vshl.u64 q4, q12, #45
+ vsri.u64 q4, q12, #19
+ vshl.u64 q5, q12, #3
+ vsri.u64 q5, q12, #61
+ veor q5, q4
+ vshr.u64 q4, q12, #6
+ veor q5, q4
+ vadd.i64 q13, q5
+ vext.8 q7, q9, q10, #8
+ vadd.i64 q13, q7
+ vshl.u64 q4, q6, #63
+ vsri.u64 q4, q6, #1
+ vshl.u64 q5, q6, #56
+ vsri.u64 q5, q6, #8
+ veor q5, q4
+ vshr.u64 q6, #7
+ veor q5, q6
+ vadd.i64 q13, q5
+ # Round 12
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d0, #50
+ vsri.u64 d8, d0, #14
+ vshl.u64 d9, d4, #36
+ vsri.u64 d9, d4, #28
+ vshl.u64 d10, d0, #46
+ vsri.u64 d10, d0, #18
+ vshl.u64 d11, d4, #30
+ vsri.u64 d11, d4, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d0, #23
+ vsri.u64 d10, d0, #41
+ vshl.u64 d11, d4, #25
+ vsri.u64 d11, d4, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d3, d8
+ vadd.i64 d12, d28
+ vmov d8, d0
+ veor d10, d5, d6
+ vadd.i64 d3, d12
+ vbsl d8, d1, d2
+ vbsl d10, d4, d6
+ vadd.i64 d3, d8
+ vadd.i64 d10, d9
+ vadd.i64 d7, d3
+ vadd.i64 d3, d10
+ # Round 13
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d7, #50
+ vsri.u64 d8, d7, #14
+ vshl.u64 d9, d3, #36
+ vsri.u64 d9, d3, #28
+ vshl.u64 d10, d7, #46
+ vsri.u64 d10, d7, #18
+ vshl.u64 d11, d3, #30
+ vsri.u64 d11, d3, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d7, #23
+ vsri.u64 d10, d7, #41
+ vshl.u64 d11, d3, #25
+ vsri.u64 d11, d3, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d2, d8
+ vadd.i64 d12, d29
+ vmov d8, d7
+ veor d10, d4, d5
+ vadd.i64 d2, d12
+ vbsl d8, d0, d1
+ vbsl d10, d3, d5
+ vadd.i64 d2, d8
+ vadd.i64 d10, d9
+ vadd.i64 d6, d2
+ vadd.i64 d2, d10
+ # Calc new W[12]-W[13]
+ vext.8 q6, q14, q15, #8
+ vshl.u64 q4, q13, #45
+ vsri.u64 q4, q13, #19
+ vshl.u64 q5, q13, #3
+ vsri.u64 q5, q13, #61
+ veor q5, q4
+ vshr.u64 q4, q13, #6
+ veor q5, q4
+ vadd.i64 q14, q5
+ vext.8 q7, q10, q11, #8
+ vadd.i64 q14, q7
+ vshl.u64 q4, q6, #63
+ vsri.u64 q4, q6, #1
+ vshl.u64 q5, q6, #56
+ vsri.u64 q5, q6, #8
+ veor q5, q4
+ vshr.u64 q6, #7
+ veor q5, q6
+ vadd.i64 q14, q5
+ # Round 14
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d6, #50
+ vsri.u64 d8, d6, #14
+ vshl.u64 d9, d2, #36
+ vsri.u64 d9, d2, #28
+ vshl.u64 d10, d6, #46
+ vsri.u64 d10, d6, #18
+ vshl.u64 d11, d2, #30
+ vsri.u64 d11, d2, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d6, #23
+ vsri.u64 d10, d6, #41
+ vshl.u64 d11, d2, #25
+ vsri.u64 d11, d2, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d1, d8
+ vadd.i64 d12, d30
+ vmov d8, d6
+ veor d10, d3, d4
+ vadd.i64 d1, d12
+ vbsl d8, d7, d0
+ vbsl d10, d2, d4
+ vadd.i64 d1, d8
+ vadd.i64 d10, d9
+ vadd.i64 d5, d1
+ vadd.i64 d1, d10
+ # Round 15
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d5, #50
+ vsri.u64 d8, d5, #14
+ vshl.u64 d9, d1, #36
+ vsri.u64 d9, d1, #28
+ vshl.u64 d10, d5, #46
+ vsri.u64 d10, d5, #18
+ vshl.u64 d11, d1, #30
+ vsri.u64 d11, d1, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d5, #23
+ vsri.u64 d10, d5, #41
+ vshl.u64 d11, d1, #25
+ vsri.u64 d11, d1, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d0, d8
+ vadd.i64 d12, d31
+ vmov d8, d5
+ veor d10, d2, d3
+ vadd.i64 d0, d12
+ vbsl d8, d6, d7
+ vbsl d10, d1, d3
+ vadd.i64 d0, d8
+ vadd.i64 d10, d9
+ vadd.i64 d4, d0
+ vadd.i64 d0, d10
+ # Calc new W[14]-W[15]
+ vext.8 q6, q15, q8, #8
+ vshl.u64 q4, q14, #45
+ vsri.u64 q4, q14, #19
+ vshl.u64 q5, q14, #3
+ vsri.u64 q5, q14, #61
+ veor q5, q4
+ vshr.u64 q4, q14, #6
+ veor q5, q4
+ vadd.i64 q15, q5
+ vext.8 q7, q11, q12, #8
+ vadd.i64 q15, q7
+ vshl.u64 q4, q6, #63
+ vsri.u64 q4, q6, #1
+ vshl.u64 q5, q6, #56
+ vsri.u64 q5, q6, #8
+ veor q5, q4
+ vshr.u64 q6, #7
+ veor q5, q6
+ vadd.i64 q15, q5
+ subs r12, r12, #1
+ bne L_sha512_len_neon_start
+ # Round 0
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d4, #50
+ vsri.u64 d8, d4, #14
+ vshl.u64 d9, d0, #36
+ vsri.u64 d9, d0, #28
+ vshl.u64 d10, d4, #46
+ vsri.u64 d10, d4, #18
+ vshl.u64 d11, d0, #30
+ vsri.u64 d11, d0, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d4, #23
+ vsri.u64 d10, d4, #41
+ vshl.u64 d11, d0, #25
+ vsri.u64 d11, d0, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d7, d8
+ vadd.i64 d12, d16
+ vmov d8, d4
+ veor d10, d1, d2
+ vadd.i64 d7, d12
+ vbsl d8, d5, d6
+ vbsl d10, d0, d2
+ vadd.i64 d7, d8
+ vadd.i64 d10, d9
+ vadd.i64 d3, d7
+ vadd.i64 d7, d10
+ # Round 1
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d3, #50
+ vsri.u64 d8, d3, #14
+ vshl.u64 d9, d7, #36
+ vsri.u64 d9, d7, #28
+ vshl.u64 d10, d3, #46
+ vsri.u64 d10, d3, #18
+ vshl.u64 d11, d7, #30
+ vsri.u64 d11, d7, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d3, #23
+ vsri.u64 d10, d3, #41
+ vshl.u64 d11, d7, #25
+ vsri.u64 d11, d7, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d6, d8
+ vadd.i64 d12, d17
+ vmov d8, d3
+ veor d10, d0, d1
+ vadd.i64 d6, d12
+ vbsl d8, d4, d5
+ vbsl d10, d7, d1
+ vadd.i64 d6, d8
+ vadd.i64 d10, d9
+ vadd.i64 d2, d6
+ vadd.i64 d6, d10
+ # Round 2
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d2, #50
+ vsri.u64 d8, d2, #14
+ vshl.u64 d9, d6, #36
+ vsri.u64 d9, d6, #28
+ vshl.u64 d10, d2, #46
+ vsri.u64 d10, d2, #18
+ vshl.u64 d11, d6, #30
+ vsri.u64 d11, d6, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d2, #23
+ vsri.u64 d10, d2, #41
+ vshl.u64 d11, d6, #25
+ vsri.u64 d11, d6, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d5, d8
+ vadd.i64 d12, d18
+ vmov d8, d2
+ veor d10, d7, d0
+ vadd.i64 d5, d12
+ vbsl d8, d3, d4
+ vbsl d10, d6, d0
+ vadd.i64 d5, d8
+ vadd.i64 d10, d9
+ vadd.i64 d1, d5
+ vadd.i64 d5, d10
+ # Round 3
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d1, #50
+ vsri.u64 d8, d1, #14
+ vshl.u64 d9, d5, #36
+ vsri.u64 d9, d5, #28
+ vshl.u64 d10, d1, #46
+ vsri.u64 d10, d1, #18
+ vshl.u64 d11, d5, #30
+ vsri.u64 d11, d5, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d1, #23
+ vsri.u64 d10, d1, #41
+ vshl.u64 d11, d5, #25
+ vsri.u64 d11, d5, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d4, d8
+ vadd.i64 d12, d19
+ vmov d8, d1
+ veor d10, d6, d7
+ vadd.i64 d4, d12
+ vbsl d8, d2, d3
+ vbsl d10, d5, d7
+ vadd.i64 d4, d8
+ vadd.i64 d10, d9
+ vadd.i64 d0, d4
+ vadd.i64 d4, d10
+ # Round 4
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d0, #50
+ vsri.u64 d8, d0, #14
+ vshl.u64 d9, d4, #36
+ vsri.u64 d9, d4, #28
+ vshl.u64 d10, d0, #46
+ vsri.u64 d10, d0, #18
+ vshl.u64 d11, d4, #30
+ vsri.u64 d11, d4, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d0, #23
+ vsri.u64 d10, d0, #41
+ vshl.u64 d11, d4, #25
+ vsri.u64 d11, d4, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d3, d8
+ vadd.i64 d12, d20
+ vmov d8, d0
+ veor d10, d5, d6
+ vadd.i64 d3, d12
+ vbsl d8, d1, d2
+ vbsl d10, d4, d6
+ vadd.i64 d3, d8
+ vadd.i64 d10, d9
+ vadd.i64 d7, d3
+ vadd.i64 d3, d10
+ # Round 5
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d7, #50
+ vsri.u64 d8, d7, #14
+ vshl.u64 d9, d3, #36
+ vsri.u64 d9, d3, #28
+ vshl.u64 d10, d7, #46
+ vsri.u64 d10, d7, #18
+ vshl.u64 d11, d3, #30
+ vsri.u64 d11, d3, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d7, #23
+ vsri.u64 d10, d7, #41
+ vshl.u64 d11, d3, #25
+ vsri.u64 d11, d3, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d2, d8
+ vadd.i64 d12, d21
+ vmov d8, d7
+ veor d10, d4, d5
+ vadd.i64 d2, d12
+ vbsl d8, d0, d1
+ vbsl d10, d3, d5
+ vadd.i64 d2, d8
+ vadd.i64 d10, d9
+ vadd.i64 d6, d2
+ vadd.i64 d2, d10
+ # Round 6
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d6, #50
+ vsri.u64 d8, d6, #14
+ vshl.u64 d9, d2, #36
+ vsri.u64 d9, d2, #28
+ vshl.u64 d10, d6, #46
+ vsri.u64 d10, d6, #18
+ vshl.u64 d11, d2, #30
+ vsri.u64 d11, d2, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d6, #23
+ vsri.u64 d10, d6, #41
+ vshl.u64 d11, d2, #25
+ vsri.u64 d11, d2, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d1, d8
+ vadd.i64 d12, d22
+ vmov d8, d6
+ veor d10, d3, d4
+ vadd.i64 d1, d12
+ vbsl d8, d7, d0
+ vbsl d10, d2, d4
+ vadd.i64 d1, d8
+ vadd.i64 d10, d9
+ vadd.i64 d5, d1
+ vadd.i64 d1, d10
+ # Round 7
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d5, #50
+ vsri.u64 d8, d5, #14
+ vshl.u64 d9, d1, #36
+ vsri.u64 d9, d1, #28
+ vshl.u64 d10, d5, #46
+ vsri.u64 d10, d5, #18
+ vshl.u64 d11, d1, #30
+ vsri.u64 d11, d1, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d5, #23
+ vsri.u64 d10, d5, #41
+ vshl.u64 d11, d1, #25
+ vsri.u64 d11, d1, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d0, d8
+ vadd.i64 d12, d23
+ vmov d8, d5
+ veor d10, d2, d3
+ vadd.i64 d0, d12
+ vbsl d8, d6, d7
+ vbsl d10, d1, d3
+ vadd.i64 d0, d8
+ vadd.i64 d10, d9
+ vadd.i64 d4, d0
+ vadd.i64 d0, d10
+ # Round 8
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d4, #50
+ vsri.u64 d8, d4, #14
+ vshl.u64 d9, d0, #36
+ vsri.u64 d9, d0, #28
+ vshl.u64 d10, d4, #46
+ vsri.u64 d10, d4, #18
+ vshl.u64 d11, d0, #30
+ vsri.u64 d11, d0, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d4, #23
+ vsri.u64 d10, d4, #41
+ vshl.u64 d11, d0, #25
+ vsri.u64 d11, d0, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d7, d8
+ vadd.i64 d12, d24
+ vmov d8, d4
+ veor d10, d1, d2
+ vadd.i64 d7, d12
+ vbsl d8, d5, d6
+ vbsl d10, d0, d2
+ vadd.i64 d7, d8
+ vadd.i64 d10, d9
+ vadd.i64 d3, d7
+ vadd.i64 d7, d10
+ # Round 9
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d3, #50
+ vsri.u64 d8, d3, #14
+ vshl.u64 d9, d7, #36
+ vsri.u64 d9, d7, #28
+ vshl.u64 d10, d3, #46
+ vsri.u64 d10, d3, #18
+ vshl.u64 d11, d7, #30
+ vsri.u64 d11, d7, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d3, #23
+ vsri.u64 d10, d3, #41
+ vshl.u64 d11, d7, #25
+ vsri.u64 d11, d7, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d6, d8
+ vadd.i64 d12, d25
+ vmov d8, d3
+ veor d10, d0, d1
+ vadd.i64 d6, d12
+ vbsl d8, d4, d5
+ vbsl d10, d7, d1
+ vadd.i64 d6, d8
+ vadd.i64 d10, d9
+ vadd.i64 d2, d6
+ vadd.i64 d6, d10
+ # Round 10
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d2, #50
+ vsri.u64 d8, d2, #14
+ vshl.u64 d9, d6, #36
+ vsri.u64 d9, d6, #28
+ vshl.u64 d10, d2, #46
+ vsri.u64 d10, d2, #18
+ vshl.u64 d11, d6, #30
+ vsri.u64 d11, d6, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d2, #23
+ vsri.u64 d10, d2, #41
+ vshl.u64 d11, d6, #25
+ vsri.u64 d11, d6, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d5, d8
+ vadd.i64 d12, d26
+ vmov d8, d2
+ veor d10, d7, d0
+ vadd.i64 d5, d12
+ vbsl d8, d3, d4
+ vbsl d10, d6, d0
+ vadd.i64 d5, d8
+ vadd.i64 d10, d9
+ vadd.i64 d1, d5
+ vadd.i64 d5, d10
+ # Round 11
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d1, #50
+ vsri.u64 d8, d1, #14
+ vshl.u64 d9, d5, #36
+ vsri.u64 d9, d5, #28
+ vshl.u64 d10, d1, #46
+ vsri.u64 d10, d1, #18
+ vshl.u64 d11, d5, #30
+ vsri.u64 d11, d5, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d1, #23
+ vsri.u64 d10, d1, #41
+ vshl.u64 d11, d5, #25
+ vsri.u64 d11, d5, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d4, d8
+ vadd.i64 d12, d27
+ vmov d8, d1
+ veor d10, d6, d7
+ vadd.i64 d4, d12
+ vbsl d8, d2, d3
+ vbsl d10, d5, d7
+ vadd.i64 d4, d8
+ vadd.i64 d10, d9
+ vadd.i64 d0, d4
+ vadd.i64 d4, d10
+ # Round 12
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d0, #50
+ vsri.u64 d8, d0, #14
+ vshl.u64 d9, d4, #36
+ vsri.u64 d9, d4, #28
+ vshl.u64 d10, d0, #46
+ vsri.u64 d10, d0, #18
+ vshl.u64 d11, d4, #30
+ vsri.u64 d11, d4, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d0, #23
+ vsri.u64 d10, d0, #41
+ vshl.u64 d11, d4, #25
+ vsri.u64 d11, d4, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d3, d8
+ vadd.i64 d12, d28
+ vmov d8, d0
+ veor d10, d5, d6
+ vadd.i64 d3, d12
+ vbsl d8, d1, d2
+ vbsl d10, d4, d6
+ vadd.i64 d3, d8
+ vadd.i64 d10, d9
+ vadd.i64 d7, d3
+ vadd.i64 d3, d10
+ # Round 13
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d7, #50
+ vsri.u64 d8, d7, #14
+ vshl.u64 d9, d3, #36
+ vsri.u64 d9, d3, #28
+ vshl.u64 d10, d7, #46
+ vsri.u64 d10, d7, #18
+ vshl.u64 d11, d3, #30
+ vsri.u64 d11, d3, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d7, #23
+ vsri.u64 d10, d7, #41
+ vshl.u64 d11, d3, #25
+ vsri.u64 d11, d3, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d2, d8
+ vadd.i64 d12, d29
+ vmov d8, d7
+ veor d10, d4, d5
+ vadd.i64 d2, d12
+ vbsl d8, d0, d1
+ vbsl d10, d3, d5
+ vadd.i64 d2, d8
+ vadd.i64 d10, d9
+ vadd.i64 d6, d2
+ vadd.i64 d2, d10
+ # Round 14
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d6, #50
+ vsri.u64 d8, d6, #14
+ vshl.u64 d9, d2, #36
+ vsri.u64 d9, d2, #28
+ vshl.u64 d10, d6, #46
+ vsri.u64 d10, d6, #18
+ vshl.u64 d11, d2, #30
+ vsri.u64 d11, d2, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d6, #23
+ vsri.u64 d10, d6, #41
+ vshl.u64 d11, d2, #25
+ vsri.u64 d11, d2, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d1, d8
+ vadd.i64 d12, d30
+ vmov d8, d6
+ veor d10, d3, d4
+ vadd.i64 d1, d12
+ vbsl d8, d7, d0
+ vbsl d10, d2, d4
+ vadd.i64 d1, d8
+ vadd.i64 d10, d9
+ vadd.i64 d5, d1
+ vadd.i64 d1, d10
+ # Round 15
+ vld1.64 {d12}, [r3:64]!
+ vshl.u64 d8, d5, #50
+ vsri.u64 d8, d5, #14
+ vshl.u64 d9, d1, #36
+ vsri.u64 d9, d1, #28
+ vshl.u64 d10, d5, #46
+ vsri.u64 d10, d5, #18
+ vshl.u64 d11, d1, #30
+ vsri.u64 d11, d1, #34
+ veor d8, d10
+ veor d9, d11
+ vshl.u64 d10, d5, #23
+ vsri.u64 d10, d5, #41
+ vshl.u64 d11, d1, #25
+ vsri.u64 d11, d1, #39
+ veor d8, d10
+ veor d9, d11
+ vadd.i64 d0, d8
+ vadd.i64 d12, d31
+ vmov d8, d5
+ veor d10, d2, d3
+ vadd.i64 d0, d12
+ vbsl d8, d6, d7
+ vbsl d10, d1, d3
+ vadd.i64 d0, d8
+ vadd.i64 d10, d9
+ vadd.i64 d4, d0
+ vadd.i64 d0, d10
+ # Add in digest from start
+ vldm.64 r0, {d8-d15}
+ vadd.i64 q0, q0, q4
+ vadd.i64 q1, q1, q5
+ vadd.i64 q2, q2, q6
+ vadd.i64 q3, q3, q7
+ vstm.64 r0, {d0-d7}
+ subs r2, r2, #0x80
+ bne L_sha512_len_neon_begin
+ vpop {d8-d15}
+ bx lr
+ .size Transform_Sha512_Len,.-Transform_Sha512_Len
+#endif /* !WOLFSSL_ARMASM_NO_NEON */
+#endif /* !__aarch64__ */
+#endif /* WOLFSSL_ARMASM */
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c
new file mode 100644
index 0000000..c502a39
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c
@@ -0,0 +1,4783 @@
+/* armv8-32-sha512-asm
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c
+ */
+
+#ifndef __aarch64__
+#include <stdint.h>
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#include <wolfssl/wolfcrypt/sha512.h>
+
+#ifdef WOLFSSL_ARMASM_NO_NEON
+static const uint64_t L_SHA512_transform_len_k[] = {
+ 0x428a2f98d728ae22UL,
+ 0x7137449123ef65cdUL,
+ 0xb5c0fbcfec4d3b2fUL,
+ 0xe9b5dba58189dbbcUL,
+ 0x3956c25bf348b538UL,
+ 0x59f111f1b605d019UL,
+ 0x923f82a4af194f9bUL,
+ 0xab1c5ed5da6d8118UL,
+ 0xd807aa98a3030242UL,
+ 0x12835b0145706fbeUL,
+ 0x243185be4ee4b28cUL,
+ 0x550c7dc3d5ffb4e2UL,
+ 0x72be5d74f27b896fUL,
+ 0x80deb1fe3b1696b1UL,
+ 0x9bdc06a725c71235UL,
+ 0xc19bf174cf692694UL,
+ 0xe49b69c19ef14ad2UL,
+ 0xefbe4786384f25e3UL,
+ 0xfc19dc68b8cd5b5UL,
+ 0x240ca1cc77ac9c65UL,
+ 0x2de92c6f592b0275UL,
+ 0x4a7484aa6ea6e483UL,
+ 0x5cb0a9dcbd41fbd4UL,
+ 0x76f988da831153b5UL,
+ 0x983e5152ee66dfabUL,
+ 0xa831c66d2db43210UL,
+ 0xb00327c898fb213fUL,
+ 0xbf597fc7beef0ee4UL,
+ 0xc6e00bf33da88fc2UL,
+ 0xd5a79147930aa725UL,
+ 0x6ca6351e003826fUL,
+ 0x142929670a0e6e70UL,
+ 0x27b70a8546d22ffcUL,
+ 0x2e1b21385c26c926UL,
+ 0x4d2c6dfc5ac42aedUL,
+ 0x53380d139d95b3dfUL,
+ 0x650a73548baf63deUL,
+ 0x766a0abb3c77b2a8UL,
+ 0x81c2c92e47edaee6UL,
+ 0x92722c851482353bUL,
+ 0xa2bfe8a14cf10364UL,
+ 0xa81a664bbc423001UL,
+ 0xc24b8b70d0f89791UL,
+ 0xc76c51a30654be30UL,
+ 0xd192e819d6ef5218UL,
+ 0xd69906245565a910UL,
+ 0xf40e35855771202aUL,
+ 0x106aa07032bbd1b8UL,
+ 0x19a4c116b8d2d0c8UL,
+ 0x1e376c085141ab53UL,
+ 0x2748774cdf8eeb99UL,
+ 0x34b0bcb5e19b48a8UL,
+ 0x391c0cb3c5c95a63UL,
+ 0x4ed8aa4ae3418acbUL,
+ 0x5b9cca4f7763e373UL,
+ 0x682e6ff3d6b2b8a3UL,
+ 0x748f82ee5defb2fcUL,
+ 0x78a5636f43172f60UL,
+ 0x84c87814a1f0ab72UL,
+ 0x8cc702081a6439ecUL,
+ 0x90befffa23631e28UL,
+ 0xa4506cebde82bde9UL,
+ 0xbef9a3f7b2c67915UL,
+ 0xc67178f2e372532bUL,
+ 0xca273eceea26619cUL,
+ 0xd186b8c721c0c207UL,
+ 0xeada7dd6cde0eb1eUL,
+ 0xf57d4f7fee6ed178UL,
+ 0x6f067aa72176fbaUL,
+ 0xa637dc5a2c898a6UL,
+ 0x113f9804bef90daeUL,
+ 0x1b710b35131c471bUL,
+ 0x28db77f523047d84UL,
+ 0x32caab7b40c72493UL,
+ 0x3c9ebe0a15c9bebcUL,
+ 0x431d67c49c100d4cUL,
+ 0x4cc5d4becb3e42b6UL,
+ 0x597f299cfc657e2aUL,
+ 0x5fcb6fab3ad6faecUL,
+ 0x6c44198c4a475817UL,
+};
+
+void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
+{
+ __asm__ __volatile__ (
+ "sub sp, sp, #0xc0\n\t"
+ "mov r3, %[L_SHA512_transform_len_k]\n\t"
+ /* Copy digest to add in at end */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r8, r9, [%[sha512], #24]\n\t"
+ "strd r12, lr, [sp, #128]\n\t"
+ "strd r4, r5, [sp, #136]\n\t"
+ "strd r6, r7, [sp, #144]\n\t"
+ "strd r8, r9, [sp, #152]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r8, r9, [%[sha512], #56]\n\t"
+ "strd r12, lr, [sp, #160]\n\t"
+ "strd r4, r5, [sp, #168]\n\t"
+ "strd r6, r7, [sp, #176]\n\t"
+ "strd r8, r9, [sp, #184]\n\t"
+ /* Start of loop processing a block */
+ "\n"
+ "L_sha512_len_neon_begin_%=: \n\t"
+ /* Load, Reverse and Store W */
+ "ldrd r12, lr, [%[data]]\n\t"
+ "ldrd r4, r5, [%[data], #8]\n\t"
+ "ldrd r6, r7, [%[data], #16]\n\t"
+ "ldrd r8, r9, [%[data], #24]\n\t"
+ "rev r12, r12\n\t"
+ "rev lr, lr\n\t"
+ "rev r4, r4\n\t"
+ "rev r5, r5\n\t"
+ "rev r6, r6\n\t"
+ "rev r7, r7\n\t"
+ "rev r8, r8\n\t"
+ "rev r9, r9\n\t"
+ "str lr, [sp]\n\t"
+ "str r12, [sp, #4]\n\t"
+ "str r5, [sp, #8]\n\t"
+ "str r4, [sp, #12]\n\t"
+ "str r7, [sp, #16]\n\t"
+ "str r6, [sp, #20]\n\t"
+ "str r9, [sp, #24]\n\t"
+ "str r8, [sp, #28]\n\t"
+ "ldrd r12, lr, [%[data], #32]\n\t"
+ "ldrd r4, r5, [%[data], #40]\n\t"
+ "ldrd r6, r7, [%[data], #48]\n\t"
+ "ldrd r8, r9, [%[data], #56]\n\t"
+ "rev r12, r12\n\t"
+ "rev lr, lr\n\t"
+ "rev r4, r4\n\t"
+ "rev r5, r5\n\t"
+ "rev r6, r6\n\t"
+ "rev r7, r7\n\t"
+ "rev r8, r8\n\t"
+ "rev r9, r9\n\t"
+ "str lr, [sp, #32]\n\t"
+ "str r12, [sp, #36]\n\t"
+ "str r5, [sp, #40]\n\t"
+ "str r4, [sp, #44]\n\t"
+ "str r7, [sp, #48]\n\t"
+ "str r6, [sp, #52]\n\t"
+ "str r9, [sp, #56]\n\t"
+ "str r8, [sp, #60]\n\t"
+ "ldrd r12, lr, [%[data], #64]\n\t"
+ "ldrd r4, r5, [%[data], #72]\n\t"
+ "ldrd r6, r7, [%[data], #80]\n\t"
+ "ldrd r8, r9, [%[data], #88]\n\t"
+ "rev r12, r12\n\t"
+ "rev lr, lr\n\t"
+ "rev r4, r4\n\t"
+ "rev r5, r5\n\t"
+ "rev r6, r6\n\t"
+ "rev r7, r7\n\t"
+ "rev r8, r8\n\t"
+ "rev r9, r9\n\t"
+ "str lr, [sp, #64]\n\t"
+ "str r12, [sp, #68]\n\t"
+ "str r5, [sp, #72]\n\t"
+ "str r4, [sp, #76]\n\t"
+ "str r7, [sp, #80]\n\t"
+ "str r6, [sp, #84]\n\t"
+ "str r9, [sp, #88]\n\t"
+ "str r8, [sp, #92]\n\t"
+ "ldrd r12, lr, [%[data], #96]\n\t"
+ "ldrd r4, r5, [%[data], #104]\n\t"
+ "ldrd r6, r7, [%[data], #112]\n\t"
+ "ldrd r8, r9, [%[data], #120]\n\t"
+ "rev r12, r12\n\t"
+ "rev lr, lr\n\t"
+ "rev r4, r4\n\t"
+ "rev r5, r5\n\t"
+ "rev r6, r6\n\t"
+ "rev r7, r7\n\t"
+ "rev r8, r8\n\t"
+ "rev r9, r9\n\t"
+ "str lr, [sp, #96]\n\t"
+ "str r12, [sp, #100]\n\t"
+ "str r5, [sp, #104]\n\t"
+ "str r4, [sp, #108]\n\t"
+ "str r7, [sp, #112]\n\t"
+ "str r6, [sp, #116]\n\t"
+ "str r9, [sp, #120]\n\t"
+ "str r8, [sp, #124]\n\t"
+ /* Pre-calc: b ^ c */
+ "ldrd r8, r9, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r8, r8, r12\n\t"
+ "eor r9, r9, lr\n\t"
+ "mov r10, #4\n\t"
+ /* Start of 16 rounds */
+ "\n"
+ "L_sha512_len_neon_start_%=: \n\t"
+ /* Round 0 */
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "strd r6, r7, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[0] */
+ "ldrd r12, lr, [sp, #112]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp]\n\t"
+ "ldrd r6, r7, [sp, #72]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp]\n\t"
+ "ldrd r12, lr, [sp, #8]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp]\n\t"
+ /* Round 1 */
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #8]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "strd r6, r7, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #48]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[1] */
+ "ldrd r12, lr, [sp, #120]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #8]\n\t"
+ "ldrd r6, r7, [sp, #80]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #8]\n\t"
+ "ldrd r12, lr, [sp, #16]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #8]\n\t"
+ /* Round 2 */
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #16]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "strd r6, r7, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[2] */
+ "ldrd r12, lr, [sp]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #16]\n\t"
+ "ldrd r6, r7, [sp, #88]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #16]\n\t"
+ "ldrd r12, lr, [sp, #24]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #16]\n\t"
+ /* Round 3 */
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #24]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "strd r6, r7, [%[sha512]]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #32]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[3] */
+ "ldrd r12, lr, [sp, #8]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #24]\n\t"
+ "ldrd r6, r7, [sp, #96]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #24]\n\t"
+ "ldrd r12, lr, [sp, #32]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #24]\n\t"
+ /* Round 4 */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #32]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "strd r6, r7, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[4] */
+ "ldrd r12, lr, [sp, #16]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #32]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #32]\n\t"
+ "ldrd r12, lr, [sp, #40]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #32]\n\t"
+ /* Round 5 */
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #40]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "strd r6, r7, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #16]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[5] */
+ "ldrd r12, lr, [sp, #24]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #40]\n\t"
+ "ldrd r6, r7, [sp, #112]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #40]\n\t"
+ "ldrd r12, lr, [sp, #48]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #40]\n\t"
+ /* Round 6 */
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #48]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "strd r6, r7, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[6] */
+ "ldrd r12, lr, [sp, #32]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #48]\n\t"
+ "ldrd r6, r7, [sp, #120]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #48]\n\t"
+ "ldrd r12, lr, [sp, #56]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #48]\n\t"
+ /* Round 7 */
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #56]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "strd r6, r7, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512]]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[7] */
+ "ldrd r12, lr, [sp, #40]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #56]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #56]\n\t"
+ "ldrd r12, lr, [sp, #64]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #56]\n\t"
+ /* Round 8 */
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp, #64]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #64]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "strd r6, r7, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[8] */
+ "ldrd r12, lr, [sp, #48]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #64]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #64]\n\t"
+ "ldrd r12, lr, [sp, #72]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #64]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #64]\n\t"
+ /* Round 9 */
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [sp, #72]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #72]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "strd r6, r7, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #48]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[9] */
+ "ldrd r12, lr, [sp, #56]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #72]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #72]\n\t"
+ "ldrd r12, lr, [sp, #80]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #72]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #72]\n\t"
+ /* Round 10 */
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #80]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #80]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "strd r6, r7, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[10] */
+ "ldrd r12, lr, [sp, #64]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #80]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #80]\n\t"
+ "ldrd r12, lr, [sp, #88]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #80]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #80]\n\t"
+ /* Round 11 */
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [sp, #88]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #88]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "strd r6, r7, [%[sha512]]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #32]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[11] */
+ "ldrd r12, lr, [sp, #72]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #88]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #88]\n\t"
+ "ldrd r12, lr, [sp, #96]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #88]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #88]\n\t"
+ /* Round 12 */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #96]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #96]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "strd r6, r7, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[12] */
+ "ldrd r12, lr, [sp, #80]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #96]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #96]\n\t"
+ "ldrd r12, lr, [sp, #104]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #96]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #96]\n\t"
+ /* Round 13 */
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #104]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "strd r6, r7, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #16]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[13] */
+ "ldrd r12, lr, [sp, #88]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #104]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #104]\n\t"
+ "ldrd r12, lr, [sp, #112]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #104]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #104]\n\t"
+ /* Round 14 */
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #112]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #112]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "strd r6, r7, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[14] */
+ "ldrd r12, lr, [sp, #96]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #112]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #112]\n\t"
+ "ldrd r12, lr, [sp, #120]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #112]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #112]\n\t"
+ /* Round 15 */
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r6, r7, [sp, #120]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #120]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "strd r6, r7, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512]]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Calc new W[15] */
+ "ldrd r12, lr, [sp, #104]\n\t"
+ "lsrs r4, r12, #19\n\t"
+ "lsrs r5, lr, #19\n\t"
+ "orr r5, r5, r12, lsl 13\n\t"
+ "orr r4, r4, lr, lsl 13\n\t"
+ "lsls r6, r12, #3\n\t"
+ "lsls r7, lr, #3\n\t"
+ "orr r7, r7, r12, lsr 29\n\t"
+ "orr r6, r6, lr, lsr 29\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #6\n\t"
+ "lsrs r7, lr, #6\n\t"
+ "orr r6, r6, lr, lsl 26\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #120]\n\t"
+ "ldrd r6, r7, [sp, #64]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "strd r12, lr, [sp, #120]\n\t"
+ "ldrd r12, lr, [sp]\n\t"
+ "lsrs r4, r12, #1\n\t"
+ "lsrs r5, lr, #1\n\t"
+ "orr r5, r5, r12, lsl 31\n\t"
+ "orr r4, r4, lr, lsl 31\n\t"
+ "lsrs r6, r12, #8\n\t"
+ "lsrs r7, lr, #8\n\t"
+ "orr r7, r7, r12, lsl 24\n\t"
+ "orr r6, r6, lr, lsl 24\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "lsrs r6, r12, #7\n\t"
+ "lsrs r7, lr, #7\n\t"
+ "orr r6, r6, lr, lsl 25\n\t"
+ "eor r5, r5, r7\n\t"
+ "eor r4, r4, r6\n\t"
+ "ldrd r12, lr, [sp, #120]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [sp, #120]\n\t"
+ "add r3, r3, #0x80\n\t"
+ "subs r10, r10, #1\n\t"
+ "bne L_sha512_len_neon_start_%=\n\t"
+ /* Round 0 */
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "strd r6, r7, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 1 */
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [sp, #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #8]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "strd r6, r7, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #48]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 2 */
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #16]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "strd r6, r7, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 3 */
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [sp, #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #24]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "strd r6, r7, [%[sha512]]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #32]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 4 */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #32]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "strd r6, r7, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 5 */
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [sp, #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #40]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "strd r6, r7, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #16]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 6 */
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #48]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "strd r6, r7, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 7 */
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r6, r7, [sp, #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #56]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "strd r6, r7, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512]]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 8 */
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp, #64]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #64]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "strd r6, r7, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [%[sha512], #56]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 9 */
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [sp, #72]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #72]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "strd r6, r7, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #48]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 10 */
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #80]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #80]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "strd r6, r7, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [%[sha512], #40]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 11 */
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r6, r7, [sp, #88]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #88]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "strd r6, r7, [%[sha512]]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #32]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 12 */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #96]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #96]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "strd r6, r7, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [%[sha512], #24]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 13 */
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r12, lr, [%[sha512], #56]\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r6, r7, [sp, #104]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #104]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #48]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #24]\n\t"
+ "strd r6, r7, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #24]\n\t"
+ "ldrd r4, r5, [%[sha512], #32]\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #16]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 14 */
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #112]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #112]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #40]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "strd r6, r7, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [%[sha512], #8]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Round 15 */
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "lsrs r4, r12, #14\n\t"
+ "lsrs r5, lr, #14\n\t"
+ "orr r5, r5, r12, lsl 18\n\t"
+ "orr r4, r4, lr, lsl 18\n\t"
+ "lsrs r6, r12, #18\n\t"
+ "lsrs r7, lr, #18\n\t"
+ "orr r7, r7, r12, lsl 14\n\t"
+ "orr r6, r6, lr, lsl 14\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #23\n\t"
+ "lsls r7, lr, #23\n\t"
+ "orr r7, r7, r12, lsr 9\n\t"
+ "orr r6, r6, lr, lsr 9\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "ldrd r12, lr, [%[sha512], #40]\n\t"
+ "ldrd r4, r5, [%[sha512], #48]\n\t"
+ "ldrd r6, r7, [%[sha512], #56]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "and r4, r4, r12\n\t"
+ "and r5, r5, lr\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r6, r7, [sp, #120]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r4, r5, [r3, #120]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "ldrd r6, r7, [%[sha512], #32]\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "adds r6, r6, r12\n\t"
+ "adc r7, r7, lr\n\t"
+ "ldrd r12, lr, [%[sha512], #8]\n\t"
+ "strd r6, r7, [%[sha512], #32]\n\t"
+ "lsrs r4, r12, #28\n\t"
+ "lsrs r5, lr, #28\n\t"
+ "orr r5, r5, r12, lsl 4\n\t"
+ "orr r4, r4, lr, lsl 4\n\t"
+ "lsls r6, r12, #30\n\t"
+ "lsls r7, lr, #30\n\t"
+ "orr r7, r7, r12, lsr 2\n\t"
+ "orr r6, r6, lr, lsr 2\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "lsls r6, r12, #25\n\t"
+ "lsls r7, lr, #25\n\t"
+ "orr r7, r7, r12, lsr 7\n\t"
+ "orr r6, r6, lr, lsr 7\n\t"
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "eor r4, r4, r6\n\t"
+ "eor r5, r5, r7\n\t"
+ "adds r12, r12, r4\n\t"
+ "adc lr, lr, r5\n\t"
+ "ldrd r6, r7, [%[sha512], #8]\n\t"
+ "ldrd r4, r5, [%[sha512], #16]\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "eor r6, r6, r4\n\t"
+ "eor r7, r7, r5\n\t"
+ "and r8, r8, r6\n\t"
+ "and r9, r9, r7\n\t"
+ "eor r8, r8, r4\n\t"
+ "eor r9, r9, r5\n\t"
+ "ldrd r4, r5, [%[sha512]]\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r4, r5, [%[sha512]]\n\t"
+ "mov r8, r6\n\t"
+ "mov r9, r7\n\t"
+ /* Add in digest from start */
+ "ldrd r12, lr, [%[sha512]]\n\t"
+ "ldrd r4, r5, [%[sha512], #8]\n\t"
+ "ldrd r6, r7, [sp, #128]\n\t"
+ "ldrd r8, r9, [sp, #136]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r12, lr, [%[sha512]]\n\t"
+ "strd r4, r5, [%[sha512], #8]\n\t"
+ "strd r12, lr, [sp, #128]\n\t"
+ "strd r4, r5, [sp, #136]\n\t"
+ "ldrd r12, lr, [%[sha512], #16]\n\t"
+ "ldrd r4, r5, [%[sha512], #24]\n\t"
+ "ldrd r6, r7, [sp, #144]\n\t"
+ "ldrd r8, r9, [sp, #152]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r12, lr, [%[sha512], #16]\n\t"
+ "strd r4, r5, [%[sha512], #24]\n\t"
+ "strd r12, lr, [sp, #144]\n\t"
+ "strd r4, r5, [sp, #152]\n\t"
+ "ldrd r12, lr, [%[sha512], #32]\n\t"
+ "ldrd r4, r5, [%[sha512], #40]\n\t"
+ "ldrd r6, r7, [sp, #160]\n\t"
+ "ldrd r8, r9, [sp, #168]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r12, lr, [%[sha512], #32]\n\t"
+ "strd r4, r5, [%[sha512], #40]\n\t"
+ "strd r12, lr, [sp, #160]\n\t"
+ "strd r4, r5, [sp, #168]\n\t"
+ "ldrd r12, lr, [%[sha512], #48]\n\t"
+ "ldrd r4, r5, [%[sha512], #56]\n\t"
+ "ldrd r6, r7, [sp, #176]\n\t"
+ "ldrd r8, r9, [sp, #184]\n\t"
+ "adds r12, r12, r6\n\t"
+ "adc lr, lr, r7\n\t"
+ "adds r4, r4, r8\n\t"
+ "adc r5, r5, r9\n\t"
+ "strd r12, lr, [%[sha512], #48]\n\t"
+ "strd r4, r5, [%[sha512], #56]\n\t"
+ "strd r12, lr, [sp, #176]\n\t"
+ "strd r4, r5, [sp, #184]\n\t"
+ "subs %[len], %[len], #0x80\n\t"
+ "sub r3, r3, #0x200\n\t"
+ "add %[data], %[data], #0x80\n\t"
+ "bne L_sha512_len_neon_begin_%=\n\t"
+ "eor r0, r0, r0\n\t"
+ "add sp, sp, #0xc0\n\t"
+ : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
+ : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k)
+ : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
+ );
+}
+
+#endif /* WOLFSSL_ARMASM_NO_NEON */
+#include <wolfssl/wolfcrypt/sha512.h>
+
+#ifndef WOLFSSL_ARMASM_NO_NEON
+static const uint64_t L_SHA512_transform_neon_len_k[] = {
+ 0x428a2f98d728ae22UL,
+ 0x7137449123ef65cdUL,
+ 0xb5c0fbcfec4d3b2fUL,
+ 0xe9b5dba58189dbbcUL,
+ 0x3956c25bf348b538UL,
+ 0x59f111f1b605d019UL,
+ 0x923f82a4af194f9bUL,
+ 0xab1c5ed5da6d8118UL,
+ 0xd807aa98a3030242UL,
+ 0x12835b0145706fbeUL,
+ 0x243185be4ee4b28cUL,
+ 0x550c7dc3d5ffb4e2UL,
+ 0x72be5d74f27b896fUL,
+ 0x80deb1fe3b1696b1UL,
+ 0x9bdc06a725c71235UL,
+ 0xc19bf174cf692694UL,
+ 0xe49b69c19ef14ad2UL,
+ 0xefbe4786384f25e3UL,
+ 0xfc19dc68b8cd5b5UL,
+ 0x240ca1cc77ac9c65UL,
+ 0x2de92c6f592b0275UL,
+ 0x4a7484aa6ea6e483UL,
+ 0x5cb0a9dcbd41fbd4UL,
+ 0x76f988da831153b5UL,
+ 0x983e5152ee66dfabUL,
+ 0xa831c66d2db43210UL,
+ 0xb00327c898fb213fUL,
+ 0xbf597fc7beef0ee4UL,
+ 0xc6e00bf33da88fc2UL,
+ 0xd5a79147930aa725UL,
+ 0x6ca6351e003826fUL,
+ 0x142929670a0e6e70UL,
+ 0x27b70a8546d22ffcUL,
+ 0x2e1b21385c26c926UL,
+ 0x4d2c6dfc5ac42aedUL,
+ 0x53380d139d95b3dfUL,
+ 0x650a73548baf63deUL,
+ 0x766a0abb3c77b2a8UL,
+ 0x81c2c92e47edaee6UL,
+ 0x92722c851482353bUL,
+ 0xa2bfe8a14cf10364UL,
+ 0xa81a664bbc423001UL,
+ 0xc24b8b70d0f89791UL,
+ 0xc76c51a30654be30UL,
+ 0xd192e819d6ef5218UL,
+ 0xd69906245565a910UL,
+ 0xf40e35855771202aUL,
+ 0x106aa07032bbd1b8UL,
+ 0x19a4c116b8d2d0c8UL,
+ 0x1e376c085141ab53UL,
+ 0x2748774cdf8eeb99UL,
+ 0x34b0bcb5e19b48a8UL,
+ 0x391c0cb3c5c95a63UL,
+ 0x4ed8aa4ae3418acbUL,
+ 0x5b9cca4f7763e373UL,
+ 0x682e6ff3d6b2b8a3UL,
+ 0x748f82ee5defb2fcUL,
+ 0x78a5636f43172f60UL,
+ 0x84c87814a1f0ab72UL,
+ 0x8cc702081a6439ecUL,
+ 0x90befffa23631e28UL,
+ 0xa4506cebde82bde9UL,
+ 0xbef9a3f7b2c67915UL,
+ 0xc67178f2e372532bUL,
+ 0xca273eceea26619cUL,
+ 0xd186b8c721c0c207UL,
+ 0xeada7dd6cde0eb1eUL,
+ 0xf57d4f7fee6ed178UL,
+ 0x6f067aa72176fbaUL,
+ 0xa637dc5a2c898a6UL,
+ 0x113f9804bef90daeUL,
+ 0x1b710b35131c471bUL,
+ 0x28db77f523047d84UL,
+ 0x32caab7b40c72493UL,
+ 0x3c9ebe0a15c9bebcUL,
+ 0x431d67c49c100d4cUL,
+ 0x4cc5d4becb3e42b6UL,
+ 0x597f299cfc657e2aUL,
+ 0x5fcb6fab3ad6faecUL,
+ 0x6c44198c4a475817UL,
+};
+
+void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
+{
+ __asm__ __volatile__ (
+ /* Load digest into working vars */
+ "vldm.64 %[sha512], {d0-d7}\n\t"
+ /* Start of loop processing a block */
+ "\n"
+ "L_sha512_len_neon_begin_%=: \n\t"
+ /* Load W */
+ "vldm.64 %[data]!, {d16-d31}\n\t"
+ "vrev64.8 q8, q8\n\t"
+ "vrev64.8 q9, q9\n\t"
+ "vrev64.8 q10, q10\n\t"
+ "vrev64.8 q11, q11\n\t"
+ "vrev64.8 q12, q12\n\t"
+ "vrev64.8 q13, q13\n\t"
+ "vrev64.8 q14, q14\n\t"
+ "vrev64.8 q15, q15\n\t"
+ "mov r3, %[L_SHA512_transform_neon_len_k]\n\t"
+ "mov r12, #4\n\t"
+ /* Start of 16 rounds */
+ "\n"
+ "L_sha512_len_neon_start_%=: \n\t"
+ /* Round 0 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d4, #50\n\t"
+ "vsri.u64 d8, d4, #14\n\t"
+ "vshl.u64 d9, d0, #36\n\t"
+ "vsri.u64 d9, d0, #28\n\t"
+ "vshl.u64 d10, d4, #46\n\t"
+ "vsri.u64 d10, d4, #18\n\t"
+ "vshl.u64 d11, d0, #30\n\t"
+ "vsri.u64 d11, d0, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d4, #23\n\t"
+ "vsri.u64 d10, d4, #41\n\t"
+ "vshl.u64 d11, d0, #25\n\t"
+ "vsri.u64 d11, d0, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d12, d16\n\t"
+ "vmov d8, d4\n\t"
+ "veor d10, d1, d2\n\t"
+ "vadd.i64 d7, d12\n\t"
+ "vbsl d8, d5, d6\n\t"
+ "vbsl d10, d0, d2\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d3, d7\n\t"
+ "vadd.i64 d7, d10\n\t"
+ /* Round 1 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d3, #50\n\t"
+ "vsri.u64 d8, d3, #14\n\t"
+ "vshl.u64 d9, d7, #36\n\t"
+ "vsri.u64 d9, d7, #28\n\t"
+ "vshl.u64 d10, d3, #46\n\t"
+ "vsri.u64 d10, d3, #18\n\t"
+ "vshl.u64 d11, d7, #30\n\t"
+ "vsri.u64 d11, d7, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d3, #23\n\t"
+ "vsri.u64 d10, d3, #41\n\t"
+ "vshl.u64 d11, d7, #25\n\t"
+ "vsri.u64 d11, d7, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d12, d17\n\t"
+ "vmov d8, d3\n\t"
+ "veor d10, d0, d1\n\t"
+ "vadd.i64 d6, d12\n\t"
+ "vbsl d8, d4, d5\n\t"
+ "vbsl d10, d7, d1\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d2, d6\n\t"
+ "vadd.i64 d6, d10\n\t"
+ /* Calc new W[0]-W[1] */
+ "vext.8 q6, q8, q9, #8\n\t"
+ "vshl.u64 q4, q15, #45\n\t"
+ "vsri.u64 q4, q15, #19\n\t"
+ "vshl.u64 q5, q15, #3\n\t"
+ "vsri.u64 q5, q15, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q15, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q8, q5\n\t"
+ "vext.8 q7, q12, q13, #8\n\t"
+ "vadd.i64 q8, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q8, q5\n\t"
+ /* Round 2 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d2, #50\n\t"
+ "vsri.u64 d8, d2, #14\n\t"
+ "vshl.u64 d9, d6, #36\n\t"
+ "vsri.u64 d9, d6, #28\n\t"
+ "vshl.u64 d10, d2, #46\n\t"
+ "vsri.u64 d10, d2, #18\n\t"
+ "vshl.u64 d11, d6, #30\n\t"
+ "vsri.u64 d11, d6, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d2, #23\n\t"
+ "vsri.u64 d10, d2, #41\n\t"
+ "vshl.u64 d11, d6, #25\n\t"
+ "vsri.u64 d11, d6, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d12, d18\n\t"
+ "vmov d8, d2\n\t"
+ "veor d10, d7, d0\n\t"
+ "vadd.i64 d5, d12\n\t"
+ "vbsl d8, d3, d4\n\t"
+ "vbsl d10, d6, d0\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d1, d5\n\t"
+ "vadd.i64 d5, d10\n\t"
+ /* Round 3 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d1, #50\n\t"
+ "vsri.u64 d8, d1, #14\n\t"
+ "vshl.u64 d9, d5, #36\n\t"
+ "vsri.u64 d9, d5, #28\n\t"
+ "vshl.u64 d10, d1, #46\n\t"
+ "vsri.u64 d10, d1, #18\n\t"
+ "vshl.u64 d11, d5, #30\n\t"
+ "vsri.u64 d11, d5, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d1, #23\n\t"
+ "vsri.u64 d10, d1, #41\n\t"
+ "vshl.u64 d11, d5, #25\n\t"
+ "vsri.u64 d11, d5, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d12, d19\n\t"
+ "vmov d8, d1\n\t"
+ "veor d10, d6, d7\n\t"
+ "vadd.i64 d4, d12\n\t"
+ "vbsl d8, d2, d3\n\t"
+ "vbsl d10, d5, d7\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d0, d4\n\t"
+ "vadd.i64 d4, d10\n\t"
+ /* Calc new W[2]-W[3] */
+ "vext.8 q6, q9, q10, #8\n\t"
+ "vshl.u64 q4, q8, #45\n\t"
+ "vsri.u64 q4, q8, #19\n\t"
+ "vshl.u64 q5, q8, #3\n\t"
+ "vsri.u64 q5, q8, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q8, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q9, q5\n\t"
+ "vext.8 q7, q13, q14, #8\n\t"
+ "vadd.i64 q9, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q9, q5\n\t"
+ /* Round 4 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d0, #50\n\t"
+ "vsri.u64 d8, d0, #14\n\t"
+ "vshl.u64 d9, d4, #36\n\t"
+ "vsri.u64 d9, d4, #28\n\t"
+ "vshl.u64 d10, d0, #46\n\t"
+ "vsri.u64 d10, d0, #18\n\t"
+ "vshl.u64 d11, d4, #30\n\t"
+ "vsri.u64 d11, d4, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d0, #23\n\t"
+ "vsri.u64 d10, d0, #41\n\t"
+ "vshl.u64 d11, d4, #25\n\t"
+ "vsri.u64 d11, d4, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d12, d20\n\t"
+ "vmov d8, d0\n\t"
+ "veor d10, d5, d6\n\t"
+ "vadd.i64 d3, d12\n\t"
+ "vbsl d8, d1, d2\n\t"
+ "vbsl d10, d4, d6\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d7, d3\n\t"
+ "vadd.i64 d3, d10\n\t"
+ /* Round 5 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d7, #50\n\t"
+ "vsri.u64 d8, d7, #14\n\t"
+ "vshl.u64 d9, d3, #36\n\t"
+ "vsri.u64 d9, d3, #28\n\t"
+ "vshl.u64 d10, d7, #46\n\t"
+ "vsri.u64 d10, d7, #18\n\t"
+ "vshl.u64 d11, d3, #30\n\t"
+ "vsri.u64 d11, d3, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d7, #23\n\t"
+ "vsri.u64 d10, d7, #41\n\t"
+ "vshl.u64 d11, d3, #25\n\t"
+ "vsri.u64 d11, d3, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d12, d21\n\t"
+ "vmov d8, d7\n\t"
+ "veor d10, d4, d5\n\t"
+ "vadd.i64 d2, d12\n\t"
+ "vbsl d8, d0, d1\n\t"
+ "vbsl d10, d3, d5\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d6, d2\n\t"
+ "vadd.i64 d2, d10\n\t"
+ /* Calc new W[4]-W[5] */
+ "vext.8 q6, q10, q11, #8\n\t"
+ "vshl.u64 q4, q9, #45\n\t"
+ "vsri.u64 q4, q9, #19\n\t"
+ "vshl.u64 q5, q9, #3\n\t"
+ "vsri.u64 q5, q9, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q9, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q10, q5\n\t"
+ "vext.8 q7, q14, q15, #8\n\t"
+ "vadd.i64 q10, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q10, q5\n\t"
+ /* Round 6 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d6, #50\n\t"
+ "vsri.u64 d8, d6, #14\n\t"
+ "vshl.u64 d9, d2, #36\n\t"
+ "vsri.u64 d9, d2, #28\n\t"
+ "vshl.u64 d10, d6, #46\n\t"
+ "vsri.u64 d10, d6, #18\n\t"
+ "vshl.u64 d11, d2, #30\n\t"
+ "vsri.u64 d11, d2, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d6, #23\n\t"
+ "vsri.u64 d10, d6, #41\n\t"
+ "vshl.u64 d11, d2, #25\n\t"
+ "vsri.u64 d11, d2, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d12, d22\n\t"
+ "vmov d8, d6\n\t"
+ "veor d10, d3, d4\n\t"
+ "vadd.i64 d1, d12\n\t"
+ "vbsl d8, d7, d0\n\t"
+ "vbsl d10, d2, d4\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d5, d1\n\t"
+ "vadd.i64 d1, d10\n\t"
+ /* Round 7 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d5, #50\n\t"
+ "vsri.u64 d8, d5, #14\n\t"
+ "vshl.u64 d9, d1, #36\n\t"
+ "vsri.u64 d9, d1, #28\n\t"
+ "vshl.u64 d10, d5, #46\n\t"
+ "vsri.u64 d10, d5, #18\n\t"
+ "vshl.u64 d11, d1, #30\n\t"
+ "vsri.u64 d11, d1, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d5, #23\n\t"
+ "vsri.u64 d10, d5, #41\n\t"
+ "vshl.u64 d11, d1, #25\n\t"
+ "vsri.u64 d11, d1, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d12, d23\n\t"
+ "vmov d8, d5\n\t"
+ "veor d10, d2, d3\n\t"
+ "vadd.i64 d0, d12\n\t"
+ "vbsl d8, d6, d7\n\t"
+ "vbsl d10, d1, d3\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d4, d0\n\t"
+ "vadd.i64 d0, d10\n\t"
+ /* Calc new W[6]-W[7] */
+ "vext.8 q6, q11, q12, #8\n\t"
+ "vshl.u64 q4, q10, #45\n\t"
+ "vsri.u64 q4, q10, #19\n\t"
+ "vshl.u64 q5, q10, #3\n\t"
+ "vsri.u64 q5, q10, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q10, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q11, q5\n\t"
+ "vext.8 q7, q15, q8, #8\n\t"
+ "vadd.i64 q11, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q11, q5\n\t"
+ /* Round 8 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d4, #50\n\t"
+ "vsri.u64 d8, d4, #14\n\t"
+ "vshl.u64 d9, d0, #36\n\t"
+ "vsri.u64 d9, d0, #28\n\t"
+ "vshl.u64 d10, d4, #46\n\t"
+ "vsri.u64 d10, d4, #18\n\t"
+ "vshl.u64 d11, d0, #30\n\t"
+ "vsri.u64 d11, d0, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d4, #23\n\t"
+ "vsri.u64 d10, d4, #41\n\t"
+ "vshl.u64 d11, d0, #25\n\t"
+ "vsri.u64 d11, d0, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d12, d24\n\t"
+ "vmov d8, d4\n\t"
+ "veor d10, d1, d2\n\t"
+ "vadd.i64 d7, d12\n\t"
+ "vbsl d8, d5, d6\n\t"
+ "vbsl d10, d0, d2\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d3, d7\n\t"
+ "vadd.i64 d7, d10\n\t"
+ /* Round 9 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d3, #50\n\t"
+ "vsri.u64 d8, d3, #14\n\t"
+ "vshl.u64 d9, d7, #36\n\t"
+ "vsri.u64 d9, d7, #28\n\t"
+ "vshl.u64 d10, d3, #46\n\t"
+ "vsri.u64 d10, d3, #18\n\t"
+ "vshl.u64 d11, d7, #30\n\t"
+ "vsri.u64 d11, d7, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d3, #23\n\t"
+ "vsri.u64 d10, d3, #41\n\t"
+ "vshl.u64 d11, d7, #25\n\t"
+ "vsri.u64 d11, d7, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d12, d25\n\t"
+ "vmov d8, d3\n\t"
+ "veor d10, d0, d1\n\t"
+ "vadd.i64 d6, d12\n\t"
+ "vbsl d8, d4, d5\n\t"
+ "vbsl d10, d7, d1\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d2, d6\n\t"
+ "vadd.i64 d6, d10\n\t"
+ /* Calc new W[8]-W[9] */
+ "vext.8 q6, q12, q13, #8\n\t"
+ "vshl.u64 q4, q11, #45\n\t"
+ "vsri.u64 q4, q11, #19\n\t"
+ "vshl.u64 q5, q11, #3\n\t"
+ "vsri.u64 q5, q11, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q11, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q12, q5\n\t"
+ "vext.8 q7, q8, q9, #8\n\t"
+ "vadd.i64 q12, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q12, q5\n\t"
+ /* Round 10 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d2, #50\n\t"
+ "vsri.u64 d8, d2, #14\n\t"
+ "vshl.u64 d9, d6, #36\n\t"
+ "vsri.u64 d9, d6, #28\n\t"
+ "vshl.u64 d10, d2, #46\n\t"
+ "vsri.u64 d10, d2, #18\n\t"
+ "vshl.u64 d11, d6, #30\n\t"
+ "vsri.u64 d11, d6, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d2, #23\n\t"
+ "vsri.u64 d10, d2, #41\n\t"
+ "vshl.u64 d11, d6, #25\n\t"
+ "vsri.u64 d11, d6, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d12, d26\n\t"
+ "vmov d8, d2\n\t"
+ "veor d10, d7, d0\n\t"
+ "vadd.i64 d5, d12\n\t"
+ "vbsl d8, d3, d4\n\t"
+ "vbsl d10, d6, d0\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d1, d5\n\t"
+ "vadd.i64 d5, d10\n\t"
+ /* Round 11 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d1, #50\n\t"
+ "vsri.u64 d8, d1, #14\n\t"
+ "vshl.u64 d9, d5, #36\n\t"
+ "vsri.u64 d9, d5, #28\n\t"
+ "vshl.u64 d10, d1, #46\n\t"
+ "vsri.u64 d10, d1, #18\n\t"
+ "vshl.u64 d11, d5, #30\n\t"
+ "vsri.u64 d11, d5, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d1, #23\n\t"
+ "vsri.u64 d10, d1, #41\n\t"
+ "vshl.u64 d11, d5, #25\n\t"
+ "vsri.u64 d11, d5, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d12, d27\n\t"
+ "vmov d8, d1\n\t"
+ "veor d10, d6, d7\n\t"
+ "vadd.i64 d4, d12\n\t"
+ "vbsl d8, d2, d3\n\t"
+ "vbsl d10, d5, d7\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d0, d4\n\t"
+ "vadd.i64 d4, d10\n\t"
+ /* Calc new W[10]-W[11] */
+ "vext.8 q6, q13, q14, #8\n\t"
+ "vshl.u64 q4, q12, #45\n\t"
+ "vsri.u64 q4, q12, #19\n\t"
+ "vshl.u64 q5, q12, #3\n\t"
+ "vsri.u64 q5, q12, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q12, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q13, q5\n\t"
+ "vext.8 q7, q9, q10, #8\n\t"
+ "vadd.i64 q13, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q13, q5\n\t"
+ /* Round 12 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d0, #50\n\t"
+ "vsri.u64 d8, d0, #14\n\t"
+ "vshl.u64 d9, d4, #36\n\t"
+ "vsri.u64 d9, d4, #28\n\t"
+ "vshl.u64 d10, d0, #46\n\t"
+ "vsri.u64 d10, d0, #18\n\t"
+ "vshl.u64 d11, d4, #30\n\t"
+ "vsri.u64 d11, d4, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d0, #23\n\t"
+ "vsri.u64 d10, d0, #41\n\t"
+ "vshl.u64 d11, d4, #25\n\t"
+ "vsri.u64 d11, d4, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d12, d28\n\t"
+ "vmov d8, d0\n\t"
+ "veor d10, d5, d6\n\t"
+ "vadd.i64 d3, d12\n\t"
+ "vbsl d8, d1, d2\n\t"
+ "vbsl d10, d4, d6\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d7, d3\n\t"
+ "vadd.i64 d3, d10\n\t"
+ /* Round 13 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d7, #50\n\t"
+ "vsri.u64 d8, d7, #14\n\t"
+ "vshl.u64 d9, d3, #36\n\t"
+ "vsri.u64 d9, d3, #28\n\t"
+ "vshl.u64 d10, d7, #46\n\t"
+ "vsri.u64 d10, d7, #18\n\t"
+ "vshl.u64 d11, d3, #30\n\t"
+ "vsri.u64 d11, d3, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d7, #23\n\t"
+ "vsri.u64 d10, d7, #41\n\t"
+ "vshl.u64 d11, d3, #25\n\t"
+ "vsri.u64 d11, d3, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d12, d29\n\t"
+ "vmov d8, d7\n\t"
+ "veor d10, d4, d5\n\t"
+ "vadd.i64 d2, d12\n\t"
+ "vbsl d8, d0, d1\n\t"
+ "vbsl d10, d3, d5\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d6, d2\n\t"
+ "vadd.i64 d2, d10\n\t"
+ /* Calc new W[12]-W[13] */
+ "vext.8 q6, q14, q15, #8\n\t"
+ "vshl.u64 q4, q13, #45\n\t"
+ "vsri.u64 q4, q13, #19\n\t"
+ "vshl.u64 q5, q13, #3\n\t"
+ "vsri.u64 q5, q13, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q13, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q14, q5\n\t"
+ "vext.8 q7, q10, q11, #8\n\t"
+ "vadd.i64 q14, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q14, q5\n\t"
+ /* Round 14 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d6, #50\n\t"
+ "vsri.u64 d8, d6, #14\n\t"
+ "vshl.u64 d9, d2, #36\n\t"
+ "vsri.u64 d9, d2, #28\n\t"
+ "vshl.u64 d10, d6, #46\n\t"
+ "vsri.u64 d10, d6, #18\n\t"
+ "vshl.u64 d11, d2, #30\n\t"
+ "vsri.u64 d11, d2, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d6, #23\n\t"
+ "vsri.u64 d10, d6, #41\n\t"
+ "vshl.u64 d11, d2, #25\n\t"
+ "vsri.u64 d11, d2, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d12, d30\n\t"
+ "vmov d8, d6\n\t"
+ "veor d10, d3, d4\n\t"
+ "vadd.i64 d1, d12\n\t"
+ "vbsl d8, d7, d0\n\t"
+ "vbsl d10, d2, d4\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d5, d1\n\t"
+ "vadd.i64 d1, d10\n\t"
+ /* Round 15 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d5, #50\n\t"
+ "vsri.u64 d8, d5, #14\n\t"
+ "vshl.u64 d9, d1, #36\n\t"
+ "vsri.u64 d9, d1, #28\n\t"
+ "vshl.u64 d10, d5, #46\n\t"
+ "vsri.u64 d10, d5, #18\n\t"
+ "vshl.u64 d11, d1, #30\n\t"
+ "vsri.u64 d11, d1, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d5, #23\n\t"
+ "vsri.u64 d10, d5, #41\n\t"
+ "vshl.u64 d11, d1, #25\n\t"
+ "vsri.u64 d11, d1, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d12, d31\n\t"
+ "vmov d8, d5\n\t"
+ "veor d10, d2, d3\n\t"
+ "vadd.i64 d0, d12\n\t"
+ "vbsl d8, d6, d7\n\t"
+ "vbsl d10, d1, d3\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d4, d0\n\t"
+ "vadd.i64 d0, d10\n\t"
+ /* Calc new W[14]-W[15] */
+ "vext.8 q6, q15, q8, #8\n\t"
+ "vshl.u64 q4, q14, #45\n\t"
+ "vsri.u64 q4, q14, #19\n\t"
+ "vshl.u64 q5, q14, #3\n\t"
+ "vsri.u64 q5, q14, #61\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q4, q14, #6\n\t"
+ "veor q5, q4\n\t"
+ "vadd.i64 q15, q5\n\t"
+ "vext.8 q7, q11, q12, #8\n\t"
+ "vadd.i64 q15, q7\n\t"
+ "vshl.u64 q4, q6, #63\n\t"
+ "vsri.u64 q4, q6, #1\n\t"
+ "vshl.u64 q5, q6, #56\n\t"
+ "vsri.u64 q5, q6, #8\n\t"
+ "veor q5, q4\n\t"
+ "vshr.u64 q6, #7\n\t"
+ "veor q5, q6\n\t"
+ "vadd.i64 q15, q5\n\t"
+ "subs r12, r12, #1\n\t"
+ "bne L_sha512_len_neon_start_%=\n\t"
+ /* Round 0 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d4, #50\n\t"
+ "vsri.u64 d8, d4, #14\n\t"
+ "vshl.u64 d9, d0, #36\n\t"
+ "vsri.u64 d9, d0, #28\n\t"
+ "vshl.u64 d10, d4, #46\n\t"
+ "vsri.u64 d10, d4, #18\n\t"
+ "vshl.u64 d11, d0, #30\n\t"
+ "vsri.u64 d11, d0, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d4, #23\n\t"
+ "vsri.u64 d10, d4, #41\n\t"
+ "vshl.u64 d11, d0, #25\n\t"
+ "vsri.u64 d11, d0, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d12, d16\n\t"
+ "vmov d8, d4\n\t"
+ "veor d10, d1, d2\n\t"
+ "vadd.i64 d7, d12\n\t"
+ "vbsl d8, d5, d6\n\t"
+ "vbsl d10, d0, d2\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d3, d7\n\t"
+ "vadd.i64 d7, d10\n\t"
+ /* Round 1 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d3, #50\n\t"
+ "vsri.u64 d8, d3, #14\n\t"
+ "vshl.u64 d9, d7, #36\n\t"
+ "vsri.u64 d9, d7, #28\n\t"
+ "vshl.u64 d10, d3, #46\n\t"
+ "vsri.u64 d10, d3, #18\n\t"
+ "vshl.u64 d11, d7, #30\n\t"
+ "vsri.u64 d11, d7, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d3, #23\n\t"
+ "vsri.u64 d10, d3, #41\n\t"
+ "vshl.u64 d11, d7, #25\n\t"
+ "vsri.u64 d11, d7, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d12, d17\n\t"
+ "vmov d8, d3\n\t"
+ "veor d10, d0, d1\n\t"
+ "vadd.i64 d6, d12\n\t"
+ "vbsl d8, d4, d5\n\t"
+ "vbsl d10, d7, d1\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d2, d6\n\t"
+ "vadd.i64 d6, d10\n\t"
+ /* Round 2 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d2, #50\n\t"
+ "vsri.u64 d8, d2, #14\n\t"
+ "vshl.u64 d9, d6, #36\n\t"
+ "vsri.u64 d9, d6, #28\n\t"
+ "vshl.u64 d10, d2, #46\n\t"
+ "vsri.u64 d10, d2, #18\n\t"
+ "vshl.u64 d11, d6, #30\n\t"
+ "vsri.u64 d11, d6, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d2, #23\n\t"
+ "vsri.u64 d10, d2, #41\n\t"
+ "vshl.u64 d11, d6, #25\n\t"
+ "vsri.u64 d11, d6, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d12, d18\n\t"
+ "vmov d8, d2\n\t"
+ "veor d10, d7, d0\n\t"
+ "vadd.i64 d5, d12\n\t"
+ "vbsl d8, d3, d4\n\t"
+ "vbsl d10, d6, d0\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d1, d5\n\t"
+ "vadd.i64 d5, d10\n\t"
+ /* Round 3 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d1, #50\n\t"
+ "vsri.u64 d8, d1, #14\n\t"
+ "vshl.u64 d9, d5, #36\n\t"
+ "vsri.u64 d9, d5, #28\n\t"
+ "vshl.u64 d10, d1, #46\n\t"
+ "vsri.u64 d10, d1, #18\n\t"
+ "vshl.u64 d11, d5, #30\n\t"
+ "vsri.u64 d11, d5, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d1, #23\n\t"
+ "vsri.u64 d10, d1, #41\n\t"
+ "vshl.u64 d11, d5, #25\n\t"
+ "vsri.u64 d11, d5, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d12, d19\n\t"
+ "vmov d8, d1\n\t"
+ "veor d10, d6, d7\n\t"
+ "vadd.i64 d4, d12\n\t"
+ "vbsl d8, d2, d3\n\t"
+ "vbsl d10, d5, d7\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d0, d4\n\t"
+ "vadd.i64 d4, d10\n\t"
+ /* Round 4 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d0, #50\n\t"
+ "vsri.u64 d8, d0, #14\n\t"
+ "vshl.u64 d9, d4, #36\n\t"
+ "vsri.u64 d9, d4, #28\n\t"
+ "vshl.u64 d10, d0, #46\n\t"
+ "vsri.u64 d10, d0, #18\n\t"
+ "vshl.u64 d11, d4, #30\n\t"
+ "vsri.u64 d11, d4, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d0, #23\n\t"
+ "vsri.u64 d10, d0, #41\n\t"
+ "vshl.u64 d11, d4, #25\n\t"
+ "vsri.u64 d11, d4, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d12, d20\n\t"
+ "vmov d8, d0\n\t"
+ "veor d10, d5, d6\n\t"
+ "vadd.i64 d3, d12\n\t"
+ "vbsl d8, d1, d2\n\t"
+ "vbsl d10, d4, d6\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d7, d3\n\t"
+ "vadd.i64 d3, d10\n\t"
+ /* Round 5 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d7, #50\n\t"
+ "vsri.u64 d8, d7, #14\n\t"
+ "vshl.u64 d9, d3, #36\n\t"
+ "vsri.u64 d9, d3, #28\n\t"
+ "vshl.u64 d10, d7, #46\n\t"
+ "vsri.u64 d10, d7, #18\n\t"
+ "vshl.u64 d11, d3, #30\n\t"
+ "vsri.u64 d11, d3, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d7, #23\n\t"
+ "vsri.u64 d10, d7, #41\n\t"
+ "vshl.u64 d11, d3, #25\n\t"
+ "vsri.u64 d11, d3, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d12, d21\n\t"
+ "vmov d8, d7\n\t"
+ "veor d10, d4, d5\n\t"
+ "vadd.i64 d2, d12\n\t"
+ "vbsl d8, d0, d1\n\t"
+ "vbsl d10, d3, d5\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d6, d2\n\t"
+ "vadd.i64 d2, d10\n\t"
+ /* Round 6 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d6, #50\n\t"
+ "vsri.u64 d8, d6, #14\n\t"
+ "vshl.u64 d9, d2, #36\n\t"
+ "vsri.u64 d9, d2, #28\n\t"
+ "vshl.u64 d10, d6, #46\n\t"
+ "vsri.u64 d10, d6, #18\n\t"
+ "vshl.u64 d11, d2, #30\n\t"
+ "vsri.u64 d11, d2, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d6, #23\n\t"
+ "vsri.u64 d10, d6, #41\n\t"
+ "vshl.u64 d11, d2, #25\n\t"
+ "vsri.u64 d11, d2, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d12, d22\n\t"
+ "vmov d8, d6\n\t"
+ "veor d10, d3, d4\n\t"
+ "vadd.i64 d1, d12\n\t"
+ "vbsl d8, d7, d0\n\t"
+ "vbsl d10, d2, d4\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d5, d1\n\t"
+ "vadd.i64 d1, d10\n\t"
+ /* Round 7 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d5, #50\n\t"
+ "vsri.u64 d8, d5, #14\n\t"
+ "vshl.u64 d9, d1, #36\n\t"
+ "vsri.u64 d9, d1, #28\n\t"
+ "vshl.u64 d10, d5, #46\n\t"
+ "vsri.u64 d10, d5, #18\n\t"
+ "vshl.u64 d11, d1, #30\n\t"
+ "vsri.u64 d11, d1, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d5, #23\n\t"
+ "vsri.u64 d10, d5, #41\n\t"
+ "vshl.u64 d11, d1, #25\n\t"
+ "vsri.u64 d11, d1, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d12, d23\n\t"
+ "vmov d8, d5\n\t"
+ "veor d10, d2, d3\n\t"
+ "vadd.i64 d0, d12\n\t"
+ "vbsl d8, d6, d7\n\t"
+ "vbsl d10, d1, d3\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d4, d0\n\t"
+ "vadd.i64 d0, d10\n\t"
+ /* Round 8 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d4, #50\n\t"
+ "vsri.u64 d8, d4, #14\n\t"
+ "vshl.u64 d9, d0, #36\n\t"
+ "vsri.u64 d9, d0, #28\n\t"
+ "vshl.u64 d10, d4, #46\n\t"
+ "vsri.u64 d10, d4, #18\n\t"
+ "vshl.u64 d11, d0, #30\n\t"
+ "vsri.u64 d11, d0, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d4, #23\n\t"
+ "vsri.u64 d10, d4, #41\n\t"
+ "vshl.u64 d11, d0, #25\n\t"
+ "vsri.u64 d11, d0, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d12, d24\n\t"
+ "vmov d8, d4\n\t"
+ "veor d10, d1, d2\n\t"
+ "vadd.i64 d7, d12\n\t"
+ "vbsl d8, d5, d6\n\t"
+ "vbsl d10, d0, d2\n\t"
+ "vadd.i64 d7, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d3, d7\n\t"
+ "vadd.i64 d7, d10\n\t"
+ /* Round 9 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d3, #50\n\t"
+ "vsri.u64 d8, d3, #14\n\t"
+ "vshl.u64 d9, d7, #36\n\t"
+ "vsri.u64 d9, d7, #28\n\t"
+ "vshl.u64 d10, d3, #46\n\t"
+ "vsri.u64 d10, d3, #18\n\t"
+ "vshl.u64 d11, d7, #30\n\t"
+ "vsri.u64 d11, d7, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d3, #23\n\t"
+ "vsri.u64 d10, d3, #41\n\t"
+ "vshl.u64 d11, d7, #25\n\t"
+ "vsri.u64 d11, d7, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d12, d25\n\t"
+ "vmov d8, d3\n\t"
+ "veor d10, d0, d1\n\t"
+ "vadd.i64 d6, d12\n\t"
+ "vbsl d8, d4, d5\n\t"
+ "vbsl d10, d7, d1\n\t"
+ "vadd.i64 d6, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d2, d6\n\t"
+ "vadd.i64 d6, d10\n\t"
+ /* Round 10 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d2, #50\n\t"
+ "vsri.u64 d8, d2, #14\n\t"
+ "vshl.u64 d9, d6, #36\n\t"
+ "vsri.u64 d9, d6, #28\n\t"
+ "vshl.u64 d10, d2, #46\n\t"
+ "vsri.u64 d10, d2, #18\n\t"
+ "vshl.u64 d11, d6, #30\n\t"
+ "vsri.u64 d11, d6, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d2, #23\n\t"
+ "vsri.u64 d10, d2, #41\n\t"
+ "vshl.u64 d11, d6, #25\n\t"
+ "vsri.u64 d11, d6, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d12, d26\n\t"
+ "vmov d8, d2\n\t"
+ "veor d10, d7, d0\n\t"
+ "vadd.i64 d5, d12\n\t"
+ "vbsl d8, d3, d4\n\t"
+ "vbsl d10, d6, d0\n\t"
+ "vadd.i64 d5, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d1, d5\n\t"
+ "vadd.i64 d5, d10\n\t"
+ /* Round 11 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d1, #50\n\t"
+ "vsri.u64 d8, d1, #14\n\t"
+ "vshl.u64 d9, d5, #36\n\t"
+ "vsri.u64 d9, d5, #28\n\t"
+ "vshl.u64 d10, d1, #46\n\t"
+ "vsri.u64 d10, d1, #18\n\t"
+ "vshl.u64 d11, d5, #30\n\t"
+ "vsri.u64 d11, d5, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d1, #23\n\t"
+ "vsri.u64 d10, d1, #41\n\t"
+ "vshl.u64 d11, d5, #25\n\t"
+ "vsri.u64 d11, d5, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d12, d27\n\t"
+ "vmov d8, d1\n\t"
+ "veor d10, d6, d7\n\t"
+ "vadd.i64 d4, d12\n\t"
+ "vbsl d8, d2, d3\n\t"
+ "vbsl d10, d5, d7\n\t"
+ "vadd.i64 d4, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d0, d4\n\t"
+ "vadd.i64 d4, d10\n\t"
+ /* Round 12 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d0, #50\n\t"
+ "vsri.u64 d8, d0, #14\n\t"
+ "vshl.u64 d9, d4, #36\n\t"
+ "vsri.u64 d9, d4, #28\n\t"
+ "vshl.u64 d10, d0, #46\n\t"
+ "vsri.u64 d10, d0, #18\n\t"
+ "vshl.u64 d11, d4, #30\n\t"
+ "vsri.u64 d11, d4, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d0, #23\n\t"
+ "vsri.u64 d10, d0, #41\n\t"
+ "vshl.u64 d11, d4, #25\n\t"
+ "vsri.u64 d11, d4, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d12, d28\n\t"
+ "vmov d8, d0\n\t"
+ "veor d10, d5, d6\n\t"
+ "vadd.i64 d3, d12\n\t"
+ "vbsl d8, d1, d2\n\t"
+ "vbsl d10, d4, d6\n\t"
+ "vadd.i64 d3, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d7, d3\n\t"
+ "vadd.i64 d3, d10\n\t"
+ /* Round 13 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d7, #50\n\t"
+ "vsri.u64 d8, d7, #14\n\t"
+ "vshl.u64 d9, d3, #36\n\t"
+ "vsri.u64 d9, d3, #28\n\t"
+ "vshl.u64 d10, d7, #46\n\t"
+ "vsri.u64 d10, d7, #18\n\t"
+ "vshl.u64 d11, d3, #30\n\t"
+ "vsri.u64 d11, d3, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d7, #23\n\t"
+ "vsri.u64 d10, d7, #41\n\t"
+ "vshl.u64 d11, d3, #25\n\t"
+ "vsri.u64 d11, d3, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d12, d29\n\t"
+ "vmov d8, d7\n\t"
+ "veor d10, d4, d5\n\t"
+ "vadd.i64 d2, d12\n\t"
+ "vbsl d8, d0, d1\n\t"
+ "vbsl d10, d3, d5\n\t"
+ "vadd.i64 d2, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d6, d2\n\t"
+ "vadd.i64 d2, d10\n\t"
+ /* Round 14 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d6, #50\n\t"
+ "vsri.u64 d8, d6, #14\n\t"
+ "vshl.u64 d9, d2, #36\n\t"
+ "vsri.u64 d9, d2, #28\n\t"
+ "vshl.u64 d10, d6, #46\n\t"
+ "vsri.u64 d10, d6, #18\n\t"
+ "vshl.u64 d11, d2, #30\n\t"
+ "vsri.u64 d11, d2, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d6, #23\n\t"
+ "vsri.u64 d10, d6, #41\n\t"
+ "vshl.u64 d11, d2, #25\n\t"
+ "vsri.u64 d11, d2, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d12, d30\n\t"
+ "vmov d8, d6\n\t"
+ "veor d10, d3, d4\n\t"
+ "vadd.i64 d1, d12\n\t"
+ "vbsl d8, d7, d0\n\t"
+ "vbsl d10, d2, d4\n\t"
+ "vadd.i64 d1, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d5, d1\n\t"
+ "vadd.i64 d1, d10\n\t"
+ /* Round 15 */
+ "vld1.64 {d12}, [r3]!\n\t"
+ "vshl.u64 d8, d5, #50\n\t"
+ "vsri.u64 d8, d5, #14\n\t"
+ "vshl.u64 d9, d1, #36\n\t"
+ "vsri.u64 d9, d1, #28\n\t"
+ "vshl.u64 d10, d5, #46\n\t"
+ "vsri.u64 d10, d5, #18\n\t"
+ "vshl.u64 d11, d1, #30\n\t"
+ "vsri.u64 d11, d1, #34\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vshl.u64 d10, d5, #23\n\t"
+ "vsri.u64 d10, d5, #41\n\t"
+ "vshl.u64 d11, d1, #25\n\t"
+ "vsri.u64 d11, d1, #39\n\t"
+ "veor d8, d10\n\t"
+ "veor d9, d11\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d12, d31\n\t"
+ "vmov d8, d5\n\t"
+ "veor d10, d2, d3\n\t"
+ "vadd.i64 d0, d12\n\t"
+ "vbsl d8, d6, d7\n\t"
+ "vbsl d10, d1, d3\n\t"
+ "vadd.i64 d0, d8\n\t"
+ "vadd.i64 d10, d9\n\t"
+ "vadd.i64 d4, d0\n\t"
+ "vadd.i64 d0, d10\n\t"
+ /* Add in digest from start */
+ "vldm.64 %[sha512], {d8-d15}\n\t"
+ "vadd.i64 q0, q0, q4\n\t"
+ "vadd.i64 q1, q1, q5\n\t"
+ "vadd.i64 q2, q2, q6\n\t"
+ "vadd.i64 q3, q3, q7\n\t"
+ "vstm.64 %[sha512], {d0-d7}\n\t"
+ "subs %[len], %[len], #0x80\n\t"
+ "bne L_sha512_len_neon_begin_%=\n\t"
+ : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
+ : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k), [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k)
+ : "memory", "r3", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#endif /* !WOLFSSL_ARMASM_NO_NEON */
+#endif /* WOLFSSL_ARMASM */
+#endif /* !__aarch64__ */
diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c
new file mode 100644
index 0000000..d0f8a9c
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-aes.c
@@ -0,0 +1,4653 @@
+/* armv8-aes.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+/*
+ * There are two versions one for 64 (Aarch64) and one for 32 bit (Aarch32).
+ * If changing one check the other.
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if !defined(NO_AES) && defined(WOLFSSL_ARMASM)
+
+#include <wolfssl/wolfcrypt/aes.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifdef _MSC_VER
+ /* 4127 warning constant while(1) */
+ #pragma warning(disable: 4127)
+#endif
+
+
+static const byte rcon[] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,0x1B, 0x36
+ /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/* get table value from hardware */
+#ifdef __aarch64__
+ #define SBOX(x) \
+ do { \
+ __asm__ volatile ( \
+ "DUP v1.4s, %w[in] \n" \
+ "MOVI v0.16b, #0 \n" \
+ "AESE v0.16b, v1.16b \n" \
+ "UMOV %w[out], v0.s[0] \n" \
+ : [out] "=r"((x)) \
+ : [in] "r" ((x)) \
+ : "cc", "memory", "v0", "v1"\
+ ); \
+ } while(0)
+
+ #define IMIX(x) \
+ do { \
+ __asm__ volatile ( \
+ "LD1 {v0.16b}, [%[in]] \n" \
+ "AESIMC v0.16b, v0.16b \n" \
+ "ST1 {v0.16b}, [%[out]]\n" \
+ : [out] "=r" ((x)) \
+ : [in] "0" ((x)) \
+ : "cc", "memory", "v0" \
+ ); \
+ } while(0)
+#else /* if not defined __aarch64__ then use 32 bit version */
+ #define SBOX(x) \
+ do { \
+ __asm__ volatile ( \
+ "VDUP.32 q1, %[in] \n" \
+ "VMOV.i32 q0, #0 \n" \
+ "AESE.8 q0, q1 \n" \
+ "VMOV.32 %[out], d0[0] \n" \
+ : [out] "=r"((x)) \
+ : [in] "r" ((x)) \
+ : "cc", "memory", "q0", "q1"\
+ ); \
+ } while(0)
+
+ #define IMIX(x) \
+ do { \
+ __asm__ volatile ( \
+ "VLD1.32 {q0}, [%[in]] \n" \
+ "AESIMC.8 q0, q0 \n" \
+ "VST1.32 {q0}, [%[out]] \n" \
+ : [out] "=r" ((x)) \
+ : [in] "0" ((x)) \
+ : "cc", "memory", "q0" \
+ ); \
+ } while(0)
+#endif /* aarch64 */
+
+
+#ifdef HAVE_AESGCM
+
+static WC_INLINE void IncrementGcmCounter(byte* inOutCtr)
+{
+ int i;
+
+ /* in network byte order so start at end and work back */
+ for (i = AES_BLOCK_SIZE - 1; i >= AES_BLOCK_SIZE - CTR_SZ; i--) {
+ if (++inOutCtr[i]) /* we're done unless we overflow */
+ return;
+ }
+}
+
+
+static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz)
+{
+ /* Multiply the sz by 8 */
+ word32 szHi = (sz >> (8*sizeof(sz) - 3));
+ sz <<= 3;
+
+ /* copy over the words of the sz into the destination buffer */
+ buf[0] = (szHi >> 24) & 0xff;
+ buf[1] = (szHi >> 16) & 0xff;
+ buf[2] = (szHi >> 8) & 0xff;
+ buf[3] = szHi & 0xff;
+ buf[4] = (sz >> 24) & 0xff;
+ buf[5] = (sz >> 16) & 0xff;
+ buf[6] = (sz >> 8) & 0xff;
+ buf[7] = sz & 0xff;
+}
+
+#endif /* HAVE_AESGCM */
+
+/* Similar to wolfSSL software implementation of expanding the AES key.
+ * Changed out the locations of where table look ups where made to
+ * use hardware instruction. Also altered decryption key to match. */
+int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
+ const byte* iv, int dir)
+{
+ word32 temp;
+ word32 *rk;
+ unsigned int i = 0;
+
+#if defined(AES_MAX_KEY_SIZE)
+ const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
+#endif
+
+ if (!((keylen == 16) || (keylen == 24) || (keylen == 32)) ||
+ aes == NULL || userKey == NULL)
+ return BAD_FUNC_ARG;
+
+ rk = aes->key;
+#if defined(AES_MAX_KEY_SIZE)
+ /* Check key length */
+ if (keylen > max_key_len) {
+ return BAD_FUNC_ARG;
+ }
+#endif
+
+ #ifdef WOLFSSL_AES_COUNTER
+ aes->left = 0;
+ #endif /* WOLFSSL_AES_COUNTER */
+
+ aes->rounds = keylen/4 + 6;
+ XMEMCPY(rk, userKey, keylen);
+
+ switch(keylen)
+ {
+#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128 && \
+ defined(WOLFSSL_AES_128)
+ case 16:
+ while (1)
+ {
+ temp = rk[3];
+ SBOX(temp);
+ temp = rotrFixed(temp, 8);
+ rk[4] = rk[0] ^ temp ^ rcon[i];
+ rk[5] = rk[4] ^ rk[1];
+ rk[6] = rk[5] ^ rk[2];
+ rk[7] = rk[6] ^ rk[3];
+ if (++i == 10)
+ break;
+ rk += 4;
+ }
+ break;
+#endif /* 128 */
+
+#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 192 && \
+ defined(WOLFSSL_AES_192)
+ case 24:
+ /* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */
+ while (1)
+ {
+ temp = rk[5];
+ SBOX(temp);
+ temp = rotrFixed(temp, 8);
+ rk[ 6] = rk[ 0] ^ temp ^ rcon[i];
+ rk[ 7] = rk[ 1] ^ rk[ 6];
+ rk[ 8] = rk[ 2] ^ rk[ 7];
+ rk[ 9] = rk[ 3] ^ rk[ 8];
+ if (++i == 8)
+ break;
+ rk[10] = rk[ 4] ^ rk[ 9];
+ rk[11] = rk[ 5] ^ rk[10];
+ rk += 6;
+ }
+ break;
+#endif /* 192 */
+
+#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256 && \
+ defined(WOLFSSL_AES_256)
+ case 32:
+ while (1)
+ {
+ temp = rk[7];
+ SBOX(temp);
+ temp = rotrFixed(temp, 8);
+ rk[8] = rk[0] ^ temp ^ rcon[i];
+ rk[ 9] = rk[ 1] ^ rk[ 8];
+ rk[10] = rk[ 2] ^ rk[ 9];
+ rk[11] = rk[ 3] ^ rk[10];
+ if (++i == 7)
+ break;
+ temp = rk[11];
+ SBOX(temp);
+ rk[12] = rk[ 4] ^ temp;
+ rk[13] = rk[ 5] ^ rk[12];
+ rk[14] = rk[ 6] ^ rk[13];
+ rk[15] = rk[ 7] ^ rk[14];
+
+ rk += 8;
+ }
+ break;
+#endif /* 256 */
+
+ default:
+ return BAD_FUNC_ARG;
+ }
+
+ if (dir == AES_DECRYPTION)
+ {
+#ifdef HAVE_AES_DECRYPT
+ unsigned int j;
+ rk = aes->key;
+
+ /* invert the order of the round keys: */
+ for (i = 0, j = 4* aes->rounds; i < j; i += 4, j -= 4) {
+ temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
+ temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+ temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+ temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+ }
+ /* apply the inverse MixColumn transform to all round keys but the
+ first and the last: */
+ for (i = 1; i < aes->rounds; i++) {
+ rk += 4;
+ IMIX(rk);
+ }
+#else
+ WOLFSSL_MSG("AES Decryption not compiled in");
+ return BAD_FUNC_ARG;
+#endif /* HAVE_AES_DECRYPT */
+ }
+
+ return wc_AesSetIV(aes, iv);
+}
+
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+ int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
+ const byte* iv, int dir)
+ {
+ return wc_AesSetKey(aes, userKey, keylen, iv, dir);
+ }
+#endif
+
+/* wc_AesSetIV is shared between software and hardware */
+int wc_AesSetIV(Aes* aes, const byte* iv)
+{
+ if (aes == NULL)
+ return BAD_FUNC_ARG;
+
+ if (iv)
+ XMEMCPY(aes->reg, iv, AES_BLOCK_SIZE);
+ else
+ XMEMSET(aes->reg, 0, AES_BLOCK_SIZE);
+
+ return 0;
+}
+
+
+#ifdef __aarch64__
+/* AES CCM/GCM use encrypt direct but not decrypt */
+#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
+ defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+ static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+ {
+ word32* keyPt = aes->key;
+
+ /*
+ AESE exor's input with round key
+ shift rows of exor'ed result
+ sub bytes for shifted rows
+ */
+
+ __asm__ __volatile__ (
+ "LD1 {v0.16b}, [%[CtrIn]] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+
+ "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+
+ "#subtract rounds done so far and see if should continue\n"
+ "MOV w12, %w[R] \n"
+ "SUB w12, w12, #10 \n"
+ "CBZ w12, 1f \n"
+ "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+
+ "SUB w12, w12, #2 \n"
+ "CBZ w12, 1f \n"
+ "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+
+ "#Final AddRoundKey then store result \n"
+ "1: \n"
+ "LD1 {v1.2d}, [%[Key]], #16 \n"
+ "EOR v0.16b, v0.16b, v1.16b \n"
+ "ST1 {v0.16b}, [%[CtrOut]] \n"
+
+ :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
+ "=r" (inBlock)
+ :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds),
+ [CtrIn] "3" (inBlock)
+ : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4"
+ );
+
+ return 0;
+ }
+#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+ #ifdef HAVE_AES_DECRYPT
+ static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+ {
+ word32* keyPt = aes->key;
+
+ /*
+ AESE exor's input with round key
+ shift rows of exor'ed result
+ sub bytes for shifted rows
+ */
+
+ __asm__ __volatile__ (
+ "LD1 {v0.16b}, [%[CtrIn]] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+ "AESD v0.16b, v1.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v2.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v3.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v4.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "AESD v0.16b, v1.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v2.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v3.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v4.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+
+ "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n"
+ "AESD v0.16b, v1.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v2.16b \n"
+
+ "#subtract rounds done so far and see if should continue\n"
+ "MOV w12, %w[R] \n"
+ "SUB w12, w12, #10 \n"
+ "CBZ w12, 1f \n"
+ "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v1.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v2.16b \n"
+
+ "SUB w12, w12, #2 \n"
+ "CBZ w12, 1f \n"
+ "LD1 {v1.2d-v2.2d}, [%[Key]], #32 \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v1.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v2.16b \n"
+
+ "#Final AddRoundKey then store result \n"
+ "1: \n"
+ "LD1 {v1.2d}, [%[Key]], #16 \n"
+ "EOR v0.16b, v0.16b, v1.16b \n"
+ "ST1 {v0.4s}, [%[CtrOut]] \n"
+
+ :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
+ "=r" (inBlock)
+ :[Key] "1" (aes->key), "0" (outBlock), [R] "2" (aes->rounds),
+ [CtrIn] "3" (inBlock)
+ : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4"
+ );
+
+ return 0;
+}
+ #endif /* HAVE_AES_DECRYPT */
+#endif /* DIRECT or COUNTER */
+
+/* AES-CBC */
+#ifdef HAVE_AES_CBC
+ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+ {
+ word32 numBlocks = sz / AES_BLOCK_SIZE;
+
+ if (aes == NULL || out == NULL || (in == NULL && sz > 0)) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* do as many block size ops as possible */
+ if (numBlocks > 0) {
+ word32* key = aes->key;
+ word32* reg = aes->reg;
+ /*
+ AESE exor's input with round key
+ shift rows of exor'ed result
+ sub bytes for shifted rows
+
+ note: grouping AESE & AESMC together as pairs reduces latency
+ */
+ switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+ case 10: /* AES 128 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "LD1 {v9.2d-v11.2d},[%[Key]], #48 \n"
+ "LD1 {v0.2d}, [%[reg]] \n"
+
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "1:\n"
+ "#CBC operations, xorbuf in with current aes->reg \n"
+ "EOR v0.16b, v0.16b, v12.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "SUB w11, w11, #1 \n"
+ "EOR v0.16b, v0.16b, v11.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+
+ "CBZ w11, 2f \n"
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "B 1b \n"
+
+ "2:\n"
+ "#store current counter value at the end \n"
+ "ST1 {v0.2d}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
+ :"0" (out), [Key] "r" (key), [input] "2" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (reg)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
+ );
+ break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+ case 12: /* AES 192 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, %[Key], #64 \n"
+ "LD1 {v5.2d-v8.2d}, %[Key], #64 \n"
+ "LD1 {v9.2d-v12.2d},%[Key], #64 \n"
+ "LD1 {v13.2d}, %[Key], #16 \n"
+ "LD1 {v0.2d}, %[reg] \n"
+
+ "LD1 {v14.2d}, [%[input]], #16 \n"
+ "1:\n"
+ "#CBC operations, xorbuf in with current aes->reg \n"
+ "EOR v0.16b, v0.16b, v14.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v12.16b \n"
+ "EOR v0.16b, v0.16b, v13.16b \n"
+ "SUB w11, w11, #1 \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+
+ "CBZ w11, 2f \n"
+ "LD1 {v14.2d}, [%[input]], #16\n"
+ "B 1b \n"
+
+ "2:\n"
+ "#store current counter value at the end \n"
+ "ST1 {v0.2d}, %[regOut] \n"
+
+
+ :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in)
+ :"0" (out), [Key] "m" (aes->key), [input] "2" (in),
+ [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+ );
+ break;
+#endif /* WOLFSSL_AES_192*/
+#ifdef WOLFSSL_AES_256
+ case 14: /* AES 256 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, %[Key], #64 \n"
+
+ "LD1 {v5.2d-v8.2d}, %[Key], #64 \n"
+ "LD1 {v9.2d-v12.2d}, %[Key], #64 \n"
+ "LD1 {v13.2d-v15.2d}, %[Key], #48 \n"
+ "LD1 {v0.2d}, %[reg] \n"
+
+ "LD1 {v16.2d}, [%[input]], #16 \n"
+ "1: \n"
+ "#CBC operations, xorbuf in with current aes->reg \n"
+ "EOR v0.16b, v0.16b, v16.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v12.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v13.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v14.16b \n"
+ "EOR v0.16b, v0.16b, v15.16b \n"
+ "SUB w11, w11, #1 \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+
+ "CBZ w11, 2f \n"
+ "LD1 {v16.2d}, [%[input]], #16 \n"
+ "B 1b \n"
+
+ "2: \n"
+ "#store current counter value at the end \n"
+ "ST1 {v0.2d}, %[regOut] \n"
+
+
+ :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in)
+ :"0" (out), [Key] "m" (aes->key), [input] "2" (in),
+ [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15",
+ "v16"
+ );
+ break;
+#endif /* WOLFSSL_AES_256 */
+ default:
+ WOLFSSL_MSG("Bad AES-CBC round value");
+ return BAD_FUNC_ARG;
+ }
+ }
+
+ return 0;
+ }
+
+ #ifdef HAVE_AES_DECRYPT
+ int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+ {
+ word32 numBlocks = sz / AES_BLOCK_SIZE;
+
+ if (aes == NULL || out == NULL || (in == NULL && sz > 0)
+ || sz % AES_BLOCK_SIZE != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* do as many block size ops as possible */
+ if (numBlocks > 0) {
+ word32* key = aes->key;
+ word32* reg = aes->reg;
+
+ switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+ case 10: /* AES 128 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "LD1 {v9.2d-v11.2d},[%[Key]], #48 \n"
+ "LD1 {v13.2d}, [%[reg]] \n"
+
+ "1:\n"
+ "LD1 {v0.2d}, [%[input]], #16 \n"
+ "MOV v12.16b, v0.16b \n"
+ "AESD v0.16b, v1.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v2.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v3.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v4.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v5.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v6.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v7.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v8.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v9.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v10.16b \n"
+ "EOR v0.16b, v0.16b, v11.16b \n"
+
+ "EOR v0.16b, v0.16b, v13.16b \n"
+ "SUB w11, w11, #1 \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "MOV v13.16b, v12.16b \n"
+
+ "CBZ w11, 2f \n"
+ "B 1b \n"
+
+ "2: \n"
+ "#store current counter value at the end \n"
+ "ST1 {v13.2d}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
+ :"0" (out), [Key] "r" (key), [input] "2" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (reg)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
+ );
+ break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+ case 12: /* AES 192 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "LD1 {v9.2d-v12.2d},[%[Key]], #64 \n"
+ "LD1 {v13.16b}, [%[Key]], #16 \n"
+ "LD1 {v15.2d}, [%[reg]] \n"
+
+ "LD1 {v0.2d}, [%[input]], #16 \n"
+ "1: \n"
+ "MOV v14.16b, v0.16b \n"
+ "AESD v0.16b, v1.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v2.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v3.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v4.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v5.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v6.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v7.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v8.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v9.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v10.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v11.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v12.16b \n"
+ "EOR v0.16b, v0.16b, v13.16b \n"
+
+ "EOR v0.16b, v0.16b, v15.16b \n"
+ "SUB w11, w11, #1 \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "MOV v15.16b, v14.16b \n"
+
+ "CBZ w11, 2f \n"
+ "LD1 {v0.2d}, [%[input]], #16 \n"
+ "B 1b \n"
+
+ "2:\n"
+ "#store current counter value at the end \n"
+ "ST1 {v15.2d}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
+ :"0" (out), [Key] "r" (key), [input] "2" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (reg)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+ );
+ break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+ case 14: /* AES 256 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n"
+ "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n"
+ "LD1 {v17.2d}, [%[reg]] \n"
+
+ "LD1 {v0.2d}, [%[input]], #16 \n"
+ "1: \n"
+ "MOV v16.16b, v0.16b \n"
+ "AESD v0.16b, v1.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v2.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v3.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v4.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v5.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v6.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v7.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v8.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v9.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v10.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v11.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v12.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v13.16b \n"
+ "AESIMC v0.16b, v0.16b \n"
+ "AESD v0.16b, v14.16b \n"
+ "EOR v0.16b, v0.16b, v15.16b \n"
+
+ "EOR v0.16b, v0.16b, v17.16b \n"
+ "SUB w11, w11, #1 \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "MOV v17.16b, v16.16b \n"
+
+ "CBZ w11, 2f \n"
+ "LD1 {v0.2d}, [%[input]], #16 \n"
+ "B 1b \n"
+
+ "2:\n"
+ "#store current counter value at the end \n"
+ "ST1 {v17.2d}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
+ :"0" (out), [Key] "r" (key), [input] "2" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (reg)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15",
+ "v16", "v17"
+ );
+ break;
+#endif /* WOLFSSL_AES_256 */
+ default:
+ WOLFSSL_MSG("Bad AES-CBC round value");
+ return BAD_FUNC_ARG;
+ }
+ }
+
+ return 0;
+ }
+ #endif
+
+#endif /* HAVE_AES_CBC */
+
+/* AES-CTR */
+#ifdef WOLFSSL_AES_COUNTER
+
+ /* Increment AES counter */
+ static WC_INLINE void IncrementAesCounter(byte* inOutCtr)
+ {
+ int i;
+
+ /* in network byte order so start at end and work back */
+ for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) {
+ if (++inOutCtr[i]) /* we're done unless we overflow */
+ return;
+ }
+ }
+
+ int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+ {
+ byte* tmp;
+ word32 numBlocks;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
+
+ /* consume any unused bytes left in aes->tmp */
+ while (aes->left && sz) {
+ *(out++) = *(in++) ^ *(tmp++);
+ aes->left--;
+ sz--;
+ }
+
+ /* do as many block size ops as possible */
+ numBlocks = sz/AES_BLOCK_SIZE;
+ if (numBlocks > 0) {
+ /* pointer needed because it is incremented when read, causing
+ * an issue with call to encrypt/decrypt leftovers */
+ byte* keyPt = (byte*)aes->key;
+ sz -= numBlocks * AES_BLOCK_SIZE;
+ switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+ case 10: /* AES 128 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+ "#Create vector with the value 1 \n"
+ "MOVI v15.16b, #1 \n"
+ "USHR v15.2d, v15.2d, #56 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "EOR v14.16b, v14.16b, v14.16b \n"
+ "EXT v14.16b, v15.16b, v14.16b, #8\n"
+
+ "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+ "LD1 {v13.2d}, %[reg] \n"
+
+ /* double block */
+ "1: \n"
+ "CMP w11, #1 \n"
+ "BEQ 2f \n"
+ "CMP w11, #0 \n"
+ "BEQ 3f \n"
+
+ "MOV v0.16b, v13.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v13.16b, v13.16b \n" /* network order */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "SUB w11, w11, #2 \n"
+ "ADD v15.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+ "ADD v13.2d, v15.2d, v14.2d \n" /* add 1 to counter */
+
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v15.16b, v15.16b \n" /* revert from network order */
+ "REV64 v13.16b, v13.16b \n" /* revert from network order */
+
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v15.16b, v1.16b \n"
+ "AESMC v15.16b, v15.16b \n"
+
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v15.16b, v2.16b \n"
+ "AESMC v15.16b, v15.16b \n"
+
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v15.16b, v3.16b \n"
+ "AESMC v15.16b, v15.16b \n"
+
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v15.16b, v4.16b \n"
+ "AESMC v15.16b, v15.16b \n"
+
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v15.16b, v5.16b \n"
+ "AESMC v15.16b, v15.16b \n"
+
+ "AESE v0.16b, v10.16b \n"
+ "AESE v15.16b, v6.16b \n"
+ "AESMC v15.16b, v15.16b \n"
+
+ "EOR v0.16b, v0.16b, v11.16b \n"
+ "AESE v15.16b, v7.16b \n"
+ "AESMC v15.16b, v15.16b \n"
+
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "AESE v15.16b, v8.16b \n"
+ "AESMC v15.16b, v15.16b \n"
+
+ "EOR v0.16b, v0.16b, v12.16b \n"
+ "AESE v15.16b, v9.16b \n"
+ "AESMC v15.16b, v15.16b \n"
+
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "AESE v15.16b, v10.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "EOR v15.16b, v15.16b, v11.16b \n"
+ "EOR v15.16b, v15.16b, v12.16b \n"
+ "ST1 {v15.2d}, [%[out]], #16 \n"
+
+ "B 1b \n"
+
+ /* single block */
+ "2: \n"
+ "MOV v0.16b, v13.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v13.16b, v13.16b \n" /* network order */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "SUB w11, w11, #1 \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v13.16b, v13.16b \n" /* revert from network order */
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "EOR v0.16b, v0.16b, v11.16b \n"
+ "#CTR operations, increment counter and xorbuf \n"
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "EOR v0.16b, v0.16b, v12.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+
+ "3: \n"
+ "#store current counter value at the end \n"
+ "ST1 {v13.2d}, %[regOut] \n"
+
+ :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg),
+ "=r" (in)
+ :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+ [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15"
+ );
+ break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+ case 12: /* AES 192 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+ "#Create vector with the value 1 \n"
+ "MOVI v16.16b, #1 \n"
+ "USHR v16.2d, v16.2d, #56 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "EOR v14.16b, v14.16b, v14.16b \n"
+ "EXT v16.16b, v16.16b, v14.16b, #8\n"
+
+ "LD1 {v9.2d-v12.2d}, [%[Key]], #64\n"
+ "LD1 {v15.2d}, %[reg] \n"
+ "LD1 {v13.16b}, [%[Key]], #16 \n"
+
+ /* double block */
+ "1: \n"
+ "CMP w11, #1 \n"
+ "BEQ 2f \n"
+ "CMP w11, #0 \n"
+ "BEQ 3f \n"
+
+ "MOV v0.16b, v15.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v15.16b, v15.16b \n" /* network order */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+ "SUB w11, w11, #2 \n"
+ "ADD v17.2d, v15.2d, v16.2d \n" /* add 1 to counter */
+ "ADD v15.2d, v17.2d, v16.2d \n" /* add 1 to counter */
+
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+ "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v17.16b, v17.16b \n" /* revert from network order */
+ "REV64 v15.16b, v15.16b \n" /* revert from network order */
+
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v17.16b, v1.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v17.16b, v2.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v17.16b, v3.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v17.16b, v4.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v17.16b, v5.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v17.16b, v6.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v17.16b, v7.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "AESE v0.16b, v12.16b \n"
+ "AESE v17.16b, v8.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "EOR v0.16b, v0.16b, v13.16b \n"
+ "AESE v17.16b, v9.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "LD1 {v14.2d}, [%[input]], #16 \n"
+ "AESE v17.16b, v10.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "EOR v0.16b, v0.16b, v14.16b \n"
+ "AESE v17.16b, v11.16b \n"
+ "AESMC v17.16b, v17.16b \n"
+
+ "LD1 {v14.2d}, [%[input]], #16 \n"
+ "AESE v17.16b, v12.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "EOR v17.16b, v17.16b, v13.16b \n"
+ "EOR v17.16b, v17.16b, v14.16b \n"
+ "ST1 {v17.2d}, [%[out]], #16 \n"
+
+ "B 1b \n"
+
+ "2: \n"
+ "LD1 {v14.2d}, [%[input]], #16 \n"
+ "MOV v0.16b, v15.16b \n"
+
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v15.16b, v15.16b \n" /* network order */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "ADD v15.2d, v15.2d, v16.2d \n" /* add 1 to counter */
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "SUB w11, w11, #1 \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v15.16b, v15.16b \n" /* revert from network order */
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v12.16b \n"
+ "EOR v0.16b, v0.16b, v13.16b \n"
+ "#CTR operations, increment counter and xorbuf \n"
+ "EOR v0.16b, v0.16b, v14.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+
+ "3: \n"
+ "#store current counter value at the end \n"
+ "ST1 {v15.2d}, %[regOut] \n"
+
+ :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg),
+ "=r" (in)
+ :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+ [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15",
+ "v16", "v17"
+ );
+ break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+ case 14: /* AES 256 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+ "#Create vector with the value 1 \n"
+ "MOVI v18.16b, #1 \n"
+ "USHR v18.2d, v18.2d, #56 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "EOR v19.16b, v19.16b, v19.16b \n"
+ "EXT v18.16b, v18.16b, v19.16b, #8\n"
+
+ "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n"
+ "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n"
+ "LD1 {v17.2d}, %[reg] \n"
+
+ /* double block */
+ "1: \n"
+ "CMP w11, #1 \n"
+ "BEQ 2f \n"
+ "CMP w11, #0 \n"
+ "BEQ 3f \n"
+
+ "MOV v0.16b, v17.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v17.16b, v17.16b \n" /* network order */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+ "SUB w11, w11, #2 \n"
+ "ADD v19.2d, v17.2d, v18.2d \n" /* add 1 to counter */
+ "ADD v17.2d, v19.2d, v18.2d \n" /* add 1 to counter */
+
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v19.16b, v19.16b, v19.16b, #8 \n"
+ "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v19.16b, v19.16b \n" /* revert from network order */
+ "REV64 v17.16b, v17.16b \n" /* revert from network order */
+
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v19.16b, v1.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v19.16b, v2.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v19.16b, v3.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v19.16b, v4.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v19.16b, v5.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v19.16b, v6.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v19.16b, v7.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "AESE v0.16b, v12.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v19.16b, v8.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "AESE v0.16b, v13.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v19.16b, v9.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "AESE v0.16b, v14.16b \n"
+ "AESE v19.16b, v10.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "EOR v0.16b, v0.16b, v15.16b \n"
+ "AESE v19.16b, v11.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "LD1 {v16.2d}, [%[input]], #16 \n"
+ "AESE v19.16b, v12.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "EOR v0.16b, v0.16b, v16.16b \n"
+ "AESE v19.16b, v13.16b \n"
+ "AESMC v19.16b, v19.16b \n"
+
+ "LD1 {v16.2d}, [%[input]], #16 \n"
+ "AESE v19.16b, v14.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "EOR v19.16b, v19.16b, v15.16b \n"
+ "EOR v19.16b, v19.16b, v16.16b \n"
+ "ST1 {v19.2d}, [%[out]], #16 \n"
+
+ "B 1b \n"
+
+ "2: \n"
+ "LD1 {v16.2d}, [%[input]], #16 \n"
+ "MOV v0.16b, v17.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v17.16b, v17.16b \n" /* network order */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "ADD v17.2d, v17.2d, v18.2d \n" /* add 1 to counter */
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "REV64 v17.16b, v17.16b \n" /* revert from network order */
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v12.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v13.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v14.16b \n"
+ "EOR v0.16b, v0.16b, v15.16b \n"
+ "#CTR operations, increment counter and xorbuf \n"
+ "EOR v0.16b, v0.16b, v16.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+
+ "3: \n"
+ "#store current counter value at the end \n"
+ "ST1 {v17.2d}, %[regOut] \n"
+
+
+ :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg),
+ "=r" (in)
+ :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+ [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15",
+ "v16", "v17", "v18", "v19"
+ );
+ break;
+#endif /* WOLFSSL_AES_256 */
+ default:
+ WOLFSSL_MSG("Bad AES-CTR round value");
+ return BAD_FUNC_ARG;
+ }
+
+ aes->left = 0;
+ }
+
+ /* handle non block size remaining */
+ if (sz) {
+ wc_AesEncrypt(aes, (byte*)aes->reg, (byte*)aes->tmp);
+ IncrementAesCounter((byte*)aes->reg);
+
+ aes->left = AES_BLOCK_SIZE;
+ tmp = (byte*)aes->tmp;
+
+ while (sz--) {
+ *(out++) = *(in++) ^ *(tmp++);
+ aes->left--;
+ }
+ }
+ return 0;
+ }
+
+#endif /* WOLFSSL_AES_COUNTER */
+
+#ifdef HAVE_AESGCM
+
+/*
+ * Based from GCM implementation in wolfcrypt/src/aes.c
+ */
+
+/* PMULL and RBIT only with AArch64 */
+/* Use ARM hardware for polynomial multiply */
+static void GMULT(byte* X, byte* Y)
+{
+ __asm__ volatile (
+ "LD1 {v0.16b}, [%[inX]] \n"
+ "LD1 {v1.16b}, [%[inY]] \n" /* v1 already reflected from set key */
+ "RBIT v0.16b, v0.16b \n"
+
+
+ /* Algorithm 1 from Intel GCM white paper.
+ "Carry-Less Multiplication and Its Usage for Computing the GCM Mode"
+ */
+ "PMULL v3.1q, v0.1d, v1.1d \n" /* a0 * b0 = C */
+ "PMULL2 v4.1q, v0.2d, v1.2d \n" /* a1 * b1 = D */
+ "EXT v5.16b, v1.16b, v1.16b, #8 \n" /* b0b1 -> b1b0 */
+ "PMULL v6.1q, v0.1d, v5.1d \n" /* a0 * b1 = E */
+ "PMULL2 v5.1q, v0.2d, v5.2d \n" /* a1 * b0 = F */
+
+ "#Set a register to all 0s using EOR \n"
+ "EOR v7.16b, v7.16b, v7.16b \n"
+ "EOR v5.16b, v5.16b, v6.16b \n" /* F ^ E */
+ "EXT v6.16b, v7.16b, v5.16b, #8 \n" /* get (F^E)[0] */
+ "EOR v3.16b, v3.16b, v6.16b \n" /* low 128 bits in v3 */
+ "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* get (F^E)[1] */
+ "EOR v4.16b, v4.16b, v6.16b \n" /* high 128 bits in v4 */
+
+
+ /* Based from White Paper "Implementing GCM on ARMv8"
+ by Conrado P.L. Gouvea and Julio Lopez
+ reduction on 256bit value using Algorithm 5 */
+ "MOVI v8.16b, #0x87 \n"
+ "USHR v8.2d, v8.2d, #56 \n"
+ /* v8 is now 0x00000000000000870000000000000087 reflected 0xe1....*/
+ "PMULL2 v5.1q, v4.2d, v8.2d \n"
+ "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* v7 is all 0's */
+ "EOR v4.16b, v4.16b, v6.16b \n"
+ "EXT v6.16b, v7.16b, v5.16b, #8 \n"
+ "EOR v3.16b, v3.16b, v6.16b \n"
+ "PMULL v5.1q, v4.1d, v8.1d \n"
+ "EOR v4.16b, v3.16b, v5.16b \n"
+
+ "RBIT v4.16b, v4.16b \n"
+ "STR q4, [%[out]] \n"
+ : [out] "=r" (X), "=r" (Y)
+ : [inX] "0" (X), [inY] "1" (Y)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"
+ );
+}
+
+
+void GHASH(Aes* aes, const byte* a, word32 aSz,
+ const byte* c, word32 cSz, byte* s, word32 sSz)
+{
+ byte x[AES_BLOCK_SIZE];
+ byte scratch[AES_BLOCK_SIZE];
+ word32 blocks, partial;
+ byte* h = aes->H;
+
+ XMEMSET(x, 0, AES_BLOCK_SIZE);
+
+ /* Hash in A, the Additional Authentication Data */
+ if (aSz != 0 && a != NULL) {
+ blocks = aSz / AES_BLOCK_SIZE;
+ partial = aSz % AES_BLOCK_SIZE;
+ /* do as many blocks as possible */
+ while (blocks--) {
+ xorbuf(x, a, AES_BLOCK_SIZE);
+ GMULT(x, h);
+ a += AES_BLOCK_SIZE;
+ }
+ if (partial != 0) {
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, a, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, h);
+ }
+ }
+
+ /* Hash in C, the Ciphertext */
+ if (cSz != 0 && c != NULL) {
+ blocks = cSz / AES_BLOCK_SIZE;
+ partial = cSz % AES_BLOCK_SIZE;
+ while (blocks--) {
+ xorbuf(x, c, AES_BLOCK_SIZE);
+ GMULT(x, h);
+ c += AES_BLOCK_SIZE;
+ }
+ if (partial != 0) {
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, c, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, h);
+ }
+ }
+
+ /* Hash in the lengths of A and C in bits */
+ FlattenSzInBits(&scratch[0], aSz);
+ FlattenSzInBits(&scratch[8], cSz);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+
+ /* Copy the result (minus last GMULT) into s. */
+ XMEMCPY(s, x, sSz);
+}
+
+
+#ifdef WOLFSSL_AES_128
+/* internal function : see wc_AesGcmEncrypt */
+static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ word32 blocks;
+ word32 partial;
+ byte counter[AES_BLOCK_SIZE];
+ byte initialCounter[AES_BLOCK_SIZE];
+ byte x[AES_BLOCK_SIZE];
+ byte scratch[AES_BLOCK_SIZE];
+
+ /* Noticed different optimization levels treated head of array different.
+ Some cases was stack pointer plus offset others was a regester containing
+ address. To make uniform for passing in to inline assembly code am using
+ pointers to the head of each local array.
+ */
+ byte* ctr = counter;
+ byte* iCtr = initialCounter;
+ byte* xPt = x;
+ byte* sPt = scratch;
+ byte* keyPt; /* pointer to handle pointer advencment */
+
+ XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+ if (ivSz == GCM_NONCE_MID_SZ) {
+ XMEMCPY(initialCounter, iv, ivSz);
+ initialCounter[AES_BLOCK_SIZE - 1] = 1;
+ }
+ else {
+ GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+ GMULT(initialCounter, aes->H);
+ }
+ XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE);
+
+
+ /* Hash in the Additional Authentication Data */
+ XMEMSET(x, 0, AES_BLOCK_SIZE);
+ if (authInSz != 0 && authIn != NULL) {
+ blocks = authInSz / AES_BLOCK_SIZE;
+ partial = authInSz % AES_BLOCK_SIZE;
+ /* do as many blocks as possible */
+ while (blocks--) {
+ xorbuf(x, authIn, AES_BLOCK_SIZE);
+ GMULT(x, aes->H);
+ authIn += AES_BLOCK_SIZE;
+ }
+ if (partial != 0) {
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, authIn, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, aes->H);
+ }
+ }
+
+ /* do as many blocks as possible */
+ blocks = sz / AES_BLOCK_SIZE;
+ partial = sz % AES_BLOCK_SIZE;
+ if (blocks > 0) {
+ keyPt = (byte*)aes->key;
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v13.2d}, [%[ctr]] \n"
+
+ "#Create vector with the value 1 \n"
+ "MOVI v14.16b, #1 \n"
+ "USHR v14.2d, v14.2d, #56 \n"
+ "EOR v22.16b, v22.16b, v22.16b \n"
+ "EXT v14.16b, v14.16b, v22.16b, #8\n"
+
+
+ /***************************************************
+ Get first out block for GHASH using AES encrypt
+ ***************************************************/
+ "REV64 v13.16b, v13.16b \n" /* network order */
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "REV64 v13.16b, v13.16b \n" /* revert from network order */
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "MOV v0.16b, v13.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "LD1 {v16.2d}, %[inY] \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "SUB w11, w11, #1 \n"
+ "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "MOVI v23.16b, #0x87 \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "USHR v23.2d, v23.2d, #56 \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "EOR v0.16b, v0.16b, v11.16b \n"
+
+ "EOR v0.16b, v0.16b, v12.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "MOV v15.16b, v0.16b \n"
+
+ "CBZ w11, 1f \n" /* only one block jump to final GHASH */
+
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+
+ /***************************************************
+ Interweave GHASH and encrypt if more then 1 block
+ ***************************************************/
+ "2: \n"
+ "REV64 v13.16b, v13.16b \n" /* network order */
+ "EOR v15.16b, v17.16b, v15.16b \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+ "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "REV64 v13.16b, v13.16b \n" /* revert from network order */
+ "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */
+ "MOV v0.16b, v13.16b \n"
+ "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */
+ "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+ "SUB w11, w11, #1 \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "PMULL2 v20.1q, v19.2d, v23.2d \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+ "EOR v0.16b, v0.16b, v11.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n"
+
+ "EOR v0.16b, v0.16b, v12.16b \n"
+ "PMULL v20.1q, v19.1d, v23.1d \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "EOR v19.16b, v18.16b, v20.16b \n"
+ "MOV v15.16b, v0.16b \n"
+ "RBIT v17.16b, v19.16b \n"
+
+ "CBZ w11, 1f \n"
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "B 2b \n"
+
+ /***************************************************
+ GHASH on last block
+ ***************************************************/
+ "1: \n"
+ "EOR v15.16b, v17.16b, v15.16b \n"
+ "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */
+
+ "#store current AES counter value \n"
+ "ST1 {v13.2d}, [%[ctrOut]] \n"
+ "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */
+ "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */
+ "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+ "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */
+ "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */
+ "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+ "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+ "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */
+
+ "#Reduce product from multiplication \n"
+ "PMULL2 v20.1q, v19.2d, v23.2d \n"
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+ "EOR v19.16b, v19.16b, v21.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+ "EOR v18.16b, v18.16b, v21.16b \n"
+ "PMULL v20.1q, v19.1d, v23.1d \n"
+ "EOR v19.16b, v18.16b, v20.16b \n"
+ "RBIT v17.16b, v19.16b \n"
+ "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */
+
+ :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in)
+ ,[xOut] "=r" (xPt),"=m" (aes->H)
+ :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+ [input] "3" (in)
+ ,[inX] "4" (xPt), [inY] "m" (aes->H)
+ : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+ ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"
+ );
+ }
+
+ /* take care of partial block sizes leftover */
+ if (partial != 0) {
+ IncrementGcmCounter(counter);
+ wc_AesEncrypt(aes, counter, scratch);
+ xorbuf(scratch, in, partial);
+ XMEMCPY(out, scratch, partial);
+
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, out, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, aes->H);
+ }
+
+ /* Hash in the lengths of A and C in bits */
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ FlattenSzInBits(&scratch[0], authInSz);
+ FlattenSzInBits(&scratch[8], sz);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, x, AES_BLOCK_SIZE);
+
+ keyPt = (byte*)aes->key;
+ __asm__ __volatile__ (
+
+ "LD1 {v16.16b}, [%[tag]] \n"
+ "LD1 {v17.16b}, %[h] \n"
+ "RBIT v16.16b, v16.16b \n"
+
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "PMULL v18.1q, v16.1d, v17.1d \n" /* a0 * b0 = C */
+ "PMULL2 v19.1q, v16.2d, v17.2d \n" /* a1 * b1 = D */
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */
+ "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+ "PMULL v21.1q, v16.1d, v20.1d \n" /* a0 * b1 = E */
+ "PMULL2 v20.1q, v16.2d, v20.2d \n" /* a1 * b0 = F */
+ "LD1 {v0.2d}, [%[ctr]] \n"
+
+ "#Set a register to all 0s using EOR \n"
+ "EOR v22.16b, v22.16b, v22.16b \n"
+ "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */
+ "MOVI v23.16b, #0x87 \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "USHR v23.2d, v23.2d, #56 \n"
+ "PMULL2 v20.1q, v19.2d, v23.2d \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n"
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "PMULL v20.1q, v19.1d, v23.1d \n"
+ "EOR v19.16b, v18.16b, v20.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "RBIT v19.16b, v19.16b \n"
+ "EOR v0.16b, v0.16b, v11.16b \n"
+ "EOR v19.16b, v19.16b, v0.16b \n"
+ "STR q19, [%[out]] \n"
+
+ :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr)
+ :[tag] "0" (sPt), [Key] "1" (keyPt),
+ [ctr] "2" (iCtr) , [h] "m" (aes->H)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14",
+ "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24"
+ );
+
+
+ if (authTagSz > AES_BLOCK_SIZE) {
+ XMEMCPY(authTag, scratch, AES_BLOCK_SIZE);
+ }
+ else {
+ /* authTagSz can be smaller than AES_BLOCK_SIZE */
+ XMEMCPY(authTag, scratch, authTagSz);
+ }
+ return 0;
+}
+#endif /* WOLFSSL_AES_128 */
+
+#ifdef WOLFSSL_AES_192
+/* internal function : see wc_AesGcmEncrypt */
+static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ word32 blocks;
+ word32 partial;
+ byte counter[AES_BLOCK_SIZE];
+ byte initialCounter[AES_BLOCK_SIZE];
+ byte x[AES_BLOCK_SIZE];
+ byte scratch[AES_BLOCK_SIZE];
+
+ /* Noticed different optimization levels treated head of array different.
+ Some cases was stack pointer plus offset others was a regester containing
+ address. To make uniform for passing in to inline assembly code am using
+ pointers to the head of each local array.
+ */
+ byte* ctr = counter;
+ byte* iCtr = initialCounter;
+ byte* xPt = x;
+ byte* sPt = scratch;
+ byte* keyPt; /* pointer to handle pointer advencment */
+
+ XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+ if (ivSz == GCM_NONCE_MID_SZ) {
+ XMEMCPY(initialCounter, iv, ivSz);
+ initialCounter[AES_BLOCK_SIZE - 1] = 1;
+ }
+ else {
+ GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+ GMULT(initialCounter, aes->H);
+ }
+ XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE);
+
+
+ /* Hash in the Additional Authentication Data */
+ XMEMSET(x, 0, AES_BLOCK_SIZE);
+ if (authInSz != 0 && authIn != NULL) {
+ blocks = authInSz / AES_BLOCK_SIZE;
+ partial = authInSz % AES_BLOCK_SIZE;
+ /* do as many blocks as possible */
+ while (blocks--) {
+ xorbuf(x, authIn, AES_BLOCK_SIZE);
+ GMULT(x, aes->H);
+ authIn += AES_BLOCK_SIZE;
+ }
+ if (partial != 0) {
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, authIn, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, aes->H);
+ }
+ }
+
+ /* do as many blocks as possible */
+ blocks = sz / AES_BLOCK_SIZE;
+ partial = sz % AES_BLOCK_SIZE;
+ if (blocks > 0) {
+ keyPt = (byte*)aes->key;
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v13.2d}, [%[ctr]] \n"
+
+ "#Create vector with the value 1 \n"
+ "MOVI v14.16b, #1 \n"
+ "USHR v14.2d, v14.2d, #56 \n"
+ "EOR v22.16b, v22.16b, v22.16b \n"
+ "EXT v14.16b, v14.16b, v22.16b, #8\n"
+
+
+ /***************************************************
+ Get first out block for GHASH using AES encrypt
+ ***************************************************/
+ "REV64 v13.16b, v13.16b \n" /* network order */
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "REV64 v13.16b, v13.16b \n" /* revert from network order */
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "MOV v0.16b, v13.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "LD1 {v16.2d}, %[inY] \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "SUB w11, w11, #1 \n"
+ "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+ "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "MOVI v23.16b, #0x87 \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "USHR v23.2d, v23.2d, #56 \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v30.16b \n"
+ "EOR v0.16b, v0.16b, v31.16b \n"
+
+ "EOR v0.16b, v0.16b, v12.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "MOV v15.16b, v0.16b \n"
+
+ "CBZ w11, 1f \n" /* only one block jump to final GHASH */
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+
+ /***************************************************
+ Interweave GHASH and encrypt if more then 1 block
+ ***************************************************/
+ "2: \n"
+ "REV64 v13.16b, v13.16b \n" /* network order */
+ "EOR v15.16b, v17.16b, v15.16b \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+ "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "REV64 v13.16b, v13.16b \n" /* revert from network order */
+ "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */
+ "MOV v0.16b, v13.16b \n"
+ "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */
+ "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+ "SUB w11, w11, #1 \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "PMULL2 v20.1q, v19.2d, v23.2d \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n"
+ "AESE v0.16b, v30.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+ "EOR v0.16b, v0.16b, v31.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n"
+
+ "EOR v0.16b, v0.16b, v12.16b \n"
+ "PMULL v20.1q, v19.1d, v23.1d \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "EOR v19.16b, v18.16b, v20.16b \n"
+ "MOV v15.16b, v0.16b \n"
+ "RBIT v17.16b, v19.16b \n"
+
+ "CBZ w11, 1f \n"
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "B 2b \n"
+
+ /***************************************************
+ GHASH on last block
+ ***************************************************/
+ "1: \n"
+ "EOR v15.16b, v17.16b, v15.16b \n"
+ "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */
+
+ "#store current AES counter value \n"
+ "ST1 {v13.2d}, [%[ctrOut]] \n"
+ "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */
+ "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */
+ "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+ "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */
+ "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */
+ "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+ "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+ "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */
+
+ "#Reduce product from multiplication \n"
+ "PMULL2 v20.1q, v19.2d, v23.2d \n"
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+ "EOR v19.16b, v19.16b, v21.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+ "EOR v18.16b, v18.16b, v21.16b \n"
+ "PMULL v20.1q, v19.1d, v23.1d \n"
+ "EOR v19.16b, v18.16b, v20.16b \n"
+ "RBIT v17.16b, v19.16b \n"
+ "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */
+
+ :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in)
+ ,[xOut] "=r" (xPt),"=m" (aes->H)
+ :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+ [input] "3" (in)
+ ,[inX] "4" (xPt), [inY] "m" (aes->H)
+ : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+ ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+ "v24","v25","v26","v27","v28","v29","v30","v31"
+ );
+ }
+
+ /* take care of partial block sizes leftover */
+ if (partial != 0) {
+ IncrementGcmCounter(counter);
+ wc_AesEncrypt(aes, counter, scratch);
+ xorbuf(scratch, in, partial);
+ XMEMCPY(out, scratch, partial);
+
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, out, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, aes->H);
+ }
+
+ /* Hash in the lengths of A and C in bits */
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ FlattenSzInBits(&scratch[0], authInSz);
+ FlattenSzInBits(&scratch[8], sz);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, x, AES_BLOCK_SIZE);
+
+ keyPt = (byte*)aes->key;
+ __asm__ __volatile__ (
+
+ "LD1 {v16.16b}, [%[tag]] \n"
+ "LD1 {v17.16b}, %[h] \n"
+ "RBIT v16.16b, v16.16b \n"
+
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "PMULL v18.1q, v16.1d, v17.1d \n" /* a0 * b0 = C */
+ "PMULL2 v19.1q, v16.2d, v17.2d \n" /* a1 * b1 = D */
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */
+ "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+ "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n"
+ "PMULL v21.1q, v16.1d, v20.1d \n" /* a0 * b1 = E */
+ "PMULL2 v20.1q, v16.2d, v20.2d \n" /* a1 * b0 = F */
+ "LD1 {v0.2d}, [%[ctr]] \n"
+
+ "#Set a register to all 0s using EOR \n"
+ "EOR v22.16b, v22.16b, v22.16b \n"
+ "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */
+ "MOVI v23.16b, #0x87 \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "USHR v23.2d, v23.2d, #56 \n"
+ "PMULL2 v20.1q, v19.2d, v23.2d \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n"
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "PMULL v20.1q, v19.1d, v23.1d \n"
+ "EOR v19.16b, v18.16b, v20.16b \n"
+ "AESE v0.16b, v30.16b \n"
+ "RBIT v19.16b, v19.16b \n"
+ "EOR v0.16b, v0.16b, v31.16b \n"
+ "EOR v19.16b, v19.16b, v0.16b \n"
+ "STR q19, [%[out]] \n"
+
+ :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr)
+ :[tag] "0" (sPt), [Key] "1" (keyPt),
+ [ctr] "2" (iCtr) , [h] "m" (aes->H)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14",
+ "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24"
+ );
+
+
+ if (authTagSz > AES_BLOCK_SIZE) {
+ XMEMCPY(authTag, scratch, AES_BLOCK_SIZE);
+ }
+ else {
+ /* authTagSz can be smaller than AES_BLOCK_SIZE */
+ XMEMCPY(authTag, scratch, authTagSz);
+ }
+
+ return 0;
+}
+#endif /* WOLFSSL_AES_192 */
+
+#ifdef WOLFSSL_AES_256
+/* internal function : see wc_AesGcmEncrypt */
+static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ word32 blocks;
+ word32 partial;
+ byte counter[AES_BLOCK_SIZE];
+ byte initialCounter[AES_BLOCK_SIZE];
+ byte x[AES_BLOCK_SIZE];
+ byte scratch[AES_BLOCK_SIZE];
+
+ /* Noticed different optimization levels treated head of array different.
+ Some cases was stack pointer plus offset others was a regester containing
+ address. To make uniform for passing in to inline assembly code am using
+ pointers to the head of each local array.
+ */
+ byte* ctr = counter;
+ byte* iCtr = initialCounter;
+ byte* xPt = x;
+ byte* sPt = scratch;
+ byte* keyPt; /* pointer to handle pointer advencment */
+
+ XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+ if (ivSz == GCM_NONCE_MID_SZ) {
+ XMEMCPY(initialCounter, iv, ivSz);
+ initialCounter[AES_BLOCK_SIZE - 1] = 1;
+ }
+ else {
+ GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+ GMULT(initialCounter, aes->H);
+ }
+ XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE);
+
+
+ /* Hash in the Additional Authentication Data */
+ XMEMSET(x, 0, AES_BLOCK_SIZE);
+ if (authInSz != 0 && authIn != NULL) {
+ blocks = authInSz / AES_BLOCK_SIZE;
+ partial = authInSz % AES_BLOCK_SIZE;
+ /* do as many blocks as possible */
+ while (blocks--) {
+ xorbuf(x, authIn, AES_BLOCK_SIZE);
+ GMULT(x, aes->H);
+ authIn += AES_BLOCK_SIZE;
+ }
+ if (partial != 0) {
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, authIn, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, aes->H);
+ }
+ }
+
+ /* do as many blocks as possible */
+ blocks = sz / AES_BLOCK_SIZE;
+ partial = sz % AES_BLOCK_SIZE;
+ if (blocks > 0) {
+ keyPt = (byte*)aes->key;
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v13.2d}, [%[ctr]] \n"
+
+ "#Create vector with the value 1 \n"
+ "MOVI v14.16b, #1 \n"
+ "USHR v14.2d, v14.2d, #56 \n"
+ "EOR v22.16b, v22.16b, v22.16b \n"
+ "EXT v14.16b, v14.16b, v22.16b, #8\n"
+
+
+ /***************************************************
+ Get first out block for GHASH using AES encrypt
+ ***************************************************/
+ "REV64 v13.16b, v13.16b \n" /* network order */
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "REV64 v13.16b, v13.16b \n" /* revert from network order */
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "MOV v0.16b, v13.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "LD1 {v16.2d}, %[inY] \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "SUB w11, w11, #1 \n"
+ "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+ "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "MOVI v23.16b, #0x87 \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "USHR v23.2d, v23.2d, #56 \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v28.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v29.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v30.16b \n"
+ "EOR v0.16b, v0.16b, v31.16b \n"
+
+ "EOR v0.16b, v0.16b, v12.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "MOV v15.16b, v0.16b \n"
+
+ "CBZ w11, 1f \n" /* only one block jump to final GHASH */
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+
+ /***************************************************
+ Interweave GHASH and encrypt if more then 1 block
+ ***************************************************/
+ "2: \n"
+ "REV64 v13.16b, v13.16b \n" /* network order */
+ "EOR v15.16b, v17.16b, v15.16b \n"
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+ "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */
+ "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+ "REV64 v13.16b, v13.16b \n" /* revert from network order */
+ "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */
+ "MOV v0.16b, v13.16b \n"
+ "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */
+ "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+ "SUB w11, w11, #1 \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "PMULL2 v20.1q, v19.2d, v23.2d \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v28.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v29.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n"
+ "AESE v0.16b, v30.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+ "EOR v0.16b, v0.16b, v31.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n"
+
+ "EOR v0.16b, v0.16b, v12.16b \n"
+ "PMULL v20.1q, v19.1d, v23.1d \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+ "EOR v19.16b, v18.16b, v20.16b \n"
+ "MOV v15.16b, v0.16b \n"
+ "RBIT v17.16b, v19.16b \n"
+
+ "CBZ w11, 1f \n"
+ "LD1 {v12.2d}, [%[input]], #16 \n"
+ "B 2b \n"
+
+ /***************************************************
+ GHASH on last block
+ ***************************************************/
+ "1: \n"
+ "EOR v15.16b, v17.16b, v15.16b \n"
+ "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */
+
+ "#store current AES counter value \n"
+ "ST1 {v13.2d}, [%[ctrOut]] \n"
+ "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */
+ "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */
+ "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+ "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */
+ "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */
+ "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+ "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+ "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */
+
+ "#Reduce product from multiplication \n"
+ "PMULL2 v20.1q, v19.2d, v23.2d \n"
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+ "EOR v19.16b, v19.16b, v21.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+ "EOR v18.16b, v18.16b, v21.16b \n"
+ "PMULL v20.1q, v19.1d, v23.1d \n"
+ "EOR v19.16b, v18.16b, v20.16b \n"
+ "RBIT v17.16b, v19.16b \n"
+ "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */
+
+ :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in)
+ ,[xOut] "=r" (xPt),"=m" (aes->H)
+ :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+ [input] "3" (in)
+ ,[inX] "4" (xPt), [inY] "m" (aes->H)
+ : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+ ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"
+ );
+ }
+
+ /* take care of partial block sizes leftover */
+ if (partial != 0) {
+ IncrementGcmCounter(counter);
+ wc_AesEncrypt(aes, counter, scratch);
+ xorbuf(scratch, in, partial);
+ XMEMCPY(out, scratch, partial);
+
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, out, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, aes->H);
+ }
+
+ /* Hash in the lengths of A and C in bits */
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ FlattenSzInBits(&scratch[0], authInSz);
+ FlattenSzInBits(&scratch[8], sz);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, x, AES_BLOCK_SIZE);
+
+ keyPt = (byte*)aes->key;
+ __asm__ __volatile__ (
+
+ "LD1 {v16.16b}, [%[tag]] \n"
+ "LD1 {v17.16b}, %[h] \n"
+ "RBIT v16.16b, v16.16b \n"
+
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+ "PMULL v18.1q, v16.1d, v17.1d \n" /* a0 * b0 = C */
+ "PMULL2 v19.1q, v16.2d, v17.2d \n" /* a1 * b1 = D */
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */
+ "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+ "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n"
+ "PMULL v21.1q, v16.1d, v20.1d \n" /* a0 * b1 = E */
+ "PMULL2 v20.1q, v16.2d, v20.2d \n" /* a1 * b0 = F */
+ "LD1 {v0.2d}, [%[ctr]] \n"
+
+ "#Set a register to all 0s using EOR \n"
+ "EOR v22.16b, v22.16b, v22.16b \n"
+ "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */
+ "MOVI v23.16b, #0x87 \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "USHR v23.2d, v23.2d, #56 \n"
+ "PMULL2 v20.1q, v19.2d, v23.2d \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v20.16b, v22.16b, #8 \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v19.16b, v19.16b, v21.16b \n"
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "EOR v18.16b, v18.16b, v21.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v28.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v29.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "PMULL v20.1q, v19.1d, v23.1d \n"
+ "EOR v19.16b, v18.16b, v20.16b \n"
+ "AESE v0.16b, v30.16b \n"
+ "RBIT v19.16b, v19.16b \n"
+ "EOR v0.16b, v0.16b, v31.16b \n"
+ "EOR v19.16b, v19.16b, v0.16b \n"
+ "STR q19, [%[out]] \n"
+
+ :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr)
+ :[tag] "0" (sPt), [Key] "1" (keyPt),
+ [ctr] "2" (iCtr) , [h] "m" (aes->H)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14",
+ "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23",
+ "v24","v25","v26","v27","v28","v29","v30","v31"
+ );
+
+
+ if (authTagSz > AES_BLOCK_SIZE) {
+ XMEMCPY(authTag, scratch, AES_BLOCK_SIZE);
+ }
+ else {
+ /* authTagSz can be smaller than AES_BLOCK_SIZE */
+ XMEMCPY(authTag, scratch, authTagSz);
+ }
+
+ return 0;
+}
+#endif /* WOLFSSL_AES_256 */
+
+
+/* aarch64 with PMULL and PMULL2
+ * Encrypt and tag data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: encrypted data output buffer
+ * in: plain text input buffer
+ * sz: size of plain text and out buffer
+ * iv: initialization vector
+ * ivSz: size of iv buffer
+ * authTag: buffer to hold tag
+ * authTagSz: size of tag buffer
+ * authIn: additional data buffer
+ * authInSz: size of additional data buffer
+ *
+ * Notes:
+ * GHASH multiplication based from Algorithm 1 from Intel GCM white paper.
+ * "Carry-Less Multiplication and Its Usage for Computing the GCM Mode"
+ *
+ * GHASH reduction Based from White Paper "Implementing GCM on ARMv8"
+ * by Conrado P.L. Gouvea and Julio Lopez reduction on 256bit value using
+ * Algorithm 5
+ */
+int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ /* sanity checks */
+ if (aes == NULL || (iv == NULL && ivSz > 0) ||
+ (authTag == NULL) ||
+ (authIn == NULL && authInSz > 0) ||
+ (in == NULL && sz > 0) ||
+ (out == NULL && sz > 0)) {
+ WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+ return BAD_FUNC_ARG;
+ }
+
+ if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ || authTagSz > AES_BLOCK_SIZE) {
+ WOLFSSL_MSG("GcmEncrypt authTagSz error");
+ return BAD_FUNC_ARG;
+ }
+
+ switch (aes->rounds) {
+#ifdef WOLFSSL_AES_128
+ case 10:
+ return Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz,
+ authTag, authTagSz, authIn, authInSz);
+#endif
+#ifdef WOLFSSL_AES_192
+ case 12:
+ return Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz,
+ authTag, authTagSz, authIn, authInSz);
+#endif
+#ifdef WOLFSSL_AES_256
+ case 14:
+ return Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz,
+ authTag, authTagSz, authIn, authInSz);
+#endif
+ default:
+ WOLFSSL_MSG("AES-GCM invalid round number");
+ return BAD_FUNC_ARG;
+ }
+}
+
+
+#ifdef HAVE_AES_DECRYPT
+/*
+ * Check tag and decrypt data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: decrypted data output buffer
+ * in: cipher text buffer
+ * sz: size of plain text and out buffer
+ * iv: initialization vector
+ * ivSz: size of iv buffer
+ * authTag: buffer holding tag
+ * authTagSz: size of tag buffer
+ * authIn: additional data buffer
+ * authInSz: size of additional data buffer
+ */
+int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ word32 blocks = sz / AES_BLOCK_SIZE;
+ word32 partial = sz % AES_BLOCK_SIZE;
+ const byte* c = in;
+ byte* p = out;
+ byte counter[AES_BLOCK_SIZE];
+ byte initialCounter[AES_BLOCK_SIZE];
+ byte *ctr ;
+ byte scratch[AES_BLOCK_SIZE];
+
+ ctr = counter ;
+
+ /* sanity checks */
+ if (aes == NULL || (iv == NULL && ivSz > 0) ||
+ (authTag == NULL) ||
+ (authIn == NULL && authInSz > 0) ||
+ (in == NULL && sz > 0) ||
+ (out == NULL && sz > 0)) {
+ WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+ if (ivSz == GCM_NONCE_MID_SZ) {
+ XMEMCPY(initialCounter, iv, ivSz);
+ initialCounter[AES_BLOCK_SIZE - 1] = 1;
+ }
+ else {
+ GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+ GMULT(initialCounter, aes->H);
+ }
+ XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
+
+ /* Calculate the authTag again using the received auth data and the
+ * cipher text. */
+ {
+ byte Tprime[AES_BLOCK_SIZE];
+ byte EKY0[AES_BLOCK_SIZE];
+
+ GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime));
+ GMULT(Tprime, aes->H);
+ wc_AesEncrypt(aes, ctr, EKY0);
+ xorbuf(Tprime, EKY0, sizeof(Tprime));
+
+ if (ConstantCompare(authTag, Tprime, authTagSz) != 0) {
+ return AES_GCM_AUTH_E;
+ }
+ }
+
+ /* do as many blocks as possible */
+ if (blocks > 0) {
+ /* pointer needed because it is incremented when read, causing
+ * an issue with call to encrypt/decrypt leftovers */
+ byte* keyPt = (byte*)aes->key;
+ switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+ case 10: /* AES 128 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+ "#Create vector with the value 1 \n"
+ "MOVI v14.16b, #1 \n"
+ "USHR v14.2d, v14.2d, #56 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "EOR v13.16b, v13.16b, v13.16b \n"
+ "EXT v14.16b, v14.16b, v13.16b, #8 \n"
+
+ "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n"
+ "LD1 {v12.2d}, [%[ctr]] \n"
+ "LD1 {v13.2d}, [%[input]], #16 \n"
+
+ "1: \n"
+ "REV64 v12.16b, v12.16b \n" /* network order */
+ "EXT v12.16b, v12.16b, v12.16b, #8 \n"
+ "ADD v12.2d, v12.2d, v14.2d \n" /* add 1 to counter */
+ "EXT v12.16b, v12.16b, v12.16b, #8 \n"
+ "REV64 v12.16b, v12.16b \n" /* revert from network order */
+ "MOV v0.16b, v12.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "SUB w11, w11, #1 \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "EOR v0.16b, v0.16b, v11.16b \n"
+
+ "EOR v0.16b, v0.16b, v13.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+
+ "CBZ w11, 2f \n"
+ "LD1 {v13.2d}, [%[input]], #16 \n"
+ "B 1b \n"
+
+ "2: \n"
+ "#store current counter value at the end \n"
+ "ST1 {v12.16b}, [%[ctrOut]] \n"
+
+ :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c)
+ :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+ [input] "3" (c)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+ );
+ break;
+#endif
+#ifdef WOLFSSL_AES_192
+ case 12: /* AES 192 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+ "#Create vector with the value 1 \n"
+ "MOVI v16.16b, #1 \n"
+ "USHR v16.2d, v16.2d, #56 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "EOR v14.16b, v14.16b, v14.16b \n"
+ "EXT v16.16b, v16.16b, v14.16b, #8 \n"
+
+ "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n"
+ "LD1 {v13.2d}, [%[Key]], #16 \n"
+ "LD1 {v14.2d}, [%[ctr]] \n"
+ "LD1 {v15.2d}, [%[input]], #16 \n"
+
+ "1: \n"
+ "REV64 v14.16b, v14.16b \n" /* network order */
+ "EXT v14.16b, v14.16b, v14.16b, #8 \n"
+ "ADD v14.2d, v14.2d, v16.2d \n" /* add 1 to counter */
+ "EXT v14.16b, v14.16b, v14.16b, #8 \n"
+ "REV64 v14.16b, v14.16b \n" /* revert from network order */
+ "MOV v0.16b, v14.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "SUB w11, w11, #1 \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v12.16b \n"
+ "EOR v0.16b, v0.16b, v13.16b \n"
+
+ "EOR v0.16b, v0.16b, v15.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+
+ "CBZ w11, 2f \n"
+ "LD1 {v15.2d}, [%[input]], #16 \n"
+ "B 1b \n"
+
+ "2: \n"
+ "#store current counter value at the end \n"
+ "ST1 {v14.2d}, [%[ctrOut]] \n"
+
+ :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c)
+ :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+ [input] "3" (c)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16"
+ );
+ break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+ case 14: /* AES 256 BLOCK */
+ __asm__ __volatile__ (
+ "MOV w11, %w[blocks] \n"
+ "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+ "#Create vector with the value 1 \n"
+ "MOVI v18.16b, #1 \n"
+ "USHR v18.2d, v18.2d, #56 \n"
+ "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+ "EOR v19.16b, v19.16b, v19.16b \n"
+ "EXT v18.16b, v18.16b, v19.16b, #8 \n"
+
+ "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n"
+ "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n"
+ "LD1 {v17.2d}, [%[ctr]] \n"
+ "LD1 {v16.2d}, [%[input]], #16 \n"
+
+ "1: \n"
+ "REV64 v17.16b, v17.16b \n" /* network order */
+ "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+ "ADD v17.2d, v17.2d, v18.2d \n" /* add 1 to counter */
+ "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+ "REV64 v17.16b, v17.16b \n" /* revert from network order */
+ "MOV v0.16b, v17.16b \n"
+ "AESE v0.16b, v1.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v2.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v3.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v4.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "SUB w11, w11, #1 \n"
+ "AESE v0.16b, v5.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v6.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v7.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v8.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v9.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v10.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v11.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v12.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v13.16b \n"
+ "AESMC v0.16b, v0.16b \n"
+ "AESE v0.16b, v14.16b \n"
+ "EOR v0.16b, v0.16b, v15.16b \n"
+
+ "EOR v0.16b, v0.16b, v16.16b \n"
+ "ST1 {v0.2d}, [%[out]], #16 \n"
+
+ "CBZ w11, 2f \n"
+ "LD1 {v16.2d}, [%[input]], #16 \n"
+ "B 1b \n"
+
+ "2: \n"
+ "#store current counter value at the end \n"
+ "ST1 {v17.2d}, [%[ctrOut]] \n"
+
+ :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c)
+ :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+ [input] "3" (c)
+ : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19"
+ );
+ break;
+#endif /* WOLFSSL_AES_256 */
+ default:
+ WOLFSSL_MSG("Bad AES-GCM round value");
+ return BAD_FUNC_ARG;
+ }
+ }
+ if (partial != 0) {
+ IncrementGcmCounter(ctr);
+ wc_AesEncrypt(aes, ctr, scratch);
+
+ /* check if pointer is null after main AES-GCM blocks
+ * helps static analysis */
+ if (p == NULL || c == NULL) {
+ return BAD_STATE_E;
+ }
+ xorbuf(scratch, c, partial);
+ XMEMCPY(p, scratch, partial);
+ }
+ return 0;
+}
+
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AESGCM */
+
+
+/***************************************
+ * not 64 bit so use 32 bit mode
+****************************************/
+#else
+
+/* AES CCM/GCM use encrypt direct but not decrypt */
+#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
+ defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+ static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+ {
+ /*
+ AESE exor's input with round key
+ shift rows of exor'ed result
+ sub bytes for shifted rows
+ */
+
+ word32* keyPt = aes->key;
+ __asm__ __volatile__ (
+ "VLD1.32 {q0}, [%[CtrIn]] \n"
+ "VLDM %[Key]!, {q1-q4} \n"
+
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q2\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "AESE.8 q0, q3\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "AESE.8 q0, q4\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q3}, [%[Key]]! \n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q4}, [%[Key]]! \n"
+ "AESE.8 q0, q2\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q3\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "AESE.8 q0, q4\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q2\n"
+
+ "MOV r12, %[R] \n"
+ "CMP r12, #10 \n"
+ "BEQ 1f \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q2\n"
+
+ "CMP r12, #12 \n"
+ "BEQ 1f \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q2\n"
+
+ "#Final AddRoundKey then store result \n"
+ "1: \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "VEOR.32 q0, q0, q1\n"
+ "VST1.32 {q0}, [%[CtrOut]] \n"
+
+ :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
+ "=r" (inBlock)
+ :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds),
+ [CtrIn] "3" (inBlock)
+ : "cc", "memory", "r12", "q0", "q1", "q2", "q3", "q4"
+ );
+
+ return 0;
+ }
+#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+ #ifdef HAVE_AES_DECRYPT
+ static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+ {
+ /*
+ AESE exor's input with round key
+ shift rows of exor'ed result
+ sub bytes for shifted rows
+ */
+
+ word32* keyPt = aes->key;
+ __asm__ __volatile__ (
+ "VLD1.32 {q0}, [%[CtrIn]] \n"
+ "VLDM %[Key]!, {q1-q4} \n"
+
+ "AESD.8 q0, q1\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q2\n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "AESD.8 q0, q3\n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "AESD.8 q0, q4\n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q3}, [%[Key]]! \n"
+ "AESD.8 q0, q1\n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q4}, [%[Key]]! \n"
+ "AESD.8 q0, q2\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q3\n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "AESD.8 q0, q4\n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "AESD.8 q0, q1\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q2\n"
+
+ "MOV r12, %[R] \n"
+ "CMP r12, #10 \n"
+ "BEQ 1f \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "AESD.8 q0, q1\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q2\n"
+
+ "CMP r12, #12 \n"
+ "BEQ 1f \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "AESD.8 q0, q1\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q2\n"
+
+ "#Final AddRoundKey then store result \n"
+ "1: \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "VEOR.32 q0, q0, q1\n"
+ "VST1.32 {q0}, [%[CtrOut]] \n"
+
+ :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
+ "=r" (inBlock)
+ :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds),
+ [CtrIn] "3" (inBlock)
+ : "cc", "memory", "r12", "q0", "q1", "q2", "q3", "q4"
+ );
+
+ return 0;
+}
+ #endif /* HAVE_AES_DECRYPT */
+#endif /* DIRECT or COUNTER */
+
+/* AES-CBC */
+#ifdef HAVE_AES_CBC
+ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+ {
+ word32 numBlocks = sz / AES_BLOCK_SIZE;
+
+ if (aes == NULL || out == NULL || (in == NULL && sz > 0)) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* do as many block size ops as possible */
+ if (numBlocks > 0) {
+ word32* keyPt = aes->key;
+ word32* regPt = aes->reg;
+ /*
+ AESE exor's input with round key
+ shift rows of exor'ed result
+ sub bytes for shifted rows
+
+ note: grouping AESE & AESMC together as pairs reduces latency
+ */
+ switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+ case 10: /* AES 128 BLOCK */
+ __asm__ __volatile__ (
+ "MOV r11, %[blocks] \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "VLD1.32 {q3}, [%[Key]]! \n"
+ "VLD1.32 {q4}, [%[Key]]! \n"
+ "VLD1.32 {q5}, [%[Key]]! \n"
+ "VLD1.32 {q6}, [%[Key]]! \n"
+ "VLD1.32 {q7}, [%[Key]]! \n"
+ "VLD1.32 {q8}, [%[Key]]! \n"
+ "VLD1.32 {q9}, [%[Key]]! \n"
+ "VLD1.32 {q10}, [%[Key]]! \n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "VLD1.32 {q0}, [%[reg]] \n"
+ "VLD1.32 {q12}, [%[input]]!\n"
+
+ "1:\n"
+ "#CBC operations, xorbuf in with current aes->reg \n"
+ "VEOR.32 q0, q0, q12 \n"
+ "AESE.8 q0, q1 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q2 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q3 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q4 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q5 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q6 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q7 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q8 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q9 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q10\n"
+ "VEOR.32 q0, q0, q11 \n"
+ "SUB r11, r11, #1 \n"
+ "VST1.32 {q0}, [%[out]]! \n"
+
+ "CMP r11, #0 \n"
+ "BEQ 2f \n"
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "B 1b \n"
+
+ "2:\n"
+ "#store current counter value at the end \n"
+ "VST1.32 {q0}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (regPt)
+ :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (regPt)
+ : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q6", "q7", "q8", "q9", "q10", "q11", "q12"
+ );
+ break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+ case 12: /* AES 192 BLOCK */
+ __asm__ __volatile__ (
+ "MOV r11, %[blocks] \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "VLD1.32 {q3}, [%[Key]]! \n"
+ "VLD1.32 {q4}, [%[Key]]! \n"
+ "VLD1.32 {q5}, [%[Key]]! \n"
+ "VLD1.32 {q6}, [%[Key]]! \n"
+ "VLD1.32 {q7}, [%[Key]]! \n"
+ "VLD1.32 {q8}, [%[Key]]! \n"
+ "VLD1.32 {q9}, [%[Key]]! \n"
+ "VLD1.32 {q10}, [%[Key]]! \n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "VLD1.32 {q0}, [%[reg]] \n"
+ "VLD1.32 {q12}, [%[input]]!\n"
+ "VLD1.32 {q13}, [%[Key]]! \n"
+ "VLD1.32 {q14}, [%[Key]]! \n"
+
+ "1:\n"
+ "#CBC operations, xorbuf in with current aes->reg \n"
+ "VEOR.32 q0, q0, q12 \n"
+ "AESE.8 q0, q1 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q2 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q3 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q4 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q5 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q6 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q7 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q8 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q9 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q10 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q11 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q13\n"
+ "VEOR.32 q0, q0, q14 \n"
+ "SUB r11, r11, #1 \n"
+ "VST1.32 {q0}, [%[out]]! \n"
+
+ "CMP r11, #0 \n"
+ "BEQ 2f \n"
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "B 1b \n"
+
+ "2:\n"
+ "#store current counter qalue at the end \n"
+ "VST1.32 {q0}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (regPt)
+ :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (regPt)
+ : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14"
+ );
+ break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+ case 14: /* AES 256 BLOCK */
+ __asm__ __volatile__ (
+ "MOV r11, %[blocks] \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "VLD1.32 {q3}, [%[Key]]! \n"
+ "VLD1.32 {q4}, [%[Key]]! \n"
+ "VLD1.32 {q5}, [%[Key]]! \n"
+ "VLD1.32 {q6}, [%[Key]]! \n"
+ "VLD1.32 {q7}, [%[Key]]! \n"
+ "VLD1.32 {q8}, [%[Key]]! \n"
+ "VLD1.32 {q9}, [%[Key]]! \n"
+ "VLD1.32 {q10}, [%[Key]]! \n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "VLD1.32 {q0}, [%[reg]] \n"
+ "VLD1.32 {q12}, [%[input]]!\n"
+ "VLD1.32 {q13}, [%[Key]]! \n"
+ "VLD1.32 {q14}, [%[Key]]! \n"
+
+ "1:\n"
+ "#CBC operations, xorbuf in with current aes->reg \n"
+ "VEOR.32 q0, q0, q12 \n"
+ "AESE.8 q0, q1 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q2 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q3 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q4 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q5 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q6 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q7 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q8 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q9 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q10 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q11 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q13 \n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q15}, [%[Key]]! \n"
+ "AESE.8 q0, q14 \n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q15\n"
+ "VLD1.32 {q15}, [%[Key]] \n"
+ "VEOR.32 q0, q0, q15 \n"
+ "SUB r11, r11, #1 \n"
+ "VST1.32 {q0}, [%[out]]! \n"
+ "SUB %[Key], %[Key], #16 \n"
+
+ "CMP r11, #0 \n"
+ "BEQ 2f \n"
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "B 1b \n"
+
+ "2:\n"
+ "#store current counter qalue at the end \n"
+ "VST1.32 {q0}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (regPt), "=r" (keyPt)
+ :"0" (out), [Key] "2" (keyPt), [input] "r" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (regPt)
+ : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ break;
+#endif /* WOLFSSL_AES_256 */
+ default:
+ WOLFSSL_MSG("Bad AES-CBC round value");
+ return BAD_FUNC_ARG;
+ }
+ }
+
+ return 0;
+ }
+
+ #ifdef HAVE_AES_DECRYPT
+ int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+ {
+ word32 numBlocks = sz / AES_BLOCK_SIZE;
+
+ if (aes == NULL || out == NULL || (in == NULL && sz > 0)
+ || sz % AES_BLOCK_SIZE != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* do as many block size ops as possible */
+ if (numBlocks > 0) {
+ word32* keyPt = aes->key;
+ word32* regPt = aes->reg;
+ switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+ case 10: /* AES 128 BLOCK */
+ __asm__ __volatile__ (
+ "MOV r11, %[blocks] \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "VLD1.32 {q3}, [%[Key]]! \n"
+ "VLD1.32 {q4}, [%[Key]]! \n"
+ "VLD1.32 {q5}, [%[Key]]! \n"
+ "VLD1.32 {q6}, [%[Key]]! \n"
+ "VLD1.32 {q7}, [%[Key]]! \n"
+ "VLD1.32 {q8}, [%[Key]]! \n"
+ "VLD1.32 {q9}, [%[Key]]! \n"
+ "VLD1.32 {q10}, [%[Key]]! \n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "VLD1.32 {q13}, [%[reg]] \n"
+ "VLD1.32 {q0}, [%[input]]!\n"
+
+ "1:\n"
+ "VMOV.32 q12, q0 \n"
+ "AESD.8 q0, q1\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q2\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q3\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q4\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q5\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q6\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q7\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q8\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q9\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q10\n"
+ "VEOR.32 q0, q0, q11\n"
+
+ "VEOR.32 q0, q0, q13\n"
+ "SUB r11, r11, #1 \n"
+ "VST1.32 {q0}, [%[out]]! \n"
+ "VMOV.32 q13, q12 \n"
+
+ "CMP r11, #0 \n"
+ "BEQ 2f \n"
+ "VLD1.32 {q0}, [%[input]]! \n"
+ "B 1b \n"
+
+ "2: \n"
+ "#store current counter qalue at the end \n"
+ "VST1.32 {q13}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (regPt)
+ :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (regPt)
+ : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13"
+ );
+ break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+ case 12: /* AES 192 BLOCK */
+ __asm__ __volatile__ (
+ "MOV r11, %[blocks] \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "VLD1.32 {q3}, [%[Key]]! \n"
+ "VLD1.32 {q4}, [%[Key]]! \n"
+ "VLD1.32 {q5}, [%[Key]]! \n"
+ "VLD1.32 {q6}, [%[Key]]! \n"
+ "VLD1.32 {q7}, [%[Key]]! \n"
+ "VLD1.32 {q8}, [%[Key]]! \n"
+ "VLD1.32 {q9}, [%[Key]]! \n"
+ "VLD1.32 {q10}, [%[Key]]! \n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "VLD1.32 {q12}, [%[Key]]! \n"
+ "VLD1.32 {q13}, [%[Key]]! \n"
+ "VLD1.32 {q14}, [%[reg]] \n"
+ "VLD1.32 {q0}, [%[input]]!\n"
+
+ "1: \n"
+ "VMOV.32 q15, q0 \n"
+ "AESD.8 q0, q1\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q2\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q3\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q4\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q5\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q6\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q7\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q8\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q9\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q10\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q11\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q12\n"
+ "VEOR.32 q0, q0, q13\n"
+
+ "VEOR.32 q0, q0, q14\n"
+ "SUB r11, r11, #1 \n"
+ "VST1.32 {q0}, [%[out]]! \n"
+ "VMOV.32 q14, q15 \n"
+
+ "CMP r11, #0 \n"
+ "BEQ 2f \n"
+ "VLD1.32 {q0}, [%[input]]! \n"
+ "B 1b \n"
+
+ "2:\n"
+ "#store current counter value at the end \n"
+ "VST1.32 {q15}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (regPt)
+ :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (regPt)
+ : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+ case 14: /* AES 256 BLOCK */
+ __asm__ __volatile__ (
+ "MOV r11, %[blocks] \n"
+ "VLD1.32 {q1}, [%[Key]]! \n"
+ "VLD1.32 {q2}, [%[Key]]! \n"
+ "VLD1.32 {q3}, [%[Key]]! \n"
+ "VLD1.32 {q4}, [%[Key]]! \n"
+ "VLD1.32 {q5}, [%[Key]]! \n"
+ "VLD1.32 {q6}, [%[Key]]! \n"
+ "VLD1.32 {q7}, [%[Key]]! \n"
+ "VLD1.32 {q8}, [%[Key]]! \n"
+ "VLD1.32 {q9}, [%[Key]]! \n"
+ "VLD1.32 {q10}, [%[Key]]! \n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "VLD1.32 {q12}, [%[Key]]! \n"
+ "VLD1.32 {q14}, [%[reg]] \n"
+ "VLD1.32 {q0}, [%[input]]!\n"
+
+ "1:\n"
+ "VMOV.32 q15, q0 \n"
+ "AESD.8 q0, q1\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q2\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q3\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q4\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q5\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q6\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q7\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q8\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q9\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q10\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q11\n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q13}, [%[Key]]! \n"
+ "AESD.8 q0, q12\n"
+ "AESIMC.8 q0, q0\n"
+ "AESD.8 q0, q13\n"
+ "AESIMC.8 q0, q0\n"
+ "VLD1.32 {q13}, [%[Key]]! \n"
+ "AESD.8 q0, q13\n"
+ "VLD1.32 {q13}, [%[Key]] \n"
+ "VEOR.32 q0, q0, q13\n"
+ "SUB %[Key], %[Key], #32 \n"
+
+ "VEOR.32 q0, q0, q14\n"
+ "SUB r11, r11, #1 \n"
+ "VST1.32 {q0}, [%[out]]! \n"
+ "VMOV.32 q14, q15 \n"
+
+ "CMP r11, #0 \n"
+ "BEQ 2f \n"
+ "VLD1.32 {q0}, [%[input]]! \n"
+ "B 1b \n"
+
+ "2:\n"
+ "#store current counter value at the end \n"
+ "VST1.32 {q15}, [%[regOut]] \n"
+
+ :[out] "=r" (out), [regOut] "=r" (regPt)
+ :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+ [blocks] "r" (numBlocks), [reg] "1" (regPt)
+ : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ break;
+#endif /* WOLFSSL_AES_256 */
+ default:
+ WOLFSSL_MSG("Bad AES-CBC round value");
+ return BAD_FUNC_ARG;
+ }
+ }
+
+ return 0;
+ }
+ #endif
+
+#endif /* HAVE_AES_CBC */
+
+/* AES-CTR */
+#ifdef WOLFSSL_AES_COUNTER
+
+ /* Increment AES counter */
+ static WC_INLINE void IncrementAesCounter(byte* inOutCtr)
+ {
+ int i;
+
+ /* in network byte order so start at end and work back */
+ for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) {
+ if (++inOutCtr[i]) /* we're done unless we overflow */
+ return;
+ }
+ }
+
+ int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+ {
+ byte* tmp;
+ word32 numBlocks;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
+
+ /* consume any unused bytes left in aes->tmp */
+ while (aes->left && sz) {
+ *(out++) = *(in++) ^ *(tmp++);
+ aes->left--;
+ sz--;
+ }
+
+ /* do as many block size ops as possible */
+ numBlocks = sz/AES_BLOCK_SIZE;
+ if (numBlocks > 0) {
+ /* pointer needed because it is incremented when read, causing
+ * an issue with call to encrypt/decrypt leftovers */
+ word32* keyPt = aes->key;
+ word32* regPt = aes->reg;
+ sz -= numBlocks * AES_BLOCK_SIZE;
+ switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+ case 10: /* AES 128 BLOCK */
+ __asm__ __volatile__ (
+ "MOV r11, %[blocks] \n"
+ "VLDM %[Key]!, {q1-q4} \n"
+
+ "#Create vector with the value 1 \n"
+ "VMOV.u32 q15, #1 \n"
+ "VSHR.u64 q15, q15, #32 \n"
+ "VLDM %[Key]!, {q5-q8} \n"
+ "VEOR.32 q14, q14, q14 \n"
+ "VLDM %[Key]!, {q9-q11} \n"
+ "VEXT.8 q14, q15, q14, #8\n"
+
+ "VLD1.32 {q13}, [%[reg]]\n"
+
+ /* double block */
+ "1: \n"
+ "CMP r11, #1 \n"
+ "BEQ 2f \n"
+ "CMP r11, #0 \n"
+ "BEQ 3f \n"
+
+ "VMOV.32 q0, q13 \n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13 \n" /* network order */
+ "AESE.8 q0, q2\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "SUB r11, r11, #2 \n"
+ "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */
+ "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */
+ "AESE.8 q0, q3\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q15, q15, q15, #8 \n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "AESE.8 q0, q4\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q15, q15\n" /* revert from network order */
+ "VREV64.8 q13, q13\n" /* revert from network order */
+ "AESE.8 q0, q5\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q1\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q6\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q2\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q7\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q3\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q8\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q4\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q9\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q5\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q10\n"
+ "AESE.8 q15, q6\n"
+ "AESMC.8 q15, q15\n"
+ "VEOR.32 q0, q0, q11\n"
+
+ "AESE.8 q15, q7\n"
+ "AESMC.8 q15, q15\n"
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "AESE.8 q15, q8\n"
+ "AESMC.8 q15, q15\n"
+
+ "VEOR.32 q0, q0, q12\n"
+ "AESE.8 q15, q9\n"
+ "AESMC.8 q15, q15\n"
+
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "AESE.8 q15, q10\n"
+ "VST1.32 {q0}, [%[out]]! \n"
+ "VEOR.32 q15, q15, q11\n"
+ "VEOR.32 q15, q15, q12\n"
+ "VST1.32 {q15}, [%[out]]! \n"
+
+ "B 1b \n"
+
+ /* single block */
+ "2: \n"
+ "VMOV.32 q0, q13 \n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13 \n" /* network order */
+ "AESE.8 q0, q2\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "AESE.8 q0, q3\n"
+ "AESMC.8 q0, q0\n"
+ "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */
+ "AESE.8 q0, q4\n"
+ "AESMC.8 q0, q0\n"
+ "SUB r11, r11, #1 \n"
+ "AESE.8 q0, q5\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "AESE.8 q0, q6\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13\n" /* revert from network order */
+ "AESE.8 q0, q7\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q8\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q9\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q10\n"
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "VEOR.32 q0, q0, q11\n"
+ "#CTR operations, increment counter and xorbuf \n"
+ "VEOR.32 q0, q0, q12\n"
+ "VST1.32 {q0}, [%[out]]! \n"
+
+ "3: \n"
+ "#store current counter qalue at the end \n"
+ "VST1.32 {q13}, [%[regOut]] \n"
+
+ :[out] "=r" (out), "=r" (keyPt), [regOut] "=r" (regPt),
+ "=r" (in)
+ :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+ [blocks] "r" (numBlocks), [reg] "2" (regPt)
+ : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14", "q15"
+ );
+ break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+ case 12: /* AES 192 BLOCK */
+ __asm__ __volatile__ (
+ "MOV r11, %[blocks] \n"
+ "VLDM %[Key]!, {q1-q4} \n"
+
+ "#Create vector with the value 1 \n"
+ "VMOV.u32 q15, #1 \n"
+ "VSHR.u64 q15, q15, #32 \n"
+ "VLDM %[Key]!, {q5-q8} \n"
+ "VEOR.32 q14, q14, q14 \n"
+ "VEXT.8 q14, q15, q14, #8\n"
+
+ "VLDM %[Key]!, {q9-q10} \n"
+ "VLD1.32 {q13}, [%[reg]]\n"
+
+ /* double block */
+ "1: \n"
+ "CMP r11, #1 \n"
+ "BEQ 2f \n"
+ "CMP r11, #0 \n"
+ "BEQ 3f \n"
+
+ "VMOV.32 q0, q13\n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13 \n" /* network order */
+ "AESE.8 q0, q2\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "SUB r11, r11, #2 \n"
+ "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */
+ "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */
+ "AESE.8 q0, q3\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q15, q15, q15, #8 \n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "AESE.8 q0, q4\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q15, q15\n" /* revert from network order */
+ "VREV64.8 q13, q13\n" /* revert from network order */
+ "AESE.8 q0, q5\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q1\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q6\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q2\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q7\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q3\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q8\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q4\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q9\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q5\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q10\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "AESE.8 q15, q6\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q11\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q7\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q15, q8\n"
+ "AESMC.8 q15, q15\n"
+
+ "VLD1.32 {q12}, [%[Key]]! \n"
+ "AESE.8 q15, q9\n"
+ "AESMC.8 q15, q15\n"
+ "AESE.8 q15, q10\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q15, q11\n"
+ "AESMC.8 q15, q15\n"
+ "VLD1.32 {q11}, [%[Key]] \n"
+ "AESE.8 q0, q12\n"
+ "AESE.8 q15, q12\n"
+
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "VEOR.32 q0, q0, q11\n"
+ "VEOR.32 q15, q15, q11\n"
+ "VEOR.32 q0, q0, q12\n"
+
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "VST1.32 {q0}, [%[out]]! \n"
+ "VEOR.32 q15, q15, q12\n"
+ "VST1.32 {q15}, [%[out]]! \n"
+ "SUB %[Key], %[Key], #32 \n"
+
+ "B 1b \n"
+
+
+ /* single block */
+ "2: \n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "VMOV.32 q0, q13 \n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13 \n" /* network order */
+ "AESE.8 q0, q2\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "AESE.8 q0, q3\n"
+ "AESMC.8 q0, q0\n"
+ "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */
+ "AESE.8 q0, q4\n"
+ "AESMC.8 q0, q0\n"
+ "SUB r11, r11, #1 \n"
+ "AESE.8 q0, q5\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "AESE.8 q0, q6\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13\n" /* revert from network order */
+ "AESE.8 q0, q7\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q8\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q9\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q10\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q12}, [%[Key]]! \n"
+ "AESE.8 q0, q11\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q11}, [%[Key]] \n"
+ "AESE.8 q0, q12\n"
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "VEOR.32 q0, q0, q11\n"
+ "#CTR operations, increment counter and xorbuf \n"
+ "VEOR.32 q0, q0, q12\n"
+ "VST1.32 {q0}, [%[out]]! \n"
+
+ "3: \n"
+ "#store current counter qalue at the end \n"
+ "VST1.32 {q13}, [%[regOut]] \n"
+
+ :[out] "=r" (out), "=r" (keyPt), [regOut] "=r" (regPt),
+ "=r" (in)
+ :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+ [blocks] "r" (numBlocks), [reg] "2" (regPt)
+ : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14"
+ );
+ break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+ case 14: /* AES 256 BLOCK */
+ __asm__ __volatile__ (
+ "MOV r11, %[blocks] \n"
+ "VLDM %[Key]!, {q1-q4} \n"
+
+ "#Create vector with the value 1 \n"
+ "VMOV.u32 q15, #1 \n"
+ "VSHR.u64 q15, q15, #32 \n"
+ "VLDM %[Key]!, {q5-q8} \n"
+ "VEOR.32 q14, q14, q14 \n"
+ "VEXT.8 q14, q15, q14, #8\n"
+
+ "VLDM %[Key]!, {q9-q10} \n"
+ "VLD1.32 {q13}, [%[reg]]\n"
+
+ /* double block */
+ "1: \n"
+ "CMP r11, #1 \n"
+ "BEQ 2f \n"
+ "CMP r11, #0 \n"
+ "BEQ 3f \n"
+
+ "VMOV.32 q0, q13 \n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13 \n" /* network order */
+ "AESE.8 q0, q2\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "SUB r11, r11, #2 \n"
+ "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */
+ "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */
+ "AESE.8 q0, q3\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q15, q15, q15, #8 \n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "AESE.8 q0, q4\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q15, q15\n" /* revert from network order */
+ "AESE.8 q0, q5\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13\n" /* revert from network order */
+ "AESE.8 q15, q1\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q6\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q2\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q7\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q3\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q8\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q4\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q9\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q5\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q10\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "AESE.8 q15, q6\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q0, q11\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q7\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q15, q8\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q15, q9\n"
+ "AESMC.8 q15, q15\n"
+ "VLD1.32 {q12}, [%[Key]]! \n"
+ "AESE.8 q15, q10\n"
+ "AESMC.8 q15, q15\n"
+
+ "AESE.8 q15, q11\n"
+ "AESMC.8 q15, q15\n"
+
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "AESE.8 q0, q12\n" /* rnd 12*/
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q12\n" /* rnd 12 */
+ "AESMC.8 q15, q15\n"
+
+ "VLD1.32 {q12}, [%[Key]]! \n"
+ "AESE.8 q0, q11\n" /* rnd 13 */
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q15, q11\n" /* rnd 13 */
+ "AESMC.8 q15, q15\n"
+
+ "VLD1.32 {q11}, [%[Key]] \n"
+ "AESE.8 q0, q12\n" /* rnd 14 */
+ "AESE.8 q15, q12\n" /* rnd 14 */
+
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "VEOR.32 q0, q0, q11\n" /* rnd 15 */
+ "VEOR.32 q15, q15, q11\n" /* rnd 15 */
+ "VEOR.32 q0, q0, q12\n"
+
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "VST1.32 {q0}, [%[out]]! \n"
+ "VEOR.32 q15, q15, q12\n"
+ "VST1.32 {q15}, [%[out]]! \n"
+ "SUB %[Key], %[Key], #64 \n"
+
+ /* single block */
+ "B 1b \n"
+
+ "2: \n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "VMOV.32 q0, q13 \n"
+ "AESE.8 q0, q1\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13 \n" /* network order */
+ "AESE.8 q0, q2\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "AESE.8 q0, q3\n"
+ "AESMC.8 q0, q0\n"
+ "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */
+ "AESE.8 q0, q4\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q5\n"
+ "AESMC.8 q0, q0\n"
+ "VEXT.8 q13, q13, q13, #8 \n"
+ "AESE.8 q0, q6\n"
+ "AESMC.8 q0, q0\n"
+ "VREV64.8 q13, q13\n" /* revert from network order */
+ "AESE.8 q0, q7\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q8\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q9\n"
+ "AESMC.8 q0, q0\n"
+ "AESE.8 q0, q10\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q12}, [%[Key]]! \n"
+ "AESE.8 q0, q11\n"
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q11}, [%[Key]]! \n"
+ "AESE.8 q0, q12\n" /* rnd 12 */
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q12}, [%[Key]]! \n"
+ "AESE.8 q0, q11\n" /* rnd 13 */
+ "AESMC.8 q0, q0\n"
+ "VLD1.32 {q11}, [%[Key]] \n"
+ "AESE.8 q0, q12\n" /* rnd 14 */
+ "VLD1.32 {q12}, [%[input]]! \n"
+ "VEOR.32 q0, q0, q11\n" /* rnd 15 */
+ "#CTR operations, increment counter and xorbuf \n"
+ "VEOR.32 q0, q0, q12\n"
+ "VST1.32 {q0}, [%[out]]! \n"
+
+ "3: \n"
+ "#store current counter qalue at the end \n"
+ "VST1.32 {q13}, [%[regOut]] \n"
+
+ :[out] "=r" (out), "=r" (keyPt), [regOut] "=r" (regPt),
+ "=r" (in)
+ :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+ [blocks] "r" (numBlocks), [reg] "2" (regPt)
+ : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14"
+ );
+ break;
+#endif /* WOLFSSL_AES_256 */
+ default:
+ WOLFSSL_MSG("Bad AES-CTR round qalue");
+ return BAD_FUNC_ARG;
+ }
+
+ aes->left = 0;
+ }
+
+ /* handle non block size remaining */
+ if (sz) {
+ wc_AesEncrypt(aes, (byte*)aes->reg, (byte*)aes->tmp);
+ IncrementAesCounter((byte*)aes->reg);
+
+ aes->left = AES_BLOCK_SIZE;
+ tmp = (byte*)aes->tmp;
+
+ while (sz--) {
+ *(out++) = *(in++) ^ *(tmp++);
+ aes->left--;
+ }
+ }
+
+ return 0;
+ }
+
+#endif /* WOLFSSL_AES_COUNTER */
+
+#ifdef HAVE_AESGCM
+/*
+ * Uses Karatsuba algorithm. Reduction algorithm is based on "Implementing GCM
+ * on ARMv8". Shifting left to account for bit reflection is based on
+ * "Carry-Less Multiplication and Its Usage for Computing the GCM mode"
+ */
+static void GMULT(byte* X, byte* Y)
+{
+ __asm__ __volatile__ (
+ "VLD1.32 {q0}, [%[x]] \n"
+
+ /* In GCM format bits are big endian, switch location of bytes to
+ * allow for logical shifts and carries.
+ */
+ "VREV64.8 q0, q0 \n"
+ "VLD1.32 {q1}, [%[y]] \n" /* converted on set key */
+ "VSWP.8 d0, d1 \n"
+
+ "VMULL.p64 q5, d0, d2 \n"
+ "VMULL.p64 q6, d1, d3 \n"
+ "VEOR d15, d2, d3 \n"
+ "VEOR d14, d0, d1 \n"
+ "VMULL.p64 q7, d15, d14 \n"
+ "VEOR q7, q5 \n"
+ "VEOR q7, q6 \n"
+ "VEOR d11, d14 \n"
+ "VEOR d12, d15\n"
+
+ /* shift to left by 1 to account for reflection */
+ "VMOV q7, q6 \n"
+ "VSHL.u64 q6, q6, #1 \n"
+ "VSHR.u64 q7, q7, #63 \n"
+ "VEOR d13, d14 \n"
+ "VMOV q8, q5 \n"
+ "VSHL.u64 q5, q5, #1 \n"
+ "VSHR.u64 q8, q8, #63 \n"
+ "VEOR d12, d17 \n"
+ "VEOR d11, d16 \n"
+
+ /* create constant 0xc200000000000000 */
+ "VMOV.i32 d16, 0xc2000000 \n"
+ "VSHL.u64 d16, d16, #32 \n"
+
+ /* reduce product of multiplication */
+ "VMULL.p64 q9, d10, d16 \n"
+ "VEOR d11, d18 \n"
+ "VEOR d12, d19 \n"
+ "VMULL.p64 q9, d11, d16 \n"
+ "VEOR q6, q9 \n"
+ "VEOR q10, q5, q6 \n"
+
+ /* convert to GCM format */
+ "VREV64.8 q10, q10 \n"
+ "VSWP.8 d20, d21 \n"
+
+ "VST1.32 {q10}, [%[xOut]] \n"
+
+ : [xOut] "=r" (X), [yOut] "=r" (Y)
+ : [x] "0" (X), [y] "1" (Y)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6" ,"q7", "q8",
+ "q9", "q10", "q11" ,"q12", "q13", "q14", "q15"
+ );
+}
+
+
+void GHASH(Aes* aes, const byte* a, word32 aSz,
+ const byte* c, word32 cSz, byte* s, word32 sSz)
+{
+ byte x[AES_BLOCK_SIZE];
+ byte scratch[AES_BLOCK_SIZE];
+ word32 blocks, partial;
+ byte* h = aes->H;
+
+ XMEMSET(x, 0, AES_BLOCK_SIZE);
+
+ /* Hash in A, the Additional Authentication Data */
+ if (aSz != 0 && a != NULL) {
+ blocks = aSz / AES_BLOCK_SIZE;
+ partial = aSz % AES_BLOCK_SIZE;
+ while (blocks--) {
+ xorbuf(x, a, AES_BLOCK_SIZE);
+ GMULT(x, h);
+ a += AES_BLOCK_SIZE;
+ }
+ if (partial != 0) {
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, a, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, h);
+ }
+ }
+
+ /* Hash in C, the Ciphertext */
+ if (cSz != 0 && c != NULL) {
+ blocks = cSz / AES_BLOCK_SIZE;
+ partial = cSz % AES_BLOCK_SIZE;
+ while (blocks--) {
+ xorbuf(x, c, AES_BLOCK_SIZE);
+ GMULT(x, h);
+ c += AES_BLOCK_SIZE;
+ }
+ if (partial != 0) {
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ XMEMCPY(scratch, c, partial);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, h);
+ }
+ }
+
+ /* Hash in the lengths of A and C in bits */
+ FlattenSzInBits(&scratch[0], aSz);
+ FlattenSzInBits(&scratch[8], cSz);
+ xorbuf(x, scratch, AES_BLOCK_SIZE);
+ GMULT(x, h);
+
+ /* Copy the result into s. */
+ XMEMCPY(s, x, sSz);
+}
+
+
+/* Aarch32
+ * Encrypt and tag data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: encrypted data output buffer
+ * in: plain text input buffer
+ * sz: size of plain text and out buffer
+ * iv: initialization vector
+ * ivSz: size of iv buffer
+ * authTag: buffer to hold tag
+ * authTagSz: size of tag buffer
+ * authIn: additional data buffer
+ * authInSz: size of additional data buffer
+ */
+int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ word32 blocks = sz / AES_BLOCK_SIZE;
+ word32 partial = sz % AES_BLOCK_SIZE;
+ const byte* p = in;
+ byte* c = out;
+ byte counter[AES_BLOCK_SIZE];
+ byte initialCounter[AES_BLOCK_SIZE];
+ byte *ctr ;
+ byte scratch[AES_BLOCK_SIZE];
+ ctr = counter ;
+
+ /* sanity checks */
+ if (aes == NULL || (iv == NULL && ivSz > 0) ||
+ (authTag == NULL) ||
+ (authIn == NULL && authInSz > 0) ||
+ (in == NULL && sz > 0) ||
+ (out == NULL && sz > 0)) {
+ WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+ return BAD_FUNC_ARG;
+ }
+
+ if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ || authTagSz > AES_BLOCK_SIZE) {
+ WOLFSSL_MSG("GcmEncrypt authTagSz error");
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+ if (ivSz == GCM_NONCE_MID_SZ) {
+ XMEMCPY(initialCounter, iv, ivSz);
+ initialCounter[AES_BLOCK_SIZE - 1] = 1;
+ }
+ else {
+ GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+ }
+ XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
+
+ while (blocks--) {
+ IncrementGcmCounter(ctr);
+ wc_AesEncrypt(aes, ctr, scratch);
+ xorbuf(scratch, p, AES_BLOCK_SIZE);
+ XMEMCPY(c, scratch, AES_BLOCK_SIZE);
+ p += AES_BLOCK_SIZE;
+ c += AES_BLOCK_SIZE;
+ }
+
+ if (partial != 0) {
+ IncrementGcmCounter(ctr);
+ wc_AesEncrypt(aes, ctr, scratch);
+ xorbuf(scratch, p, partial);
+ XMEMCPY(c, scratch, partial);
+
+ }
+
+ GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz);
+ wc_AesEncrypt(aes, initialCounter, scratch);
+ if (authTagSz > AES_BLOCK_SIZE) {
+ xorbuf(authTag, scratch, AES_BLOCK_SIZE);
+ }
+ else {
+ xorbuf(authTag, scratch, authTagSz);
+ }
+
+ return 0;
+}
+
+
+#ifdef HAVE_AES_DECRYPT
+/*
+ * Check tag and decrypt data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: decrypted data output buffer
+ * in: cipher text buffer
+ * sz: size of plain text and out buffer
+ * iv: initialization vector
+ * ivSz: size of iv buffer
+ * authTag: buffer holding tag
+ * authTagSz: size of tag buffer
+ * authIn: additional data buffer
+ * authInSz: size of additional data buffer
+ */
+int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ word32 blocks = sz / AES_BLOCK_SIZE;
+ word32 partial = sz % AES_BLOCK_SIZE;
+ const byte* c = in;
+ byte* p = out;
+ byte counter[AES_BLOCK_SIZE];
+ byte initialCounter[AES_BLOCK_SIZE];
+ byte *ctr ;
+ byte scratch[AES_BLOCK_SIZE];
+ ctr = counter ;
+
+ /* sanity checks */
+ if (aes == NULL || (iv == NULL && ivSz > 0) ||
+ (authTag == NULL) ||
+ (authIn == NULL && authInSz > 0) ||
+ (in == NULL && sz > 0) ||
+ (out == NULL && sz > 0)) {
+ WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+ if (ivSz == GCM_NONCE_MID_SZ) {
+ XMEMCPY(initialCounter, iv, ivSz);
+ initialCounter[AES_BLOCK_SIZE - 1] = 1;
+ }
+ else {
+ GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+ }
+ XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
+
+ /* Calculate the authTag again using the received auth data and the
+ * cipher text. */
+ {
+ byte Tprime[AES_BLOCK_SIZE];
+ byte EKY0[AES_BLOCK_SIZE];
+
+ GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime));
+ wc_AesEncrypt(aes, ctr, EKY0);
+ xorbuf(Tprime, EKY0, sizeof(Tprime));
+
+ if (ConstantCompare(authTag, Tprime, authTagSz) != 0) {
+ return AES_GCM_AUTH_E;
+ }
+ }
+
+ while (blocks--) {
+ IncrementGcmCounter(ctr);
+ wc_AesEncrypt(aes, ctr, scratch);
+ xorbuf(scratch, c, AES_BLOCK_SIZE);
+ XMEMCPY(p, scratch, AES_BLOCK_SIZE);
+ p += AES_BLOCK_SIZE;
+ c += AES_BLOCK_SIZE;
+ }
+ if (partial != 0) {
+ IncrementGcmCounter(ctr);
+ wc_AesEncrypt(aes, ctr, scratch);
+
+ /* check if pointer is null after main AES-GCM blocks
+ * helps static analysis */
+ if (p == NULL || c == NULL) {
+ return BAD_STATE_E;
+ }
+ xorbuf(scratch, c, partial);
+ XMEMCPY(p, scratch, partial);
+ }
+ return 0;
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AESGCM */
+
+#endif /* aarch64 */
+
+
+#ifdef HAVE_AESCCM
+/* Software version of AES-CCM from wolfcrypt/src/aes.c
+ * Gets some speed up from hardware acceleration of wc_AesEncrypt */
+
+static void roll_x(Aes* aes, const byte* in, word32 inSz, byte* out)
+{
+ /* process the bulk of the data */
+ while (inSz >= AES_BLOCK_SIZE) {
+ xorbuf(out, in, AES_BLOCK_SIZE);
+ in += AES_BLOCK_SIZE;
+ inSz -= AES_BLOCK_SIZE;
+
+ wc_AesEncrypt(aes, out, out);
+ }
+
+ /* process remainder of the data */
+ if (inSz > 0) {
+ xorbuf(out, in, inSz);
+ wc_AesEncrypt(aes, out, out);
+ }
+}
+
+
+static void roll_auth(Aes* aes, const byte* in, word32 inSz, byte* out)
+{
+ word32 authLenSz;
+ word32 remainder;
+
+ /* encode the length in */
+ if (inSz <= 0xFEFF) {
+ authLenSz = 2;
+ out[0] ^= ((inSz & 0xFF00) >> 8);
+ out[1] ^= (inSz & 0x00FF);
+ }
+ else if (inSz <= 0xFFFFFFFF) {
+ authLenSz = 6;
+ out[0] ^= 0xFF; out[1] ^= 0xFE;
+ out[2] ^= ((inSz & 0xFF000000) >> 24);
+ out[3] ^= ((inSz & 0x00FF0000) >> 16);
+ out[4] ^= ((inSz & 0x0000FF00) >> 8);
+ out[5] ^= (inSz & 0x000000FF);
+ }
+ /* Note, the protocol handles auth data up to 2^64, but we are
+ * using 32-bit sizes right now, so the bigger data isn't handled
+ * else if (inSz <= 0xFFFFFFFFFFFFFFFF) {} */
+ else
+ return;
+
+ /* start fill out the rest of the first block */
+ remainder = AES_BLOCK_SIZE - authLenSz;
+ if (inSz >= remainder) {
+ /* plenty of bulk data to fill the remainder of this block */
+ xorbuf(out + authLenSz, in, remainder);
+ inSz -= remainder;
+ in += remainder;
+ }
+ else {
+ /* not enough bulk data, copy what is available, and pad zero */
+ xorbuf(out + authLenSz, in, inSz);
+ inSz = 0;
+ }
+ wc_AesEncrypt(aes, out, out);
+
+ if (inSz > 0)
+ roll_x(aes, in, inSz, out);
+}
+
+
+static WC_INLINE void AesCcmCtrInc(byte* B, word32 lenSz)
+{
+ word32 i;
+
+ for (i = 0; i < lenSz; i++) {
+ if (++B[AES_BLOCK_SIZE - 1 - i] != 0) return;
+ }
+}
+
+
+/* return 0 on success */
+int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
+ const byte* nonce, word32 nonceSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ byte A[AES_BLOCK_SIZE];
+ byte B[AES_BLOCK_SIZE];
+ byte lenSz;
+ word32 i;
+ byte mask = 0xFF;
+ word32 wordSz = (word32)sizeof(word32);
+
+ /* sanity check on arguments */
+ if (aes == NULL || out == NULL || in == NULL || nonce == NULL
+ || authTag == NULL || nonceSz < 7 || nonceSz > 13)
+ return BAD_FUNC_ARG;
+
+ XMEMCPY(B+1, nonce, nonceSz);
+ lenSz = AES_BLOCK_SIZE - 1 - (byte)nonceSz;
+ B[0] = (authInSz > 0 ? 64 : 0)
+ + (8 * (((byte)authTagSz - 2) / 2))
+ + (lenSz - 1);
+ for (i = 0; i < lenSz; i++) {
+ if (mask && i >= wordSz)
+ mask = 0x00;
+ B[AES_BLOCK_SIZE - 1 - i] = (inSz >> ((8 * i) & mask)) & mask;
+ }
+
+ wc_AesEncrypt(aes, B, A);
+
+ if (authInSz > 0)
+ roll_auth(aes, authIn, authInSz, A);
+ if (inSz > 0)
+ roll_x(aes, in, inSz, A);
+ XMEMCPY(authTag, A, authTagSz);
+
+ B[0] = lenSz - 1;
+ for (i = 0; i < lenSz; i++)
+ B[AES_BLOCK_SIZE - 1 - i] = 0;
+ wc_AesEncrypt(aes, B, A);
+ xorbuf(authTag, A, authTagSz);
+
+ B[15] = 1;
+ while (inSz >= AES_BLOCK_SIZE) {
+ wc_AesEncrypt(aes, B, A);
+ xorbuf(A, in, AES_BLOCK_SIZE);
+ XMEMCPY(out, A, AES_BLOCK_SIZE);
+
+ AesCcmCtrInc(B, lenSz);
+ inSz -= AES_BLOCK_SIZE;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ if (inSz > 0) {
+ wc_AesEncrypt(aes, B, A);
+ xorbuf(A, in, inSz);
+ XMEMCPY(out, A, inSz);
+ }
+
+ ForceZero(A, AES_BLOCK_SIZE);
+ ForceZero(B, AES_BLOCK_SIZE);
+
+ return 0;
+}
+
+#ifdef HAVE_AES_DECRYPT
+int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
+ const byte* nonce, word32 nonceSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ byte A[AES_BLOCK_SIZE];
+ byte B[AES_BLOCK_SIZE];
+ byte* o;
+ byte lenSz;
+ word32 i, oSz;
+ int result = 0;
+ byte mask = 0xFF;
+ word32 wordSz = (word32)sizeof(word32);
+
+ /* sanity check on arguments */
+ if (aes == NULL || out == NULL || in == NULL || nonce == NULL
+ || authTag == NULL || nonceSz < 7 || nonceSz > 13)
+ return BAD_FUNC_ARG;
+
+ o = out;
+ oSz = inSz;
+ XMEMCPY(B+1, nonce, nonceSz);
+ lenSz = AES_BLOCK_SIZE - 1 - (byte)nonceSz;
+
+ B[0] = lenSz - 1;
+ for (i = 0; i < lenSz; i++)
+ B[AES_BLOCK_SIZE - 1 - i] = 0;
+ B[15] = 1;
+
+ while (oSz >= AES_BLOCK_SIZE) {
+ wc_AesEncrypt(aes, B, A);
+ xorbuf(A, in, AES_BLOCK_SIZE);
+ XMEMCPY(o, A, AES_BLOCK_SIZE);
+
+ AesCcmCtrInc(B, lenSz);
+ oSz -= AES_BLOCK_SIZE;
+ in += AES_BLOCK_SIZE;
+ o += AES_BLOCK_SIZE;
+ }
+ if (inSz > 0) {
+ wc_AesEncrypt(aes, B, A);
+ xorbuf(A, in, oSz);
+ XMEMCPY(o, A, oSz);
+ }
+
+ for (i = 0; i < lenSz; i++)
+ B[AES_BLOCK_SIZE - 1 - i] = 0;
+ wc_AesEncrypt(aes, B, A);
+
+ o = out;
+ oSz = inSz;
+
+ B[0] = (authInSz > 0 ? 64 : 0)
+ + (8 * (((byte)authTagSz - 2) / 2))
+ + (lenSz - 1);
+ for (i = 0; i < lenSz; i++) {
+ if (mask && i >= wordSz)
+ mask = 0x00;
+ B[AES_BLOCK_SIZE - 1 - i] = (inSz >> ((8 * i) & mask)) & mask;
+ }
+
+ wc_AesEncrypt(aes, B, A);
+
+ if (authInSz > 0)
+ roll_auth(aes, authIn, authInSz, A);
+ if (inSz > 0)
+ roll_x(aes, o, oSz, A);
+
+ B[0] = lenSz - 1;
+ for (i = 0; i < lenSz; i++)
+ B[AES_BLOCK_SIZE - 1 - i] = 0;
+ wc_AesEncrypt(aes, B, B);
+ xorbuf(A, B, authTagSz);
+
+ if (ConstantCompare(A, authTag, authTagSz) != 0) {
+ /* If the authTag check fails, don't keep the decrypted data.
+ * Unfortunately, you need the decrypted data to calculate the
+ * check value. */
+ XMEMSET(out, 0, inSz);
+ result = AES_CCM_AUTH_E;
+ }
+
+ ForceZero(A, AES_BLOCK_SIZE);
+ ForceZero(B, AES_BLOCK_SIZE);
+ o = NULL;
+
+ return result;
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AESCCM */
+
+
+
+#ifdef HAVE_AESGCM /* common GCM functions 32 and 64 bit */
+int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
+{
+ int ret;
+ byte iv[AES_BLOCK_SIZE];
+
+ if (!((len == 16) || (len == 24) || (len == 32)))
+ return BAD_FUNC_ARG;
+
+ XMEMSET(iv, 0, AES_BLOCK_SIZE);
+ ret = wc_AesSetKey(aes, key, len, iv, AES_ENCRYPTION);
+
+ if (ret == 0) {
+ wc_AesEncrypt(aes, iv, aes->H);
+ #if defined(__aarch64__)
+ {
+ word32* pt = (word32*)aes->H;
+ __asm__ volatile (
+ "LD1 {v0.16b}, [%[h]] \n"
+ "RBIT v0.16b, v0.16b \n"
+ "ST1 {v0.16b}, [%[out]] \n"
+ : [out] "=r" (pt)
+ : [h] "0" (pt)
+ : "cc", "memory", "v0"
+ );
+ }
+ #else
+ {
+ word32* pt = (word32*)aes->H;
+ __asm__ volatile (
+ "VLD1.32 {q0}, [%[h]] \n"
+ "VREV64.8 q0, q0 \n"
+ "VSWP.8 d0, d1 \n"
+ "VST1.32 {q0}, [%[out]] \n"
+ : [out] "=r" (pt)
+ : [h] "0" (pt)
+ : "cc", "memory", "q0"
+ );
+ }
+ #endif
+ }
+
+ return ret;
+}
+
+#endif /* HAVE_AESGCM */
+
+/* AES-DIRECT */
+#if defined(WOLFSSL_AES_DIRECT)
+ /* Allow direct access to one block encrypt */
+ void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in)
+ {
+ if (aes == NULL || out == NULL || in == NULL) {
+ WOLFSSL_MSG("Invalid input to wc_AesEncryptDirect");
+ return;
+ }
+ wc_AesEncrypt(aes, in, out);
+ }
+ #ifdef HAVE_AES_DECRYPT
+ /* Allow direct access to one block decrypt */
+ void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in)
+ {
+ if (aes == NULL || out == NULL || in == NULL) {
+ WOLFSSL_MSG("Invalid input to wc_AesDecryptDirect");
+ return;
+ }
+ wc_AesDecrypt(aes, in, out);
+ }
+ #endif /* HAVE_AES_DECRYPT */
+#endif /* WOLFSSL_AES_DIRECT */
+#endif /* !NO_AES && WOLFSSL_ARMASM */
diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c
new file mode 100644
index 0000000..df76bec
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-chacha.c
@@ -0,0 +1,2857 @@
+/* armv8-chacha.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ *
+ */
+
+/* The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM
+ * https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+ */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#ifdef HAVE_CHACHA
+
+#include <wolfssl/wolfcrypt/chacha.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/cpuid.h>
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifdef CHACHA_AEAD_TEST
+ #include <stdio.h>
+#endif
+
+#ifdef CHACHA_TEST
+ #include <stdio.h>
+#endif
+
+#ifdef BIG_ENDIAN_ORDER
+ #define LITTLE32(x) ByteReverseWord32(x)
+#else
+ #define LITTLE32(x) (x)
+#endif
+
+/* Number of rounds */
+#define ROUNDS 20
+
+#define U32C(v) (v##U)
+#define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF))
+#define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0])
+
+#define PLUS(v,w) (U32V((v) + (w)))
+#define PLUSONE(v) (PLUS((v),1))
+
+#define ARM_SIMD_LEN_BYTES 16
+
+/**
+ * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version
+ * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB.
+ */
+int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
+{
+ word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */
+
+#ifdef CHACHA_AEAD_TEST
+ word32 i;
+ printf("NONCE : ");
+ for (i = 0; i < CHACHA_IV_BYTES; i++) {
+ printf("%02x", inIv[i]);
+ }
+ printf("\n\n");
+#endif
+
+ if (ctx == NULL)
+ return BAD_FUNC_ARG;
+
+ XMEMCPY(temp, inIv, CHACHA_IV_BYTES);
+
+ ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */
+ ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */
+ ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */
+ ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */
+
+ return 0;
+}
+
+/* "expand 32-byte k" as unsigned 32 byte */
+static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+/* "expand 16-byte k" as unsigned 16 byte */
+static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574};
+
+/**
+ * Key setup. 8 word iv (nonce)
+ */
+int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
+{
+ const word32* constants;
+ const byte* k;
+
+#ifdef XSTREAM_ALIGN
+ word32 alignKey[8];
+#endif
+
+ if (ctx == NULL)
+ return BAD_FUNC_ARG;
+
+ if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ)
+ return BAD_FUNC_ARG;
+
+#ifdef XSTREAM_ALIGN
+ if ((wolfssl_word)key % 4) {
+ WOLFSSL_MSG("wc_ChachaSetKey unaligned key");
+ XMEMCPY(alignKey, key, keySz);
+ k = (byte*)alignKey;
+ }
+ else {
+ k = key;
+ }
+#else
+ k = key;
+#endif /* XSTREAM_ALIGN */
+
+#ifdef CHACHA_AEAD_TEST
+ word32 i;
+ printf("ChaCha key used :\n");
+ for (i = 0; i < keySz; i++) {
+ printf("%02x", key[i]);
+ if ((i + 1) % 8 == 0)
+ printf("\n");
+ }
+ printf("\n\n");
+#endif
+
+ ctx->X[4] = U8TO32_LITTLE(k + 0);
+ ctx->X[5] = U8TO32_LITTLE(k + 4);
+ ctx->X[6] = U8TO32_LITTLE(k + 8);
+ ctx->X[7] = U8TO32_LITTLE(k + 12);
+ if (keySz == CHACHA_MAX_KEY_SZ) {
+ k += 16;
+ constants = sigma;
+ }
+ else {
+ constants = tau;
+ }
+ ctx->X[ 8] = U8TO32_LITTLE(k + 0);
+ ctx->X[ 9] = U8TO32_LITTLE(k + 4);
+ ctx->X[10] = U8TO32_LITTLE(k + 8);
+ ctx->X[11] = U8TO32_LITTLE(k + 12);
+ ctx->X[ 0] = constants[0];
+ ctx->X[ 1] = constants[1];
+ ctx->X[ 2] = constants[2];
+ ctx->X[ 3] = constants[3];
+
+ return 0;
+}
+
+static const word32 L_chacha20_neon_inc_first_word[] = {
+ 0x1,
+ 0x0,
+ 0x0,
+ 0x0,
+};
+
+#ifdef __aarch64__
+
+static const word32 L_chacha20_neon_add_all_counters[] = {
+ 0x0,
+ 0x1,
+ 0x2,
+ 0x3,
+};
+
+static const word32 L_chacha20_neon_rol8[] = {
+ 0x2010003,
+ 0x6050407,
+ 0xa09080b,
+ 0xe0d0c0f,
+};
+
+static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m, byte* c, word32 bytes)
+{
+#ifdef CHACHA_TEST
+ printf("Entering wc_Chacha_encrypt_320 with %d bytes\n", bytes);
+#endif /*CHACHA_TEST */
+ word64 bytes64 = (word64) bytes;
+ __asm__ __volatile__ (
+ /*
+ * The layout of used registers is:
+ * ARM
+ * w4-w19: these registers hold the fifth Chacha block for calculation in regular ARM
+ * w20: loop counter for how many even-odd rounds need to be executed
+ * w21: the counter offset for the block in ARM registers
+ * NEON
+ * v0-v15: the vi'th register holds the i'th word of four blocks during the quarter rounds.
+ * these registers are later transposed make ADDing the input and XORing the message easier.
+ * v16-v19: these are helper registers that are used as temporary location to store data
+ * v20-v23: load the next message block
+ * v24-v27: the 64 byte initial Chacha block
+ * v28: vector to increment the counter words of each block
+ * v29: vector of 5's to increment counters between L_chacha20_arm64_outer_%= loops
+ * v30: table lookup indices to rotate values by 8
+ */
+
+ /* Load counter-add values for each block */
+ "LD1 {v28.4s}, [%[L_chacha20_neon_add_all_counters]] \n\t"
+ /* Load index look-up for rotating left 8 bits */
+ "LD1 {v30.16b}, [%[L_chacha20_neon_rol8]] \n\t"
+ /* For adding 5 to each counter-add for next 320-byte chunk */
+ "MOVI v29.4s, #5 \n\t"
+ /* Counter for 5th block in regular registers */
+ "MOV w21, #4 \n\t"
+ /* Load state to encrypt */
+ "LD1 {v24.4s-v27.4s}, [%[input]] \n\t"
+ "\n"
+ "L_chacha20_arm64_outer_%=: \n\t"
+ /* Move state into regular registers */
+ "MOV x4, v24.d[0] \n\t"
+ "MOV x6, v24.d[1] \n\t"
+ "MOV x8, v25.d[0] \n\t"
+ "MOV x10, v25.d[1] \n\t"
+ "MOV x12, v26.d[0] \n\t"
+ "MOV x14, v26.d[1] \n\t"
+ "MOV x16, v27.d[0] \n\t"
+ "MOV x22, v27.d[1] \n\t"
+ /* Move state into vector registers (x4) */
+ "DUP v0.4s, v24.s[0] \n\t"
+ "DUP v1.4s, v24.s[1] \n\t"
+ "LSR x5, x4, #32 \n\t"
+ "DUP v2.4s, v24.s[2] \n\t"
+ "DUP v3.4s, v24.s[3] \n\t"
+ "LSR x7, x6, #32 \n\t"
+ "DUP v4.4s, v25.s[0] \n\t"
+ "DUP v5.4s, v25.s[1] \n\t"
+ "LSR x9, x8, #32 \n\t"
+ "DUP v6.4s, v25.s[2] \n\t"
+ "DUP v7.4s, v25.s[3] \n\t"
+ "LSR x11, x10, #32 \n\t"
+ "DUP v8.4s, v26.s[0] \n\t"
+ "DUP v9.4s, v26.s[1] \n\t"
+ "LSR x13, x12, #32 \n\t"
+ "DUP v10.4s, v26.s[2] \n\t"
+ "DUP v11.4s, v26.s[3] \n\t"
+ "LSR x15, x14, #32 \n\t"
+ "DUP v12.4s, v27.s[0] \n\t"
+ "DUP v13.4s, v27.s[1] \n\t"
+ "LSR x17, x16, #32 \n\t"
+ "DUP v14.4s, v27.s[2] \n\t"
+ "DUP v15.4s, v27.s[3] \n\t"
+ "LSR x19, x22, #32 \n\t"
+ /* Add to counter word */
+ "ADD v12.4s, v12.4s, v28.4s \n\t"
+ "ADD w16, w16, w21 \n\t"
+ /* Set number of odd+even rounds to perform */
+ "MOV w20, #10 \n\t"
+ "\n"
+ "L_chacha20_arm64_inner_%=: \n\t"
+ "SUBS w20, w20, #1 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4s, v0.4s, v4.4s \n\t"
+ "ADD w4, w4, w8 \n\t"
+ "ADD v1.4s, v1.4s, v5.4s \n\t"
+ "ADD w5, w5, w9 \n\t"
+ "ADD v2.4s, v2.4s, v6.4s \n\t"
+ "ADD w6, w6, w10 \n\t"
+ "ADD v3.4s, v3.4s, v7.4s \n\t"
+ "ADD w7, w7, w11 \n\t"
+ "EOR v12.16b, v12.16b, v0.16b \n\t"
+ "EOR w16, w16, w4 \n\t"
+ "EOR v13.16b, v13.16b, v1.16b \n\t"
+ "EOR w17, w17, w5 \n\t"
+ "EOR v14.16b, v14.16b, v2.16b \n\t"
+ "EOR w22, w22, w6 \n\t"
+ "EOR v15.16b, v15.16b, v3.16b \n\t"
+ "EOR w19, w19, w7 \n\t"
+ "REV32 v12.8h, v12.8h \n\t"
+ "ROR w16, w16, #16 \n\t"
+ "REV32 v13.8h, v13.8h \n\t"
+ "ROR w17, w17, #16 \n\t"
+ "REV32 v14.8h, v14.8h \n\t"
+ "ROR w22, w22, #16 \n\t"
+ "REV32 v15.8h, v15.8h \n\t"
+ "ROR w19, w19, #16 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v8.4s, v8.4s, v12.4s \n\t"
+ "ADD w12, w12, w16 \n\t"
+ "ADD v9.4s, v9.4s, v13.4s \n\t"
+ "ADD w13, w13, w17 \n\t"
+ "ADD v10.4s, v10.4s, v14.4s \n\t"
+ "ADD w14, w14, w22 \n\t"
+ "ADD v11.4s, v11.4s, v15.4s \n\t"
+ "ADD w15, w15, w19 \n\t"
+ "EOR v16.16b, v4.16b, v8.16b \n\t"
+ "EOR w8, w8, w12 \n\t"
+ "EOR v17.16b, v5.16b, v9.16b \n\t"
+ "EOR w9, w9, w13 \n\t"
+ "EOR v18.16b, v6.16b, v10.16b \n\t"
+ "EOR w10, w10, w14 \n\t"
+ "EOR v19.16b, v7.16b, v11.16b \n\t"
+ "EOR w11, w11, w15 \n\t"
+ "SHL v4.4s, v16.4s, #12 \n\t"
+ "ROR w8, w8, #20 \n\t"
+ "SHL v5.4s, v17.4s, #12 \n\t"
+ "ROR w9, w9, #20 \n\t"
+ "SHL v6.4s, v18.4s, #12 \n\t"
+ "ROR w10, w10, #20 \n\t"
+ "SHL v7.4s, v19.4s, #12 \n\t"
+ "ROR w11, w11, #20 \n\t"
+ "SRI v4.4s, v16.4s, #20 \n\t"
+ "SRI v5.4s, v17.4s, #20 \n\t"
+ "SRI v6.4s, v18.4s, #20 \n\t"
+ "SRI v7.4s, v19.4s, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4s, v0.4s, v4.4s \n\t"
+ "ADD w4, w4, w8 \n\t"
+ "ADD v1.4s, v1.4s, v5.4s \n\t"
+ "ADD w5, w5, w9 \n\t"
+ "ADD v2.4s, v2.4s, v6.4s \n\t"
+ "ADD w6, w6, w10 \n\t"
+ "ADD v3.4s, v3.4s, v7.4s \n\t"
+ "ADD w7, w7, w11 \n\t"
+ "EOR v12.16b, v12.16b, v0.16b \n\t"
+ "EOR w16, w16, w4 \n\t"
+ "EOR v13.16b, v13.16b, v1.16b \n\t"
+ "EOR w17, w17, w5 \n\t"
+ "EOR v14.16b, v14.16b, v2.16b \n\t"
+ "EOR w22, w22, w6 \n\t"
+ "EOR v15.16b, v15.16b, v3.16b \n\t"
+ "EOR w19, w19, w7 \n\t"
+ "TBL v12.16b, { v12.16b }, v30.16b \n\t"
+ "ROR w16, w16, #24 \n\t"
+ "TBL v13.16b, { v13.16b }, v30.16b \n\t"
+ "ROR w17, w17, #24 \n\t"
+ "TBL v14.16b, { v14.16b }, v30.16b \n\t"
+ "ROR w22, w22, #24 \n\t"
+ "TBL v15.16b, { v15.16b }, v30.16b \n\t"
+ "ROR w19, w19, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v8.4s, v8.4s, v12.4s \n\t"
+ "ADD w12, w12, w16 \n\t"
+ "ADD v9.4s, v9.4s, v13.4s \n\t"
+ "ADD w13, w13, w17 \n\t"
+ "ADD v10.4s, v10.4s, v14.4s \n\t"
+ "ADD w14, w14, w22 \n\t"
+ "ADD v11.4s, v11.4s, v15.4s \n\t"
+ "ADD w15, w15, w19 \n\t"
+ "EOR v16.16b, v4.16b, v8.16b \n\t"
+ "EOR w8, w8, w12 \n\t"
+ "EOR v17.16b, v5.16b, v9.16b \n\t"
+ "EOR w9, w9, w13 \n\t"
+ "EOR v18.16b, v6.16b, v10.16b \n\t"
+ "EOR w10, w10, w14 \n\t"
+ "EOR v19.16b, v7.16b, v11.16b \n\t"
+ "EOR w11, w11, w15 \n\t"
+ "SHL v4.4s, v16.4s, #7 \n\t"
+ "ROR w8, w8, #25 \n\t"
+ "SHL v5.4s, v17.4s, #7 \n\t"
+ "ROR w9, w9, #25 \n\t"
+ "SHL v6.4s, v18.4s, #7 \n\t"
+ "ROR w10, w10, #25 \n\t"
+ "SHL v7.4s, v19.4s, #7 \n\t"
+ "ROR w11, w11, #25 \n\t"
+ "SRI v4.4s, v16.4s, #25 \n\t"
+ "SRI v5.4s, v17.4s, #25 \n\t"
+ "SRI v6.4s, v18.4s, #25 \n\t"
+ "SRI v7.4s, v19.4s, #25 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4s, v0.4s, v5.4s \n\t"
+ "ADD w4, w4, w9 \n\t"
+ "ADD v1.4s, v1.4s, v6.4s \n\t"
+ "ADD w5, w5, w10 \n\t"
+ "ADD v2.4s, v2.4s, v7.4s \n\t"
+ "ADD w6, w6, w11 \n\t"
+ "ADD v3.4s, v3.4s, v4.4s \n\t"
+ "ADD w7, w7, w8 \n\t"
+ "EOR v15.16b, v15.16b, v0.16b \n\t"
+ "EOR w19, w19, w4 \n\t"
+ "EOR v12.16b, v12.16b, v1.16b \n\t"
+ "EOR w16, w16, w5 \n\t"
+ "EOR v13.16b, v13.16b, v2.16b \n\t"
+ "EOR w17, w17, w6 \n\t"
+ "EOR v14.16b, v14.16b, v3.16b \n\t"
+ "EOR w22, w22, w7 \n\t"
+ "REV32 v15.8h, v15.8h \n\t"
+ "ROR w19, w19, #16 \n\t"
+ "REV32 v12.8h, v12.8h \n\t"
+ "ROR w16, w16, #16 \n\t"
+ "REV32 v13.8h, v13.8h \n\t"
+ "ROR w17, w17, #16 \n\t"
+ "REV32 v14.8h, v14.8h \n\t"
+ "ROR w22, w22, #16 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v10.4s, v10.4s, v15.4s \n\t"
+ "ADD w14, w14, w19 \n\t"
+ "ADD v11.4s, v11.4s, v12.4s \n\t"
+ "ADD w15, w15, w16 \n\t"
+ "ADD v8.4s, v8.4s, v13.4s \n\t"
+ "ADD w12, w12, w17 \n\t"
+ "ADD v9.4s, v9.4s, v14.4s \n\t"
+ "ADD w13, w13, w22 \n\t"
+ "EOR v16.16b, v5.16b, v10.16b \n\t"
+ "EOR w9, w9, w14 \n\t"
+ "EOR v17.16b, v6.16b, v11.16b \n\t"
+ "EOR w10, w10, w15 \n\t"
+ "EOR v18.16b, v7.16b, v8.16b \n\t"
+ "EOR w11, w11, w12 \n\t"
+ "EOR v19.16b, v4.16b, v9.16b \n\t"
+ "EOR w8, w8, w13 \n\t"
+ "SHL v5.4s, v16.4s, #12 \n\t"
+ "ROR w9, w9, #20 \n\t"
+ "SHL v6.4s, v17.4s, #12 \n\t"
+ "ROR w10, w10, #20 \n\t"
+ "SHL v7.4s, v18.4s, #12 \n\t"
+ "ROR w11, w11, #20 \n\t"
+ "SHL v4.4s, v19.4s, #12 \n\t"
+ "ROR w8, w8, #20 \n\t"
+ "SRI v5.4s, v16.4s, #20 \n\t"
+ "SRI v6.4s, v17.4s, #20 \n\t"
+ "SRI v7.4s, v18.4s, #20 \n\t"
+ "SRI v4.4s, v19.4s, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4s, v0.4s, v5.4s \n\t"
+ "ADD w4, w4, w9 \n\t"
+ "ADD v1.4s, v1.4s, v6.4s \n\t"
+ "ADD w5, w5, w10 \n\t"
+ "ADD v2.4s, v2.4s, v7.4s \n\t"
+ "ADD w6, w6, w11 \n\t"
+ "ADD v3.4s, v3.4s, v4.4s \n\t"
+ "ADD w7, w7, w8 \n\t"
+ "EOR v15.16b, v15.16b, v0.16b \n\t"
+ "EOR w19, w19, w4 \n\t"
+ "EOR v12.16b, v12.16b, v1.16b \n\t"
+ "EOR w16, w16, w5 \n\t"
+ "EOR v13.16b, v13.16b, v2.16b \n\t"
+ "EOR w17, w17, w6 \n\t"
+ "EOR v14.16b, v14.16b, v3.16b \n\t"
+ "EOR w22, w22, w7 \n\t"
+ "TBL v15.16b, { v15.16b }, v30.16b \n\t"
+ "ROR w19, w19, #24 \n\t"
+ "TBL v12.16b, { v12.16b }, v30.16b \n\t"
+ "ROR w16, w16, #24 \n\t"
+ "TBL v13.16b, { v13.16b }, v30.16b \n\t"
+ "ROR w17, w17, #24 \n\t"
+ "TBL v14.16b, { v14.16b }, v30.16b \n\t"
+ "ROR w22, w22, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v10.4s, v10.4s, v15.4s \n\t"
+ "ADD w14, w14, w19 \n\t"
+ "ADD v11.4s, v11.4s, v12.4s \n\t"
+ "ADD w15, w15, w16 \n\t"
+ "ADD v8.4s, v8.4s, v13.4s \n\t"
+ "ADD w12, w12, w17 \n\t"
+ "ADD v9.4s, v9.4s, v14.4s \n\t"
+ "ADD w13, w13, w22 \n\t"
+ "EOR v16.16b, v5.16b, v10.16b \n\t"
+ "EOR w9, w9, w14 \n\t"
+ "EOR v17.16b, v6.16b, v11.16b \n\t"
+ "EOR w10, w10, w15 \n\t"
+ "EOR v18.16b, v7.16b, v8.16b \n\t"
+ "EOR w11, w11, w12 \n\t"
+ "EOR v19.16b, v4.16b, v9.16b \n\t"
+ "EOR w8, w8, w13 \n\t"
+ "SHL v5.4s, v16.4s, #7 \n\t"
+ "ROR w9, w9, #25 \n\t"
+ "SHL v6.4s, v17.4s, #7 \n\t"
+ "ROR w10, w10, #25 \n\t"
+ "SHL v7.4s, v18.4s, #7 \n\t"
+ "ROR w11, w11, #25 \n\t"
+ "SHL v4.4s, v19.4s, #7 \n\t"
+ "ROR w8, w8, #25 \n\t"
+ "SRI v5.4s, v16.4s, #25 \n\t"
+ "SRI v6.4s, v17.4s, #25 \n\t"
+ "SRI v7.4s, v18.4s, #25 \n\t"
+ "SRI v4.4s, v19.4s, #25 \n\t"
+ "BNE L_chacha20_arm64_inner_%= \n\t"
+ /* Add counter now rather than after transposed */
+ "ADD v12.4s, v12.4s, v28.4s \n\t"
+ "ADD w16, w16, w21 \n\t"
+ /* Load message */
+ "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+ /* Transpose vectors */
+ "TRN1 v16.4s, v0.4s, v1.4s \n\t"
+ "TRN1 v18.4s, v2.4s, v3.4s \n\t"
+ "TRN2 v17.4s, v0.4s, v1.4s \n\t"
+ "TRN2 v19.4s, v2.4s, v3.4s \n\t"
+ "TRN1 v0.2d, v16.2d, v18.2d \n\t"
+ "TRN1 v1.2d, v17.2d, v19.2d \n\t"
+ "TRN2 v2.2d, v16.2d, v18.2d \n\t"
+ "TRN2 v3.2d, v17.2d, v19.2d \n\t"
+ "TRN1 v16.4s, v4.4s, v5.4s \n\t"
+ "TRN1 v18.4s, v6.4s, v7.4s \n\t"
+ "TRN2 v17.4s, v4.4s, v5.4s \n\t"
+ "TRN2 v19.4s, v6.4s, v7.4s \n\t"
+ "TRN1 v4.2d, v16.2d, v18.2d \n\t"
+ "TRN1 v5.2d, v17.2d, v19.2d \n\t"
+ "TRN2 v6.2d, v16.2d, v18.2d \n\t"
+ "TRN2 v7.2d, v17.2d, v19.2d \n\t"
+ "TRN1 v16.4s, v8.4s, v9.4s \n\t"
+ "TRN1 v18.4s, v10.4s, v11.4s \n\t"
+ "TRN2 v17.4s, v8.4s, v9.4s \n\t"
+ "TRN2 v19.4s, v10.4s, v11.4s \n\t"
+ "TRN1 v8.2d, v16.2d, v18.2d \n\t"
+ "TRN1 v9.2d, v17.2d, v19.2d \n\t"
+ "TRN2 v10.2d, v16.2d, v18.2d \n\t"
+ "TRN2 v11.2d, v17.2d, v19.2d \n\t"
+ "TRN1 v16.4s, v12.4s, v13.4s \n\t"
+ "TRN1 v18.4s, v14.4s, v15.4s \n\t"
+ "TRN2 v17.4s, v12.4s, v13.4s \n\t"
+ "TRN2 v19.4s, v14.4s, v15.4s \n\t"
+ "TRN1 v12.2d, v16.2d, v18.2d \n\t"
+ "TRN1 v13.2d, v17.2d, v19.2d \n\t"
+ "TRN2 v14.2d, v16.2d, v18.2d \n\t"
+ "TRN2 v15.2d, v17.2d, v19.2d \n\t"
+ /* Add back state, XOR in message and store (load next block) */
+ "ADD v16.4s, v0.4s, v24.4s \n\t"
+ "ADD v17.4s, v4.4s, v25.4s \n\t"
+ "ADD v18.4s, v8.4s, v26.4s \n\t"
+ "ADD v19.4s, v12.4s, v27.4s \n\t"
+ "EOR v16.16b, v16.16b, v20.16b \n\t"
+ "EOR v17.16b, v17.16b, v21.16b \n\t"
+ "EOR v18.16b, v18.16b, v22.16b \n\t"
+ "EOR v19.16b, v19.16b, v23.16b \n\t"
+ "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+ "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+ "ADD v16.4s, v1.4s, v24.4s \n\t"
+ "ADD v17.4s, v5.4s, v25.4s \n\t"
+ "ADD v18.4s, v9.4s, v26.4s \n\t"
+ "ADD v19.4s, v13.4s, v27.4s \n\t"
+ "EOR v16.16b, v16.16b, v20.16b \n\t"
+ "EOR v17.16b, v17.16b, v21.16b \n\t"
+ "EOR v18.16b, v18.16b, v22.16b \n\t"
+ "EOR v19.16b, v19.16b, v23.16b \n\t"
+ "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+ "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+ "ADD v16.4s, v2.4s, v24.4s \n\t"
+ "ADD v17.4s, v6.4s, v25.4s \n\t"
+ "ADD v18.4s, v10.4s, v26.4s \n\t"
+ "ADD v19.4s, v14.4s, v27.4s \n\t"
+ "EOR v16.16b, v16.16b, v20.16b \n\t"
+ "EOR v17.16b, v17.16b, v21.16b \n\t"
+ "EOR v18.16b, v18.16b, v22.16b \n\t"
+ "EOR v19.16b, v19.16b, v23.16b \n\t"
+ "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+ "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+ "ADD v16.4s, v3.4s, v24.4s \n\t"
+ "ADD v17.4s, v7.4s, v25.4s \n\t"
+ "ADD v18.4s, v11.4s, v26.4s \n\t"
+ "ADD v19.4s, v15.4s, v27.4s \n\t"
+ "EOR v16.16b, v16.16b, v20.16b \n\t"
+ "EOR v17.16b, v17.16b, v21.16b \n\t"
+ "EOR v18.16b, v18.16b, v22.16b \n\t"
+ "EOR v19.16b, v19.16b, v23.16b \n\t"
+ "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+ "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+ /* Move regular registers into vector registers for adding and xor */
+ "ORR x4, x4, x5, LSL #32 \n\t"
+ "ORR x6, x6, x7, LSL #32 \n\t"
+ "ORR x8, x8, x9, LSL #32 \n\t"
+ "MOV v16.d[0], x4 \n\t"
+ "ORR x10, x10, x11, LSL #32 \n\t"
+ "MOV v16.d[1], x6 \n\t"
+ "ORR x12, x12, x13, LSL #32 \n\t"
+ "MOV v17.d[0], x8 \n\t"
+ "ORR x14, x14, x15, LSL #32 \n\t"
+ "MOV v17.d[1], x10 \n\t"
+ "ORR x16, x16, x17, LSL #32 \n\t"
+ "MOV v18.d[0], x12 \n\t"
+ "ORR x22, x22, x19, LSL #32 \n\t"
+ "MOV v18.d[1], x14 \n\t"
+ "MOV v19.d[0], x16 \n\t"
+ "MOV v19.d[1], x22 \n\t"
+ /* Add back state, XOR in message and store */
+ "ADD v16.4s, v16.4s, v24.4s \n\t"
+ "ADD v17.4s, v17.4s, v25.4s \n\t"
+ "ADD v18.4s, v18.4s, v26.4s \n\t"
+ "ADD v19.4s, v19.4s, v27.4s \n\t"
+ "EOR v16.16b, v16.16b, v20.16b \n\t"
+ "EOR v17.16b, v17.16b, v21.16b \n\t"
+ "EOR v18.16b, v18.16b, v22.16b \n\t"
+ "EOR v19.16b, v19.16b, v23.16b \n\t"
+ "ADD w21, w21, #5 \n\t"
+ "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+ "SUBS %[bytes], %[bytes], #320 \n\t"
+ "ADD v28.4s, v28.4s, v29.4s \n\t"
+ "BNE L_chacha20_arm64_outer_%= \n\t"
+ : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c),
+ [bytes] "+r" (bytes64)
+ : [L_chacha20_neon_add_all_counters] "r" (L_chacha20_neon_add_all_counters),
+ [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
+ : "memory", "cc",
+ "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12",
+ "x13", "x14", "x15", "x16", "x17", "x22", "x19", "x20", "x21",
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+ "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"
+ );
+}
+#endif /* __aarch64__ */
+
+/**
+ * Converts word into bytes with rotations having been done.
+ */
+static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c)
+{
+#ifdef CHACHA_TEST
+ printf("Entering wc_Chacha_encrypt_256\n");
+#endif /*CHACHA_TEST */
+
+#ifdef __aarch64__
+ __asm__ __volatile__ (
+ // v0-v3 - first block
+ // v12 first block helper
+ // v4-v7 - second block
+ // v13 second block helper
+ // v8-v11 - third block
+ // v14 third block helper
+ // w4-w19 - fourth block
+
+ // v0 0 1 2 3
+ // v1 4 5 6 7
+ // v2 8 9 10 11
+ // v3 12 13 14 15
+ // load CHACHA state with indices placed as shown above
+ /* Load state to encrypt */
+ "LD1 {v20.4S-v23.4S}, [%[input]] \n\t"
+ /* Load index look-up for rotating left 8 bits */
+ "LD1 {v24.16B}, [%[L_chacha20_neon_rol8]] \n\t"
+ /* Move state into regular registers */
+ "MOV x4, v20.D[0] \n\t"
+ "MOV x6, v20.D[1] \n\t"
+ "MOV x8, v21.D[0] \n\t"
+ "MOV x10, v21.D[1] \n\t"
+ "MOV x12, v22.D[0] \n\t"
+ "MOV x14, v22.D[1] \n\t"
+ "MOV x16, v23.D[0] \n\t"
+ "MOV x22, v23.D[1] \n\t"
+ /* Move state into vector registers (x3) */
+ "MOV v0.16B, v20.16B \n\t"
+ "MOV v1.16B, v21.16B \n\t"
+ "LSR x19, x22, #32 \n\t"
+ "MOV v2.16B, v22.16B \n\t"
+ "ADD w20, w16, #1 \n\t"
+ "MOV v3.16B, v23.16B \n\t"
+ "LSR x17, x16, #32 \n\t"
+ "MOV v4.16B, v20.16B \n\t"
+ "MOV v5.16B, v21.16B \n\t"
+ "LSR x15, x14, #32 \n\t"
+ "MOV v6.16B, v22.16B \n\t"
+ "ADD w21, w16, #2 \n\t"
+ "MOV v7.16B, v23.16B \n\t"
+ "LSR x13, x12, #32 \n\t"
+ "MOV v8.16B, v20.16B \n\t"
+ "MOV v9.16B, v21.16B \n\t"
+ "LSR x11, x10, #32 \n\t"
+ "MOV v10.16B, v22.16B \n\t"
+ "ADD w16, w16, #3 \n\t"
+ "MOV v11.16B, v23.16B \n\t"
+ "LSR x9, x8, #32 \n\t"
+ /* Set counter word */
+ "MOV v7.S[0], w20 \n\t"
+ "LSR x7, x6, #32 \n\t"
+ "MOV v11.S[0], w21 \n\t"
+ "LSR x5, x4, #32 \n\t"
+ /* Set number of odd+even rounds to perform */
+ "MOV w3, #10 \n\t"
+ "\n"
+ "L_chacha20_arm64_256_loop_%=: \n\t"
+ "SUBS w3, w3, #1 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD w4, w4, w8 \n\t"
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "ADD w5, w5, w9 \n\t"
+ "ADD v4.4S, v4.4S, v5.4S \n\t"
+ "ADD w6, w6, w10 \n\t"
+ "ADD v8.4S, v8.4S, v9.4S \n\t"
+ "ADD w7, w7, w11 \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "EOR w16, w16, w4 \n\t"
+ "EOR v7.16B, v7.16B, v4.16B \n\t"
+ "EOR w17, w17, w5 \n\t"
+ "EOR v11.16B, v11.16B, v8.16B \n\t"
+ "EOR w22, w22, w6 \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ "EOR w19, w19, w7 \n\t"
+ "REV32 v7.8H, v7.8H \n\t"
+ "ROR w16, w16, #16 \n\t"
+ "REV32 v11.8H, v11.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ROR w17, w17, #16 \n\t"
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "ROR w22, w22, #16 \n\t"
+ "ADD v6.4S, v6.4S, v7.4S \n\t"
+ "ROR w19, w19, #16 \n\t"
+ "ADD v10.4S, v10.4S, v11.4S \n\t"
+ "ADD w12, w12, w16 \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "ADD w13, w13, w17 \n\t"
+ "EOR v13.16B, v5.16B, v6.16B \n\t"
+ "ADD w14, w14, w22 \n\t"
+ "EOR v14.16B, v9.16B, v10.16B \n\t"
+ "ADD w15, w15, w19 \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "EOR w8, w8, w12 \n\t"
+ "SHL v5.4S, v13.4S, #12 \n\t"
+ "EOR w9, w9, w13 \n\t"
+ "SHL v9.4S, v14.4S, #12 \n\t"
+ "EOR w10, w10, w14 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ "EOR w11, w11, w15 \n\t"
+ "SRI v5.4S, v13.4S, #20 \n\t"
+ "ROR w8, w8, #20 \n\t"
+ "SRI v9.4S, v14.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ROR w9, w9, #20 \n\t"
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "ROR w10, w10, #20 \n\t"
+ "ADD v4.4S, v4.4S, v5.4S \n\t"
+ "ROR w11, w11, #20 \n\t"
+ "ADD v8.4S, v8.4S, v9.4S \n\t"
+ "ADD w4, w4, w8 \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "ADD w5, w5, w9 \n\t"
+ "EOR v7.16B, v7.16B, v4.16B \n\t"
+ "ADD w6, w6, w10 \n\t"
+ "EOR v11.16B, v11.16B, v8.16B \n\t"
+ "ADD w7, w7, w11 \n\t"
+ "TBL v3.16B, { v3.16B }, v24.16B \n\t"
+ "EOR w16, w16, w4 \n\t"
+ "TBL v7.16B, { v7.16B }, v24.16B \n\t"
+ "EOR w17, w17, w5 \n\t"
+ "TBL v11.16B, { v11.16B }, v24.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "EOR w22, w22, w6 \n\t"
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR w19, w19, w7 \n\t"
+ "ADD v6.4S, v6.4S, v7.4S \n\t"
+ "ROR w16, w16, #24 \n\t"
+ "ADD v10.4S, v10.4S, v11.4S \n\t"
+ "ROR w17, w17, #24 \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "ROR w22, w22, #24 \n\t"
+ "EOR v13.16B, v5.16B, v6.16B \n\t"
+ "ROR w19, w19, #24 \n\t"
+ "EOR v14.16B, v9.16B, v10.16B \n\t"
+ "ADD w12, w12, w16 \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "ADD w13, w13, w17 \n\t"
+ "SHL v5.4S, v13.4S, #7 \n\t"
+ "ADD w14, w14, w22 \n\t"
+ "SHL v9.4S, v14.4S, #7 \n\t"
+ "ADD w15, w15, w19 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EOR w8, w8, w12 \n\t"
+ "SRI v5.4S, v13.4S, #25 \n\t"
+ "EOR w9, w9, w13 \n\t"
+ "SRI v9.4S, v14.4S, #25 \n\t"
+ "EOR w10, w10, w14 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EOR w11, w11, w15 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ "ROR w8, w8, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "ROR w9, w9, #25 \n\t"
+ "EXT v5.16B, v5.16B, v5.16B, #4 \n\t"
+ "ROR w10, w10, #25 \n\t"
+ "EXT v6.16B, v6.16B, v6.16B, #8 \n\t"
+ "ROR w11, w11, #25 \n\t"
+ "EXT v7.16B, v7.16B, v7.16B, #12 \n\t"
+ "EXT v9.16B, v9.16B, v9.16B, #4 \n\t"
+ "EXT v10.16B, v10.16B, v10.16B, #8 \n\t"
+ "EXT v11.16B, v11.16B, v11.16B, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD w4, w4, w9 \n\t"
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "ADD w5, w5, w10 \n\t"
+ "ADD v4.4S, v4.4S, v5.4S \n\t"
+ "ADD w6, w6, w11 \n\t"
+ "ADD v8.4S, v8.4S, v9.4S \n\t"
+ "ADD w7, w7, w8 \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "EOR w19, w19, w4 \n\t"
+ "EOR v7.16B, v7.16B, v4.16B \n\t"
+ "EOR w16, w16, w5 \n\t"
+ "EOR v11.16B, v11.16B, v8.16B \n\t"
+ "EOR w17, w17, w6 \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ "EOR w22, w22, w7 \n\t"
+ "REV32 v7.8H, v7.8H \n\t"
+ "ROR w19, w19, #16 \n\t"
+ "REV32 v11.8H, v11.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ROR w16, w16, #16 \n\t"
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "ROR w17, w17, #16 \n\t"
+ "ADD v6.4S, v6.4S, v7.4S \n\t"
+ "ROR w22, w22, #16 \n\t"
+ "ADD v10.4S, v10.4S, v11.4S \n\t"
+ "ADD w14, w14, w19 \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "ADD w15, w15, w16 \n\t"
+ "EOR v13.16B, v5.16B, v6.16B \n\t"
+ "ADD w12, w12, w17 \n\t"
+ "EOR v14.16B, v9.16B, v10.16B \n\t"
+ "ADD w13, w13, w22 \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "EOR w9, w9, w14 \n\t"
+ "SHL v5.4S, v13.4S, #12 \n\t"
+ "EOR w10, w10, w15 \n\t"
+ "SHL v9.4S, v14.4S, #12 \n\t"
+ "EOR w11, w11, w12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ "EOR w8, w8, w13 \n\t"
+ "SRI v5.4S, v13.4S, #20 \n\t"
+ "ROR w9, w9, #20 \n\t"
+ "SRI v9.4S, v14.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ROR w10, w10, #20 \n\t"
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "ROR w11, w11, #20 \n\t"
+ "ADD v4.4S, v4.4S, v5.4S \n\t"
+ "ROR w8, w8, #20 \n\t"
+ "ADD v8.4S, v8.4S, v9.4S \n\t"
+ "ADD w4, w4, w9 \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "ADD w5, w5, w10 \n\t"
+ "EOR v7.16B, v7.16B, v4.16B \n\t"
+ "ADD w6, w6, w11 \n\t"
+ "EOR v11.16B, v11.16B, v8.16B \n\t"
+ "ADD w7, w7, w8 \n\t"
+ "TBL v3.16B, { v3.16B }, v24.16B \n\t"
+ "EOR w19, w19, w4 \n\t"
+ "TBL v7.16B, { v7.16B }, v24.16B \n\t"
+ "EOR w16, w16, w5 \n\t"
+ "TBL v11.16B, { v11.16B }, v24.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "EOR w17, w17, w6 \n\t"
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR w22, w22, w7 \n\t"
+ "ADD v6.4S, v6.4S, v7.4S \n\t"
+ "ROR w19, w19, #24 \n\t"
+ "ADD v10.4S, v10.4S, v11.4S \n\t"
+ "ROR w16, w16, #24 \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "ROR w17, w17, #24 \n\t"
+ "EOR v13.16B, v5.16B, v6.16B \n\t"
+ "ROR w22, w22, #24 \n\t"
+ "EOR v14.16B, v9.16B, v10.16B \n\t"
+ "ADD w14, w14, w19 \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "ADD w15, w15, w16 \n\t"
+ "SHL v5.4S, v13.4S, #7 \n\t"
+ "ADD w12, w12, w17 \n\t"
+ "SHL v9.4S, v14.4S, #7 \n\t"
+ "ADD w13, w13, w22 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EOR w9, w9, w14 \n\t"
+ "SRI v5.4S, v13.4S, #25 \n\t"
+ "EOR w10, w10, w15 \n\t"
+ "SRI v9.4S, v14.4S, #25 \n\t"
+ "EOR w11, w11, w12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EOR w8, w8, w13 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ "ROR w9, w9, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "ROR w10, w10, #25 \n\t"
+ "EXT v5.16B, v5.16B, v5.16B, #12 \n\t"
+ "ROR w11, w11, #25 \n\t"
+ "EXT v6.16B, v6.16B, v6.16B, #8 \n\t"
+ "ROR w8, w8, #25 \n\t"
+ "EXT v7.16B, v7.16B, v7.16B, #4 \n\t"
+ "EXT v9.16B, v9.16B, v9.16B, #12 \n\t"
+ "EXT v10.16B, v10.16B, v10.16B, #8 \n\t"
+ "EXT v11.16B, v11.16B, v11.16B, #4 \n\t"
+ "BNE L_chacha20_arm64_256_loop_%= \n\t"
+ /* Load message */
+ "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+ /* Add one (2 added during calculating vector results) */
+ "ADD w16, w16, #1 \n\t"
+ /* Add back state, XOR in message and store (load next block) */
+ "ADD v0.4S, v0.4S, v20.4S \n\t"
+ "ADD v1.4S, v1.4S, v21.4S \n\t"
+ "ADD v2.4S, v2.4S, v22.4S \n\t"
+ "ADD v3.4S, v3.4S, v23.4S \n\t"
+ "EOR v0.16B, v0.16B, v16.16B \n\t"
+ "EOR v1.16B, v1.16B, v17.16B \n\t"
+ "EOR v2.16B, v2.16B, v18.16B \n\t"
+ "EOR v3.16B, v3.16B, v19.16B \n\t"
+ "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+ "ST1 {v0.4S-v3.4S}, [%[c]], #64 \n\t"
+ "MOV v23.S[0], w20 \n\t"
+ "ADD v4.4S, v4.4S, v20.4S \n\t"
+ "ADD v5.4S, v5.4S, v21.4S \n\t"
+ "ADD v6.4S, v6.4S, v22.4S \n\t"
+ "ADD v7.4S, v7.4S, v23.4S \n\t"
+ "EOR v4.16B, v4.16B, v16.16B \n\t"
+ "EOR v5.16B, v5.16B, v17.16B \n\t"
+ "EOR v6.16B, v6.16B, v18.16B \n\t"
+ "EOR v7.16B, v7.16B, v19.16B \n\t"
+ "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+ "ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t"
+ "MOV v23.S[0], w21 \n\t"
+ "ADD v8.4S, v8.4S, v20.4S \n\t"
+ "ADD v9.4S, v9.4S, v21.4S \n\t"
+ "ADD v10.4S, v10.4S, v22.4S \n\t"
+ "ADD v11.4S, v11.4S, v23.4S \n\t"
+ "EOR v8.16B, v8.16B, v16.16B \n\t"
+ "EOR v9.16B, v9.16B, v17.16B \n\t"
+ "EOR v10.16B, v10.16B, v18.16B \n\t"
+ "EOR v11.16B, v11.16B, v19.16B \n\t"
+ "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+ "ST1 {v8.4S-v11.4S}, [%[c]], #64 \n\t"
+ /* Move regular registers into vector registers for adding and xor */
+ "ORR x4, x4, x5, lsl #32 \n\t"
+ "ORR x6, x6, x7, lsl #32 \n\t"
+ "ORR x8, x8, x9, lsl #32 \n\t"
+ "MOV v12.D[0], x4 \n\t"
+ "ORR x10, x10, x11, lsl #32 \n\t"
+ "MOV v12.D[1], x6 \n\t"
+ "ORR x12, x12, x13, lsl #32 \n\t"
+ "MOV v13.D[0], x8 \n\t"
+ "ORR x14, x14, x15, lsl #32 \n\t"
+ "MOV v13.D[1], x10 \n\t"
+ "ORR x16, x16, x17, lsl #32 \n\t"
+ "MOV v14.D[0], x12 \n\t"
+ "ORR x22, x22, x19, lsl #32 \n\t"
+ "MOV v14.D[1], x14 \n\t"
+ "MOV v15.D[0], x16 \n\t"
+ "MOV v15.D[1], x22 \n\t"
+ /* Add back state, XOR in message and store */
+ "ADD v12.4S, v12.4S, v20.4S \n\t"
+ "ADD v13.4S, v13.4S, v21.4S \n\t"
+ "ADD v14.4S, v14.4S, v22.4S \n\t"
+ "ADD v15.4S, v15.4S, v23.4S \n\t"
+ "EOR v12.16B, v12.16B, v16.16B \n\t"
+ "EOR v13.16B, v13.16B, v17.16B \n\t"
+ "EOR v14.16B, v14.16B, v18.16B \n\t"
+ "EOR v15.16B, v15.16B, v19.16B \n\t"
+ "ST1 {v12.4S-v15.4S}, [%[c]], #64 \n\t"
+ : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c)
+ : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
+ "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+ "x17", "x22", "x19", "x20", "x21", "v0", "v1",
+ "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+ "v9", "v10", "v11", "v12", "v13", "v14",
+ "v15", "v16", "v17", "v18", "v19", "v20",
+ "v21", "v22", "v23"
+ );
+#else
+ word32 x[CHACHA_CHUNK_WORDS];
+ word32* x_addr = x;
+ __asm__ __volatile__ (
+ // The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM
+ // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+
+ ".align 2 \n\t"
+ "LDR r14, %[input] \n\t" // load input address
+
+ "LDM r14, { r0-r12 } \n\t"
+ // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12
+ "VMOV d0, r0, r1 \n\t"
+ "VMOV d1, r2, r3 \n\t"
+ "VMOV d2, r4, r5 \n\t"
+ "VMOV d3, r6, r7 \n\t"
+ "VMOV d4, r8, r9 \n\t"
+ "STRD r10, r11, %[x_10] \n\t"
+ "VMOV d5, r10, r11 \n\t"
+ "LDRD r11, r10, [r14, #4*14] \n\t"
+ "VMOV q4, q0 \n\t"
+ "VMOV q5, q1 \n\t"
+ "VMOV q6, q2 \n\t"
+ "VMOV q8, q0 \n\t"
+ "VMOV q9, q1 \n\t"
+ "VMOV q10, q2 \n\t"
+ // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+ // 0 1 2 3 4 5 6 7 8 9 15 14 12
+ "VMOV d7, r11, r10 \n\t"
+ "STR r10, %[x_15] \n\t"
+ "VMOV d15, r11, r10 \n\t"
+ "VMOV d23, r11, r10 \n\t"
+ "MOV r10, r12 \n\t"
+ "MOV r12, r11 \n\t"
+ "LDR r11, [r14, #4*13] \n\t"
+ // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+ // 0 1 2 3 4 5 6 7 8 9 12 13 14
+
+ "MOV r14, %[rounds] \n\t"
+
+ "VMOV d6, r10, r11 \n\t"
+ "ADD r10, r10, #1 \n\t"
+ "VMOV d14, r10, r11 \n\t"
+ "ADD r10, r10, #1 \n\t"
+ "VMOV d22, r10, r11 \n\t"
+ "ADD r10, r10, #1 \n\t" // ARM calculates the fourth block (two was already added earlier)
+ "\n"
+ "L_chacha20_arm32_256_loop_%=: \n\t"
+ "SUBS r14, r14, #1 \n\t"
+
+ // 0, 4, 8, 12
+ // 1, 5, 9, 13
+
+ // ODD ROUND
+ "ADD r0, r0, r4 \n\t" // 0 0 4
+ "VADD.I32 q0, q0, q1 \n\t"
+ "ADD r1, r1, r5 \n\t" // 1 1 5
+ "VADD.I32 q4, q4, q5 \n\t"
+ "EOR r10, r10, r0 \n\t" // 12 12 0
+ "VADD.I32 q8, q8, q9 \n\t"
+ "EOR r11, r11, r1 \n\t" // 13 13 1
+ "VEOR q12, q3, q0 \n\t"
+ "ROR r10, r10, #16 \n\t" // 12 12
+ "VEOR q13, q7, q4 \n\t"
+ "ROR r11, r11, #16 \n\t" // 13 13
+ "VEOR q14, q11, q8 \n\t"
+ "ADD r8, r8, r10 \n\t" // 8 8 12
+ // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+ "VREV32.16 q3, q12 \n\t"
+ "ADD r9, r9, r11 \n\t" // 9 9 13
+ "VREV32.16 q7, q13 \n\t"
+ "EOR r4, r4, r8 \n\t" // 4 4 8
+ "VREV32.16 q11, q14 \n\t"
+
+ "EOR r5, r5, r9 \n\t" // 5 5 9
+ "VADD.I32 q2, q2, q3 \n\t"
+ "ROR r4, r4, #20 \n\t" // 4 4
+ "VADD.I32 q6, q6, q7 \n\t"
+ "ROR r5, r5, #20 \n\t" // 5 5
+ "VADD.I32 q10, q10, q11 \n\t"
+ "ADD r0, r0, r4 \n\t" // 0 0 4
+ "VEOR q12, q1, q2 \n\t"
+ "ADD r1, r1, r5 \n\t" // 1 1 5
+ "VEOR q13, q5, q6 \n\t"
+ "EOR r10, r10, r0 \n\t" // 12 12 0
+ "VEOR q14, q9, q10 \n\t"
+ "EOR r11, r11, r1 \n\t" // 13 13 1
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q1, q12, #12 \n\t"
+ "ROR r10, r10, #24 \n\t" // 12 12
+ "VSHL.I32 q5, q13, #12 \n\t"
+ "ROR r11, r11, #24 \n\t" // 13 13
+ "VSHL.I32 q9, q14, #12 \n\t"
+ "ADD r8, r8, r10 \n\t" // 8 8 12
+ "VSRI.I32 q1, q12, #20 \n\t"
+ "ADD r9, r9, r11 \n\t" // 9 9 13
+ "VSRI.I32 q5, q13, #20 \n\t"
+ "STR r11, %[x_13] \n\t"
+ "VSRI.I32 q9, q14, #20 \n\t"
+
+ "LDR r11, %[x_15] \n\t"
+ "VADD.I32 q0, q0, q1 \n\t"
+ "EOR r4, r4, r8 \n\t" // 4 4 8
+ "VADD.I32 q4, q4, q5 \n\t"
+ "STR r8, %[x_8] \n\t"
+ "VADD.I32 q8, q8, q9 \n\t"
+ "LDR r8, %[x_10] \n\t"
+ "VEOR q12, q3, q0 \n\t"
+ "EOR r5, r5, r9 \n\t" // 5 5 9
+ "VEOR q13, q7, q4 \n\t"
+ "STR r9, %[x_9] \n\t"
+ "VEOR q14, q11, q8 \n\t"
+ "LDR r9, %[x_11] \n\t"
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q3, q12, #8 \n\t"
+ "ROR r4, r4, #25 \n\t" // 4 4
+ "VSHL.I32 q7, q13, #8 \n\t"
+ "ROR r5, r5, #25 \n\t" // 5 5
+ "VSHL.I32 q11, q14, #8 \n\t"
+
+ // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+ // 0 1 2 3 4 5 6 7 10 11 12 15 14
+
+ // 2, 6, 10, 14
+ // 3, 7, 11, 15
+
+ "ADD r2, r2, r6 \n\t" // 2 2 6
+ "VSRI.I32 q3, q12, #24 \n\t"
+ "ADD r3, r3, r7 \n\t" // 3 3 7
+ "VSRI.I32 q7, q13, #24 \n\t"
+ "EOR r12, r12, r2 \n\t" // 14 14 2
+ "VSRI.I32 q11, q14, #24 \n\t"
+
+ "EOR r11, r11, r3 \n\t" // 15 15 3
+ "VADD.I32 q2, q2, q3 \n\t"
+ "ROR r12, r12, #16 \n\t" // 14 14
+ "VADD.I32 q6, q6, q7 \n\t"
+ "ROR r11, r11, #16 \n\t" // 15 15
+ "VADD.I32 q10, q10, q11 \n\t"
+ "ADD r8, r8, r12 \n\t" // 10 10 14
+ "VEOR q12, q1, q2 \n\t"
+ "ADD r9, r9, r11 \n\t" // 11 11 15
+ "VEOR q13, q5, q6 \n\t"
+ "EOR r6, r6, r8 \n\t" // 6 6 10
+ "VEOR q14, q9, q10 \n\t"
+ "EOR r7, r7, r9 \n\t" // 7 7 11
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q1, q12, #7 \n\t"
+ "ROR r6, r6, #20 \n\t" // 6 6
+ "VSHL.I32 q5, q13, #7 \n\t"
+ "ROR r7, r7, #20 \n\t" // 7 7
+ "VSHL.I32 q9, q14, #7 \n\t"
+ "ADD r2, r2, r6 \n\t" // 2 2 6
+ "VSRI.I32 q1, q12, #25 \n\t"
+ "ADD r3, r3, r7 \n\t" // 3 3 7
+ "VSRI.I32 q5, q13, #25 \n\t"
+ "EOR r12, r12, r2 \n\t" // 14 14 2
+ "VSRI.I32 q9, q14, #25 \n\t"
+
+ // EVEN ROUND
+
+ "EOR r11, r11, r3 \n\t" // 15 15 3
+ "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one
+ "ROR r12, r12, #24 \n\t" // 14 14
+ "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+ "ROR r11, r11, #24 \n\t" // 15 15
+ "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three
+
+ "ADD r8, r8, r12 \n\t" // 10 10 14
+ "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one
+ "ADD r9, r9, r11 \n\t" // 11 11 15
+ "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+ "EOR r6, r6, r8 \n\t" // 6 6 10
+ "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three
+
+ "EOR r7, r7, r9 \n\t" // 7 7 11
+ "VEXT.8 q9, q9, q9, #4 \n\t" // permute elements left by one
+ "ROR r6, r6, #25 \n\t" // 6 6
+ "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two
+ "ROR r7, r7, #25 \n\t" // 7 7
+ "VEXT.8 q11, q11, q11, #12 \n\t" // permute elements left by three
+
+ // 0, 5, 10, 15
+ // 1, 6, 11, 12
+
+ "ADD r0, r0, r5 \n\t" // 0 0 5
+ "VADD.I32 q0, q0, q1 \n\t"
+ "ADD r1, r1, r6 \n\t" // 1 1 6
+ "VADD.I32 q4, q4, q5 \n\t"
+ "EOR r11, r11, r0 \n\t" // 15 15 0
+ "VADD.I32 q8, q8, q9 \n\t"
+ "EOR r10, r10, r1 \n\t" // 12 12 1
+ "VEOR q12, q3, q0 \n\t"
+ "ROR r11, r11, #16 \n\t" // 15 15
+ "VEOR q13, q7, q4 \n\t"
+ "ROR r10, r10, #16 \n\t" // 12 12
+ "VEOR q14, q11, q8 \n\t"
+ "ADD r8, r8, r11 \n\t" // 10 10 15
+ // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+ "VREV32.16 q3, q12 \n\t"
+ "ADD r9, r9, r10 \n\t" // 11 11 12
+ "VREV32.16 q7, q13 \n\t"
+ "EOR r5, r5, r8 \n\t" // 5 5 10
+ "VREV32.16 q11, q14 \n\t"
+
+ "EOR r6, r6, r9 \n\t" // 6 6 11
+ "VADD.I32 q2, q2, q3 \n\t"
+ "ROR r5, r5, #20 \n\t" // 5 5
+ "VADD.I32 q6, q6, q7 \n\t"
+ "ROR r6, r6, #20 \n\t" // 6 6
+ "VADD.I32 q10, q10, q11 \n\t"
+ "ADD r0, r0, r5 \n\t" // 0 0 5
+ "VEOR q12, q1, q2 \n\t"
+ "ADD r1, r1, r6 \n\t" // 1 1 6
+ "VEOR q13, q5, q6 \n\t"
+ "EOR r11, r11, r0 \n\t" // 15 15 0
+ "VEOR q14, q9, q10 \n\t"
+ "EOR r10, r10, r1 \n\t" // 12 12 1
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q1, q12, #12 \n\t"
+ "ROR r11, r11, #24 \n\t" // 15 15
+ "VSHL.I32 q5, q13, #12 \n\t"
+ "ROR r10, r10, #24 \n\t" // 12 12
+ "VSHL.I32 q9, q14, #12 \n\t"
+ "ADD r8, r8, r11 \n\t" // 10 10 15
+ "VSRI.I32 q1, q12, #20 \n\t"
+ "STR r11, %[x_15] \n\t"
+ "VSRI.I32 q5, q13, #20 \n\t"
+ "LDR r11, %[x_13] \n\t"
+ "VSRI.I32 q9, q14, #20 \n\t"
+
+ "ADD r9, r9, r10 \n\t" // 11 11 12
+ "VADD.I32 q0, q0, q1 \n\t"
+ "EOR r5, r5, r8 \n\t" // 5 5 10
+ "VADD.I32 q4, q4, q5 \n\t"
+ "STR r8, %[x_10] \n\t"
+ "VADD.I32 q8, q8, q9 \n\t"
+ "LDR r8, %[x_8] \n\t"
+ "VEOR q12, q3, q0 \n\t"
+ "EOR r6, r6, r9 \n\t" // 6 6 11
+ "VEOR q13, q7, q4 \n\t"
+ "STR r9, %[x_11] \n\t"
+ "VEOR q14, q11, q8 \n\t"
+ "LDR r9, %[x_9] \n\t"
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q3, q12, #8 \n\t"
+ "ROR r5, r5, #25 \n\t" // 5 5
+ "VSHL.I32 q7, q13, #8 \n\t"
+ "ROR r6, r6, #25 \n\t" // 6 6
+ "VSHL.I32 q11, q14, #8 \n\t"
+
+ // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+ // 0 1 2 3 4 5 6 7 8 9 12 13 14
+
+ // 2, 7, 8, 13
+ // 3, 4, 9, 14
+
+ "ADD r2, r2, r7 \n\t" // 2 2 7
+ "VSRI.I32 q3, q12, #24 \n\t"
+ "ADD r3, r3, r4 \n\t" // 3 3 4
+ "VSRI.I32 q7, q13, #24 \n\t"
+ "EOR r11, r11, r2 \n\t" // 13 13 2
+ "VSRI.I32 q11, q14, #24 \n\t"
+
+ "EOR r12, r12, r3 \n\t" // 14 14 3
+ "VADD.I32 q2, q2, q3 \n\t"
+ "ROR r11, r11, #16 \n\t" // 13 13
+ "VADD.I32 q6, q6, q7 \n\t"
+ "ROR r12, r12, #16 \n\t" // 14 14
+ "VADD.I32 q10, q10, q11 \n\t"
+ "ADD r8, r8, r11 \n\t" // 8 8 13
+ "VEOR q12, q1, q2 \n\t"
+ "ADD r9, r9, r12 \n\t" // 9 9 14
+ "VEOR q13, q5, q6 \n\t"
+ "EOR r7, r7, r8 \n\t" // 7 7 8
+ "VEOR q14, q9, q10 \n\t"
+ "EOR r4, r4, r9 \n\t" // 4 4 9
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q1, q12, #7 \n\t"
+ "ROR r7, r7, #20 \n\t" // 7 7
+ "VSHL.I32 q5, q13, #7 \n\t"
+ "ROR r4, r4, #20 \n\t" // 4 4
+ "VSHL.I32 q9, q14, #7 \n\t"
+ "ADD r2, r2, r7 \n\t" // 2 2 7
+ "VSRI.I32 q1, q12, #25 \n\t"
+ "ADD r3, r3, r4 \n\t" // 3 3 4
+ "VSRI.I32 q5, q13, #25 \n\t"
+ "EOR r11, r11, r2 \n\t" // 13 13 2
+ "VSRI.I32 q9, q14, #25 \n\t"
+
+ "EOR r12, r12, r3 \n\t" // 14 14 3
+ "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three
+ "ROR r11, r11, #24 \n\t" // 13 13
+ "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+ "ROR r12, r12, #24 \n\t" // 14 14
+ "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one
+
+ "ADD r8, r8, r11 \n\t" // 8 8 13
+ "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three
+ "ADD r9, r9, r12 \n\t" // 9 9 14
+ "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+ "EOR r7, r7, r8 \n\t" // 7 7 8
+ "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one
+
+ "EOR r4, r4, r9 \n\t" // 4 4 9
+ "VEXT.8 q9, q9, q9, #12 \n\t" // permute elements left by three
+ "ROR r7, r7, #25 \n\t" // 7 7
+ "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two
+ "ROR r4, r4, #25 \n\t" // 4 4
+ "VEXT.8 q11, q11, q11, #4 \n\t" // permute elements left by one
+
+ "BNE L_chacha20_arm32_256_loop_%= \n\t"
+
+ "LDR r14, %[x_addr] \n\t" // load address of x to r14
+ // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+ // 0 1 2 3 4 5 6 7 8 9 12 13 14
+ "ADD r10, r10, #3 \n\t" // add three here to make later NEON easier
+ "STM r14, { r0-r9 } \n\t"
+ "STRD r10, r11, [r14, #4*12] \n\t"
+ "LDR r9, %[input] \n\t" // load input address
+ "STR r12, [r14, #4*14] \n\t"
+ "LDR r10, %[c] \n\t" // load c address
+
+ "VLDM r9, { q12-q15 } \n\t"
+ "LDR r12, %[m] \n\t" // load m address
+
+ "VADD.I32 q0, q0, q12 \n\t"
+ "VADD.I32 q1, q1, q13 \n\t"
+ "VADD.I32 q2, q2, q14 \n\t"
+ "VADD.I32 q3, q3, q15 \n\t"
+
+ "VADD.I32 q4, q4, q12 \n\t"
+ "VADD.I32 q5, q5, q13 \n\t"
+ "VADD.I32 q6, q6, q14 \n\t"
+ "VADD.I32 q7, q7, q15 \n\t"
+
+ "MOV r11, #1 \n\t"
+
+ "VADD.I32 q8, q8, q12 \n\t"
+ "VMOV.I32 q12, #0 \n\t"
+ "VADD.I32 q9, q9, q13 \n\t"
+ "VMOV.I32 d24[0], r11 \n\t"
+ "VADD.I32 q10, q10, q14 \n\t"
+ "VADD.I32 q11, q11, q15 \n\t"
+
+ "VADD.I32 q11, q11, q12 \n\t" // add one to counter
+ "VADD.I32 q7, q7, q12 \n\t" // add one to counter
+ "VADD.I32 q11, q11, q12 \n\t" // add one to counter
+
+ "VLDM r12!, { q12-q15 } \n\t" // load m
+ "VEOR q0, q0, q12 \n\t"
+ "VEOR q1, q1, q13 \n\t"
+ "VEOR q2, q2, q14 \n\t"
+ "VEOR q3, q3, q15 \n\t"
+ "VSTM r10!, { q0-q3 } \n\t" // store to c
+
+ "VLDM r14, { q0-q3 } \n\t " // load final block from x
+
+ "VLDM r12!, { q12-q15 } \n\t" // load m
+ "VEOR q4, q4, q12 \n\t"
+ "VEOR q5, q5, q13 \n\t"
+ "VEOR q6, q6, q14 \n\t"
+ "VEOR q7, q7, q15 \n\t"
+ "VSTM r10!, { q4-q7 } \n\t" // store to c
+
+ "VLDM r9, { q4-q7 } \n\t" // load input
+
+ "VLDM r12!, { q12-q15 } \n\t" // load m
+ "VEOR q8, q8, q12 \n\t"
+ "VEOR q9, q9, q13 \n\t"
+ "VEOR q10, q10, q14 \n\t"
+ "VEOR q11, q11, q15 \n\t"
+ "VSTM r10!, { q8-q11 } \n\t" // store to c
+
+ "VLDM r12!, { q12-q15 } \n\t" // load m
+ "VADD.I32 q0, q0, q4 \n\t"
+ "VADD.I32 q1, q1, q5 \n\t"
+ "VADD.I32 q2, q2, q6 \n\t"
+ "VADD.I32 q3, q3, q7 \n\t" // three was added earlier
+ "VEOR q0, q0, q12 \n\t"
+ "VEOR q1, q1, q13 \n\t"
+ "VEOR q2, q2, q14 \n\t"
+ "VEOR q3, q3, q15 \n\t"
+ "VSTM r10!, { q0-q3 } \n\t" // store to c
+
+ : [c] "+m" (c),
+ [x_0] "=m" (x),
+ [x_8] "=m" (x[8]),
+ [x_9] "=m" (x[9]),
+ [x_10] "=m" (x[10]),
+ [x_11] "=m" (x[11]),
+ [x_13] "=m" (x[13]),
+ [x_15] "=m" (x[15])
+ : [rounds] "I" (ROUNDS/2), [input] "m" (input),
+ [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES),
+ [m] "m" (m), [x_addr] "m" (x_addr)
+ : "memory", "cc",
+ "r0", "r1", "r2", "r3",
+ "r4", "r5", "r6", "r7",
+ "r8", "r9", "r10", "r11", "r12", "r14",
+ "q0", "q1", "q2", "q3", "q4",
+ "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+
+#endif /* __aarch64__ */
+ return CHACHA_CHUNK_BYTES * 4;
+}
+
+
+static WC_INLINE int wc_Chacha_encrypt_128(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c)
+{
+#ifdef CHACHA_TEST
+ printf("Entering wc_Chacha_encrypt_128\n");
+#endif /*CHACHA_TEST */
+
+#ifdef __aarch64__
+ __asm__ __volatile__ (
+ /* Load incrementer register to modify counter */
+ "LD1 {v22.16B}, [%[L_chacha20_neon_inc_first_word]] \n\t"
+ /* Load index look-up for rotating left 8 bits */
+ "LD1 {v23.16B}, [%[L_chacha20_neon_rol8]] \n\t"
+ /* Load state to encrypt */
+ "LD1 {v18.4S-v21.4S}, [%[input]] \n\t"
+ /* Load message */
+ "LD1 {v14.4S-v17.4S}, [%[m]], #64 \n\t"
+ /* Move state into vector registers (x3) */
+ "MOV v0.16B, v18.16B \n\t"
+ "MOV v1.16B, v19.16B \n\t"
+ "MOV v2.16B, v20.16B \n\t"
+ "MOV v3.16B, v21.16B \n\t"
+ "MOV v4.16B, v18.16B \n\t"
+ "MOV v5.16B, v19.16B \n\t"
+ "MOV v6.16B, v20.16B \n\t"
+ "MOV v7.16B, v21.16B \n\t"
+ /* Add counter word */
+ "ADD v7.4S, v7.4S, v22.4S \n\t"
+ /* Set number of odd+even rounds to perform */
+ "MOV w3, #10 \n\t"
+ "\n"
+ "L_chacha20_arm64_128_loop_%=: \n\t"
+ "SUBS w3, w3, #1 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "ADD v4.4S, v4.4S, v5.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "EOR v7.16B, v7.16B, v4.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ "REV32 v7.8H, v7.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "ADD v6.4S, v6.4S, v7.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "EOR v13.16B, v5.16B, v6.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SHL v5.4S, v13.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ "SRI v5.4S, v13.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "ADD v4.4S, v4.4S, v5.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "EOR v7.16B, v7.16B, v4.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v23.16B \n\t"
+ "TBL v7.16B, { v7.16B }, v23.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "ADD v6.4S, v6.4S, v7.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "EOR v13.16B, v5.16B, v6.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SHL v5.4S, v13.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "SRI v5.4S, v13.4S, #25 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v5.16B, v5.16B, v5.16B, #4 \n\t"
+ "EXT v6.16B, v6.16B, v6.16B, #8 \n\t"
+ "EXT v7.16B, v7.16B, v7.16B, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "ADD v4.4S, v4.4S, v5.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "EOR v7.16B, v7.16B, v4.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ "REV32 v7.8H, v7.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "ADD v6.4S, v6.4S, v7.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "EOR v13.16B, v5.16B, v6.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SHL v5.4S, v13.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ "SRI v5.4S, v13.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "ADD v4.4S, v4.4S, v5.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "EOR v7.16B, v7.16B, v4.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v23.16B \n\t"
+ "TBL v7.16B, { v7.16B }, v23.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "ADD v6.4S, v6.4S, v7.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "EOR v13.16B, v5.16B, v6.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SHL v5.4S, v13.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "SRI v5.4S, v13.4S, #25 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v5.16B, v5.16B, v5.16B, #12 \n\t"
+ "EXT v6.16B, v6.16B, v6.16B, #8 \n\t"
+ "EXT v7.16B, v7.16B, v7.16B, #4 \n\t"
+ "BNE L_chacha20_arm64_128_loop_%= \n\t"
+ /* Add back state, XOR in message and store (load next block) */
+ "ADD v0.4S, v0.4S, v18.4S \n\t"
+ "ADD v1.4S, v1.4S, v19.4S \n\t"
+ "ADD v2.4S, v2.4S, v20.4S \n\t"
+ "ADD v3.4S, v3.4S, v21.4S \n\t"
+ "EOR v0.16B, v0.16B, v14.16B \n\t"
+ "EOR v1.16B, v1.16B, v15.16B \n\t"
+ "EOR v2.16B, v2.16B, v16.16B \n\t"
+ "EOR v3.16B, v3.16B, v17.16B \n\t"
+ "LD1 {v14.4S-v17.4S}, [%[m]], #64 \n\t"
+ "ST1 {v0.4S-v3.4S}, [%[c]], #64 \n\t"
+ "ADD v21.4S, v21.4S, v22.4S \n\t"
+ "ADD v4.4S, v4.4S, v18.4S \n\t"
+ "ADD v5.4S, v5.4S, v19.4S \n\t"
+ "ADD v6.4S, v6.4S, v20.4S \n\t"
+ "ADD v7.4S, v7.4S, v21.4S \n\t"
+ "EOR v4.16B, v4.16B, v14.16B \n\t"
+ "EOR v5.16B, v5.16B, v15.16B \n\t"
+ "EOR v6.16B, v6.16B, v16.16B \n\t"
+ "EOR v7.16B, v7.16B, v17.16B \n\t"
+ "ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t"
+ : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c)
+ : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8),
+ [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word)
+ : "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19", "v20", "v21"
+ );
+#else
+ __asm__ __volatile__ (
+ "MOV r11, %[rounds] \n\t"
+ "MOV r12, #1 \n\t"
+ "VLDM %[input], { q0-q3 } \n\t"
+ "VMOV.I32 q8, #0 \n\t"
+ "VMOV q4, q0 \n\t"
+ "VMOV.I32 d16[0], r12 \n\t"
+ "VMOV q5, q1 \n\t"
+ "VMOV q6, q2 \n\t"
+ "VADD.I32 q7, q3, q8 \n\t" // add one to counter
+
+ // store input
+ "VMOV q10, q0 \n\t"
+ "VMOV q11, q1 \n\t"
+ "VMOV q12, q2 \n\t"
+ "VMOV q13, q3 \n\t"
+ "\n"
+ "L_chacha20_arm32_128_loop_%=: \n\t"
+ "SUBS r11, r11, #1 \n\t"
+
+ // ODD ROUND
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VADD.I32 q4, q4, q5 \n\t"
+ "VEOR q8, q3, q0 \n\t"
+ "VEOR q9, q7, q4 \n\t"
+ // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+ "VREV32.16 q3, q8 \n\t"
+ "VREV32.16 q7, q9 \n\t"
+
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VADD.I32 q6, q6, q7 \n\t"
+ "VEOR q8, q1, q2 \n\t"
+ "VEOR q9, q5, q6 \n\t"
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q1, q8, #12 \n\t"
+ "VSHL.I32 q5, q9, #12 \n\t"
+ "VSRI.I32 q1, q8, #20 \n\t"
+ "VSRI.I32 q5, q9, #20 \n\t"
+
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VADD.I32 q4, q4, q5 \n\t"
+ "VEOR q8, q3, q0 \n\t"
+ "VEOR q9, q7, q4 \n\t"
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q3, q8, #8 \n\t"
+ "VSHL.I32 q7, q9, #8 \n\t"
+ "VSRI.I32 q3, q8, #24 \n\t"
+ "VSRI.I32 q7, q9, #24 \n\t"
+
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VADD.I32 q6, q6, q7 \n\t"
+ "VEOR q8, q1, q2 \n\t"
+ "VEOR q9, q5, q6 \n\t"
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q1, q8, #7 \n\t"
+ "VSHL.I32 q5, q9, #7 \n\t"
+ "VSRI.I32 q1, q8, #25 \n\t"
+ "VSRI.I32 q5, q9, #25 \n\t"
+
+ // EVEN ROUND
+
+ "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one
+ "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+ "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three
+
+ "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one
+ "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+ "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three
+
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VADD.I32 q4, q4, q5 \n\t"
+ "VEOR q8, q3, q0 \n\t"
+ "VEOR q9, q7, q4 \n\t"
+ // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+ "VREV32.16 q3, q8 \n\t"
+ "VREV32.16 q7, q9 \n\t"
+
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VADD.I32 q6, q6, q7 \n\t"
+ "VEOR q8, q1, q2 \n\t"
+ "VEOR q9, q5, q6 \n\t"
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q1, q8, #12 \n\t"
+ "VSHL.I32 q5, q9, #12 \n\t"
+ "VSRI.I32 q1, q8, #20 \n\t"
+ "VSRI.I32 q5, q9, #20 \n\t"
+
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VADD.I32 q4, q4, q5 \n\t"
+ "VEOR q8, q3, q0 \n\t"
+ "VEOR q9, q7, q4 \n\t"
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q3, q8, #8 \n\t"
+ "VSHL.I32 q7, q9, #8 \n\t"
+ "VSRI.I32 q3, q8, #24 \n\t"
+ "VSRI.I32 q7, q9, #24 \n\t"
+
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VADD.I32 q6, q6, q7 \n\t"
+ "VEOR q8, q1, q2 \n\t"
+ "VEOR q9, q5, q6 \n\t"
+ // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+ "VSHL.I32 q1, q8, #7 \n\t"
+ "VSHL.I32 q5, q9, #7 \n\t"
+ "VSRI.I32 q1, q8, #25 \n\t"
+ "VSRI.I32 q5, q9, #25 \n\t"
+
+ "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three
+ "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+ "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one
+
+ "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three
+ "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+ "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one
+
+ "BNE L_chacha20_arm32_128_loop_%= \n\t"
+
+ "VMOV.I32 q8, #0 \n\t"
+ "VADD.I32 q0, q0, q10 \n\t"
+ "VADD.I32 q1, q1, q11 \n\t"
+ "VMOV.I32 d16[0], r12 \n\t"
+ "VADD.I32 q2, q2, q12 \n\t"
+ "VADD.I32 q3, q3, q13 \n\t"
+
+ "VADD.I32 q13, q13, q8 \n\t" // add one to counter
+
+ "VADD.I32 q4, q4, q10 \n\t"
+ "VADD.I32 q5, q5, q11 \n\t"
+ "VADD.I32 q6, q6, q12 \n\t"
+ "VADD.I32 q7, q7, q13 \n\t"
+
+ "VLDM %[m], { q8-q15 } \n\t"
+ "VEOR q0, q0, q8 \n\t"
+ "VEOR q1, q1, q9 \n\t"
+ "VEOR q2, q2, q10 \n\t"
+ "VEOR q3, q3, q11 \n\t"
+ "VEOR q4, q4, q12 \n\t"
+ "VEOR q5, q5, q13 \n\t"
+ "VEOR q6, q6, q14 \n\t"
+ "VEOR q7, q7, q15 \n\t"
+ "VSTM %[c], { q0-q7 } \n\t"
+
+ : [c] "+r" (c), [m] "+r" (m)
+ : [rounds] "I" (ROUNDS/2), [input] "r" (input),
+ [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES)
+ : "memory", "cc",
+ "r11", "r12",
+ "q0", "q1", "q2", "q3", "q4",
+ "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+#endif /* __aarch64__ */
+ return CHACHA_CHUNK_BYTES * 2;
+}
+
+static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
+ byte* c, word32 bytes)
+{
+#ifdef CHACHA_TEST
+ printf("Entering wc_Chacha_encrypt_64 with %d bytes\n", bytes);
+#endif /*CHACHA_TEST */
+
+#ifdef __aarch64__
+ word64 bytes64 = (word64) bytes;
+ __asm__ __volatile__ (
+ /* Load index look-up for rotating left 8 bits */
+ "LD1 {v13.16B}, [%[L_chacha20_neon_rol8]] \n\t"
+ "LD1 {v14.4S}, [%[L_chacha20_neon_inc_first_word]] \n\t"
+ /* Load state to encrypt */
+ "LD1 {v8.4S-v11.4S}, [%[input]] \n\t"
+ "\n"
+ "L_chacha20_arm64_64_loop_%=: \n\t"
+ /* Move state into vector registers (x3) */
+ "MOV v0.16B, v8.16B \n\t"
+ "MOV v1.16B, v9.16B \n\t"
+ "MOV v2.16B, v10.16B \n\t"
+ "MOV v3.16B, v11.16B \n\t"
+ /* Add counter word */
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #12 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #4 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "REV32 v3.8H, v3.8H \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #12 \n\t"
+ "SRI v1.4S, v12.4S, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "ADD v0.4S, v0.4S, v1.4S \n\t"
+ "EOR v3.16B, v3.16B, v0.16B \n\t"
+ "TBL v3.16B, { v3.16B }, v13.16B \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "ADD v2.4S, v2.4S, v3.4S \n\t"
+ "EOR v12.16B, v1.16B, v2.16B \n\t"
+ "SHL v1.4S, v12.4S, #7 \n\t"
+ "SRI v1.4S, v12.4S, #25 \n\t"
+ "EXT v3.16B, v3.16B, v3.16B, #4 \n\t"
+ "EXT v1.16B, v1.16B, v1.16B, #12 \n\t"
+ "EXT v2.16B, v2.16B, v2.16B, #8 \n\t"
+ /* Add back state */
+ "ADD v0.4S, v0.4S, v8.4S \n\t"
+ "ADD v1.4S, v1.4S, v9.4S \n\t"
+ "ADD v2.4S, v2.4S, v10.4S \n\t"
+ "ADD v3.4S, v3.4S, v11.4S \n\t"
+ "CMP %[bytes], #64 \n\t"
+ "BLT L_chacha20_arm64_64_lt_64_%= \n\t"
+ "LD1 {v4.4S-v7.4S}, [%[m]], #64 \n\t"
+ "EOR v4.16B, v4.16B, v0.16B \n\t"
+ "EOR v5.16B, v5.16B, v1.16B \n\t"
+ "EOR v6.16B, v6.16B, v2.16B \n\t"
+ "EOR v7.16B, v7.16B, v3.16B \n\t"
+ "ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t"
+ "SUBS %[bytes], %[bytes], #64 \n\t"
+ "ADD v11.4S, v11.4S, v14.4S \n\t"
+ "BNE L_chacha20_arm64_64_loop_%= \n\t"
+ "B L_chacha20_arm64_64_done_%= \n\t"
+ "\n"
+ "L_chacha20_arm64_64_lt_64_%=: \n\t"
+ "CMP %[bytes], #32 \n\t"
+ "BLT L_chacha20_arm64_64_lt_32_%= \n\t"
+ "LD1 {v4.4S, v5.4S}, [%[m]], #32 \n\t"
+ "EOR v4.16B, v4.16B, v0.16B \n\t"
+ "EOR v5.16B, v5.16B, v1.16B \n\t"
+ "ST1 {v4.4S, v5.4S}, [%[c]], #32 \n\t"
+ "SUBS %[bytes], %[bytes], #32 \n\t"
+ "MOV v0.16B, v2.16B \n\t"
+ "MOV v1.16B, v3.16B \n\t"
+ "BEQ L_chacha20_arm64_64_done_%= \n\t"
+ "\n"
+ "L_chacha20_arm64_64_lt_32_%=: \n\t"
+ "CMP %[bytes], #16 \n\t"
+ "BLT L_chacha20_arm64_64_lt_16_%= \n\t"
+ "LD1 {v4.4S}, [%[m]], #16 \n\t"
+ "EOR v4.16B, v4.16B, v0.16B \n\t"
+ "ST1 {v4.4S}, [%[c]], #16 \n\t"
+ "SUBS %[bytes], %[bytes], #16 \n\t"
+ "MOV v0.16B, v1.16B \n\t"
+ "BEQ L_chacha20_arm64_64_done_%= \n\t"
+ "\n"
+ "L_chacha20_arm64_64_lt_16_%=: \n\t"
+ "CMP %[bytes], #8 \n\t"
+ "BLT L_chacha20_arm64_64_lt_8_%= \n\t"
+ "LD1 {v4.2S}, [%[m]], #8 \n\t"
+ "EOR v4.8B, v4.8B, v0.8B \n\t"
+ "ST1 {v4.2S}, [%[c]], #8 \n\t"
+ "SUBS %[bytes], %[bytes], #8 \n\t"
+ "MOV v0.D[0], v0.D[1] \n\t"
+ "BEQ L_chacha20_arm64_64_done_%= \n\t"
+ "\n"
+ "L_chacha20_arm64_64_lt_8_%=: \n\t"
+ "MOV x4, v0.D[0] \n\t"
+ "LSL x5, %[bytes], #3 \n\t"
+ "\n"
+ "L_chacha20_arm64_64_loop_lt_8_%=: \n\t"
+ "LDRB w6, [%[m], %[bytes]] \n\t"
+ "ROR x7, x4, x5 \n\t"
+ "EOR w6, w6, w7 \n\t"
+ "STRB w6, [%[c], %[bytes]] \n\t"
+ "SUBS %[bytes], %[bytes], #1 \n\t"
+ "SUB x5, x5, #8 \n\t"
+ "BGE L_chacha20_arm64_64_loop_lt_8_%= \n\t"
+ "\n"
+ "L_chacha20_arm64_64_done_%=: \n\t"
+ : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes64)
+ : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8),
+ [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word)
+ : "memory", "x4", "x5", "x6", "x7", "v0", "v1", "v2", "v3",
+ "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+ );
+#else
+ __asm__ __volatile__ (
+ /* Get the input state */
+ "VLDM %[input], { q8-q11 } \n\t"
+ /* Get the incrementer register */
+ "VLDM %[L_chacha20_neon_inc_first_word], { q14 } \n\t"
+ "\n"
+ "L_chacha20_arm32_64_outer_loop_%=: \n\t"
+ /* Copy over the input state */
+ "VMOV q0, q8 \n\t"
+ "VMOV q1, q9 \n\t"
+ "VMOV q2, q10 \n\t"
+ "VMOV q3, q11 \n\t"
+ /* Compute quarter rounds */
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Odd Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Odd->Even */
+ "VEXT.8 q1, q1, q1, #4 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #12 \n\t"
+ /* Even Round */
+ /* a += b; d ^= a; d <<<= 16; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VREV32.16 q3, q4 \n\t"
+ /* c += d; b ^= c; b <<<= 12; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #12 \n\t"
+ "VSRI.I32 q1, q4, #20 \n\t"
+ /* a += b; d ^= a; d <<<= 8; */
+ "VADD.I32 q0, q0, q1 \n\t"
+ "VEOR q4, q3, q0 \n\t"
+ "VSHL.I32 q3, q4, #8 \n\t"
+ "VSRI.I32 q3, q4, #24 \n\t"
+ /* c += d; b ^= c; b <<<= 7; */
+ "VADD.I32 q2, q2, q3 \n\t"
+ "VEOR q4, q1, q2 \n\t"
+ "VSHL.I32 q1, q4, #7 \n\t"
+ "VSRI.I32 q1, q4, #25 \n\t"
+ /* Permute Even->Odd */
+ "VEXT.8 q1, q1, q1, #12 \n\t"
+ "VEXT.8 q2, q2, q2, #8 \n\t"
+ "VEXT.8 q3, q3, q3, #4 \n\t"
+ /* Add back state */
+ "VADD.I32 q0, q0, q8 \n\t"
+ "VADD.I32 q1, q1, q9 \n\t"
+ "VADD.I32 q2, q2, q10 \n\t"
+ "VADD.I32 q3, q3, q11 \n\t"
+ "CMP %[bytes], #64 \n\t"
+ "BLT L_chacha20_arm32_64_lt_64_%= \n\t"
+ /* XOR full 64 byte block */
+ "VLDM %[m], { q4-q7 } \n\t"
+ "ADD %[m], %[m], #64 \n\t"
+ "VEOR q0, q0, q4 \n\t"
+ "VEOR q1, q1, q5 \n\t"
+ "VEOR q2, q2, q6 \n\t"
+ "VEOR q3, q3, q7 \n\t"
+ "VSTM %[c], { q0-q3 } \n\t"
+ "ADD %[c], %[c], #64 \n\t"
+ "SUBS %[bytes], %[bytes], #64 \n\t"
+ "VADD.I32 q11, q11, q14 \n\t"
+ "BNE L_chacha20_arm32_64_outer_loop_%= \n\t"
+ "B L_chacha20_arm32_64_done_%= \n\t"
+ "\n"
+ "L_chacha20_arm32_64_lt_64_%=: \n\t"
+ /* XOR 32 bytes */
+ "CMP %[bytes], #32 \n\t"
+ "BLT L_chacha20_arm32_64_lt_32_%= \n\t"
+ "VLDM %[m], { q4-q5 } \n\t"
+ "ADD %[m], %[m], #32 \n\t"
+ "VEOR q4, q4, q0 \n\t"
+ "VEOR q5, q5, q1 \n\t"
+ "VSTM %[c], { q4-q5 } \n\t"
+ "ADD %[c], %[c], #32 \n\t"
+ "SUBS %[bytes], %[bytes], #32 \n\t"
+ "VMOV q0, q2 \n\t"
+ "VMOV q1, q3 \n\t"
+ "BEQ L_chacha20_arm32_64_done_%= \n\t"
+ "\n"
+ "L_chacha20_arm32_64_lt_32_%=: \n\t"
+ /* XOR 16 bytes */
+ "CMP %[bytes], #16 \n\t"
+ "BLT L_chacha20_arm32_64_lt_16_%= \n\t"
+ "VLDM %[m], { q4 } \n\t"
+ "ADD %[m], %[m], #16 \n\t"
+ "VEOR q4, q4, q0 \n\t"
+ "VSTM %[c], { q4 } \n\t"
+ "ADD %[c], %[c], #16 \n\t"
+ "SUBS %[bytes], %[bytes], #16 \n\t"
+ "VMOV q0, q1 \n\t"
+ "BEQ L_chacha20_arm32_64_done_%= \n\t"
+ "\n"
+ "L_chacha20_arm32_64_lt_16_%=: \n\t"
+ /* XOR 8 bytes */
+ "CMP %[bytes], #8 \n\t"
+ "BLT L_chacha20_arm32_64_lt_8_%= \n\t"
+ "VLDR d8, [%[m], #0] \n\t"
+ "ADD %[m], %[m], #8 \n\t"
+ "VEOR d8, d8, d0 \n\t"
+ "VSTR d8, [%[c], #0] \n\t"
+ "ADD %[c], %[c], #8 \n\t"
+ "SUBS %[bytes], %[bytes], #8 \n\t"
+ "VMOV d0, d1 \n\t"
+ "BEQ L_chacha20_arm32_64_done_%= \n\t"
+ "\n"
+ "L_chacha20_arm32_64_lt_8_%=: \n\t"
+ /* XOR 4 bytes */
+ "CMP %[bytes], #4 \n\t"
+ "BLT L_chacha20_arm32_64_lt_4_%= \n\t"
+ "LDR r12, [%[m]], #4 \n\t"
+ "VMOV r14, d0[0] \n\t"
+ "EOR r12, r12, r14 \n\t"
+ "STR r12, [%[c]], #4 \n\t"
+ "SUBS %[bytes], %[bytes], #4 \n\t"
+ "VTRN.32 d0, d0 \n\t"
+ "BEQ L_chacha20_arm32_64_done_%= \n\t"
+ "\n"
+ "L_chacha20_arm32_64_lt_4_%=: \n\t"
+ /* XOR remaining bytes */
+ "VMOV r14, d0[0] \n\t"
+ "\n"
+ "L_chacha20_arm32_64_lt_4_loop_%=: \n\t"
+ "LDRB r12, [%[m]], #1 \n\t"
+ "EOR r12, r12, r14 \n\t"
+ "STRB r12, [%[c]], #1 \n\t"
+ "SUBS %[bytes], %[bytes], #1 \n\t"
+ "LSR r14, r14, #8 \n\t"
+ "BGT L_chacha20_arm32_64_lt_4_loop_%= \n\t"
+ "\n"
+ "L_chacha20_arm32_64_done_%=: \n\t"
+ : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes)
+ : [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word)
+ : "memory", "cc",
+ "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q14", "r12", "r14"
+ );
+#endif /* __aarch64__ */
+}
+
+/**
+ * Encrypt a stream of bytes
+ */
+static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
+ word32 bytes)
+{
+ int processed;
+
+#ifdef __aarch64__
+ if (bytes >= CHACHA_CHUNK_BYTES * 5) {
+ processed = (bytes / (CHACHA_CHUNK_BYTES * 5)) * CHACHA_CHUNK_BYTES * 5;
+ wc_Chacha_encrypt_320(ctx->X, m, c, processed);
+
+ bytes -= processed;
+ c += processed;
+ m += processed;
+ ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES);
+ }
+ if (bytes >= CHACHA_CHUNK_BYTES * 4) {
+#else
+ while (bytes >= CHACHA_CHUNK_BYTES * 4) {
+#endif /*__aarch64__ */
+ processed = wc_Chacha_encrypt_256(ctx->X, m, c);
+
+ bytes -= processed;
+ c += processed;
+ m += processed;
+ ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES);
+ }
+ if (bytes >= CHACHA_CHUNK_BYTES * 2) {
+ processed = wc_Chacha_encrypt_128(ctx->X, m, c);
+
+ bytes -= processed;
+ c += processed;
+ m += processed;
+ ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES);
+ }
+ if (bytes > 0) {
+ wc_Chacha_encrypt_64(ctx->X, m, c, bytes);
+ if (bytes > 64)
+ ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
+ ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
+ }
+}
+
+/**
+ * API to encrypt/decrypt a message of any size.
+ */
+int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
+ word32 msglen)
+{
+ if (ctx == NULL || output == NULL || input == NULL)
+ return BAD_FUNC_ARG;
+
+ wc_Chacha_encrypt_bytes(ctx, input, output, msglen);
+
+ return 0;
+}
+
+#endif /* HAVE_CHACHA */
+#endif /* WOLFSSL_ARMASM */
diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S
new file mode 100644
index 0000000..891c6d8
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-curve25519.S
@@ -0,0 +1,6715 @@
+/* armv8-curve25519
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S
+ */
+#ifdef __aarch64__
+ .text
+ .align 2
+ .globl fe_init
+ .type fe_init, %function
+fe_init:
+ ret
+ .size fe_init,.-fe_init
+ .text
+ .align 2
+ .globl fe_frombytes
+ .type fe_frombytes, %function
+fe_frombytes:
+ ldp x2, x3, [x1]
+ ldp x4, x5, [x1, #16]
+ and x5, x5, #0x7fffffffffffffff
+ stp x2, x3, [x0]
+ stp x4, x5, [x0, #16]
+ ret
+ .size fe_frombytes,.-fe_frombytes
+ .text
+ .align 2
+ .globl fe_tobytes
+ .type fe_tobytes, %function
+fe_tobytes:
+ mov x7, #19
+ ldp x2, x3, [x1]
+ ldp x4, x5, [x1, #16]
+ adds x6, x2, x7
+ adcs x6, x3, xzr
+ adcs x6, x4, xzr
+ adc x6, x5, xzr
+ and x6, x7, x6, asr 63
+ adds x2, x2, x6
+ adcs x3, x3, xzr
+ adcs x4, x4, xzr
+ adc x5, x5, xzr
+ and x5, x5, #0x7fffffffffffffff
+ stp x2, x3, [x0]
+ stp x4, x5, [x0, #16]
+ ret
+ .size fe_tobytes,.-fe_tobytes
+ .text
+ .align 2
+ .globl fe_1
+ .type fe_1, %function
+fe_1:
+ # Set one
+ mov x1, #1
+ stp x1, xzr, [x0]
+ stp xzr, xzr, [x0, #16]
+ ret
+ .size fe_1,.-fe_1
+ .text
+ .align 2
+ .globl fe_0
+ .type fe_0, %function
+fe_0:
+ # Set zero
+ stp xzr, xzr, [x0]
+ stp xzr, xzr, [x0, #16]
+ ret
+ .size fe_0,.-fe_0
+ .text
+ .align 2
+ .globl fe_copy
+ .type fe_copy, %function
+fe_copy:
+ # Copy
+ ldp x2, x3, [x1]
+ ldp x4, x5, [x1, #16]
+ stp x2, x3, [x0]
+ stp x4, x5, [x0, #16]
+ ret
+ .size fe_copy,.-fe_copy
+ .text
+ .align 2
+ .globl fe_sub
+ .type fe_sub, %function
+fe_sub:
+ # Sub
+ ldp x3, x4, [x1]
+ ldp x5, x6, [x1, #16]
+ ldp x7, x8, [x2]
+ ldp x9, x10, [x2, #16]
+ subs x3, x3, x7
+ sbcs x4, x4, x8
+ sbcs x5, x5, x9
+ sbcs x6, x6, x10
+ mov x12, #-19
+ csetm x11, cc
+ # Mask the modulus
+ and x12, x11, x12
+ and x13, x11, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x3, x3, x12
+ adcs x4, x4, x11
+ adcs x5, x5, x11
+ adc x6, x6, x13
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ret
+ .size fe_sub,.-fe_sub
+ .text
+ .align 2
+ .globl fe_add
+ .type fe_add, %function
+fe_add:
+ # Add
+ ldp x3, x4, [x1]
+ ldp x5, x6, [x1, #16]
+ ldp x7, x8, [x2]
+ ldp x9, x10, [x2, #16]
+ adds x3, x3, x7
+ adcs x4, x4, x8
+ adcs x5, x5, x9
+ adc x6, x6, x10
+ mov x12, #-19
+ asr x11, x6, #63
+ # Mask the modulus
+ and x12, x11, x12
+ and x13, x11, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x3, x3, x12
+ sbcs x4, x4, x11
+ sbcs x5, x5, x11
+ sbc x6, x6, x13
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ret
+ .size fe_add,.-fe_add
+ .text
+ .align 2
+ .globl fe_neg
+ .type fe_neg, %function
+fe_neg:
+ ldp x2, x3, [x1]
+ ldp x4, x5, [x1, #16]
+ mov x6, #-19
+ mov x7, #-1
+ mov x8, #-1
+ mov x9, #0x7fffffffffffffff
+ subs x6, x6, x2
+ sbcs x7, x7, x3
+ sbcs x8, x8, x4
+ sbc x9, x9, x5
+ stp x6, x7, [x0]
+ stp x8, x9, [x0, #16]
+ ret
+ .size fe_neg,.-fe_neg
+ .text
+ .align 2
+ .globl fe_isnonzero
+ .type fe_isnonzero, %function
+fe_isnonzero:
+ mov x6, #19
+ ldp x1, x2, [x0]
+ ldp x3, x4, [x0, #16]
+ adds x5, x1, x6
+ adcs x5, x2, xzr
+ adcs x5, x3, xzr
+ adc x5, x4, xzr
+ and x5, x6, x5, asr 63
+ adds x1, x1, x5
+ adcs x2, x2, xzr
+ adcs x3, x3, xzr
+ adc x4, x4, xzr
+ and x4, x4, #0x7fffffffffffffff
+ orr x0, x1, x2
+ orr x3, x3, x4
+ orr x0, x0, x3
+ ret
+ .size fe_isnonzero,.-fe_isnonzero
+ .text
+ .align 2
+ .globl fe_isnegative
+ .type fe_isnegative, %function
+fe_isnegative:
+ mov x6, #19
+ ldp x1, x2, [x0]
+ ldp x3, x4, [x0, #16]
+ adds x5, x1, x6
+ adcs x5, x2, xzr
+ adcs x5, x3, xzr
+ adc x5, x4, xzr
+ and x0, x1, #1
+ eor x0, x0, x5, lsr 63
+ ret
+ .size fe_isnegative,.-fe_isnegative
+ .text
+ .align 2
+ .globl fe_cmov_table
+ .type fe_cmov_table, %function
+fe_cmov_table:
+ stp x29, x30, [sp, #-128]!
+ add x29, sp, #0
+ str x17, [x29, #40]
+ str x19, [x29, #48]
+ stp x20, x21, [x29, #56]
+ stp x22, x23, [x29, #72]
+ stp x24, x25, [x29, #88]
+ stp x26, x27, [x29, #104]
+ str x28, [x29, #120]
+ str x0, [x29, #16]
+ sxtb x2, w2
+ sbfx x3, x2, #7, #1
+ eor x0, x2, x3
+ sub x0, x0, x3
+ mov x4, #1
+ mov x5, xzr
+ mov x6, xzr
+ mov x7, xzr
+ mov x8, #1
+ mov x9, xzr
+ mov x10, xzr
+ mov x11, xzr
+ mov x12, xzr
+ mov x13, xzr
+ mov x14, xzr
+ mov x15, xzr
+ cmp x0, #1
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x1, #32]
+ ldp x23, x24, [x1, #48]
+ ldp x25, x26, [x1, #64]
+ ldp x27, x28, [x1, #80]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #2
+ ldp x16, x17, [x1, #96]
+ ldp x19, x20, [x1, #112]
+ ldp x21, x22, [x1, #128]
+ ldp x23, x24, [x1, #144]
+ ldp x25, x26, [x1, #160]
+ ldp x27, x28, [x1, #176]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #3
+ ldp x16, x17, [x1, #192]
+ ldp x19, x20, [x1, #208]
+ ldp x21, x22, [x1, #224]
+ ldp x23, x24, [x1, #240]
+ ldp x25, x26, [x1, #256]
+ ldp x27, x28, [x1, #272]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #4
+ ldp x16, x17, [x1, #288]
+ ldp x19, x20, [x1, #304]
+ ldp x21, x22, [x1, #320]
+ ldp x23, x24, [x1, #336]
+ ldp x25, x26, [x1, #352]
+ ldp x27, x28, [x1, #368]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ add x1, x1, #0x180
+ cmp x0, #5
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x1, #32]
+ ldp x23, x24, [x1, #48]
+ ldp x25, x26, [x1, #64]
+ ldp x27, x28, [x1, #80]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #6
+ ldp x16, x17, [x1, #96]
+ ldp x19, x20, [x1, #112]
+ ldp x21, x22, [x1, #128]
+ ldp x23, x24, [x1, #144]
+ ldp x25, x26, [x1, #160]
+ ldp x27, x28, [x1, #176]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #7
+ ldp x16, x17, [x1, #192]
+ ldp x19, x20, [x1, #208]
+ ldp x21, x22, [x1, #224]
+ ldp x23, x24, [x1, #240]
+ ldp x25, x26, [x1, #256]
+ ldp x27, x28, [x1, #272]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ cmp x0, #8
+ ldp x16, x17, [x1, #288]
+ ldp x19, x20, [x1, #304]
+ ldp x21, x22, [x1, #320]
+ ldp x23, x24, [x1, #336]
+ ldp x25, x26, [x1, #352]
+ ldp x27, x28, [x1, #368]
+ csel x4, x16, x4, eq
+ csel x5, x17, x5, eq
+ csel x6, x19, x6, eq
+ csel x7, x20, x7, eq
+ csel x8, x21, x8, eq
+ csel x9, x22, x9, eq
+ csel x10, x23, x10, eq
+ csel x11, x24, x11, eq
+ csel x12, x25, x12, eq
+ csel x13, x26, x13, eq
+ csel x14, x27, x14, eq
+ csel x15, x28, x15, eq
+ mov x16, #-19
+ mov x17, #-1
+ mov x19, #-1
+ mov x20, #0x7fffffffffffffff
+ subs x16, x16, x12
+ sbcs x17, x17, x13
+ sbcs x19, x19, x14
+ sbc x20, x20, x15
+ cmp x2, #0
+ mov x3, x4
+ csel x4, x8, x4, lt
+ csel x8, x3, x8, lt
+ mov x3, x5
+ csel x5, x9, x5, lt
+ csel x9, x3, x9, lt
+ mov x3, x6
+ csel x6, x10, x6, lt
+ csel x10, x3, x10, lt
+ mov x3, x7
+ csel x7, x11, x7, lt
+ csel x11, x3, x11, lt
+ csel x12, x16, x12, lt
+ csel x13, x17, x13, lt
+ csel x14, x19, x14, lt
+ csel x15, x20, x15, lt
+ ldr x0, [x29, #16]
+ stp x4, x5, [x0]
+ stp x6, x7, [x0, #16]
+ stp x8, x9, [x0, #32]
+ stp x10, x11, [x0, #48]
+ stp x12, x13, [x0, #64]
+ stp x14, x15, [x0, #80]
+ ldr x17, [x29, #40]
+ ldr x19, [x29, #48]
+ ldp x20, x21, [x29, #56]
+ ldp x22, x23, [x29, #72]
+ ldp x24, x25, [x29, #88]
+ ldp x26, x27, [x29, #104]
+ ldr x28, [x29, #120]
+ ldp x29, x30, [sp], #0x80
+ ret
+ .size fe_cmov_table,.-fe_cmov_table
+ .text
+ .align 2
+ .globl fe_mul
+ .type fe_mul, %function
+fe_mul:
+ stp x29, x30, [sp, #-64]!
+ add x29, sp, #0
+ str x17, [x29, #24]
+ str x19, [x29, #32]
+ stp x20, x21, [x29, #40]
+ str x22, [x29, #56]
+ # Multiply
+ ldp x14, x15, [x1]
+ ldp x16, x17, [x1, #16]
+ ldp x19, x20, [x2]
+ ldp x21, x22, [x2, #16]
+ # A[0] * B[0]
+ mul x6, x14, x19
+ umulh x7, x14, x19
+ # A[0] * B[1]
+ mul x3, x14, x20
+ umulh x8, x14, x20
+ adds x7, x7, x3
+ adc x8, x8, xzr
+ # A[1] * B[0]
+ mul x3, x15, x19
+ umulh x4, x15, x19
+ adds x7, x7, x3
+ adcs x8, x8, x4
+ adc x9, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x21
+ umulh x4, x14, x21
+ adds x8, x8, x3
+ adc x9, x9, x4
+ # A[1] * B[1]
+ mul x3, x15, x20
+ umulh x4, x15, x20
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x10, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x19
+ umulh x4, x16, x19
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x10, x10, xzr
+ # A[0] * B[3]
+ mul x3, x14, x22
+ umulh x4, x14, x22
+ adds x9, x9, x3
+ adcs x10, x10, x4
+ adc x11, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x21
+ umulh x4, x15, x21
+ adds x9, x9, x3
+ adcs x10, x10, x4
+ adc x11, x11, xzr
+ # A[2] * B[1]
+ mul x3, x16, x20
+ umulh x4, x16, x20
+ adds x9, x9, x3
+ adcs x10, x10, x4
+ adc x11, x11, xzr
+ # A[3] * B[0]
+ mul x3, x17, x19
+ umulh x4, x17, x19
+ adds x9, x9, x3
+ adcs x10, x10, x4
+ adc x11, x11, xzr
+ # A[1] * B[3]
+ mul x3, x15, x22
+ umulh x4, x15, x22
+ adds x10, x10, x3
+ adcs x11, x11, x4
+ adc x12, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x21
+ umulh x4, x16, x21
+ adds x10, x10, x3
+ adcs x11, x11, x4
+ adc x12, x12, xzr
+ # A[3] * B[1]
+ mul x3, x17, x20
+ umulh x4, x17, x20
+ adds x10, x10, x3
+ adcs x11, x11, x4
+ adc x12, x12, xzr
+ # A[2] * B[3]
+ mul x3, x16, x22
+ umulh x4, x16, x22
+ adds x11, x11, x3
+ adcs x12, x12, x4
+ adc x13, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x21
+ umulh x4, x17, x21
+ adds x11, x11, x3
+ adcs x12, x12, x4
+ adc x13, x13, xzr
+ # A[3] * B[3]
+ mul x3, x17, x22
+ umulh x4, x17, x22
+ adds x12, x12, x3
+ adc x13, x13, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x13, x13, x12, #63
+ extr x12, x12, x11, #63
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x10
+ umulh x10, x3, x10
+ adds x6, x6, x4
+ mul x4, x3, x11
+ umulh x11, x3, x11
+ adcs x7, x7, x4
+ mul x4, x3, x12
+ umulh x12, x3, x12
+ adcs x8, x8, x4
+ mul x4, x3, x13
+ umulh x5, x3, x13
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x10
+ adcs x8, x8, x11
+ adcs x9, x9, x12
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ stp x6, x7, [x0]
+ stp x8, x9, [x0, #16]
+ ldr x17, [x29, #24]
+ ldr x19, [x29, #32]
+ ldp x20, x21, [x29, #40]
+ ldr x22, [x29, #56]
+ ldp x29, x30, [sp], #0x40
+ ret
+ .size fe_mul,.-fe_mul
+ .text
+ .align 2
+ .globl fe_sq
+ .type fe_sq, %function
+fe_sq:
+ # Square
+ ldp x13, x14, [x1]
+ ldp x15, x16, [x1, #16]
+ # A[0] * A[1]
+ mul x6, x13, x14
+ umulh x7, x13, x14
+ # A[0] * A[2]
+ mul x2, x13, x15
+ umulh x8, x13, x15
+ adds x7, x7, x2
+ adc x8, x8, xzr
+ # A[0] * A[3]
+ mul x2, x13, x16
+ umulh x9, x13, x16
+ adds x8, x8, x2
+ adc x9, x9, xzr
+ # A[1] * A[2]
+ mul x2, x14, x15
+ umulh x3, x14, x15
+ adds x8, x8, x2
+ adcs x9, x9, x3
+ adc x10, xzr, xzr
+ # A[1] * A[3]
+ mul x2, x14, x16
+ umulh x3, x14, x16
+ adds x9, x9, x2
+ adc x10, x10, x3
+ # A[2] * A[3]
+ mul x2, x15, x16
+ umulh x11, x15, x16
+ adds x10, x10, x2
+ adc x11, x11, xzr
+ # Double
+ adds x6, x6, x6
+ adcs x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adcs x11, x11, x11
+ adc x12, xzr, xzr
+ # A[0] * A[0]
+ mul x5, x13, x13
+ umulh x4, x13, x13
+ # A[1] * A[1]
+ mul x2, x14, x14
+ umulh x3, x14, x14
+ adds x6, x6, x4
+ adcs x7, x7, x2
+ adc x4, x3, xzr
+ # A[2] * A[2]
+ mul x2, x15, x15
+ umulh x3, x15, x15
+ adds x8, x8, x4
+ adcs x9, x9, x2
+ adc x4, x3, xzr
+ # A[3] * A[3]
+ mul x2, x16, x16
+ umulh x3, x16, x16
+ adds x10, x10, x4
+ adcs x11, x11, x2
+ adc x12, x12, x3
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x12, x12, x11, #63
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ and x8, x8, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x2, #19
+ mul x3, x2, x9
+ umulh x9, x2, x9
+ adds x5, x5, x3
+ mul x3, x2, x10
+ umulh x10, x2, x10
+ adcs x6, x6, x3
+ mul x3, x2, x11
+ umulh x11, x2, x11
+ adcs x7, x7, x3
+ mul x3, x2, x12
+ umulh x4, x2, x12
+ adcs x8, x8, x3
+ adc x4, x4, xzr
+ # Add remaining product results in
+ adds x6, x6, x9
+ adcs x7, x7, x10
+ adcs x8, x8, x11
+ adc x4, x4, xzr
+ # Overflow
+ extr x4, x4, x8, #63
+ mul x4, x4, x2
+ and x8, x8, #0x7fffffffffffffff
+ adds x5, x5, x4
+ adcs x6, x6, xzr
+ adcs x7, x7, xzr
+ adc x8, x8, xzr
+ # Reduce if top bit set
+ and x4, x2, x8, asr 63
+ and x8, x8, #0x7fffffffffffffff
+ adds x5, x5, x4
+ adcs x6, x6, xzr
+ adcs x7, x7, xzr
+ adc x8, x8, xzr
+ # Store
+ stp x5, x6, [x0]
+ stp x7, x8, [x0, #16]
+ ret
+ .size fe_sq,.-fe_sq
+ .text
+ .align 2
+ .globl fe_invert
+ .type fe_invert, %function
+fe_invert:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x20, [x29, #168]
+ # Invert
+ str x0, [x29, #144]
+ str x1, [x29, #152]
+ add x0, x29, #16
+ bl fe_sq
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ add x1, x29, #48
+ bl fe_sq
+ ldr x1, [x29, #152]
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #16
+ add x1, x29, #16
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x50
+ bl fe_sq
+ add x0, x29, #48
+ add x1, x29, #48
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x50
+ bl fe_sq
+ mov x20, #4
+ add x1, x29, #0x50
+L_fe_invert1:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert1
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x50
+ add x1, x29, #48
+ bl fe_sq
+ mov x20, #9
+ add x1, x29, #0x50
+L_fe_invert2:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert2
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x70
+ bl fe_sq
+ mov x20, #19
+ add x1, x29, #0x70
+L_fe_invert3:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert3
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ mov x20, #10
+ add x1, x29, #0x50
+L_fe_invert4:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert4
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x50
+ add x1, x29, #48
+ bl fe_sq
+ mov x20, #49
+ add x1, x29, #0x50
+L_fe_invert5:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert5
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #0x70
+ bl fe_sq
+ mov x20, #0x63
+ add x1, x29, #0x70
+L_fe_invert6:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert6
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ mov x20, #50
+ add x1, x29, #0x50
+L_fe_invert7:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert7
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ mov x20, #5
+ add x1, x29, #48
+L_fe_invert8:
+ bl fe_sq
+ sub x20, x20, #1
+ cmp x20, #0
+ bne L_fe_invert8
+ ldr x0, [x29, #144]
+ add x2, x29, #16
+ bl fe_mul
+ ldr x20, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_invert,.-fe_invert
+ .text
+ .align 2
+ .globl curve25519
+ .type curve25519, %function
+curve25519:
+ stp x29, x30, [sp, #-288]!
+ add x29, sp, #0
+ str x17, [x29, #200]
+ str x19, [x29, #208]
+ stp x20, x21, [x29, #216]
+ stp x22, x23, [x29, #232]
+ stp x24, x25, [x29, #248]
+ stp x26, x27, [x29, #264]
+ str x28, [x29, #280]
+ mov x23, xzr
+ str x0, [x29, #176]
+ str x2, [x29, #184]
+ # Copy
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x29, #80]
+ stp x8, x9, [x29, #96]
+ # Set one
+ mov x2, #1
+ stp x2, xzr, [x0]
+ stp xzr, xzr, [x0, #16]
+ # Set zero
+ stp xzr, xzr, [x29, #16]
+ stp xzr, xzr, [x29, #32]
+ # Set one
+ mov x2, #1
+ stp x2, xzr, [x29, #48]
+ stp xzr, xzr, [x29, #64]
+ mov x25, #62
+ mov x24, #24
+L_curve25519_words:
+L_curve25519_bits:
+ ldr x2, [x1, x24]
+ lsr x2, x2, x25
+ and x2, x2, #1
+ eor x23, x23, x2
+ # Conditional Swap
+ cmp x23, #1
+ ldp x10, x11, [x0]
+ ldp x12, x13, [x0, #16]
+ ldp x6, x7, [x29, #80]
+ ldp x8, x9, [x29, #96]
+ csel x14, x10, x6, eq
+ csel x10, x6, x10, eq
+ csel x15, x11, x7, eq
+ csel x11, x7, x11, eq
+ csel x16, x12, x8, eq
+ csel x12, x8, x12, eq
+ csel x17, x13, x9, eq
+ csel x13, x9, x13, eq
+ # Conditional Swap
+ cmp x23, #1
+ ldp x19, x20, [x29, #16]
+ ldp x21, x22, [x29, #32]
+ ldp x6, x7, [x29, #48]
+ ldp x8, x9, [x29, #64]
+ csel x5, x19, x6, eq
+ csel x19, x6, x19, eq
+ csel x26, x20, x7, eq
+ csel x20, x7, x20, eq
+ csel x27, x21, x8, eq
+ csel x21, x8, x21, eq
+ csel x28, x22, x9, eq
+ csel x22, x9, x22, eq
+ mov x23, x2
+ # Add
+ adds x6, x10, x19
+ adcs x7, x11, x20
+ adcs x8, x12, x21
+ adc x9, x13, x22
+ mov x3, #-19
+ asr x2, x9, #63
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x6, x6, x3
+ sbcs x7, x7, x2
+ sbcs x8, x8, x2
+ sbc x9, x9, x4
+ # Sub
+ subs x19, x10, x19
+ sbcs x20, x11, x20
+ sbcs x21, x12, x21
+ sbcs x22, x13, x22
+ mov x3, #-19
+ csetm x2, cc
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x19, x19, x3
+ adcs x20, x20, x2
+ adcs x21, x21, x2
+ adc x22, x22, x4
+ stp x19, x20, [x29, #144]
+ stp x21, x22, [x29, #160]
+ # Add
+ adds x10, x14, x5
+ adcs x11, x15, x26
+ adcs x12, x16, x27
+ adc x13, x17, x28
+ mov x3, #-19
+ asr x2, x13, #63
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x10, x10, x3
+ sbcs x11, x11, x2
+ sbcs x12, x12, x2
+ sbc x13, x13, x4
+ # Sub
+ subs x14, x14, x5
+ sbcs x15, x15, x26
+ sbcs x16, x16, x27
+ sbcs x17, x17, x28
+ mov x3, #-19
+ csetm x2, cc
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x14, x14, x3
+ adcs x15, x15, x2
+ adcs x16, x16, x2
+ adc x17, x17, x4
+ # Multiply
+ # A[0] * B[0]
+ mul x19, x14, x6
+ umulh x20, x14, x6
+ # A[0] * B[1]
+ mul x3, x14, x7
+ umulh x21, x14, x7
+ adds x20, x20, x3
+ adc x21, x21, xzr
+ # A[1] * B[0]
+ mul x3, x15, x6
+ umulh x4, x15, x6
+ adds x20, x20, x3
+ adcs x21, x21, x4
+ adc x22, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x8
+ umulh x4, x14, x8
+ adds x21, x21, x3
+ adc x22, x22, x4
+ # A[1] * B[1]
+ mul x3, x15, x7
+ umulh x4, x15, x7
+ adds x21, x21, x3
+ adcs x22, x22, x4
+ adc x2, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x6
+ umulh x4, x16, x6
+ adds x21, x21, x3
+ adcs x22, x22, x4
+ adc x2, x2, xzr
+ # A[0] * B[3]
+ mul x3, x14, x9
+ umulh x4, x14, x9
+ adds x22, x22, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x8
+ umulh x4, x15, x8
+ adds x22, x22, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[2] * B[1]
+ mul x3, x16, x7
+ umulh x4, x16, x7
+ adds x22, x22, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[3] * B[0]
+ mul x3, x17, x6
+ umulh x4, x17, x6
+ adds x22, x22, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[1] * B[3]
+ mul x3, x15, x9
+ umulh x4, x15, x9
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x8
+ umulh x4, x16, x8
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[3] * B[1]
+ mul x3, x17, x7
+ umulh x4, x17, x7
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[2] * B[3]
+ mul x3, x16, x9
+ umulh x4, x16, x9
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x8
+ umulh x4, x17, x8
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, x28, xzr
+ # A[3] * B[3]
+ mul x3, x17, x9
+ umulh x4, x17, x9
+ adds x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x22, #63
+ and x22, x22, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x19, x19, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x20, x20, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x21, x21, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x22, x22, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x20, x20, x2
+ adcs x21, x21, x26
+ adcs x22, x22, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x22, #63
+ mul x5, x5, x3
+ and x22, x22, #0x7fffffffffffffff
+ adds x19, x19, x5
+ adcs x20, x20, xzr
+ adcs x21, x21, xzr
+ adc x22, x22, xzr
+ # Reduce if top bit set
+ and x5, x3, x22, asr 63
+ and x22, x22, #0x7fffffffffffffff
+ adds x19, x19, x5
+ adcs x20, x20, xzr
+ adcs x21, x21, xzr
+ adc x22, x22, xzr
+ # Store
+ stp x19, x20, [x29, #112]
+ stp x21, x22, [x29, #128]
+ # Multiply
+ ldp x2, x26, [x29, #144]
+ ldp x27, x28, [x29, #160]
+ # A[0] * B[0]
+ mul x19, x10, x2
+ umulh x20, x10, x2
+ # A[0] * B[1]
+ mul x3, x10, x26
+ umulh x21, x10, x26
+ adds x20, x20, x3
+ adc x21, x21, xzr
+ # A[1] * B[0]
+ mul x3, x11, x2
+ umulh x4, x11, x2
+ adds x20, x20, x3
+ adcs x21, x21, x4
+ adc x22, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x10, x27
+ umulh x4, x10, x27
+ adds x21, x21, x3
+ adc x22, x22, x4
+ # A[1] * B[1]
+ mul x3, x11, x26
+ umulh x4, x11, x26
+ adds x21, x21, x3
+ adcs x22, x22, x4
+ adc x14, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x12, x2
+ umulh x4, x12, x2
+ adds x21, x21, x3
+ adcs x22, x22, x4
+ adc x14, x14, xzr
+ # A[0] * B[3]
+ mul x3, x10, x28
+ umulh x4, x10, x28
+ adds x22, x22, x3
+ adcs x14, x14, x4
+ adc x15, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x11, x27
+ umulh x4, x11, x27
+ adds x22, x22, x3
+ adcs x14, x14, x4
+ adc x15, x15, xzr
+ # A[2] * B[1]
+ mul x3, x12, x26
+ umulh x4, x12, x26
+ adds x22, x22, x3
+ adcs x14, x14, x4
+ adc x15, x15, xzr
+ # A[3] * B[0]
+ mul x3, x13, x2
+ umulh x4, x13, x2
+ adds x22, x22, x3
+ adcs x14, x14, x4
+ adc x15, x15, xzr
+ # A[1] * B[3]
+ mul x3, x11, x28
+ umulh x4, x11, x28
+ adds x14, x14, x3
+ adcs x15, x15, x4
+ adc x16, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x12, x27
+ umulh x4, x12, x27
+ adds x14, x14, x3
+ adcs x15, x15, x4
+ adc x16, x16, xzr
+ # A[3] * B[1]
+ mul x3, x13, x26
+ umulh x4, x13, x26
+ adds x14, x14, x3
+ adcs x15, x15, x4
+ adc x16, x16, xzr
+ # A[2] * B[3]
+ mul x3, x12, x28
+ umulh x4, x12, x28
+ adds x15, x15, x3
+ adcs x16, x16, x4
+ adc x17, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x13, x27
+ umulh x4, x13, x27
+ adds x15, x15, x3
+ adcs x16, x16, x4
+ adc x17, x17, xzr
+ # A[3] * B[3]
+ mul x3, x13, x28
+ umulh x4, x13, x28
+ adds x16, x16, x3
+ adc x17, x17, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ extr x15, x15, x14, #63
+ extr x14, x14, x22, #63
+ and x22, x22, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x14
+ umulh x14, x3, x14
+ adds x19, x19, x4
+ mul x4, x3, x15
+ umulh x15, x3, x15
+ adcs x20, x20, x4
+ mul x4, x3, x16
+ umulh x16, x3, x16
+ adcs x21, x21, x4
+ mul x4, x3, x17
+ umulh x5, x3, x17
+ adcs x22, x22, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x20, x20, x14
+ adcs x21, x21, x15
+ adcs x22, x22, x16
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x22, #63
+ mul x5, x5, x3
+ and x22, x22, #0x7fffffffffffffff
+ adds x19, x19, x5
+ adcs x20, x20, xzr
+ adcs x21, x21, xzr
+ adc x22, x22, xzr
+ # Reduce if top bit set
+ and x5, x3, x22, asr 63
+ and x22, x22, #0x7fffffffffffffff
+ adds x19, x19, x5
+ adcs x20, x20, xzr
+ adcs x21, x21, xzr
+ adc x22, x22, xzr
+ # Store
+ # Square
+ # A[0] * A[1]
+ mul x11, x2, x26
+ umulh x12, x2, x26
+ # A[0] * A[2]
+ mul x3, x2, x27
+ umulh x13, x2, x27
+ adds x12, x12, x3
+ adc x13, x13, xzr
+ # A[0] * A[3]
+ mul x3, x2, x28
+ umulh x14, x2, x28
+ adds x13, x13, x3
+ adc x14, x14, xzr
+ # A[1] * A[2]
+ mul x3, x26, x27
+ umulh x4, x26, x27
+ adds x13, x13, x3
+ adcs x14, x14, x4
+ adc x15, xzr, xzr
+ # A[1] * A[3]
+ mul x3, x26, x28
+ umulh x4, x26, x28
+ adds x14, x14, x3
+ adc x15, x15, x4
+ # A[2] * A[3]
+ mul x3, x27, x28
+ umulh x16, x27, x28
+ adds x15, x15, x3
+ adc x16, x16, xzr
+ # Double
+ adds x11, x11, x11
+ adcs x12, x12, x12
+ adcs x13, x13, x13
+ adcs x14, x14, x14
+ adcs x15, x15, x15
+ adcs x16, x16, x16
+ adc x17, xzr, xzr
+ # A[0] * A[0]
+ mul x10, x2, x2
+ umulh x5, x2, x2
+ # A[1] * A[1]
+ mul x3, x26, x26
+ umulh x4, x26, x26
+ adds x11, x11, x5
+ adcs x12, x12, x3
+ adc x5, x4, xzr
+ # A[2] * A[2]
+ mul x3, x27, x27
+ umulh x4, x27, x27
+ adds x13, x13, x5
+ adcs x14, x14, x3
+ adc x5, x4, xzr
+ # A[3] * A[3]
+ mul x3, x28, x28
+ umulh x4, x28, x28
+ adds x15, x15, x5
+ adcs x16, x16, x3
+ adc x17, x17, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ extr x15, x15, x14, #63
+ extr x14, x14, x13, #63
+ and x13, x13, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x14
+ umulh x14, x3, x14
+ adds x10, x10, x4
+ mul x4, x3, x15
+ umulh x15, x3, x15
+ adcs x11, x11, x4
+ mul x4, x3, x16
+ umulh x16, x3, x16
+ adcs x12, x12, x4
+ mul x4, x3, x17
+ umulh x5, x3, x17
+ adcs x13, x13, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x11, x11, x14
+ adcs x12, x12, x15
+ adcs x13, x13, x16
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x13, #63
+ mul x5, x5, x3
+ and x13, x13, #0x7fffffffffffffff
+ adds x10, x10, x5
+ adcs x11, x11, xzr
+ adcs x12, x12, xzr
+ adc x13, x13, xzr
+ # Reduce if top bit set
+ and x5, x3, x13, asr 63
+ and x13, x13, #0x7fffffffffffffff
+ adds x10, x10, x5
+ adcs x11, x11, xzr
+ adcs x12, x12, xzr
+ adc x13, x13, xzr
+ # Store
+ # Square
+ # A[0] * A[1]
+ mul x15, x6, x7
+ umulh x16, x6, x7
+ # A[0] * A[2]
+ mul x3, x6, x8
+ umulh x17, x6, x8
+ adds x16, x16, x3
+ adc x17, x17, xzr
+ # A[0] * A[3]
+ mul x3, x6, x9
+ umulh x2, x6, x9
+ adds x17, x17, x3
+ adc x2, x2, xzr
+ # A[1] * A[2]
+ mul x3, x7, x8
+ umulh x4, x7, x8
+ adds x17, x17, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * A[3]
+ mul x3, x7, x9
+ umulh x4, x7, x9
+ adds x2, x2, x3
+ adc x26, x26, x4
+ # A[2] * A[3]
+ mul x3, x8, x9
+ umulh x27, x8, x9
+ adds x26, x26, x3
+ adc x27, x27, xzr
+ # Double
+ adds x15, x15, x15
+ adcs x16, x16, x16
+ adcs x17, x17, x17
+ adcs x2, x2, x2
+ adcs x26, x26, x26
+ adcs x27, x27, x27
+ adc x28, xzr, xzr
+ # A[0] * A[0]
+ mul x14, x6, x6
+ umulh x5, x6, x6
+ # A[1] * A[1]
+ mul x3, x7, x7
+ umulh x4, x7, x7
+ adds x15, x15, x5
+ adcs x16, x16, x3
+ adc x5, x4, xzr
+ # A[2] * A[2]
+ mul x3, x8, x8
+ umulh x4, x8, x8
+ adds x17, x17, x5
+ adcs x2, x2, x3
+ adc x5, x4, xzr
+ # A[3] * A[3]
+ mul x3, x9, x9
+ umulh x4, x9, x9
+ adds x26, x26, x5
+ adcs x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x17, #63
+ and x17, x17, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x14, x14, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x15, x15, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x16, x16, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x17, x17, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x15, x15, x2
+ adcs x16, x16, x26
+ adcs x17, x17, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x17, #63
+ mul x5, x5, x3
+ and x17, x17, #0x7fffffffffffffff
+ adds x14, x14, x5
+ adcs x15, x15, xzr
+ adcs x16, x16, xzr
+ adc x17, x17, xzr
+ # Reduce if top bit set
+ and x5, x3, x17, asr 63
+ and x17, x17, #0x7fffffffffffffff
+ adds x14, x14, x5
+ adcs x15, x15, xzr
+ adcs x16, x16, xzr
+ adc x17, x17, xzr
+ # Store
+ # Multiply
+ # A[0] * B[0]
+ mul x6, x14, x10
+ umulh x7, x14, x10
+ # A[0] * B[1]
+ mul x3, x14, x11
+ umulh x8, x14, x11
+ adds x7, x7, x3
+ adc x8, x8, xzr
+ # A[1] * B[0]
+ mul x3, x15, x10
+ umulh x4, x15, x10
+ adds x7, x7, x3
+ adcs x8, x8, x4
+ adc x9, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x12
+ umulh x4, x14, x12
+ adds x8, x8, x3
+ adc x9, x9, x4
+ # A[1] * B[1]
+ mul x3, x15, x11
+ umulh x4, x15, x11
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x2, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x10
+ umulh x4, x16, x10
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x2, x2, xzr
+ # A[0] * B[3]
+ mul x3, x14, x13
+ umulh x4, x14, x13
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x12
+ umulh x4, x15, x12
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[2] * B[1]
+ mul x3, x16, x11
+ umulh x4, x16, x11
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[3] * B[0]
+ mul x3, x17, x10
+ umulh x4, x17, x10
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[1] * B[3]
+ mul x3, x15, x13
+ umulh x4, x15, x13
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x12
+ umulh x4, x16, x12
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[3] * B[1]
+ mul x3, x17, x11
+ umulh x4, x17, x11
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[2] * B[3]
+ mul x3, x16, x13
+ umulh x4, x16, x13
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x12
+ umulh x4, x17, x12
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, x28, xzr
+ # A[3] * B[3]
+ mul x3, x17, x13
+ umulh x4, x17, x13
+ adds x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x6, x6, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x7, x7, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x8, x8, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x2
+ adcs x8, x8, x26
+ adcs x9, x9, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ stp x6, x7, [x0]
+ stp x8, x9, [x0, #16]
+ # Sub
+ subs x14, x14, x10
+ sbcs x15, x15, x11
+ sbcs x16, x16, x12
+ sbcs x17, x17, x13
+ mov x3, #-19
+ csetm x2, cc
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x14, x14, x3
+ adcs x15, x15, x2
+ adcs x16, x16, x2
+ adc x17, x17, x4
+ # Multiply by 121666
+ mov x5, #0xdb42
+ movk x5, #1, lsl 16
+ mul x6, x14, x5
+ umulh x7, x14, x5
+ mul x3, x15, x5
+ umulh x4, x15, x5
+ adds x7, x7, x3
+ adc x8, xzr, x4
+ mul x3, x16, x5
+ umulh x4, x16, x5
+ adds x8, x8, x3
+ adc x9, xzr, x4
+ mul x3, x17, x5
+ umulh x4, x17, x5
+ adds x9, x9, x3
+ adc x4, xzr, x4
+ mov x5, #19
+ extr x4, x4, x9, #63
+ mul x4, x4, x5
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x4
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Add
+ adds x10, x10, x6
+ adcs x11, x11, x7
+ adcs x12, x12, x8
+ adc x13, x13, x9
+ mov x3, #-19
+ asr x2, x13, #63
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x10, x10, x3
+ sbcs x11, x11, x2
+ sbcs x12, x12, x2
+ sbc x13, x13, x4
+ # Multiply
+ # A[0] * B[0]
+ mul x6, x14, x10
+ umulh x7, x14, x10
+ # A[0] * B[1]
+ mul x3, x14, x11
+ umulh x8, x14, x11
+ adds x7, x7, x3
+ adc x8, x8, xzr
+ # A[1] * B[0]
+ mul x3, x15, x10
+ umulh x4, x15, x10
+ adds x7, x7, x3
+ adcs x8, x8, x4
+ adc x9, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x12
+ umulh x4, x14, x12
+ adds x8, x8, x3
+ adc x9, x9, x4
+ # A[1] * B[1]
+ mul x3, x15, x11
+ umulh x4, x15, x11
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x2, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x10
+ umulh x4, x16, x10
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adc x2, x2, xzr
+ # A[0] * B[3]
+ mul x3, x14, x13
+ umulh x4, x14, x13
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x12
+ umulh x4, x15, x12
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[2] * B[1]
+ mul x3, x16, x11
+ umulh x4, x16, x11
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[3] * B[0]
+ mul x3, x17, x10
+ umulh x4, x17, x10
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[1] * B[3]
+ mul x3, x15, x13
+ umulh x4, x15, x13
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x12
+ umulh x4, x16, x12
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[3] * B[1]
+ mul x3, x17, x11
+ umulh x4, x17, x11
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[2] * B[3]
+ mul x3, x16, x13
+ umulh x4, x16, x13
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x12
+ umulh x4, x17, x12
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, x28, xzr
+ # A[3] * B[3]
+ mul x3, x17, x13
+ umulh x4, x17, x13
+ adds x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x6, x6, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x7, x7, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x8, x8, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x2
+ adcs x8, x8, x26
+ adcs x9, x9, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ stp x6, x7, [x29, #16]
+ stp x8, x9, [x29, #32]
+ # Add
+ ldp x6, x7, [x29, #112]
+ ldp x8, x9, [x29, #128]
+ adds x10, x6, x19
+ adcs x11, x7, x20
+ adcs x12, x8, x21
+ adc x13, x9, x22
+ mov x3, #-19
+ asr x2, x13, #63
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x10, x10, x3
+ sbcs x11, x11, x2
+ sbcs x12, x12, x2
+ sbc x13, x13, x4
+ # Sub
+ subs x19, x6, x19
+ sbcs x20, x7, x20
+ sbcs x21, x8, x21
+ sbcs x22, x9, x22
+ mov x3, #-19
+ csetm x2, cc
+ # Mask the modulus
+ and x3, x2, x3
+ and x4, x2, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x19, x19, x3
+ adcs x20, x20, x2
+ adcs x21, x21, x2
+ adc x22, x22, x4
+ # Square
+ # A[0] * A[1]
+ mul x7, x10, x11
+ umulh x8, x10, x11
+ # A[0] * A[2]
+ mul x3, x10, x12
+ umulh x9, x10, x12
+ adds x8, x8, x3
+ adc x9, x9, xzr
+ # A[0] * A[3]
+ mul x3, x10, x13
+ umulh x2, x10, x13
+ adds x9, x9, x3
+ adc x2, x2, xzr
+ # A[1] * A[2]
+ mul x3, x11, x12
+ umulh x4, x11, x12
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * A[3]
+ mul x3, x11, x13
+ umulh x4, x11, x13
+ adds x2, x2, x3
+ adc x26, x26, x4
+ # A[2] * A[3]
+ mul x3, x12, x13
+ umulh x27, x12, x13
+ adds x26, x26, x3
+ adc x27, x27, xzr
+ # Double
+ adds x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x2, x2, x2
+ adcs x26, x26, x26
+ adcs x27, x27, x27
+ adc x28, xzr, xzr
+ # A[0] * A[0]
+ mul x6, x10, x10
+ umulh x5, x10, x10
+ # A[1] * A[1]
+ mul x3, x11, x11
+ umulh x4, x11, x11
+ adds x7, x7, x5
+ adcs x8, x8, x3
+ adc x5, x4, xzr
+ # A[2] * A[2]
+ mul x3, x12, x12
+ umulh x4, x12, x12
+ adds x9, x9, x5
+ adcs x2, x2, x3
+ adc x5, x4, xzr
+ # A[3] * A[3]
+ mul x3, x13, x13
+ umulh x4, x13, x13
+ adds x26, x26, x5
+ adcs x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x6, x6, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x7, x7, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x8, x8, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x2
+ adcs x8, x8, x26
+ adcs x9, x9, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ stp x6, x7, [x29, #80]
+ stp x8, x9, [x29, #96]
+ # Square
+ # A[0] * A[1]
+ mul x7, x19, x20
+ umulh x8, x19, x20
+ # A[0] * A[2]
+ mul x3, x19, x21
+ umulh x9, x19, x21
+ adds x8, x8, x3
+ adc x9, x9, xzr
+ # A[0] * A[3]
+ mul x3, x19, x22
+ umulh x2, x19, x22
+ adds x9, x9, x3
+ adc x2, x2, xzr
+ # A[1] * A[2]
+ mul x3, x20, x21
+ umulh x4, x20, x21
+ adds x9, x9, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * A[3]
+ mul x3, x20, x22
+ umulh x4, x20, x22
+ adds x2, x2, x3
+ adc x26, x26, x4
+ # A[2] * A[3]
+ mul x3, x21, x22
+ umulh x27, x21, x22
+ adds x26, x26, x3
+ adc x27, x27, xzr
+ # Double
+ adds x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x2, x2, x2
+ adcs x26, x26, x26
+ adcs x27, x27, x27
+ adc x28, xzr, xzr
+ # A[0] * A[0]
+ mul x6, x19, x19
+ umulh x5, x19, x19
+ # A[1] * A[1]
+ mul x3, x20, x20
+ umulh x4, x20, x20
+ adds x7, x7, x5
+ adcs x8, x8, x3
+ adc x5, x4, xzr
+ # A[2] * A[2]
+ mul x3, x21, x21
+ umulh x4, x21, x21
+ adds x9, x9, x5
+ adcs x2, x2, x3
+ adc x5, x4, xzr
+ # A[3] * A[3]
+ mul x3, x22, x22
+ umulh x4, x22, x22
+ adds x26, x26, x5
+ adcs x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x9, #63
+ and x9, x9, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x6, x6, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x7, x7, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x8, x8, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x9, x9, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x7, x7, x2
+ adcs x8, x8, x26
+ adcs x9, x9, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x9, #63
+ mul x5, x5, x3
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Reduce if top bit set
+ and x5, x3, x9, asr 63
+ and x9, x9, #0x7fffffffffffffff
+ adds x6, x6, x5
+ adcs x7, x7, xzr
+ adcs x8, x8, xzr
+ adc x9, x9, xzr
+ # Store
+ ldr x2, [x29, #184]
+ # Multiply
+ ldp x14, x15, [x2]
+ ldp x16, x17, [x2, #16]
+ # A[0] * B[0]
+ mul x10, x14, x6
+ umulh x11, x14, x6
+ # A[0] * B[1]
+ mul x3, x14, x7
+ umulh x12, x14, x7
+ adds x11, x11, x3
+ adc x12, x12, xzr
+ # A[1] * B[0]
+ mul x3, x15, x6
+ umulh x4, x15, x6
+ adds x11, x11, x3
+ adcs x12, x12, x4
+ adc x13, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x14, x8
+ umulh x4, x14, x8
+ adds x12, x12, x3
+ adc x13, x13, x4
+ # A[1] * B[1]
+ mul x3, x15, x7
+ umulh x4, x15, x7
+ adds x12, x12, x3
+ adcs x13, x13, x4
+ adc x2, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x16, x6
+ umulh x4, x16, x6
+ adds x12, x12, x3
+ adcs x13, x13, x4
+ adc x2, x2, xzr
+ # A[0] * B[3]
+ mul x3, x14, x9
+ umulh x4, x14, x9
+ adds x13, x13, x3
+ adcs x2, x2, x4
+ adc x26, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x15, x8
+ umulh x4, x15, x8
+ adds x13, x13, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[2] * B[1]
+ mul x3, x16, x7
+ umulh x4, x16, x7
+ adds x13, x13, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[3] * B[0]
+ mul x3, x17, x6
+ umulh x4, x17, x6
+ adds x13, x13, x3
+ adcs x2, x2, x4
+ adc x26, x26, xzr
+ # A[1] * B[3]
+ mul x3, x15, x9
+ umulh x4, x15, x9
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x16, x8
+ umulh x4, x16, x8
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[3] * B[1]
+ mul x3, x17, x7
+ umulh x4, x17, x7
+ adds x2, x2, x3
+ adcs x26, x26, x4
+ adc x27, x27, xzr
+ # A[2] * B[3]
+ mul x3, x16, x9
+ umulh x4, x16, x9
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x17, x8
+ umulh x4, x17, x8
+ adds x26, x26, x3
+ adcs x27, x27, x4
+ adc x28, x28, xzr
+ # A[3] * B[3]
+ mul x3, x17, x9
+ umulh x4, x17, x9
+ adds x27, x27, x3
+ adc x28, x28, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x28, x28, x27, #63
+ extr x27, x27, x26, #63
+ extr x26, x26, x2, #63
+ extr x2, x2, x13, #63
+ and x13, x13, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x2
+ umulh x2, x3, x2
+ adds x10, x10, x4
+ mul x4, x3, x26
+ umulh x26, x3, x26
+ adcs x11, x11, x4
+ mul x4, x3, x27
+ umulh x27, x3, x27
+ adcs x12, x12, x4
+ mul x4, x3, x28
+ umulh x5, x3, x28
+ adcs x13, x13, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x11, x11, x2
+ adcs x12, x12, x26
+ adcs x13, x13, x27
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x13, #63
+ mul x5, x5, x3
+ and x13, x13, #0x7fffffffffffffff
+ adds x10, x10, x5
+ adcs x11, x11, xzr
+ adcs x12, x12, xzr
+ adc x13, x13, xzr
+ # Reduce if top bit set
+ and x5, x3, x13, asr 63
+ and x13, x13, #0x7fffffffffffffff
+ adds x10, x10, x5
+ adcs x11, x11, xzr
+ adcs x12, x12, xzr
+ adc x13, x13, xzr
+ # Store
+ stp x10, x11, [x29, #48]
+ stp x12, x13, [x29, #64]
+ sub x25, x25, #1
+ cmp x25, #0
+ bge L_curve25519_bits
+ mov x25, #63
+ sub x24, x24, #8
+ cmp x24, #0
+ bge L_curve25519_words
+ # Invert
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ add x0, x29, #0x50
+ add x1, x29, #48
+ bl fe_sq
+ add x1, x29, #0x50
+ bl fe_sq
+ add x1, x29, #16
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #48
+ add x1, x29, #48
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x70
+ bl fe_sq
+ add x0, x29, #0x50
+ add x1, x29, #0x50
+ add x2, x29, #0x70
+ bl fe_mul
+ add x0, x29, #0x70
+ bl fe_sq
+ mov x24, #4
+ add x1, x29, #0x70
+L_curve25519_inv_1:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_1
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x70
+ add x1, x29, #0x50
+ bl fe_sq
+ mov x24, #9
+ add x1, x29, #0x70
+L_curve25519_inv_2:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_2
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x90
+ bl fe_sq
+ mov x24, #19
+ add x1, x29, #0x90
+L_curve25519_inv_3:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_3
+ add x0, x29, #0x70
+ add x2, x29, #0x70
+ bl fe_mul
+ mov x24, #10
+ add x1, x29, #0x70
+L_curve25519_inv_4:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_4
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x70
+ add x1, x29, #0x50
+ bl fe_sq
+ mov x24, #49
+ add x1, x29, #0x70
+L_curve25519_inv_5:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_5
+ add x2, x29, #0x50
+ bl fe_mul
+ add x0, x29, #0x90
+ bl fe_sq
+ mov x24, #0x63
+ add x1, x29, #0x90
+L_curve25519_inv_6:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_6
+ add x0, x29, #0x70
+ add x2, x29, #0x70
+ bl fe_mul
+ mov x24, #50
+ add x1, x29, #0x70
+L_curve25519_inv_7:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_7
+ add x0, x29, #0x50
+ add x2, x29, #0x50
+ bl fe_mul
+ mov x24, #5
+ add x1, x29, #0x50
+L_curve25519_inv_8:
+ bl fe_sq
+ sub x24, x24, #1
+ cmp x24, #0
+ bne L_curve25519_inv_8
+ add x0, x29, #16
+ add x2, x29, #48
+ bl fe_mul
+ ldr x0, [x29, #176]
+ # Multiply
+ ldp x6, x7, [x0]
+ ldp x8, x9, [x0, #16]
+ ldp x10, x11, [x29, #16]
+ ldp x12, x13, [x29, #32]
+ # A[0] * B[0]
+ mul x14, x6, x10
+ umulh x15, x6, x10
+ # A[0] * B[1]
+ mul x3, x6, x11
+ umulh x16, x6, x11
+ adds x15, x15, x3
+ adc x16, x16, xzr
+ # A[1] * B[0]
+ mul x3, x7, x10
+ umulh x4, x7, x10
+ adds x15, x15, x3
+ adcs x16, x16, x4
+ adc x17, xzr, xzr
+ # A[0] * B[2]
+ mul x3, x6, x12
+ umulh x4, x6, x12
+ adds x16, x16, x3
+ adc x17, x17, x4
+ # A[1] * B[1]
+ mul x3, x7, x11
+ umulh x4, x7, x11
+ adds x16, x16, x3
+ adcs x17, x17, x4
+ adc x19, xzr, xzr
+ # A[2] * B[0]
+ mul x3, x8, x10
+ umulh x4, x8, x10
+ adds x16, x16, x3
+ adcs x17, x17, x4
+ adc x19, x19, xzr
+ # A[0] * B[3]
+ mul x3, x6, x13
+ umulh x4, x6, x13
+ adds x17, x17, x3
+ adcs x19, x19, x4
+ adc x20, xzr, xzr
+ # A[1] * B[2]
+ mul x3, x7, x12
+ umulh x4, x7, x12
+ adds x17, x17, x3
+ adcs x19, x19, x4
+ adc x20, x20, xzr
+ # A[2] * B[1]
+ mul x3, x8, x11
+ umulh x4, x8, x11
+ adds x17, x17, x3
+ adcs x19, x19, x4
+ adc x20, x20, xzr
+ # A[3] * B[0]
+ mul x3, x9, x10
+ umulh x4, x9, x10
+ adds x17, x17, x3
+ adcs x19, x19, x4
+ adc x20, x20, xzr
+ # A[1] * B[3]
+ mul x3, x7, x13
+ umulh x4, x7, x13
+ adds x19, x19, x3
+ adcs x20, x20, x4
+ adc x21, xzr, xzr
+ # A[2] * B[2]
+ mul x3, x8, x12
+ umulh x4, x8, x12
+ adds x19, x19, x3
+ adcs x20, x20, x4
+ adc x21, x21, xzr
+ # A[3] * B[1]
+ mul x3, x9, x11
+ umulh x4, x9, x11
+ adds x19, x19, x3
+ adcs x20, x20, x4
+ adc x21, x21, xzr
+ # A[2] * B[3]
+ mul x3, x8, x13
+ umulh x4, x8, x13
+ adds x20, x20, x3
+ adcs x21, x21, x4
+ adc x22, xzr, xzr
+ # A[3] * B[2]
+ mul x3, x9, x12
+ umulh x4, x9, x12
+ adds x20, x20, x3
+ adcs x21, x21, x4
+ adc x22, x22, xzr
+ # A[3] * B[3]
+ mul x3, x9, x13
+ umulh x4, x9, x13
+ adds x21, x21, x3
+ adc x22, x22, x4
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x22, x22, x21, #63
+ extr x21, x21, x20, #63
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ and x17, x17, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x3, #19
+ mul x4, x3, x19
+ umulh x19, x3, x19
+ adds x14, x14, x4
+ mul x4, x3, x20
+ umulh x20, x3, x20
+ adcs x15, x15, x4
+ mul x4, x3, x21
+ umulh x21, x3, x21
+ adcs x16, x16, x4
+ mul x4, x3, x22
+ umulh x5, x3, x22
+ adcs x17, x17, x4
+ adc x5, x5, xzr
+ # Add remaining product results in
+ adds x15, x15, x19
+ adcs x16, x16, x20
+ adcs x17, x17, x21
+ adc x5, x5, xzr
+ # Overflow
+ extr x5, x5, x17, #63
+ mul x5, x5, x3
+ and x17, x17, #0x7fffffffffffffff
+ adds x14, x14, x5
+ adcs x15, x15, xzr
+ adcs x16, x16, xzr
+ adc x17, x17, xzr
+ # Reduce if top bit set
+ and x5, x3, x17, asr 63
+ and x17, x17, #0x7fffffffffffffff
+ adds x14, x14, x5
+ adcs x15, x15, xzr
+ adcs x16, x16, xzr
+ adc x17, x17, xzr
+ # Store
+ stp x14, x15, [x0]
+ stp x16, x17, [x0, #16]
+ mov x0, xzr
+ ldr x17, [x29, #200]
+ ldr x19, [x29, #208]
+ ldp x20, x21, [x29, #216]
+ ldp x22, x23, [x29, #232]
+ ldp x24, x25, [x29, #248]
+ ldp x26, x27, [x29, #264]
+ ldr x28, [x29, #280]
+ ldp x29, x30, [sp], #0x120
+ ret
+ .size curve25519,.-curve25519
+ .text
+ .align 2
+ .globl fe_pow22523
+ .type fe_pow22523, %function
+fe_pow22523:
+ stp x29, x30, [sp, #-144]!
+ add x29, sp, #0
+ str x21, [x29, #136]
+ # pow22523
+ str x0, [x29, #112]
+ str x1, [x29, #120]
+ add x0, x29, #16
+ bl fe_sq
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ add x1, x29, #48
+ bl fe_sq
+ ldr x1, [x29, #120]
+ add x2, x29, #48
+ bl fe_mul
+ add x0, x29, #16
+ add x1, x29, #16
+ add x2, x29, #48
+ bl fe_mul
+ bl fe_sq
+ add x1, x29, #48
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ mov x21, #4
+ add x1, x29, #48
+L_fe_pow22523_1:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_1
+ add x0, x29, #16
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ mov x21, #9
+ add x1, x29, #48
+L_fe_pow22523_2:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_2
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #0x50
+ bl fe_sq
+ mov x21, #19
+ add x1, x29, #0x50
+L_fe_pow22523_3:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_3
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ mov x21, #10
+ add x1, x29, #48
+L_fe_pow22523_4:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_4
+ add x0, x29, #16
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #48
+ add x1, x29, #16
+ bl fe_sq
+ mov x21, #49
+ add x1, x29, #48
+L_fe_pow22523_5:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_5
+ add x2, x29, #16
+ bl fe_mul
+ add x0, x29, #0x50
+ bl fe_sq
+ mov x21, #0x63
+ add x1, x29, #0x50
+L_fe_pow22523_6:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_6
+ add x0, x29, #48
+ add x2, x29, #48
+ bl fe_mul
+ mov x21, #50
+ add x1, x29, #48
+L_fe_pow22523_7:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_7
+ add x0, x29, #16
+ add x2, x29, #16
+ bl fe_mul
+ mov x21, #2
+ add x1, x29, #16
+L_fe_pow22523_8:
+ bl fe_sq
+ sub x21, x21, #1
+ cmp x21, #0
+ bne L_fe_pow22523_8
+ ldr x0, [x29, #112]
+ ldr x2, [x29, #120]
+ bl fe_mul
+ ldr x21, [x29, #136]
+ ldp x29, x30, [sp], #0x90
+ ret
+ .size fe_pow22523,.-fe_pow22523
+ .text
+ .align 2
+ .globl fe_ge_to_p2
+ .type fe_ge_to_p2, %function
+fe_ge_to_p2:
+ stp x29, x30, [sp, #-112]!
+ add x29, sp, #0
+ str x17, [x29, #72]
+ str x19, [x29, #80]
+ stp x20, x21, [x29, #88]
+ str x22, [x29, #104]
+ str x1, [x29, #16]
+ str x2, [x29, #24]
+ str x3, [x29, #32]
+ str x4, [x29, #40]
+ str x5, [x29, #48]
+ str x6, [x29, #56]
+ ldr x1, [x29, #32]
+ ldr x2, [x29, #56]
+ # Multiply
+ ldp x11, x12, [x1]
+ ldp x13, x14, [x1, #16]
+ ldp x15, x16, [x2]
+ ldp x17, x19, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x11, x15
+ umulh x4, x11, x15
+ # A[0] * B[1]
+ mul x20, x11, x16
+ umulh x5, x11, x16
+ adds x4, x4, x20
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x20, x12, x15
+ umulh x21, x12, x15
+ adds x4, x4, x20
+ adcs x5, x5, x21
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x20, x11, x17
+ umulh x21, x11, x17
+ adds x5, x5, x20
+ adc x6, x6, x21
+ # A[1] * B[1]
+ mul x20, x12, x16
+ umulh x21, x12, x16
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x20, x13, x15
+ umulh x21, x13, x15
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x20, x11, x19
+ umulh x21, x11, x19
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x20, x12, x17
+ umulh x21, x12, x17
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x20, x13, x16
+ umulh x21, x13, x16
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x20, x14, x15
+ umulh x21, x14, x15
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x20, x12, x19
+ umulh x21, x12, x19
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x20, x13, x17
+ umulh x21, x13, x17
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x20, x14, x16
+ umulh x21, x14, x16
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x20, x13, x19
+ umulh x21, x13, x19
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x20, x14, x17
+ umulh x21, x14, x17
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x20, x14, x19
+ umulh x21, x14, x19
+ adds x9, x9, x20
+ adc x10, x10, x21
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x20, #19
+ mul x21, x20, x7
+ umulh x7, x20, x7
+ adds x3, x3, x21
+ mul x21, x20, x8
+ umulh x8, x20, x8
+ adcs x4, x4, x21
+ mul x21, x20, x9
+ umulh x9, x20, x9
+ adcs x5, x5, x21
+ mul x21, x20, x10
+ umulh x22, x20, x10
+ adcs x6, x6, x21
+ adc x22, x22, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x22, x22, xzr
+ # Overflow
+ extr x22, x22, x6, #63
+ mul x22, x22, x20
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x22, x20, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #16]
+ ldr x1, [x29, #40]
+ ldr x2, [x29, #48]
+ # Multiply
+ ldp x11, x12, [x1]
+ ldp x13, x14, [x1, #16]
+ ldp x15, x16, [x2]
+ ldp x17, x19, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x11, x15
+ umulh x4, x11, x15
+ # A[0] * B[1]
+ mul x20, x11, x16
+ umulh x5, x11, x16
+ adds x4, x4, x20
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x20, x12, x15
+ umulh x21, x12, x15
+ adds x4, x4, x20
+ adcs x5, x5, x21
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x20, x11, x17
+ umulh x21, x11, x17
+ adds x5, x5, x20
+ adc x6, x6, x21
+ # A[1] * B[1]
+ mul x20, x12, x16
+ umulh x21, x12, x16
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x20, x13, x15
+ umulh x21, x13, x15
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x20, x11, x19
+ umulh x21, x11, x19
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x20, x12, x17
+ umulh x21, x12, x17
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x20, x13, x16
+ umulh x21, x13, x16
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x20, x14, x15
+ umulh x21, x14, x15
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x20, x12, x19
+ umulh x21, x12, x19
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x20, x13, x17
+ umulh x21, x13, x17
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x20, x14, x16
+ umulh x21, x14, x16
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x20, x13, x19
+ umulh x21, x13, x19
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x20, x14, x17
+ umulh x21, x14, x17
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x20, x14, x19
+ umulh x21, x14, x19
+ adds x9, x9, x20
+ adc x10, x10, x21
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x20, #19
+ mul x21, x20, x7
+ umulh x7, x20, x7
+ adds x3, x3, x21
+ mul x21, x20, x8
+ umulh x8, x20, x8
+ adcs x4, x4, x21
+ mul x21, x20, x9
+ umulh x9, x20, x9
+ adcs x5, x5, x21
+ mul x21, x20, x10
+ umulh x22, x20, x10
+ adcs x6, x6, x21
+ adc x22, x22, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x22, x22, xzr
+ # Overflow
+ extr x22, x22, x6, #63
+ mul x22, x22, x20
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x22, x20, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #24]
+ ldr x2, [x29, #56]
+ # Multiply
+ ldp x11, x12, [x2]
+ ldp x13, x14, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x15, x11
+ umulh x4, x15, x11
+ # A[0] * B[1]
+ mul x20, x15, x12
+ umulh x5, x15, x12
+ adds x4, x4, x20
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x20, x16, x11
+ umulh x21, x16, x11
+ adds x4, x4, x20
+ adcs x5, x5, x21
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x20, x15, x13
+ umulh x21, x15, x13
+ adds x5, x5, x20
+ adc x6, x6, x21
+ # A[1] * B[1]
+ mul x20, x16, x12
+ umulh x21, x16, x12
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x20, x17, x11
+ umulh x21, x17, x11
+ adds x5, x5, x20
+ adcs x6, x6, x21
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x20, x15, x14
+ umulh x21, x15, x14
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x20, x16, x13
+ umulh x21, x16, x13
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x20, x17, x12
+ umulh x21, x17, x12
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x20, x19, x11
+ umulh x21, x19, x11
+ adds x6, x6, x20
+ adcs x7, x7, x21
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x20, x16, x14
+ umulh x21, x16, x14
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x20, x17, x13
+ umulh x21, x17, x13
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x20, x19, x12
+ umulh x21, x19, x12
+ adds x7, x7, x20
+ adcs x8, x8, x21
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x20, x17, x14
+ umulh x21, x17, x14
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x20, x19, x13
+ umulh x21, x19, x13
+ adds x8, x8, x20
+ adcs x9, x9, x21
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x20, x19, x14
+ umulh x21, x19, x14
+ adds x9, x9, x20
+ adc x10, x10, x21
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x20, #19
+ mul x21, x20, x7
+ umulh x7, x20, x7
+ adds x3, x3, x21
+ mul x21, x20, x8
+ umulh x8, x20, x8
+ adcs x4, x4, x21
+ mul x21, x20, x9
+ umulh x9, x20, x9
+ adcs x5, x5, x21
+ mul x21, x20, x10
+ umulh x22, x20, x10
+ adcs x6, x6, x21
+ adc x22, x22, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x22, x22, xzr
+ # Overflow
+ extr x22, x22, x6, #63
+ mul x22, x22, x20
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x22, x20, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x22
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x17, [x29, #72]
+ ldr x19, [x29, #80]
+ ldp x20, x21, [x29, #88]
+ ldr x22, [x29, #104]
+ ldp x29, x30, [sp], #0x70
+ ret
+ .size fe_ge_to_p2,.-fe_ge_to_p2
+ .text
+ .align 2
+ .globl fe_ge_to_p3
+ .type fe_ge_to_p3, %function
+fe_ge_to_p3:
+ stp x29, x30, [sp, #-160]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ str x26, [x29, #152]
+ str x1, [x29, #16]
+ str x2, [x29, #24]
+ str x3, [x29, #32]
+ str x4, [x29, #40]
+ str x5, [x29, #48]
+ str x6, [x29, #56]
+ str x7, [x29, #64]
+ ldr x1, [x29, #40]
+ ldr x2, [x29, #64]
+ # Multiply
+ ldp x11, x12, [x1]
+ ldp x13, x14, [x1, #16]
+ ldp x15, x16, [x2]
+ ldp x17, x19, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x11, x15
+ umulh x4, x11, x15
+ # A[0] * B[1]
+ mul x24, x11, x16
+ umulh x5, x11, x16
+ adds x4, x4, x24
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x24, x12, x15
+ umulh x25, x12, x15
+ adds x4, x4, x24
+ adcs x5, x5, x25
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x24, x11, x17
+ umulh x25, x11, x17
+ adds x5, x5, x24
+ adc x6, x6, x25
+ # A[1] * B[1]
+ mul x24, x12, x16
+ umulh x25, x12, x16
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x24, x13, x15
+ umulh x25, x13, x15
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x24, x11, x19
+ umulh x25, x11, x19
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x24, x12, x17
+ umulh x25, x12, x17
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x24, x13, x16
+ umulh x25, x13, x16
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x24, x14, x15
+ umulh x25, x14, x15
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x24, x12, x19
+ umulh x25, x12, x19
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x24, x13, x17
+ umulh x25, x13, x17
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x24, x14, x16
+ umulh x25, x14, x16
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x24, x13, x19
+ umulh x25, x13, x19
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x24, x14, x17
+ umulh x25, x14, x17
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x24, x14, x19
+ umulh x25, x14, x19
+ adds x9, x9, x24
+ adc x10, x10, x25
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x24, #19
+ mul x25, x24, x7
+ umulh x7, x24, x7
+ adds x3, x3, x25
+ mul x25, x24, x8
+ umulh x8, x24, x8
+ adcs x4, x4, x25
+ mul x25, x24, x9
+ umulh x9, x24, x9
+ adcs x5, x5, x25
+ mul x25, x24, x10
+ umulh x26, x24, x10
+ adcs x6, x6, x25
+ adc x26, x26, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x26, x26, xzr
+ # Overflow
+ extr x26, x26, x6, #63
+ mul x26, x26, x24
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x26, x24, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #48]
+ # Multiply
+ ldp x20, x21, [x2]
+ ldp x22, x23, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x11, x20
+ umulh x4, x11, x20
+ # A[0] * B[1]
+ mul x24, x11, x21
+ umulh x5, x11, x21
+ adds x4, x4, x24
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x24, x12, x20
+ umulh x25, x12, x20
+ adds x4, x4, x24
+ adcs x5, x5, x25
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x24, x11, x22
+ umulh x25, x11, x22
+ adds x5, x5, x24
+ adc x6, x6, x25
+ # A[1] * B[1]
+ mul x24, x12, x21
+ umulh x25, x12, x21
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x24, x13, x20
+ umulh x25, x13, x20
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x24, x11, x23
+ umulh x25, x11, x23
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x24, x12, x22
+ umulh x25, x12, x22
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x24, x13, x21
+ umulh x25, x13, x21
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x24, x14, x20
+ umulh x25, x14, x20
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x24, x12, x23
+ umulh x25, x12, x23
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x24, x13, x22
+ umulh x25, x13, x22
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x24, x14, x21
+ umulh x25, x14, x21
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x24, x13, x23
+ umulh x25, x13, x23
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x24, x14, x22
+ umulh x25, x14, x22
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x24, x14, x23
+ umulh x25, x14, x23
+ adds x9, x9, x24
+ adc x10, x10, x25
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x24, #19
+ mul x25, x24, x7
+ umulh x7, x24, x7
+ adds x3, x3, x25
+ mul x25, x24, x8
+ umulh x8, x24, x8
+ adcs x4, x4, x25
+ mul x25, x24, x9
+ umulh x9, x24, x9
+ adcs x5, x5, x25
+ mul x25, x24, x10
+ umulh x26, x24, x10
+ adcs x6, x6, x25
+ adc x26, x26, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x26, x26, xzr
+ # Overflow
+ extr x26, x26, x6, #63
+ mul x26, x26, x24
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x26, x24, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #16]
+ ldr x2, [x29, #56]
+ # Multiply
+ ldp x11, x12, [x2]
+ ldp x13, x14, [x2, #16]
+ # A[0] * B[0]
+ mul x3, x20, x11
+ umulh x4, x20, x11
+ # A[0] * B[1]
+ mul x24, x20, x12
+ umulh x5, x20, x12
+ adds x4, x4, x24
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x24, x21, x11
+ umulh x25, x21, x11
+ adds x4, x4, x24
+ adcs x5, x5, x25
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x24, x20, x13
+ umulh x25, x20, x13
+ adds x5, x5, x24
+ adc x6, x6, x25
+ # A[1] * B[1]
+ mul x24, x21, x12
+ umulh x25, x21, x12
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x24, x22, x11
+ umulh x25, x22, x11
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x24, x20, x14
+ umulh x25, x20, x14
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x24, x21, x13
+ umulh x25, x21, x13
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x24, x22, x12
+ umulh x25, x22, x12
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x24, x23, x11
+ umulh x25, x23, x11
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x24, x21, x14
+ umulh x25, x21, x14
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x24, x22, x13
+ umulh x25, x22, x13
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x24, x23, x12
+ umulh x25, x23, x12
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x24, x22, x14
+ umulh x25, x22, x14
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x24, x23, x13
+ umulh x25, x23, x13
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x24, x23, x14
+ umulh x25, x23, x14
+ adds x9, x9, x24
+ adc x10, x10, x25
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x24, #19
+ mul x25, x24, x7
+ umulh x7, x24, x7
+ adds x3, x3, x25
+ mul x25, x24, x8
+ umulh x8, x24, x8
+ adcs x4, x4, x25
+ mul x25, x24, x9
+ umulh x9, x24, x9
+ adcs x5, x5, x25
+ mul x25, x24, x10
+ umulh x26, x24, x10
+ adcs x6, x6, x25
+ adc x26, x26, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x26, x26, xzr
+ # Overflow
+ extr x26, x26, x6, #63
+ mul x26, x26, x24
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x26, x24, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x0, [x29, #24]
+ # Multiply
+ # A[0] * B[0]
+ mul x3, x11, x15
+ umulh x4, x11, x15
+ # A[0] * B[1]
+ mul x24, x11, x16
+ umulh x5, x11, x16
+ adds x4, x4, x24
+ adc x5, x5, xzr
+ # A[1] * B[0]
+ mul x24, x12, x15
+ umulh x25, x12, x15
+ adds x4, x4, x24
+ adcs x5, x5, x25
+ adc x6, xzr, xzr
+ # A[0] * B[2]
+ mul x24, x11, x17
+ umulh x25, x11, x17
+ adds x5, x5, x24
+ adc x6, x6, x25
+ # A[1] * B[1]
+ mul x24, x12, x16
+ umulh x25, x12, x16
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, xzr, xzr
+ # A[2] * B[0]
+ mul x24, x13, x15
+ umulh x25, x13, x15
+ adds x5, x5, x24
+ adcs x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * B[3]
+ mul x24, x11, x19
+ umulh x25, x11, x19
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, xzr, xzr
+ # A[1] * B[2]
+ mul x24, x12, x17
+ umulh x25, x12, x17
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[2] * B[1]
+ mul x24, x13, x16
+ umulh x25, x13, x16
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[3] * B[0]
+ mul x24, x14, x15
+ umulh x25, x14, x15
+ adds x6, x6, x24
+ adcs x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * B[3]
+ mul x24, x12, x19
+ umulh x25, x12, x19
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, xzr, xzr
+ # A[2] * B[2]
+ mul x24, x13, x17
+ umulh x25, x13, x17
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[3] * B[1]
+ mul x24, x14, x16
+ umulh x25, x14, x16
+ adds x7, x7, x24
+ adcs x8, x8, x25
+ adc x9, x9, xzr
+ # A[2] * B[3]
+ mul x24, x13, x19
+ umulh x25, x13, x19
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, xzr, xzr
+ # A[3] * B[2]
+ mul x24, x14, x17
+ umulh x25, x14, x17
+ adds x8, x8, x24
+ adcs x9, x9, x25
+ adc x10, x10, xzr
+ # A[3] * B[3]
+ mul x24, x14, x19
+ umulh x25, x14, x19
+ adds x9, x9, x24
+ adc x10, x10, x25
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ extr x7, x7, x6, #63
+ and x6, x6, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x24, #19
+ mul x25, x24, x7
+ umulh x7, x24, x7
+ adds x3, x3, x25
+ mul x25, x24, x8
+ umulh x8, x24, x8
+ adcs x4, x4, x25
+ mul x25, x24, x9
+ umulh x9, x24, x9
+ adcs x5, x5, x25
+ mul x25, x24, x10
+ umulh x26, x24, x10
+ adcs x6, x6, x25
+ adc x26, x26, xzr
+ # Add remaining product results in
+ adds x4, x4, x7
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adc x26, x26, xzr
+ # Overflow
+ extr x26, x26, x6, #63
+ mul x26, x26, x24
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Reduce if top bit set
+ and x26, x24, x6, asr 63
+ and x6, x6, #0x7fffffffffffffff
+ adds x3, x3, x26
+ adcs x4, x4, xzr
+ adcs x5, x5, xzr
+ adc x6, x6, xzr
+ # Store
+ stp x3, x4, [x0]
+ stp x5, x6, [x0, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldr x26, [x29, #152]
+ ldp x29, x30, [sp], #0xa0
+ ret
+ .size fe_ge_to_p3,.-fe_ge_to_p3
+ .text
+ .align 2
+ .globl fe_ge_dbl
+ .type fe_ge_dbl, %function
+fe_ge_dbl:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ ldr x1, [x29, #48]
+ # Square
+ ldp x12, x13, [x1]
+ ldp x14, x15, [x1, #16]
+ # A[0] * A[1]
+ mul x5, x12, x13
+ umulh x6, x12, x13
+ # A[0] * A[2]
+ mul x25, x12, x14
+ umulh x7, x12, x14
+ adds x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * A[3]
+ mul x25, x12, x15
+ umulh x8, x12, x15
+ adds x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * A[2]
+ mul x25, x13, x14
+ umulh x26, x13, x14
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * A[3]
+ mul x25, x13, x15
+ umulh x26, x13, x15
+ adds x8, x8, x25
+ adc x9, x9, x26
+ # A[2] * A[3]
+ mul x25, x14, x15
+ umulh x10, x14, x15
+ adds x9, x9, x25
+ adc x10, x10, xzr
+ # Double
+ adds x5, x5, x5
+ adcs x6, x6, x6
+ adcs x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adc x11, xzr, xzr
+ # A[0] * A[0]
+ mul x4, x12, x12
+ umulh x27, x12, x12
+ # A[1] * A[1]
+ mul x25, x13, x13
+ umulh x26, x13, x13
+ adds x5, x5, x27
+ adcs x6, x6, x25
+ adc x27, x26, xzr
+ # A[2] * A[2]
+ mul x25, x14, x14
+ umulh x26, x14, x14
+ adds x7, x7, x27
+ adcs x8, x8, x25
+ adc x27, x26, xzr
+ # A[3] * A[3]
+ mul x25, x15, x15
+ umulh x26, x15, x15
+ adds x9, x9, x27
+ adcs x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ stp x4, x5, [x0]
+ stp x6, x7, [x0, #16]
+ ldr x0, [x29, #32]
+ ldr x1, [x29, #56]
+ # Square
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * A[1]
+ mul x9, x21, x22
+ umulh x10, x21, x22
+ # A[0] * A[2]
+ mul x25, x21, x23
+ umulh x11, x21, x23
+ adds x10, x10, x25
+ adc x11, x11, xzr
+ # A[0] * A[3]
+ mul x25, x21, x24
+ umulh x16, x21, x24
+ adds x11, x11, x25
+ adc x16, x16, xzr
+ # A[1] * A[2]
+ mul x25, x22, x23
+ umulh x26, x22, x23
+ adds x11, x11, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * A[3]
+ mul x25, x22, x24
+ umulh x26, x22, x24
+ adds x16, x16, x25
+ adc x17, x17, x26
+ # A[2] * A[3]
+ mul x25, x23, x24
+ umulh x19, x23, x24
+ adds x17, x17, x25
+ adc x19, x19, xzr
+ # Double
+ adds x9, x9, x9
+ adcs x10, x10, x10
+ adcs x11, x11, x11
+ adcs x16, x16, x16
+ adcs x17, x17, x17
+ adcs x19, x19, x19
+ adc x20, xzr, xzr
+ # A[0] * A[0]
+ mul x8, x21, x21
+ umulh x27, x21, x21
+ # A[1] * A[1]
+ mul x25, x22, x22
+ umulh x26, x22, x22
+ adds x9, x9, x27
+ adcs x10, x10, x25
+ adc x27, x26, xzr
+ # A[2] * A[2]
+ mul x25, x23, x23
+ umulh x26, x23, x23
+ adds x11, x11, x27
+ adcs x16, x16, x25
+ adc x27, x26, xzr
+ # A[3] * A[3]
+ mul x25, x24, x24
+ umulh x26, x24, x24
+ adds x17, x17, x27
+ adcs x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x11, #63
+ and x11, x11, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x8, x8, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x9, x9, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x10, x10, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x11, x11, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x9, x9, x16
+ adcs x10, x10, x17
+ adcs x11, x11, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x11, #63
+ mul x27, x27, x25
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Reduce if top bit set
+ and x27, x25, x11, asr 63
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Store
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ ldr x0, [x29, #24]
+ # Add
+ adds x12, x12, x21
+ adcs x13, x13, x22
+ adcs x14, x14, x23
+ adc x15, x15, x24
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ ldr x0, [x29, #40]
+ # Square
+ # A[0] * A[1]
+ mul x17, x12, x13
+ umulh x19, x12, x13
+ # A[0] * A[2]
+ mul x25, x12, x14
+ umulh x20, x12, x14
+ adds x19, x19, x25
+ adc x20, x20, xzr
+ # A[0] * A[3]
+ mul x25, x12, x15
+ umulh x21, x12, x15
+ adds x20, x20, x25
+ adc x21, x21, xzr
+ # A[1] * A[2]
+ mul x25, x13, x14
+ umulh x26, x13, x14
+ adds x20, x20, x25
+ adcs x21, x21, x26
+ adc x22, xzr, xzr
+ # A[1] * A[3]
+ mul x25, x13, x15
+ umulh x26, x13, x15
+ adds x21, x21, x25
+ adc x22, x22, x26
+ # A[2] * A[3]
+ mul x25, x14, x15
+ umulh x23, x14, x15
+ adds x22, x22, x25
+ adc x23, x23, xzr
+ # Double
+ adds x17, x17, x17
+ adcs x19, x19, x19
+ adcs x20, x20, x20
+ adcs x21, x21, x21
+ adcs x22, x22, x22
+ adcs x23, x23, x23
+ adc x24, xzr, xzr
+ # A[0] * A[0]
+ mul x16, x12, x12
+ umulh x27, x12, x12
+ # A[1] * A[1]
+ mul x25, x13, x13
+ umulh x26, x13, x13
+ adds x17, x17, x27
+ adcs x19, x19, x25
+ adc x27, x26, xzr
+ # A[2] * A[2]
+ mul x25, x14, x14
+ umulh x26, x14, x14
+ adds x20, x20, x27
+ adcs x21, x21, x25
+ adc x27, x26, xzr
+ # A[3] * A[3]
+ mul x25, x15, x15
+ umulh x26, x15, x15
+ adds x22, x22, x27
+ adcs x23, x23, x25
+ adc x24, x24, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x24, x24, x23, #63
+ extr x23, x23, x22, #63
+ extr x22, x22, x21, #63
+ extr x21, x21, x20, #63
+ and x20, x20, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x21
+ umulh x21, x25, x21
+ adds x16, x16, x26
+ mul x26, x25, x22
+ umulh x22, x25, x22
+ adcs x17, x17, x26
+ mul x26, x25, x23
+ umulh x23, x25, x23
+ adcs x19, x19, x26
+ mul x26, x25, x24
+ umulh x27, x25, x24
+ adcs x20, x20, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x17, x17, x21
+ adcs x19, x19, x22
+ adcs x20, x20, x23
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x20, #63
+ mul x27, x27, x25
+ and x20, x20, #0x7fffffffffffffff
+ adds x16, x16, x27
+ adcs x17, x17, xzr
+ adcs x19, x19, xzr
+ adc x20, x20, xzr
+ # Reduce if top bit set
+ and x27, x25, x20, asr 63
+ and x20, x20, #0x7fffffffffffffff
+ adds x16, x16, x27
+ adcs x17, x17, xzr
+ adcs x19, x19, xzr
+ adc x20, x20, xzr
+ # Store
+ stp x16, x17, [x0]
+ stp x19, x20, [x0, #16]
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #32]
+ # Add
+ adds x12, x8, x4
+ adcs x13, x9, x5
+ adcs x14, x10, x6
+ adc x15, x11, x7
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x21, x8, x4
+ sbcs x22, x9, x5
+ sbcs x23, x10, x6
+ sbcs x24, x11, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x21, x21, x25
+ adcs x22, x22, x28
+ adcs x23, x23, x28
+ adc x24, x24, x26
+ stp x12, x13, [x0]
+ stp x14, x15, [x0, #16]
+ stp x21, x22, [x1]
+ stp x23, x24, [x1, #16]
+ ldr x0, [x29, #16]
+ # Sub
+ subs x16, x16, x12
+ sbcs x17, x17, x13
+ sbcs x19, x19, x14
+ sbcs x20, x20, x15
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x16, x17, [x0]
+ stp x19, x20, [x0, #16]
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #64]
+ # Square * 2
+ ldp x12, x13, [x1]
+ ldp x14, x15, [x1, #16]
+ # A[0] * A[1]
+ mul x5, x12, x13
+ umulh x6, x12, x13
+ # A[0] * A[2]
+ mul x25, x12, x14
+ umulh x7, x12, x14
+ adds x6, x6, x25
+ adc x7, x7, xzr
+ # A[0] * A[3]
+ mul x25, x12, x15
+ umulh x8, x12, x15
+ adds x7, x7, x25
+ adc x8, x8, xzr
+ # A[1] * A[2]
+ mul x25, x13, x14
+ umulh x26, x13, x14
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * A[3]
+ mul x25, x13, x15
+ umulh x26, x13, x15
+ adds x8, x8, x25
+ adc x9, x9, x26
+ # A[2] * A[3]
+ mul x25, x14, x15
+ umulh x10, x14, x15
+ adds x9, x9, x25
+ adc x10, x10, xzr
+ # Double
+ adds x5, x5, x5
+ adcs x6, x6, x6
+ adcs x7, x7, x7
+ adcs x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adc x11, xzr, xzr
+ # A[0] * A[0]
+ mul x4, x12, x12
+ umulh x28, x12, x12
+ # A[1] * A[1]
+ mul x25, x13, x13
+ umulh x26, x13, x13
+ adds x5, x5, x28
+ adcs x6, x6, x25
+ adc x28, x26, xzr
+ # A[2] * A[2]
+ mul x25, x14, x14
+ umulh x26, x14, x14
+ adds x7, x7, x28
+ adcs x8, x8, x25
+ adc x28, x26, xzr
+ # A[3] * A[3]
+ mul x25, x15, x15
+ umulh x26, x15, x15
+ adds x9, x9, x28
+ adcs x10, x10, x25
+ adc x11, x11, x26
+ # Double and Reduce
+ mov x25, #0x169
+ # Move top half into t4-t7 and remove top bit from t3
+ lsr x28, x11, #61
+ extr x11, x11, x10, #62
+ extr x10, x10, x9, #62
+ extr x9, x9, x8, #62
+ extr x8, x8, x7, #62
+ extr x7, x7, x6, #63
+ extr x6, x6, x5, #63
+ extr x5, x5, x4, #63
+ lsl x4, x4, #1
+ and x7, x7, #0x7fffffffffffffff
+ # Two left, only one right
+ and x11, x11, #0x7fffffffffffffff
+ # Multiply top bits by 19*19
+ mul x28, x28, x25
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x4, x4, x28
+ adcs x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #40]
+ # Sub
+ subs x4, x4, x21
+ sbcs x5, x5, x22
+ sbcs x6, x6, x23
+ sbcs x7, x7, x24
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x4, x4, x25
+ adcs x5, x5, x28
+ adcs x6, x6, x28
+ adc x7, x7, x26
+ stp x4, x5, [x0]
+ stp x6, x7, [x0, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_dbl,.-fe_ge_dbl
+ .text
+ .align 2
+ .globl fe_ge_madd
+ .type fe_ge_madd, %function
+fe_ge_madd:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ str x7, [x29, #72]
+ ldr x2, [x29, #56]
+ ldr x3, [x29, #48]
+ # Add
+ ldp x12, x13, [x2]
+ ldp x14, x15, [x2, #16]
+ ldp x16, x17, [x3]
+ ldp x19, x20, [x3, #16]
+ adds x4, x12, x16
+ adcs x5, x13, x17
+ adcs x6, x14, x19
+ adc x7, x15, x20
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ # Sub
+ subs x8, x12, x16
+ sbcs x9, x13, x17
+ sbcs x10, x14, x19
+ sbcs x11, x15, x20
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x8, x8, x25
+ adcs x9, x9, x28
+ adcs x10, x10, x28
+ adc x11, x11, x26
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #184]
+ # Multiply
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x12, x4, x21
+ umulh x13, x4, x21
+ # A[0] * B[1]
+ mul x25, x4, x22
+ umulh x14, x4, x22
+ adds x13, x13, x25
+ adc x14, x14, xzr
+ # A[1] * B[0]
+ mul x25, x5, x21
+ umulh x26, x5, x21
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x4, x23
+ umulh x26, x4, x23
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # A[1] * B[1]
+ mul x25, x5, x22
+ umulh x26, x5, x22
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x6, x21
+ umulh x26, x6, x21
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x4, x24
+ umulh x26, x4, x24
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x5, x23
+ umulh x26, x5, x23
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x6, x22
+ umulh x26, x6, x22
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x7, x21
+ umulh x26, x7, x21
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x5, x24
+ umulh x26, x5, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x6, x23
+ umulh x26, x6, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x7, x22
+ umulh x26, x7, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x6, x24
+ umulh x26, x6, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x7, x23
+ umulh x26, x7, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x7, x24
+ umulh x26, x7, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ and x15, x15, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x12, x12, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x13, x13, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x14, x14, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x15, x15, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x13, x13, x16
+ adcs x14, x14, x17
+ adcs x15, x15, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x15, #63
+ mul x27, x27, x25
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Reduce if top bit set
+ and x27, x25, x15, asr 63
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #192]
+ # Multiply
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * B[0]
+ mul x4, x8, x21
+ umulh x5, x8, x21
+ # A[0] * B[1]
+ mul x25, x8, x22
+ umulh x6, x8, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x9, x21
+ umulh x26, x9, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x8, x23
+ umulh x26, x8, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x9, x22
+ umulh x26, x9, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x10, x21
+ umulh x26, x10, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x8, x24
+ umulh x26, x8, x24
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x9, x23
+ umulh x26, x9, x23
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x10, x22
+ umulh x26, x10, x22
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x11, x21
+ umulh x26, x11, x21
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x9, x24
+ umulh x26, x9, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x10, x23
+ umulh x26, x10, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x11, x22
+ umulh x26, x11, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x10, x24
+ umulh x26, x10, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x11, x23
+ umulh x26, x11, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x11, x24
+ umulh x26, x11, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x4, x4, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x5, x5, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x6, x6, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x16
+ adcs x6, x6, x17
+ adcs x7, x7, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #16]
+ # Add
+ adds x8, x12, x4
+ adcs x9, x13, x5
+ adcs x10, x14, x6
+ adc x11, x15, x7
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ # Sub
+ subs x16, x12, x4
+ sbcs x17, x13, x5
+ sbcs x19, x14, x6
+ sbcs x20, x15, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #176]
+ ldr x3, [x29, #72]
+ # Multiply
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x3]
+ ldp x23, x24, [x3, #16]
+ # A[0] * B[0]
+ mul x4, x16, x21
+ umulh x5, x16, x21
+ # A[0] * B[1]
+ mul x25, x16, x22
+ umulh x6, x16, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x17, x21
+ umulh x26, x17, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x16, x23
+ umulh x26, x16, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x17, x22
+ umulh x26, x17, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x19, x21
+ umulh x26, x19, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, x8, xzr
+ # A[0] * B[3]
+ mul x25, x16, x24
+ umulh x26, x16, x24
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x17, x23
+ umulh x26, x17, x23
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[2] * B[1]
+ mul x25, x19, x22
+ umulh x26, x19, x22
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[3] * B[0]
+ mul x25, x20, x21
+ umulh x26, x20, x21
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[1] * B[3]
+ mul x25, x17, x24
+ umulh x26, x17, x24
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x19, x23
+ umulh x26, x19, x23
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[3] * B[1]
+ mul x25, x20, x22
+ umulh x26, x20, x22
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[2] * B[3]
+ mul x25, x19, x24
+ umulh x26, x19, x24
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x20, x23
+ umulh x26, x20, x23
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, x11, xzr
+ # A[3] * B[3]
+ mul x25, x20, x24
+ umulh x26, x20, x24
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #32]
+ ldr x1, [x29, #64]
+ # Double
+ ldp x8, x9, [x1]
+ ldp x10, x11, [x1, #16]
+ adds x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adc x11, x11, x11
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ ldr x1, [x29, #40]
+ # Add
+ adds x12, x8, x4
+ adcs x13, x9, x5
+ adcs x14, x10, x6
+ adc x15, x11, x7
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x16, x8, x4
+ sbcs x17, x9, x5
+ sbcs x19, x10, x6
+ sbcs x20, x11, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x12, x13, [x0]
+ stp x14, x15, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_madd,.-fe_ge_madd
+ .text
+ .align 2
+ .globl fe_ge_msub
+ .type fe_ge_msub, %function
+fe_ge_msub:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ str x7, [x29, #72]
+ ldr x2, [x29, #56]
+ ldr x3, [x29, #48]
+ # Add
+ ldp x12, x13, [x2]
+ ldp x14, x15, [x2, #16]
+ ldp x16, x17, [x3]
+ ldp x19, x20, [x3, #16]
+ adds x4, x12, x16
+ adcs x5, x13, x17
+ adcs x6, x14, x19
+ adc x7, x15, x20
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ # Sub
+ subs x8, x12, x16
+ sbcs x9, x13, x17
+ sbcs x10, x14, x19
+ sbcs x11, x15, x20
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x8, x8, x25
+ adcs x9, x9, x28
+ adcs x10, x10, x28
+ adc x11, x11, x26
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #192]
+ # Multiply
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x12, x4, x21
+ umulh x13, x4, x21
+ # A[0] * B[1]
+ mul x25, x4, x22
+ umulh x14, x4, x22
+ adds x13, x13, x25
+ adc x14, x14, xzr
+ # A[1] * B[0]
+ mul x25, x5, x21
+ umulh x26, x5, x21
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x4, x23
+ umulh x26, x4, x23
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # A[1] * B[1]
+ mul x25, x5, x22
+ umulh x26, x5, x22
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x6, x21
+ umulh x26, x6, x21
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x4, x24
+ umulh x26, x4, x24
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x5, x23
+ umulh x26, x5, x23
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x6, x22
+ umulh x26, x6, x22
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x7, x21
+ umulh x26, x7, x21
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x5, x24
+ umulh x26, x5, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x6, x23
+ umulh x26, x6, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x7, x22
+ umulh x26, x7, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x6, x24
+ umulh x26, x6, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x7, x23
+ umulh x26, x7, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x7, x24
+ umulh x26, x7, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ and x15, x15, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x12, x12, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x13, x13, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x14, x14, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x15, x15, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x13, x13, x16
+ adcs x14, x14, x17
+ adcs x15, x15, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x15, #63
+ mul x27, x27, x25
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Reduce if top bit set
+ and x27, x25, x15, asr 63
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #184]
+ # Multiply
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * B[0]
+ mul x4, x8, x21
+ umulh x5, x8, x21
+ # A[0] * B[1]
+ mul x25, x8, x22
+ umulh x6, x8, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x9, x21
+ umulh x26, x9, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x8, x23
+ umulh x26, x8, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x9, x22
+ umulh x26, x9, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x10, x21
+ umulh x26, x10, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x8, x24
+ umulh x26, x8, x24
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x9, x23
+ umulh x26, x9, x23
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x10, x22
+ umulh x26, x10, x22
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x11, x21
+ umulh x26, x11, x21
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x9, x24
+ umulh x26, x9, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x10, x23
+ umulh x26, x10, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x11, x22
+ umulh x26, x11, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x10, x24
+ umulh x26, x10, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x11, x23
+ umulh x26, x11, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x11, x24
+ umulh x26, x11, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x4, x4, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x5, x5, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x6, x6, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x16
+ adcs x6, x6, x17
+ adcs x7, x7, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #16]
+ # Add
+ adds x8, x12, x4
+ adcs x9, x13, x5
+ adcs x10, x14, x6
+ adc x11, x15, x7
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ # Sub
+ subs x16, x12, x4
+ sbcs x17, x13, x5
+ sbcs x19, x14, x6
+ sbcs x20, x15, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #176]
+ ldr x3, [x29, #72]
+ # Multiply
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x3]
+ ldp x23, x24, [x3, #16]
+ # A[0] * B[0]
+ mul x4, x16, x21
+ umulh x5, x16, x21
+ # A[0] * B[1]
+ mul x25, x16, x22
+ umulh x6, x16, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x17, x21
+ umulh x26, x17, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x16, x23
+ umulh x26, x16, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x17, x22
+ umulh x26, x17, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x19, x21
+ umulh x26, x19, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, x8, xzr
+ # A[0] * B[3]
+ mul x25, x16, x24
+ umulh x26, x16, x24
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x17, x23
+ umulh x26, x17, x23
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[2] * B[1]
+ mul x25, x19, x22
+ umulh x26, x19, x22
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[3] * B[0]
+ mul x25, x20, x21
+ umulh x26, x20, x21
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[1] * B[3]
+ mul x25, x17, x24
+ umulh x26, x17, x24
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x19, x23
+ umulh x26, x19, x23
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[3] * B[1]
+ mul x25, x20, x22
+ umulh x26, x20, x22
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[2] * B[3]
+ mul x25, x19, x24
+ umulh x26, x19, x24
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x20, x23
+ umulh x26, x20, x23
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, x11, xzr
+ # A[3] * B[3]
+ mul x25, x20, x24
+ umulh x26, x20, x24
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #32]
+ ldr x1, [x29, #64]
+ # Double
+ ldp x8, x9, [x1]
+ ldp x10, x11, [x1, #16]
+ adds x8, x8, x8
+ adcs x9, x9, x9
+ adcs x10, x10, x10
+ adc x11, x11, x11
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ ldr x1, [x29, #40]
+ # Add
+ adds x12, x8, x4
+ adcs x13, x9, x5
+ adcs x14, x10, x6
+ adc x15, x11, x7
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x16, x8, x4
+ sbcs x17, x9, x5
+ sbcs x19, x10, x6
+ sbcs x20, x11, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x12, x13, [x1]
+ stp x14, x15, [x1, #16]
+ stp x16, x17, [x0]
+ stp x19, x20, [x0, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_msub,.-fe_ge_msub
+ .text
+ .align 2
+ .globl fe_ge_add
+ .type fe_ge_add, %function
+fe_ge_add:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ str x7, [x29, #72]
+ ldr x2, [x29, #56]
+ ldr x3, [x29, #48]
+ # Add
+ ldp x12, x13, [x2]
+ ldp x14, x15, [x2, #16]
+ ldp x16, x17, [x3]
+ ldp x19, x20, [x3, #16]
+ adds x4, x12, x16
+ adcs x5, x13, x17
+ adcs x6, x14, x19
+ adc x7, x15, x20
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ # Sub
+ subs x8, x12, x16
+ sbcs x9, x13, x17
+ sbcs x10, x14, x19
+ sbcs x11, x15, x20
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x8, x8, x25
+ adcs x9, x9, x28
+ adcs x10, x10, x28
+ adc x11, x11, x26
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #192]
+ # Multiply
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x12, x4, x21
+ umulh x13, x4, x21
+ # A[0] * B[1]
+ mul x25, x4, x22
+ umulh x14, x4, x22
+ adds x13, x13, x25
+ adc x14, x14, xzr
+ # A[1] * B[0]
+ mul x25, x5, x21
+ umulh x26, x5, x21
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x4, x23
+ umulh x26, x4, x23
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # A[1] * B[1]
+ mul x25, x5, x22
+ umulh x26, x5, x22
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x6, x21
+ umulh x26, x6, x21
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x4, x24
+ umulh x26, x4, x24
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x5, x23
+ umulh x26, x5, x23
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x6, x22
+ umulh x26, x6, x22
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x7, x21
+ umulh x26, x7, x21
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x5, x24
+ umulh x26, x5, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x6, x23
+ umulh x26, x6, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x7, x22
+ umulh x26, x7, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x6, x24
+ umulh x26, x6, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x7, x23
+ umulh x26, x7, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x7, x24
+ umulh x26, x7, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ and x15, x15, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x12, x12, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x13, x13, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x14, x14, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x15, x15, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x13, x13, x16
+ adcs x14, x14, x17
+ adcs x15, x15, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x15, #63
+ mul x27, x27, x25
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Reduce if top bit set
+ and x27, x25, x15, asr 63
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #200]
+ # Multiply
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * B[0]
+ mul x4, x8, x21
+ umulh x5, x8, x21
+ # A[0] * B[1]
+ mul x25, x8, x22
+ umulh x6, x8, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x9, x21
+ umulh x26, x9, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x8, x23
+ umulh x26, x8, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x9, x22
+ umulh x26, x9, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x10, x21
+ umulh x26, x10, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x8, x24
+ umulh x26, x8, x24
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x9, x23
+ umulh x26, x9, x23
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x10, x22
+ umulh x26, x10, x22
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x11, x21
+ umulh x26, x11, x21
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x9, x24
+ umulh x26, x9, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x10, x23
+ umulh x26, x10, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x11, x22
+ umulh x26, x11, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x10, x24
+ umulh x26, x10, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x11, x23
+ umulh x26, x11, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x11, x24
+ umulh x26, x11, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x4, x4, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x5, x5, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x6, x6, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x16
+ adcs x6, x6, x17
+ adcs x7, x7, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #16]
+ # Add
+ adds x8, x12, x4
+ adcs x9, x13, x5
+ adcs x10, x14, x6
+ adc x11, x15, x7
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ # Sub
+ subs x16, x12, x4
+ sbcs x17, x13, x5
+ sbcs x19, x14, x6
+ sbcs x20, x15, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x0, [x29, #48]
+ ldr x1, [x29, #64]
+ ldr x2, [x29, #176]
+ # Multiply
+ ldp x12, x13, [x1]
+ ldp x14, x15, [x1, #16]
+ ldp x16, x17, [x2]
+ ldp x19, x20, [x2, #16]
+ # A[0] * B[0]
+ mul x4, x12, x16
+ umulh x5, x12, x16
+ # A[0] * B[1]
+ mul x25, x12, x17
+ umulh x6, x12, x17
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x13, x16
+ umulh x26, x13, x16
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x12, x19
+ umulh x26, x12, x19
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x13, x17
+ umulh x26, x13, x17
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x14, x16
+ umulh x26, x14, x16
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, x8, xzr
+ # A[0] * B[3]
+ mul x25, x12, x20
+ umulh x26, x12, x20
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x13, x19
+ umulh x26, x13, x19
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[2] * B[1]
+ mul x25, x14, x17
+ umulh x26, x14, x17
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[3] * B[0]
+ mul x25, x15, x16
+ umulh x26, x15, x16
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[1] * B[3]
+ mul x25, x13, x20
+ umulh x26, x13, x20
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x14, x19
+ umulh x26, x14, x19
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[3] * B[1]
+ mul x25, x15, x17
+ umulh x26, x15, x17
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[2] * B[3]
+ mul x25, x14, x20
+ umulh x26, x14, x20
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x15, x19
+ umulh x26, x15, x19
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, x11, xzr
+ # A[3] * B[3]
+ mul x25, x15, x20
+ umulh x26, x15, x20
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #48]
+ # Double
+ adds x4, x4, x4
+ adcs x5, x5, x5
+ adcs x6, x6, x6
+ adc x7, x7, x7
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #184]
+ ldr x2, [x29, #72]
+ # Multiply
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x8, x16, x21
+ umulh x9, x16, x21
+ # A[0] * B[1]
+ mul x25, x16, x22
+ umulh x10, x16, x22
+ adds x9, x9, x25
+ adc x10, x10, xzr
+ # A[1] * B[0]
+ mul x25, x17, x21
+ umulh x26, x17, x21
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x16, x23
+ umulh x26, x16, x23
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # A[1] * B[1]
+ mul x25, x17, x22
+ umulh x26, x17, x22
+ adds x10, x10, x25
+ adcs x11, x11, x26
+ adc x12, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x19, x21
+ umulh x26, x19, x21
+ adds x10, x10, x25
+ adcs x11, x11, x26
+ adc x12, x12, xzr
+ # A[0] * B[3]
+ mul x25, x16, x24
+ umulh x26, x16, x24
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x17, x23
+ umulh x26, x17, x23
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[2] * B[1]
+ mul x25, x19, x22
+ umulh x26, x19, x22
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[3] * B[0]
+ mul x25, x20, x21
+ umulh x26, x20, x21
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[1] * B[3]
+ mul x25, x17, x24
+ umulh x26, x17, x24
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x19, x23
+ umulh x26, x19, x23
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, x14, xzr
+ # A[3] * B[1]
+ mul x25, x20, x22
+ umulh x26, x20, x22
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, x14, xzr
+ # A[2] * B[3]
+ mul x25, x19, x24
+ umulh x26, x19, x24
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x20, x23
+ umulh x26, x20, x23
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, x15, xzr
+ # A[3] * B[3]
+ mul x25, x20, x24
+ umulh x26, x20, x24
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x15, x15, x14, #63
+ extr x14, x14, x13, #63
+ extr x13, x13, x12, #63
+ extr x12, x12, x11, #63
+ and x11, x11, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x12
+ umulh x12, x25, x12
+ adds x8, x8, x26
+ mul x26, x25, x13
+ umulh x13, x25, x13
+ adcs x9, x9, x26
+ mul x26, x25, x14
+ umulh x14, x25, x14
+ adcs x10, x10, x26
+ mul x26, x25, x15
+ umulh x27, x25, x15
+ adcs x11, x11, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x9, x9, x12
+ adcs x10, x10, x13
+ adcs x11, x11, x14
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x11, #63
+ mul x27, x27, x25
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Reduce if top bit set
+ and x27, x25, x11, asr 63
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Store
+ ldr x0, [x29, #32]
+ ldr x1, [x29, #40]
+ # Add
+ adds x12, x4, x8
+ adcs x13, x5, x9
+ adcs x14, x6, x10
+ adc x15, x7, x11
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x16, x4, x8
+ sbcs x17, x5, x9
+ sbcs x19, x6, x10
+ sbcs x20, x7, x11
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x12, x13, [x0]
+ stp x14, x15, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_add,.-fe_ge_add
+ .text
+ .align 2
+ .globl fe_ge_sub
+ .type fe_ge_sub, %function
+fe_ge_sub:
+ stp x29, x30, [sp, #-176]!
+ add x29, sp, #0
+ str x17, [x29, #88]
+ str x19, [x29, #96]
+ stp x20, x21, [x29, #104]
+ stp x22, x23, [x29, #120]
+ stp x24, x25, [x29, #136]
+ stp x26, x27, [x29, #152]
+ str x28, [x29, #168]
+ str x0, [x29, #16]
+ str x1, [x29, #24]
+ str x2, [x29, #32]
+ str x3, [x29, #40]
+ str x4, [x29, #48]
+ str x5, [x29, #56]
+ str x6, [x29, #64]
+ str x7, [x29, #72]
+ ldr x2, [x29, #56]
+ ldr x3, [x29, #48]
+ # Add
+ ldp x12, x13, [x2]
+ ldp x14, x15, [x2, #16]
+ ldp x16, x17, [x3]
+ ldp x19, x20, [x3, #16]
+ adds x4, x12, x16
+ adcs x5, x13, x17
+ adcs x6, x14, x19
+ adc x7, x15, x20
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ # Sub
+ subs x8, x12, x16
+ sbcs x9, x13, x17
+ sbcs x10, x14, x19
+ sbcs x11, x15, x20
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x8, x8, x25
+ adcs x9, x9, x28
+ adcs x10, x10, x28
+ adc x11, x11, x26
+ ldr x0, [x29, #32]
+ ldr x2, [x29, #200]
+ # Multiply
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x12, x4, x21
+ umulh x13, x4, x21
+ # A[0] * B[1]
+ mul x25, x4, x22
+ umulh x14, x4, x22
+ adds x13, x13, x25
+ adc x14, x14, xzr
+ # A[1] * B[0]
+ mul x25, x5, x21
+ umulh x26, x5, x21
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x4, x23
+ umulh x26, x4, x23
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # A[1] * B[1]
+ mul x25, x5, x22
+ umulh x26, x5, x22
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x6, x21
+ umulh x26, x6, x21
+ adds x14, x14, x25
+ adcs x15, x15, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x4, x24
+ umulh x26, x4, x24
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x5, x23
+ umulh x26, x5, x23
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x6, x22
+ umulh x26, x6, x22
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x7, x21
+ umulh x26, x7, x21
+ adds x15, x15, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x5, x24
+ umulh x26, x5, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x6, x23
+ umulh x26, x6, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x7, x22
+ umulh x26, x7, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x6, x24
+ umulh x26, x6, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x7, x23
+ umulh x26, x7, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x7, x24
+ umulh x26, x7, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x15, #63
+ and x15, x15, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x12, x12, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x13, x13, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x14, x14, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x15, x15, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x13, x13, x16
+ adcs x14, x14, x17
+ adcs x15, x15, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x15, #63
+ mul x27, x27, x25
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Reduce if top bit set
+ and x27, x25, x15, asr 63
+ and x15, x15, #0x7fffffffffffffff
+ adds x12, x12, x27
+ adcs x13, x13, xzr
+ adcs x14, x14, xzr
+ adc x15, x15, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #192]
+ # Multiply
+ ldp x21, x22, [x1]
+ ldp x23, x24, [x1, #16]
+ # A[0] * B[0]
+ mul x4, x8, x21
+ umulh x5, x8, x21
+ # A[0] * B[1]
+ mul x25, x8, x22
+ umulh x6, x8, x22
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x9, x21
+ umulh x26, x9, x21
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x8, x23
+ umulh x26, x8, x23
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x9, x22
+ umulh x26, x9, x22
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x10, x21
+ umulh x26, x10, x21
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x16, x16, xzr
+ # A[0] * B[3]
+ mul x25, x8, x24
+ umulh x26, x8, x24
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x9, x23
+ umulh x26, x9, x23
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[2] * B[1]
+ mul x25, x10, x22
+ umulh x26, x10, x22
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[3] * B[0]
+ mul x25, x11, x21
+ umulh x26, x11, x21
+ adds x7, x7, x25
+ adcs x16, x16, x26
+ adc x17, x17, xzr
+ # A[1] * B[3]
+ mul x25, x9, x24
+ umulh x26, x9, x24
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x10, x23
+ umulh x26, x10, x23
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[3] * B[1]
+ mul x25, x11, x22
+ umulh x26, x11, x22
+ adds x16, x16, x25
+ adcs x17, x17, x26
+ adc x19, x19, xzr
+ # A[2] * B[3]
+ mul x25, x10, x24
+ umulh x26, x10, x24
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x11, x23
+ umulh x26, x11, x23
+ adds x17, x17, x25
+ adcs x19, x19, x26
+ adc x20, x20, xzr
+ # A[3] * B[3]
+ mul x25, x11, x24
+ umulh x26, x11, x24
+ adds x19, x19, x25
+ adc x20, x20, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x20, x20, x19, #63
+ extr x19, x19, x17, #63
+ extr x17, x17, x16, #63
+ extr x16, x16, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x16
+ umulh x16, x25, x16
+ adds x4, x4, x26
+ mul x26, x25, x17
+ umulh x17, x25, x17
+ adcs x5, x5, x26
+ mul x26, x25, x19
+ umulh x19, x25, x19
+ adcs x6, x6, x26
+ mul x26, x25, x20
+ umulh x27, x25, x20
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x16
+ adcs x6, x6, x17
+ adcs x7, x7, x19
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #24]
+ ldr x1, [x29, #16]
+ # Add
+ adds x8, x12, x4
+ adcs x9, x13, x5
+ adcs x10, x14, x6
+ adc x11, x15, x7
+ mov x25, #-19
+ asr x28, x11, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x8, x8, x25
+ sbcs x9, x9, x28
+ sbcs x10, x10, x28
+ sbc x11, x11, x26
+ # Sub
+ subs x16, x12, x4
+ sbcs x17, x13, x5
+ sbcs x19, x14, x6
+ sbcs x20, x15, x7
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x8, x9, [x0]
+ stp x10, x11, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x0, [x29, #48]
+ ldr x1, [x29, #64]
+ ldr x2, [x29, #176]
+ # Multiply
+ ldp x12, x13, [x1]
+ ldp x14, x15, [x1, #16]
+ ldp x16, x17, [x2]
+ ldp x19, x20, [x2, #16]
+ # A[0] * B[0]
+ mul x4, x12, x16
+ umulh x5, x12, x16
+ # A[0] * B[1]
+ mul x25, x12, x17
+ umulh x6, x12, x17
+ adds x5, x5, x25
+ adc x6, x6, xzr
+ # A[1] * B[0]
+ mul x25, x13, x16
+ umulh x26, x13, x16
+ adds x5, x5, x25
+ adcs x6, x6, x26
+ adc x7, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x12, x19
+ umulh x26, x12, x19
+ adds x6, x6, x25
+ adc x7, x7, x26
+ # A[1] * B[1]
+ mul x25, x13, x17
+ umulh x26, x13, x17
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x14, x16
+ umulh x26, x14, x16
+ adds x6, x6, x25
+ adcs x7, x7, x26
+ adc x8, x8, xzr
+ # A[0] * B[3]
+ mul x25, x12, x20
+ umulh x26, x12, x20
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x13, x19
+ umulh x26, x13, x19
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[2] * B[1]
+ mul x25, x14, x17
+ umulh x26, x14, x17
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[3] * B[0]
+ mul x25, x15, x16
+ umulh x26, x15, x16
+ adds x7, x7, x25
+ adcs x8, x8, x26
+ adc x9, x9, xzr
+ # A[1] * B[3]
+ mul x25, x13, x20
+ umulh x26, x13, x20
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x14, x19
+ umulh x26, x14, x19
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[3] * B[1]
+ mul x25, x15, x17
+ umulh x26, x15, x17
+ adds x8, x8, x25
+ adcs x9, x9, x26
+ adc x10, x10, xzr
+ # A[2] * B[3]
+ mul x25, x14, x20
+ umulh x26, x14, x20
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x15, x19
+ umulh x26, x15, x19
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, x11, xzr
+ # A[3] * B[3]
+ mul x25, x15, x20
+ umulh x26, x15, x20
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x11, x11, x10, #63
+ extr x10, x10, x9, #63
+ extr x9, x9, x8, #63
+ extr x8, x8, x7, #63
+ and x7, x7, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x8
+ umulh x8, x25, x8
+ adds x4, x4, x26
+ mul x26, x25, x9
+ umulh x9, x25, x9
+ adcs x5, x5, x26
+ mul x26, x25, x10
+ umulh x10, x25, x10
+ adcs x6, x6, x26
+ mul x26, x25, x11
+ umulh x27, x25, x11
+ adcs x7, x7, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x5, x5, x8
+ adcs x6, x6, x9
+ adcs x7, x7, x10
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x7, #63
+ mul x27, x27, x25
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Reduce if top bit set
+ and x27, x25, x7, asr 63
+ and x7, x7, #0x7fffffffffffffff
+ adds x4, x4, x27
+ adcs x5, x5, xzr
+ adcs x6, x6, xzr
+ adc x7, x7, xzr
+ # Store
+ ldr x0, [x29, #48]
+ # Double
+ adds x4, x4, x4
+ adcs x5, x5, x5
+ adcs x6, x6, x6
+ adc x7, x7, x7
+ mov x25, #-19
+ asr x28, x7, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x4, x4, x25
+ sbcs x5, x5, x28
+ sbcs x6, x6, x28
+ sbc x7, x7, x26
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #184]
+ ldr x2, [x29, #72]
+ # Multiply
+ ldp x16, x17, [x1]
+ ldp x19, x20, [x1, #16]
+ ldp x21, x22, [x2]
+ ldp x23, x24, [x2, #16]
+ # A[0] * B[0]
+ mul x8, x16, x21
+ umulh x9, x16, x21
+ # A[0] * B[1]
+ mul x25, x16, x22
+ umulh x10, x16, x22
+ adds x9, x9, x25
+ adc x10, x10, xzr
+ # A[1] * B[0]
+ mul x25, x17, x21
+ umulh x26, x17, x21
+ adds x9, x9, x25
+ adcs x10, x10, x26
+ adc x11, xzr, xzr
+ # A[0] * B[2]
+ mul x25, x16, x23
+ umulh x26, x16, x23
+ adds x10, x10, x25
+ adc x11, x11, x26
+ # A[1] * B[1]
+ mul x25, x17, x22
+ umulh x26, x17, x22
+ adds x10, x10, x25
+ adcs x11, x11, x26
+ adc x12, xzr, xzr
+ # A[2] * B[0]
+ mul x25, x19, x21
+ umulh x26, x19, x21
+ adds x10, x10, x25
+ adcs x11, x11, x26
+ adc x12, x12, xzr
+ # A[0] * B[3]
+ mul x25, x16, x24
+ umulh x26, x16, x24
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, xzr, xzr
+ # A[1] * B[2]
+ mul x25, x17, x23
+ umulh x26, x17, x23
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[2] * B[1]
+ mul x25, x19, x22
+ umulh x26, x19, x22
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[3] * B[0]
+ mul x25, x20, x21
+ umulh x26, x20, x21
+ adds x11, x11, x25
+ adcs x12, x12, x26
+ adc x13, x13, xzr
+ # A[1] * B[3]
+ mul x25, x17, x24
+ umulh x26, x17, x24
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, xzr, xzr
+ # A[2] * B[2]
+ mul x25, x19, x23
+ umulh x26, x19, x23
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, x14, xzr
+ # A[3] * B[1]
+ mul x25, x20, x22
+ umulh x26, x20, x22
+ adds x12, x12, x25
+ adcs x13, x13, x26
+ adc x14, x14, xzr
+ # A[2] * B[3]
+ mul x25, x19, x24
+ umulh x26, x19, x24
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, xzr, xzr
+ # A[3] * B[2]
+ mul x25, x20, x23
+ umulh x26, x20, x23
+ adds x13, x13, x25
+ adcs x14, x14, x26
+ adc x15, x15, xzr
+ # A[3] * B[3]
+ mul x25, x20, x24
+ umulh x26, x20, x24
+ adds x14, x14, x25
+ adc x15, x15, x26
+ # Reduce
+ # Move top half into t4-t7 and remove top bit from t3
+ extr x15, x15, x14, #63
+ extr x14, x14, x13, #63
+ extr x13, x13, x12, #63
+ extr x12, x12, x11, #63
+ and x11, x11, #0x7fffffffffffffff
+ # Multiply top half by 19
+ mov x25, #19
+ mul x26, x25, x12
+ umulh x12, x25, x12
+ adds x8, x8, x26
+ mul x26, x25, x13
+ umulh x13, x25, x13
+ adcs x9, x9, x26
+ mul x26, x25, x14
+ umulh x14, x25, x14
+ adcs x10, x10, x26
+ mul x26, x25, x15
+ umulh x27, x25, x15
+ adcs x11, x11, x26
+ adc x27, x27, xzr
+ # Add remaining product results in
+ adds x9, x9, x12
+ adcs x10, x10, x13
+ adcs x11, x11, x14
+ adc x27, x27, xzr
+ # Overflow
+ extr x27, x27, x11, #63
+ mul x27, x27, x25
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Reduce if top bit set
+ and x27, x25, x11, asr 63
+ and x11, x11, #0x7fffffffffffffff
+ adds x8, x8, x27
+ adcs x9, x9, xzr
+ adcs x10, x10, xzr
+ adc x11, x11, xzr
+ # Store
+ ldr x0, [x29, #40]
+ ldr x1, [x29, #32]
+ # Add
+ adds x12, x4, x8
+ adcs x13, x5, x9
+ adcs x14, x6, x10
+ adc x15, x7, x11
+ mov x25, #-19
+ asr x28, x15, #63
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Sub modulus (if overflow)
+ subs x12, x12, x25
+ sbcs x13, x13, x28
+ sbcs x14, x14, x28
+ sbc x15, x15, x26
+ # Sub
+ subs x16, x4, x8
+ sbcs x17, x5, x9
+ sbcs x19, x6, x10
+ sbcs x20, x7, x11
+ mov x25, #-19
+ csetm x28, cc
+ # Mask the modulus
+ and x25, x28, x25
+ and x26, x28, #0x7fffffffffffffff
+ # Add modulus (if underflow)
+ adds x16, x16, x25
+ adcs x17, x17, x28
+ adcs x19, x19, x28
+ adc x20, x20, x26
+ stp x12, x13, [x0]
+ stp x14, x15, [x0, #16]
+ stp x16, x17, [x1]
+ stp x19, x20, [x1, #16]
+ ldr x17, [x29, #88]
+ ldr x19, [x29, #96]
+ ldp x20, x21, [x29, #104]
+ ldp x22, x23, [x29, #120]
+ ldp x24, x25, [x29, #136]
+ ldp x26, x27, [x29, #152]
+ ldr x28, [x29, #168]
+ ldp x29, x30, [sp], #0xb0
+ ret
+ .size fe_ge_sub,.-fe_ge_sub
+#endif /* __aarch64__ */
diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.c b/wolfcrypt/src/port/arm/armv8-curve25519.c
new file mode 100644
index 0000000..d1ab4c8
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-curve25519.c
@@ -0,0 +1,6725 @@
+/* armv8-curve25519
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c
+ */
+#ifdef __aarch64__
+#include <stdint.h>
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#include <wolfssl/wolfcrypt/fe_operations.h>
+#include <stdint.h>
+
+void fe_init()
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ :
+ :
+ : "memory"
+ );
+}
+
+void fe_frombytes(fe out, const unsigned char* in)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "ldp x2, x3, [%x[in]]\n\t"
+ "ldp x4, x5, [%x[in], #16]\n\t"
+ "and x5, x5, #0x7fffffffffffffff\n\t"
+ "stp x2, x3, [%x[out]]\n\t"
+ "stp x4, x5, [%x[out], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [out] "+r" (out), [in] "+r" (in)
+ :
+ : "memory", "x2", "x3", "x4", "x5", "x6"
+ );
+}
+
+void fe_tobytes(unsigned char* out, const fe n)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "mov x7, #19\n\t"
+ "ldp x2, x3, [%x[n]]\n\t"
+ "ldp x4, x5, [%x[n], #16]\n\t"
+ "adds x6, x2, x7\n\t"
+ "adcs x6, x3, xzr\n\t"
+ "adcs x6, x4, xzr\n\t"
+ "adc x6, x5, xzr\n\t"
+ "and x6, x7, x6, asr 63\n\t"
+ "adds x2, x2, x6\n\t"
+ "adcs x3, x3, xzr\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adc x5, x5, xzr\n\t"
+ "and x5, x5, #0x7fffffffffffffff\n\t"
+ "stp x2, x3, [%x[out]]\n\t"
+ "stp x4, x5, [%x[out], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [out] "+r" (out), [n] "+r" (n)
+ :
+ : "memory", "x2", "x3", "x4", "x5", "x6", "x7"
+ );
+}
+
+void fe_1(fe n)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Set one */
+ "mov x1, #1\n\t"
+ "stp x1, xzr, [%x[n]]\n\t"
+ "stp xzr, xzr, [%x[n], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [n] "+r" (n)
+ :
+ : "memory", "x1"
+ );
+}
+
+void fe_0(fe n)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Set zero */
+ "stp xzr, xzr, [%x[n]]\n\t"
+ "stp xzr, xzr, [%x[n], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [n] "+r" (n)
+ :
+ : "memory"
+ );
+}
+
+void fe_copy(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Copy */
+ "ldp x2, x3, [%x[a]]\n\t"
+ "ldp x4, x5, [%x[a], #16]\n\t"
+ "stp x2, x3, [%x[r]]\n\t"
+ "stp x4, x5, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x2", "x3", "x4", "x5"
+ );
+}
+
+void fe_sub(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Sub */
+ "ldp x3, x4, [%x[a]]\n\t"
+ "ldp x5, x6, [%x[a], #16]\n\t"
+ "ldp x7, x8, [%x[b]]\n\t"
+ "ldp x9, x10, [%x[b], #16]\n\t"
+ "subs x3, x3, x7\n\t"
+ "sbcs x4, x4, x8\n\t"
+ "sbcs x5, x5, x9\n\t"
+ "sbcs x6, x6, x10\n\t"
+ "mov x12, #-19\n\t"
+ "csetm x11, cc\n\t"
+ /* Mask the modulus */
+ "and x12, x11, x12\n\t"
+ "and x13, x11, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x3, x3, x12\n\t"
+ "adcs x4, x4, x11\n\t"
+ "adcs x5, x5, x11\n\t"
+ "adc x6, x6, x13\n\t"
+ "stp x3, x4, [%x[r]]\n\t"
+ "stp x5, x6, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
+ );
+}
+
+void fe_add(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Add */
+ "ldp x3, x4, [%x[a]]\n\t"
+ "ldp x5, x6, [%x[a], #16]\n\t"
+ "ldp x7, x8, [%x[b]]\n\t"
+ "ldp x9, x10, [%x[b], #16]\n\t"
+ "adds x3, x3, x7\n\t"
+ "adcs x4, x4, x8\n\t"
+ "adcs x5, x5, x9\n\t"
+ "adc x6, x6, x10\n\t"
+ "mov x12, #-19\n\t"
+ "asr x11, x6, #63\n\t"
+ /* Mask the modulus */
+ "and x12, x11, x12\n\t"
+ "and x13, x11, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x3, x3, x12\n\t"
+ "sbcs x4, x4, x11\n\t"
+ "sbcs x5, x5, x11\n\t"
+ "sbc x6, x6, x13\n\t"
+ "stp x3, x4, [%x[r]]\n\t"
+ "stp x5, x6, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
+ );
+}
+
+void fe_neg(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "ldp x2, x3, [%x[a]]\n\t"
+ "ldp x4, x5, [%x[a], #16]\n\t"
+ "mov x6, #-19\n\t"
+ "mov x7, #-1\n\t"
+ "mov x8, #-1\n\t"
+ "mov x9, #0x7fffffffffffffff\n\t"
+ "subs x6, x6, x2\n\t"
+ "sbcs x7, x7, x3\n\t"
+ "sbcs x8, x8, x4\n\t"
+ "sbc x9, x9, x5\n\t"
+ "stp x6, x7, [%x[r]]\n\t"
+ "stp x8, x9, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9"
+ );
+}
+
+int fe_isnonzero(const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "mov x6, #19\n\t"
+ "ldp x1, x2, [%x[a]]\n\t"
+ "ldp x3, x4, [%x[a], #16]\n\t"
+ "adds x5, x1, x6\n\t"
+ "adcs x5, x2, xzr\n\t"
+ "adcs x5, x3, xzr\n\t"
+ "adc x5, x4, xzr\n\t"
+ "and x5, x6, x5, asr 63\n\t"
+ "adds x1, x1, x5\n\t"
+ "adcs x2, x2, xzr\n\t"
+ "adcs x3, x3, xzr\n\t"
+ "adc x4, x4, xzr\n\t"
+ "and x4, x4, #0x7fffffffffffffff\n\t"
+ "orr %x[a], x1, x2\n\t"
+ "orr x3, x3, x4\n\t"
+ "orr %x[a], %x[a], x3\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [a] "+r" (a)
+ :
+ : "memory", "x1", "x2", "x3", "x4", "x5", "x6"
+ );
+ return (uint32_t)(size_t)a;
+}
+
+int fe_isnegative(const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "mov x6, #19\n\t"
+ "ldp x1, x2, [%x[a]]\n\t"
+ "ldp x3, x4, [%x[a], #16]\n\t"
+ "adds x5, x1, x6\n\t"
+ "adcs x5, x2, xzr\n\t"
+ "adcs x5, x3, xzr\n\t"
+ "adc x5, x4, xzr\n\t"
+ "and %x[a], x1, #1\n\t"
+ "eor %x[a], %x[a], x5, lsr 63\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [a] "+r" (a)
+ :
+ : "memory", "x1", "x2", "x3", "x4", "x5", "x6"
+ );
+ return (uint32_t)(size_t)a;
+}
+
+void fe_cmov_table(fe* r, fe* base, signed char b)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-32]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[r], [x29, #16]\n\t"
+ "sxtb %x[b], %w[b]\n\t"
+ "sbfx x3, %x[b], #7, #1\n\t"
+ "eor %x[r], %x[b], x3\n\t"
+ "sub %x[r], %x[r], x3\n\t"
+ "mov x4, #1\n\t"
+ "mov x5, xzr\n\t"
+ "mov x6, xzr\n\t"
+ "mov x7, xzr\n\t"
+ "mov x8, #1\n\t"
+ "mov x9, xzr\n\t"
+ "mov x10, xzr\n\t"
+ "mov x11, xzr\n\t"
+ "mov x12, xzr\n\t"
+ "mov x13, xzr\n\t"
+ "mov x14, xzr\n\t"
+ "mov x15, xzr\n\t"
+ "cmp %x[r], #1\n\t"
+ "ldp x16, x17, [%x[base]]\n\t"
+ "ldp x19, x20, [%x[base], #16]\n\t"
+ "ldp x21, x22, [%x[base], #32]\n\t"
+ "ldp x23, x24, [%x[base], #48]\n\t"
+ "ldp x25, x26, [%x[base], #64]\n\t"
+ "ldp x27, x28, [%x[base], #80]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #2\n\t"
+ "ldp x16, x17, [%x[base], #96]\n\t"
+ "ldp x19, x20, [%x[base], #112]\n\t"
+ "ldp x21, x22, [%x[base], #128]\n\t"
+ "ldp x23, x24, [%x[base], #144]\n\t"
+ "ldp x25, x26, [%x[base], #160]\n\t"
+ "ldp x27, x28, [%x[base], #176]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #3\n\t"
+ "ldp x16, x17, [%x[base], #192]\n\t"
+ "ldp x19, x20, [%x[base], #208]\n\t"
+ "ldp x21, x22, [%x[base], #224]\n\t"
+ "ldp x23, x24, [%x[base], #240]\n\t"
+ "ldp x25, x26, [%x[base], #256]\n\t"
+ "ldp x27, x28, [%x[base], #272]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #4\n\t"
+ "ldp x16, x17, [%x[base], #288]\n\t"
+ "ldp x19, x20, [%x[base], #304]\n\t"
+ "ldp x21, x22, [%x[base], #320]\n\t"
+ "ldp x23, x24, [%x[base], #336]\n\t"
+ "ldp x25, x26, [%x[base], #352]\n\t"
+ "ldp x27, x28, [%x[base], #368]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "add %x[base], %x[base], #0x180\n\t"
+ "cmp %x[r], #5\n\t"
+ "ldp x16, x17, [%x[base]]\n\t"
+ "ldp x19, x20, [%x[base], #16]\n\t"
+ "ldp x21, x22, [%x[base], #32]\n\t"
+ "ldp x23, x24, [%x[base], #48]\n\t"
+ "ldp x25, x26, [%x[base], #64]\n\t"
+ "ldp x27, x28, [%x[base], #80]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #6\n\t"
+ "ldp x16, x17, [%x[base], #96]\n\t"
+ "ldp x19, x20, [%x[base], #112]\n\t"
+ "ldp x21, x22, [%x[base], #128]\n\t"
+ "ldp x23, x24, [%x[base], #144]\n\t"
+ "ldp x25, x26, [%x[base], #160]\n\t"
+ "ldp x27, x28, [%x[base], #176]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #7\n\t"
+ "ldp x16, x17, [%x[base], #192]\n\t"
+ "ldp x19, x20, [%x[base], #208]\n\t"
+ "ldp x21, x22, [%x[base], #224]\n\t"
+ "ldp x23, x24, [%x[base], #240]\n\t"
+ "ldp x25, x26, [%x[base], #256]\n\t"
+ "ldp x27, x28, [%x[base], #272]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "cmp %x[r], #8\n\t"
+ "ldp x16, x17, [%x[base], #288]\n\t"
+ "ldp x19, x20, [%x[base], #304]\n\t"
+ "ldp x21, x22, [%x[base], #320]\n\t"
+ "ldp x23, x24, [%x[base], #336]\n\t"
+ "ldp x25, x26, [%x[base], #352]\n\t"
+ "ldp x27, x28, [%x[base], #368]\n\t"
+ "csel x4, x16, x4, eq\n\t"
+ "csel x5, x17, x5, eq\n\t"
+ "csel x6, x19, x6, eq\n\t"
+ "csel x7, x20, x7, eq\n\t"
+ "csel x8, x21, x8, eq\n\t"
+ "csel x9, x22, x9, eq\n\t"
+ "csel x10, x23, x10, eq\n\t"
+ "csel x11, x24, x11, eq\n\t"
+ "csel x12, x25, x12, eq\n\t"
+ "csel x13, x26, x13, eq\n\t"
+ "csel x14, x27, x14, eq\n\t"
+ "csel x15, x28, x15, eq\n\t"
+ "mov x16, #-19\n\t"
+ "mov x17, #-1\n\t"
+ "mov x19, #-1\n\t"
+ "mov x20, #0x7fffffffffffffff\n\t"
+ "subs x16, x16, x12\n\t"
+ "sbcs x17, x17, x13\n\t"
+ "sbcs x19, x19, x14\n\t"
+ "sbc x20, x20, x15\n\t"
+ "cmp %x[b], #0\n\t"
+ "mov x3, x4\n\t"
+ "csel x4, x8, x4, lt\n\t"
+ "csel x8, x3, x8, lt\n\t"
+ "mov x3, x5\n\t"
+ "csel x5, x9, x5, lt\n\t"
+ "csel x9, x3, x9, lt\n\t"
+ "mov x3, x6\n\t"
+ "csel x6, x10, x6, lt\n\t"
+ "csel x10, x3, x10, lt\n\t"
+ "mov x3, x7\n\t"
+ "csel x7, x11, x7, lt\n\t"
+ "csel x11, x3, x11, lt\n\t"
+ "csel x12, x16, x12, lt\n\t"
+ "csel x13, x17, x13, lt\n\t"
+ "csel x14, x19, x14, lt\n\t"
+ "csel x15, x20, x15, lt\n\t"
+ "ldr %x[r], [x29, #16]\n\t"
+ "stp x4, x5, [%x[r]]\n\t"
+ "stp x6, x7, [%x[r], #16]\n\t"
+ "stp x8, x9, [%x[r], #32]\n\t"
+ "stp x10, x11, [%x[r], #48]\n\t"
+ "stp x12, x13, [%x[r], #64]\n\t"
+ "stp x14, x15, [%x[r], #80]\n\t"
+ "ldp x29, x30, [sp], #32\n\t"
+ : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+void fe_mul(fe r, const fe a, const fe b)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Multiply */
+ "ldp x14, x15, [%x[a]]\n\t"
+ "ldp x16, x17, [%x[a], #16]\n\t"
+ "ldp x19, x20, [%x[b]]\n\t"
+ "ldp x21, x22, [%x[b], #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x6, x14, x19\n\t"
+ "umulh x7, x14, x19\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x20\n\t"
+ "umulh x8, x14, x20\n\t"
+ "adds x7, x7, x3\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x19\n\t"
+ "umulh x4, x15, x19\n\t"
+ "adds x7, x7, x3\n\t"
+ "adcs x8, x8, x4\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x21\n\t"
+ "umulh x4, x14, x21\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x20\n\t"
+ "umulh x4, x15, x20\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x19\n\t"
+ "umulh x4, x16, x19\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x22\n\t"
+ "umulh x4, x14, x22\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs x10, x10, x4\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x21\n\t"
+ "umulh x4, x15, x21\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs x10, x10, x4\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x20\n\t"
+ "umulh x4, x16, x20\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs x10, x10, x4\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x19\n\t"
+ "umulh x4, x17, x19\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs x10, x10, x4\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x22\n\t"
+ "umulh x4, x15, x22\n\t"
+ "adds x10, x10, x3\n\t"
+ "adcs x11, x11, x4\n\t"
+ "adc x12, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x21\n\t"
+ "umulh x4, x16, x21\n\t"
+ "adds x10, x10, x3\n\t"
+ "adcs x11, x11, x4\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x20\n\t"
+ "umulh x4, x17, x20\n\t"
+ "adds x10, x10, x3\n\t"
+ "adcs x11, x11, x4\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x22\n\t"
+ "umulh x4, x16, x22\n\t"
+ "adds x11, x11, x3\n\t"
+ "adcs x12, x12, x4\n\t"
+ "adc x13, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x21\n\t"
+ "umulh x4, x17, x21\n\t"
+ "adds x11, x11, x3\n\t"
+ "adcs x12, x12, x4\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x22\n\t"
+ "umulh x4, x17, x22\n\t"
+ "adds x12, x12, x3\n\t"
+ "adc x13, x13, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x13, x13, x12, #63\n\t"
+ "extr x12, x12, x11, #63\n\t"
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, x10\n\t"
+ "umulh x10, x3, x10\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x11\n\t"
+ "umulh x11, x3, x11\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x12\n\t"
+ "umulh x12, x3, x12\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x13\n\t"
+ "umulh x5, x3, x13\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, x10\n\t"
+ "adcs x8, x8, x11\n\t"
+ "adcs x9, x9, x12\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "stp x6, x7, [%x[r]]\n\t"
+ "stp x8, x9, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22"
+ );
+}
+
+void fe_sq(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Square */
+ "ldp x13, x14, [%x[a]]\n\t"
+ "ldp x15, x16, [%x[a], #16]\n\t"
+ /* A[0] * A[1] */
+ "mul x6, x13, x14\n\t"
+ "umulh x7, x13, x14\n\t"
+ /* A[0] * A[2] */
+ "mul x2, x13, x15\n\t"
+ "umulh x8, x13, x15\n\t"
+ "adds x7, x7, x2\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x2, x13, x16\n\t"
+ "umulh x9, x13, x16\n\t"
+ "adds x8, x8, x2\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x2, x14, x15\n\t"
+ "umulh x3, x14, x15\n\t"
+ "adds x8, x8, x2\n\t"
+ "adcs x9, x9, x3\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x2, x14, x16\n\t"
+ "umulh x3, x14, x16\n\t"
+ "adds x9, x9, x2\n\t"
+ "adc x10, x10, x3\n\t"
+ /* A[2] * A[3] */
+ "mul x2, x15, x16\n\t"
+ "umulh x11, x15, x16\n\t"
+ "adds x10, x10, x2\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Double */
+ "adds x6, x6, x6\n\t"
+ "adcs x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adcs x11, x11, x11\n\t"
+ "adc x12, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x5, x13, x13\n\t"
+ "umulh x4, x13, x13\n\t"
+ /* A[1] * A[1] */
+ "mul x2, x14, x14\n\t"
+ "umulh x3, x14, x14\n\t"
+ "adds x6, x6, x4\n\t"
+ "adcs x7, x7, x2\n\t"
+ "adc x4, x3, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x2, x15, x15\n\t"
+ "umulh x3, x15, x15\n\t"
+ "adds x8, x8, x4\n\t"
+ "adcs x9, x9, x2\n\t"
+ "adc x4, x3, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x2, x16, x16\n\t"
+ "umulh x3, x16, x16\n\t"
+ "adds x10, x10, x4\n\t"
+ "adcs x11, x11, x2\n\t"
+ "adc x12, x12, x3\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x12, x12, x11, #63\n\t"
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "and x8, x8, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x2, #19\n\t"
+ "mul x3, x2, x9\n\t"
+ "umulh x9, x2, x9\n\t"
+ "adds x5, x5, x3\n\t"
+ "mul x3, x2, x10\n\t"
+ "umulh x10, x2, x10\n\t"
+ "adcs x6, x6, x3\n\t"
+ "mul x3, x2, x11\n\t"
+ "umulh x11, x2, x11\n\t"
+ "adcs x7, x7, x3\n\t"
+ "mul x3, x2, x12\n\t"
+ "umulh x4, x2, x12\n\t"
+ "adcs x8, x8, x3\n\t"
+ "adc x4, x4, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adcs x8, x8, x11\n\t"
+ "adc x4, x4, xzr\n\t"
+ /* Overflow */
+ "extr x4, x4, x8, #63\n\t"
+ "mul x4, x4, x2\n\t"
+ "and x8, x8, #0x7fffffffffffffff\n\t"
+ "adds x5, x5, x4\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x4, x2, x8, asr 63\n\t"
+ "and x8, x8, #0x7fffffffffffffff\n\t"
+ "adds x5, x5, x4\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* Store */
+ "stp x5, x6, [%x[r]]\n\t"
+ "stp x7, x8, [%x[r], #16]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16"
+ );
+}
+
+void fe_invert(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-160]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* Invert */
+ "str %x[r], [x29, #144]\n\t"
+ "str %x[a], [x29, #152]\n\t"
+ "add x0, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "ldr x1, [x29, #152]\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #16\n\t"
+ "add x1, x29, #16\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #48\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #4\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert1_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert1_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #9\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert2_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert2_%=\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #19\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_fe_invert3_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert3_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "mov x20, #10\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert4_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert4_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #49\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert5_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert5_%=\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "bl fe_sq\n\t"
+ "mov x20, #0x63\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_fe_invert6_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert6_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "mov x20, #50\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_invert7_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert7_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "mov x20, #5\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_invert8_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x20, x20, #1\n\t"
+ "cmp x20, #0\n\t"
+ "bne L_fe_invert8_%=\n\t"
+ "ldr x0, [x29, #144]\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "ldp x29, x30, [sp], #0xa0\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x20"
+ );
+}
+
+int curve25519(byte* r, byte* n, byte* a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-192]!\n\t"
+ "add x29, sp, #0\n\t"
+ "mov x23, xzr\n\t"
+ "str %x[r], [x29, #176]\n\t"
+ "str %x[a], [x29, #184]\n\t"
+ /* Copy */
+ "ldp x6, x7, [%x[a]]\n\t"
+ "ldp x8, x9, [%x[a], #16]\n\t"
+ "stp x6, x7, [x29, #80]\n\t"
+ "stp x8, x9, [x29, #96]\n\t"
+ /* Set one */
+ "mov %x[a], #1\n\t"
+ "stp %x[a], xzr, [%x[r]]\n\t"
+ "stp xzr, xzr, [%x[r], #16]\n\t"
+ /* Set zero */
+ "stp xzr, xzr, [x29, #16]\n\t"
+ "stp xzr, xzr, [x29, #32]\n\t"
+ /* Set one */
+ "mov %x[a], #1\n\t"
+ "stp %x[a], xzr, [x29, #48]\n\t"
+ "stp xzr, xzr, [x29, #64]\n\t"
+ "mov x25, #62\n\t"
+ "mov x24, #24\n\t"
+ "\n"
+ "L_curve25519_words_%=: \n\t"
+ "\n"
+ "L_curve25519_bits_%=: \n\t"
+ "ldr %x[a], [%x[n], x24]\n\t"
+ "lsr %x[a], %x[a], x25\n\t"
+ "and %x[a], %x[a], #1\n\t"
+ "eor x23, x23, %x[a]\n\t"
+ /* Conditional Swap */
+ "cmp x23, #1\n\t"
+ "ldp x10, x11, [%x[r]]\n\t"
+ "ldp x12, x13, [%x[r], #16]\n\t"
+ "ldp x6, x7, [x29, #80]\n\t"
+ "ldp x8, x9, [x29, #96]\n\t"
+ "csel x14, x10, x6, eq\n\t"
+ "csel x10, x6, x10, eq\n\t"
+ "csel x15, x11, x7, eq\n\t"
+ "csel x11, x7, x11, eq\n\t"
+ "csel x16, x12, x8, eq\n\t"
+ "csel x12, x8, x12, eq\n\t"
+ "csel x17, x13, x9, eq\n\t"
+ "csel x13, x9, x13, eq\n\t"
+ /* Conditional Swap */
+ "cmp x23, #1\n\t"
+ "ldp x19, x20, [x29, #16]\n\t"
+ "ldp x21, x22, [x29, #32]\n\t"
+ "ldp x6, x7, [x29, #48]\n\t"
+ "ldp x8, x9, [x29, #64]\n\t"
+ "csel x5, x19, x6, eq\n\t"
+ "csel x19, x6, x19, eq\n\t"
+ "csel x26, x20, x7, eq\n\t"
+ "csel x20, x7, x20, eq\n\t"
+ "csel x27, x21, x8, eq\n\t"
+ "csel x21, x8, x21, eq\n\t"
+ "csel x28, x22, x9, eq\n\t"
+ "csel x22, x9, x22, eq\n\t"
+ "mov x23, %x[a]\n\t"
+ /* Add */
+ "adds x6, x10, x19\n\t"
+ "adcs x7, x11, x20\n\t"
+ "adcs x8, x12, x21\n\t"
+ "adc x9, x13, x22\n\t"
+ "mov x3, #-19\n\t"
+ "asr %x[a], x9, #63\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x6, x6, x3\n\t"
+ "sbcs x7, x7, %x[a]\n\t"
+ "sbcs x8, x8, %x[a]\n\t"
+ "sbc x9, x9, x4\n\t"
+ /* Sub */
+ "subs x19, x10, x19\n\t"
+ "sbcs x20, x11, x20\n\t"
+ "sbcs x21, x12, x21\n\t"
+ "sbcs x22, x13, x22\n\t"
+ "mov x3, #-19\n\t"
+ "csetm %x[a], cc\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, %x[a]\n\t"
+ "adcs x21, x21, %x[a]\n\t"
+ "adc x22, x22, x4\n\t"
+ "stp x19, x20, [x29, #144]\n\t"
+ "stp x21, x22, [x29, #160]\n\t"
+ /* Add */
+ "adds x10, x14, x5\n\t"
+ "adcs x11, x15, x26\n\t"
+ "adcs x12, x16, x27\n\t"
+ "adc x13, x17, x28\n\t"
+ "mov x3, #-19\n\t"
+ "asr %x[a], x13, #63\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x10, x10, x3\n\t"
+ "sbcs x11, x11, %x[a]\n\t"
+ "sbcs x12, x12, %x[a]\n\t"
+ "sbc x13, x13, x4\n\t"
+ /* Sub */
+ "subs x14, x14, x5\n\t"
+ "sbcs x15, x15, x26\n\t"
+ "sbcs x16, x16, x27\n\t"
+ "sbcs x17, x17, x28\n\t"
+ "mov x3, #-19\n\t"
+ "csetm %x[a], cc\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, %x[a]\n\t"
+ "adcs x16, x16, %x[a]\n\t"
+ "adc x17, x17, x4\n\t"
+ /* Multiply */
+ /* A[0] * B[0] */
+ "mul x19, x14, x6\n\t"
+ "umulh x20, x14, x6\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x7\n\t"
+ "umulh x21, x14, x7\n\t"
+ "adds x20, x20, x3\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x6\n\t"
+ "umulh x4, x15, x6\n\t"
+ "adds x20, x20, x3\n\t"
+ "adcs x21, x21, x4\n\t"
+ "adc x22, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x8\n\t"
+ "umulh x4, x14, x8\n\t"
+ "adds x21, x21, x3\n\t"
+ "adc x22, x22, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x7\n\t"
+ "umulh x4, x15, x7\n\t"
+ "adds x21, x21, x3\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc %x[a], xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x6\n\t"
+ "umulh x4, x16, x6\n\t"
+ "adds x21, x21, x3\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x9\n\t"
+ "umulh x4, x14, x9\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x8\n\t"
+ "umulh x4, x15, x8\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x7\n\t"
+ "umulh x4, x16, x7\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x6\n\t"
+ "umulh x4, x17, x6\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x9\n\t"
+ "umulh x4, x15, x9\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x8\n\t"
+ "umulh x4, x16, x8\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x7\n\t"
+ "umulh x4, x17, x7\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x9\n\t"
+ "umulh x4, x16, x9\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x8\n\t"
+ "umulh x4, x17, x8\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, x28, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x9\n\t"
+ "umulh x4, x17, x9\n\t"
+ "adds x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x22, #63\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x19, x19, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x20, x20, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x21, x21, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x20, x20, %x[a]\n\t"
+ "adcs x21, x21, x26\n\t"
+ "adcs x22, x22, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x22, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ "adds x19, x19, x5\n\t"
+ "adcs x20, x20, xzr\n\t"
+ "adcs x21, x21, xzr\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x22, asr 63\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ "adds x19, x19, x5\n\t"
+ "adcs x20, x20, xzr\n\t"
+ "adcs x21, x21, xzr\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Store */
+ "stp x19, x20, [x29, #112]\n\t"
+ "stp x21, x22, [x29, #128]\n\t"
+ /* Multiply */
+ "ldp %x[a], x26, [x29, #144]\n\t"
+ "ldp x27, x28, [x29, #160]\n\t"
+ /* A[0] * B[0] */
+ "mul x19, x10, %x[a]\n\t"
+ "umulh x20, x10, %x[a]\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x10, x26\n\t"
+ "umulh x21, x10, x26\n\t"
+ "adds x20, x20, x3\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x11, %x[a]\n\t"
+ "umulh x4, x11, %x[a]\n\t"
+ "adds x20, x20, x3\n\t"
+ "adcs x21, x21, x4\n\t"
+ "adc x22, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x10, x27\n\t"
+ "umulh x4, x10, x27\n\t"
+ "adds x21, x21, x3\n\t"
+ "adc x22, x22, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x11, x26\n\t"
+ "umulh x4, x11, x26\n\t"
+ "adds x21, x21, x3\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc x14, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x12, %x[a]\n\t"
+ "umulh x4, x12, %x[a]\n\t"
+ "adds x21, x21, x3\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x10, x28\n\t"
+ "umulh x4, x10, x28\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x11, x27\n\t"
+ "umulh x4, x11, x27\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x12, x26\n\t"
+ "umulh x4, x12, x26\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x13, %x[a]\n\t"
+ "umulh x4, x13, %x[a]\n\t"
+ "adds x22, x22, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x11, x28\n\t"
+ "umulh x4, x11, x28\n\t"
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, x4\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x12, x27\n\t"
+ "umulh x4, x12, x27\n\t"
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, x4\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x13, x26\n\t"
+ "umulh x4, x13, x26\n\t"
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, x4\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x12, x28\n\t"
+ "umulh x4, x12, x28\n\t"
+ "adds x15, x15, x3\n\t"
+ "adcs x16, x16, x4\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x13, x27\n\t"
+ "umulh x4, x13, x27\n\t"
+ "adds x15, x15, x3\n\t"
+ "adcs x16, x16, x4\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x13, x28\n\t"
+ "umulh x4, x13, x28\n\t"
+ "adds x16, x16, x3\n\t"
+ "adc x17, x17, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "extr x15, x15, x14, #63\n\t"
+ "extr x14, x14, x22, #63\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, x14\n\t"
+ "umulh x14, x3, x14\n\t"
+ "adds x19, x19, x4\n\t"
+ "mul x4, x3, x15\n\t"
+ "umulh x15, x3, x15\n\t"
+ "adcs x20, x20, x4\n\t"
+ "mul x4, x3, x16\n\t"
+ "umulh x16, x3, x16\n\t"
+ "adcs x21, x21, x4\n\t"
+ "mul x4, x3, x17\n\t"
+ "umulh x5, x3, x17\n\t"
+ "adcs x22, x22, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x20, x20, x14\n\t"
+ "adcs x21, x21, x15\n\t"
+ "adcs x22, x22, x16\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x22, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ "adds x19, x19, x5\n\t"
+ "adcs x20, x20, xzr\n\t"
+ "adcs x21, x21, xzr\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x22, asr 63\n\t"
+ "and x22, x22, #0x7fffffffffffffff\n\t"
+ "adds x19, x19, x5\n\t"
+ "adcs x20, x20, xzr\n\t"
+ "adcs x21, x21, xzr\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Store */
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x11, %x[a], x26\n\t"
+ "umulh x12, %x[a], x26\n\t"
+ /* A[0] * A[2] */
+ "mul x3, %x[a], x27\n\t"
+ "umulh x13, %x[a], x27\n\t"
+ "adds x12, x12, x3\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x3, %x[a], x28\n\t"
+ "umulh x14, %x[a], x28\n\t"
+ "adds x13, x13, x3\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x3, x26, x27\n\t"
+ "umulh x4, x26, x27\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs x14, x14, x4\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x3, x26, x28\n\t"
+ "umulh x4, x26, x28\n\t"
+ "adds x14, x14, x3\n\t"
+ "adc x15, x15, x4\n\t"
+ /* A[2] * A[3] */
+ "mul x3, x27, x28\n\t"
+ "umulh x16, x27, x28\n\t"
+ "adds x15, x15, x3\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* Double */
+ "adds x11, x11, x11\n\t"
+ "adcs x12, x12, x12\n\t"
+ "adcs x13, x13, x13\n\t"
+ "adcs x14, x14, x14\n\t"
+ "adcs x15, x15, x15\n\t"
+ "adcs x16, x16, x16\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x10, %x[a], %x[a]\n\t"
+ "umulh x5, %x[a], %x[a]\n\t"
+ /* A[1] * A[1] */
+ "mul x3, x26, x26\n\t"
+ "umulh x4, x26, x26\n\t"
+ "adds x11, x11, x5\n\t"
+ "adcs x12, x12, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x3, x27, x27\n\t"
+ "umulh x4, x27, x27\n\t"
+ "adds x13, x13, x5\n\t"
+ "adcs x14, x14, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x3, x28, x28\n\t"
+ "umulh x4, x28, x28\n\t"
+ "adds x15, x15, x5\n\t"
+ "adcs x16, x16, x3\n\t"
+ "adc x17, x17, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "extr x15, x15, x14, #63\n\t"
+ "extr x14, x14, x13, #63\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, x14\n\t"
+ "umulh x14, x3, x14\n\t"
+ "adds x10, x10, x4\n\t"
+ "mul x4, x3, x15\n\t"
+ "umulh x15, x3, x15\n\t"
+ "adcs x11, x11, x4\n\t"
+ "mul x4, x3, x16\n\t"
+ "umulh x16, x3, x16\n\t"
+ "adcs x12, x12, x4\n\t"
+ "mul x4, x3, x17\n\t"
+ "umulh x5, x3, x17\n\t"
+ "adcs x13, x13, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x11, x11, x14\n\t"
+ "adcs x12, x12, x15\n\t"
+ "adcs x13, x13, x16\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x13, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ "adds x10, x10, x5\n\t"
+ "adcs x11, x11, xzr\n\t"
+ "adcs x12, x12, xzr\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x13, asr 63\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ "adds x10, x10, x5\n\t"
+ "adcs x11, x11, xzr\n\t"
+ "adcs x12, x12, xzr\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* Store */
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x15, x6, x7\n\t"
+ "umulh x16, x6, x7\n\t"
+ /* A[0] * A[2] */
+ "mul x3, x6, x8\n\t"
+ "umulh x17, x6, x8\n\t"
+ "adds x16, x16, x3\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x3, x6, x9\n\t"
+ "umulh %x[a], x6, x9\n\t"
+ "adds x17, x17, x3\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x3, x7, x8\n\t"
+ "umulh x4, x7, x8\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x3, x7, x9\n\t"
+ "umulh x4, x7, x9\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adc x26, x26, x4\n\t"
+ /* A[2] * A[3] */
+ "mul x3, x8, x9\n\t"
+ "umulh x27, x8, x9\n\t"
+ "adds x26, x26, x3\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Double */
+ "adds x15, x15, x15\n\t"
+ "adcs x16, x16, x16\n\t"
+ "adcs x17, x17, x17\n\t"
+ "adcs %x[a], %x[a], %x[a]\n\t"
+ "adcs x26, x26, x26\n\t"
+ "adcs x27, x27, x27\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x14, x6, x6\n\t"
+ "umulh x5, x6, x6\n\t"
+ /* A[1] * A[1] */
+ "mul x3, x7, x7\n\t"
+ "umulh x4, x7, x7\n\t"
+ "adds x15, x15, x5\n\t"
+ "adcs x16, x16, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x3, x8, x8\n\t"
+ "umulh x4, x8, x8\n\t"
+ "adds x17, x17, x5\n\t"
+ "adcs %x[a], %x[a], x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x3, x9, x9\n\t"
+ "umulh x4, x9, x9\n\t"
+ "adds x26, x26, x5\n\t"
+ "adcs x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x17, #63\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x14, x14, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x15, x15, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x16, x16, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x17, x17, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x15, x15, %x[a]\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adcs x17, x17, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x17, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ "adds x14, x14, x5\n\t"
+ "adcs x15, x15, xzr\n\t"
+ "adcs x16, x16, xzr\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x17, asr 63\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ "adds x14, x14, x5\n\t"
+ "adcs x15, x15, xzr\n\t"
+ "adcs x16, x16, xzr\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* Store */
+ /* Multiply */
+ /* A[0] * B[0] */
+ "mul x6, x14, x10\n\t"
+ "umulh x7, x14, x10\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x11\n\t"
+ "umulh x8, x14, x11\n\t"
+ "adds x7, x7, x3\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x10\n\t"
+ "umulh x4, x15, x10\n\t"
+ "adds x7, x7, x3\n\t"
+ "adcs x8, x8, x4\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x12\n\t"
+ "umulh x4, x14, x12\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x11\n\t"
+ "umulh x4, x15, x11\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc %x[a], xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x10\n\t"
+ "umulh x4, x16, x10\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x13\n\t"
+ "umulh x4, x14, x13\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x12\n\t"
+ "umulh x4, x15, x12\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x11\n\t"
+ "umulh x4, x16, x11\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x10\n\t"
+ "umulh x4, x17, x10\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x13\n\t"
+ "umulh x4, x15, x13\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x12\n\t"
+ "umulh x4, x16, x12\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x11\n\t"
+ "umulh x4, x17, x11\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x13\n\t"
+ "umulh x4, x16, x13\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x12\n\t"
+ "umulh x4, x17, x12\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, x28, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x13\n\t"
+ "umulh x4, x17, x13\n\t"
+ "adds x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, %x[a]\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adcs x9, x9, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "stp x6, x7, [%x[r]]\n\t"
+ "stp x8, x9, [%x[r], #16]\n\t"
+ /* Sub */
+ "subs x14, x14, x10\n\t"
+ "sbcs x15, x15, x11\n\t"
+ "sbcs x16, x16, x12\n\t"
+ "sbcs x17, x17, x13\n\t"
+ "mov x3, #-19\n\t"
+ "csetm %x[a], cc\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x14, x14, x3\n\t"
+ "adcs x15, x15, %x[a]\n\t"
+ "adcs x16, x16, %x[a]\n\t"
+ "adc x17, x17, x4\n\t"
+ /* Multiply by 121666 */
+ "mov x5, #0xdb42\n\t"
+ "movk x5, #1, lsl 16\n\t"
+ "mul x6, x14, x5\n\t"
+ "umulh x7, x14, x5\n\t"
+ "mul x3, x15, x5\n\t"
+ "umulh x4, x15, x5\n\t"
+ "adds x7, x7, x3\n\t"
+ "adc x8, xzr, x4\n\t"
+ "mul x3, x16, x5\n\t"
+ "umulh x4, x16, x5\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, xzr, x4\n\t"
+ "mul x3, x17, x5\n\t"
+ "umulh x4, x17, x5\n\t"
+ "adds x9, x9, x3\n\t"
+ "adc x4, xzr, x4\n\t"
+ "mov x5, #19\n\t"
+ "extr x4, x4, x9, #63\n\t"
+ "mul x4, x4, x5\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x4\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Add */
+ "adds x10, x10, x6\n\t"
+ "adcs x11, x11, x7\n\t"
+ "adcs x12, x12, x8\n\t"
+ "adc x13, x13, x9\n\t"
+ "mov x3, #-19\n\t"
+ "asr %x[a], x13, #63\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x10, x10, x3\n\t"
+ "sbcs x11, x11, %x[a]\n\t"
+ "sbcs x12, x12, %x[a]\n\t"
+ "sbc x13, x13, x4\n\t"
+ /* Multiply */
+ /* A[0] * B[0] */
+ "mul x6, x14, x10\n\t"
+ "umulh x7, x14, x10\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x11\n\t"
+ "umulh x8, x14, x11\n\t"
+ "adds x7, x7, x3\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x10\n\t"
+ "umulh x4, x15, x10\n\t"
+ "adds x7, x7, x3\n\t"
+ "adcs x8, x8, x4\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x12\n\t"
+ "umulh x4, x14, x12\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x11\n\t"
+ "umulh x4, x15, x11\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc %x[a], xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x10\n\t"
+ "umulh x4, x16, x10\n\t"
+ "adds x8, x8, x3\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x13\n\t"
+ "umulh x4, x14, x13\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x12\n\t"
+ "umulh x4, x15, x12\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x11\n\t"
+ "umulh x4, x16, x11\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x10\n\t"
+ "umulh x4, x17, x10\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x13\n\t"
+ "umulh x4, x15, x13\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x12\n\t"
+ "umulh x4, x16, x12\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x11\n\t"
+ "umulh x4, x17, x11\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x13\n\t"
+ "umulh x4, x16, x13\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x12\n\t"
+ "umulh x4, x17, x12\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, x28, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x13\n\t"
+ "umulh x4, x17, x13\n\t"
+ "adds x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, %x[a]\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adcs x9, x9, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "stp x6, x7, [x29, #16]\n\t"
+ "stp x8, x9, [x29, #32]\n\t"
+ /* Add */
+ "ldp x6, x7, [x29, #112]\n\t"
+ "ldp x8, x9, [x29, #128]\n\t"
+ "adds x10, x6, x19\n\t"
+ "adcs x11, x7, x20\n\t"
+ "adcs x12, x8, x21\n\t"
+ "adc x13, x9, x22\n\t"
+ "mov x3, #-19\n\t"
+ "asr %x[a], x13, #63\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x10, x10, x3\n\t"
+ "sbcs x11, x11, %x[a]\n\t"
+ "sbcs x12, x12, %x[a]\n\t"
+ "sbc x13, x13, x4\n\t"
+ /* Sub */
+ "subs x19, x6, x19\n\t"
+ "sbcs x20, x7, x20\n\t"
+ "sbcs x21, x8, x21\n\t"
+ "sbcs x22, x9, x22\n\t"
+ "mov x3, #-19\n\t"
+ "csetm %x[a], cc\n\t"
+ /* Mask the modulus */
+ "and x3, %x[a], x3\n\t"
+ "and x4, %x[a], #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, %x[a]\n\t"
+ "adcs x21, x21, %x[a]\n\t"
+ "adc x22, x22, x4\n\t"
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x7, x10, x11\n\t"
+ "umulh x8, x10, x11\n\t"
+ /* A[0] * A[2] */
+ "mul x3, x10, x12\n\t"
+ "umulh x9, x10, x12\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x3, x10, x13\n\t"
+ "umulh %x[a], x10, x13\n\t"
+ "adds x9, x9, x3\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x3, x11, x12\n\t"
+ "umulh x4, x11, x12\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x3, x11, x13\n\t"
+ "umulh x4, x11, x13\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adc x26, x26, x4\n\t"
+ /* A[2] * A[3] */
+ "mul x3, x12, x13\n\t"
+ "umulh x27, x12, x13\n\t"
+ "adds x26, x26, x3\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Double */
+ "adds x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs %x[a], %x[a], %x[a]\n\t"
+ "adcs x26, x26, x26\n\t"
+ "adcs x27, x27, x27\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x6, x10, x10\n\t"
+ "umulh x5, x10, x10\n\t"
+ /* A[1] * A[1] */
+ "mul x3, x11, x11\n\t"
+ "umulh x4, x11, x11\n\t"
+ "adds x7, x7, x5\n\t"
+ "adcs x8, x8, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x3, x12, x12\n\t"
+ "umulh x4, x12, x12\n\t"
+ "adds x9, x9, x5\n\t"
+ "adcs %x[a], %x[a], x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x3, x13, x13\n\t"
+ "umulh x4, x13, x13\n\t"
+ "adds x26, x26, x5\n\t"
+ "adcs x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, %x[a]\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adcs x9, x9, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "stp x6, x7, [x29, #80]\n\t"
+ "stp x8, x9, [x29, #96]\n\t"
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x7, x19, x20\n\t"
+ "umulh x8, x19, x20\n\t"
+ /* A[0] * A[2] */
+ "mul x3, x19, x21\n\t"
+ "umulh x9, x19, x21\n\t"
+ "adds x8, x8, x3\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x3, x19, x22\n\t"
+ "umulh %x[a], x19, x22\n\t"
+ "adds x9, x9, x3\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x3, x20, x21\n\t"
+ "umulh x4, x20, x21\n\t"
+ "adds x9, x9, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x3, x20, x22\n\t"
+ "umulh x4, x20, x22\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adc x26, x26, x4\n\t"
+ /* A[2] * A[3] */
+ "mul x3, x21, x22\n\t"
+ "umulh x27, x21, x22\n\t"
+ "adds x26, x26, x3\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Double */
+ "adds x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs %x[a], %x[a], %x[a]\n\t"
+ "adcs x26, x26, x26\n\t"
+ "adcs x27, x27, x27\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x6, x19, x19\n\t"
+ "umulh x5, x19, x19\n\t"
+ /* A[1] * A[1] */
+ "mul x3, x20, x20\n\t"
+ "umulh x4, x20, x20\n\t"
+ "adds x7, x7, x5\n\t"
+ "adcs x8, x8, x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x3, x21, x21\n\t"
+ "umulh x4, x21, x21\n\t"
+ "adds x9, x9, x5\n\t"
+ "adcs %x[a], %x[a], x3\n\t"
+ "adc x5, x4, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x3, x22, x22\n\t"
+ "umulh x4, x22, x22\n\t"
+ "adds x26, x26, x5\n\t"
+ "adcs x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x9, #63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x6, x6, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x7, x7, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x8, x8, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x9, x9, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x7, x7, %x[a]\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adcs x9, x9, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x9, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x9, asr 63\n\t"
+ "and x9, x9, #0x7fffffffffffffff\n\t"
+ "adds x6, x6, x5\n\t"
+ "adcs x7, x7, xzr\n\t"
+ "adcs x8, x8, xzr\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* Store */
+ "ldr %x[a], [x29, #184]\n\t"
+ /* Multiply */
+ "ldp x14, x15, [%x[a]]\n\t"
+ "ldp x16, x17, [%x[a], #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x10, x14, x6\n\t"
+ "umulh x11, x14, x6\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x14, x7\n\t"
+ "umulh x12, x14, x7\n\t"
+ "adds x11, x11, x3\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x15, x6\n\t"
+ "umulh x4, x15, x6\n\t"
+ "adds x11, x11, x3\n\t"
+ "adcs x12, x12, x4\n\t"
+ "adc x13, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x14, x8\n\t"
+ "umulh x4, x14, x8\n\t"
+ "adds x12, x12, x3\n\t"
+ "adc x13, x13, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x15, x7\n\t"
+ "umulh x4, x15, x7\n\t"
+ "adds x12, x12, x3\n\t"
+ "adcs x13, x13, x4\n\t"
+ "adc %x[a], xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x16, x6\n\t"
+ "umulh x4, x16, x6\n\t"
+ "adds x12, x12, x3\n\t"
+ "adcs x13, x13, x4\n\t"
+ "adc %x[a], %x[a], xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x14, x9\n\t"
+ "umulh x4, x14, x9\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x15, x8\n\t"
+ "umulh x4, x15, x8\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x16, x7\n\t"
+ "umulh x4, x16, x7\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x17, x6\n\t"
+ "umulh x4, x17, x6\n\t"
+ "adds x13, x13, x3\n\t"
+ "adcs %x[a], %x[a], x4\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x15, x9\n\t"
+ "umulh x4, x15, x9\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x16, x8\n\t"
+ "umulh x4, x16, x8\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x17, x7\n\t"
+ "umulh x4, x17, x7\n\t"
+ "adds %x[a], %x[a], x3\n\t"
+ "adcs x26, x26, x4\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x16, x9\n\t"
+ "umulh x4, x16, x9\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x17, x8\n\t"
+ "umulh x4, x17, x8\n\t"
+ "adds x26, x26, x3\n\t"
+ "adcs x27, x27, x4\n\t"
+ "adc x28, x28, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x17, x9\n\t"
+ "umulh x4, x17, x9\n\t"
+ "adds x27, x27, x3\n\t"
+ "adc x28, x28, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x28, x28, x27, #63\n\t"
+ "extr x27, x27, x26, #63\n\t"
+ "extr x26, x26, %x[a], #63\n\t"
+ "extr %x[a], %x[a], x13, #63\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, %x[a]\n\t"
+ "umulh %x[a], x3, %x[a]\n\t"
+ "adds x10, x10, x4\n\t"
+ "mul x4, x3, x26\n\t"
+ "umulh x26, x3, x26\n\t"
+ "adcs x11, x11, x4\n\t"
+ "mul x4, x3, x27\n\t"
+ "umulh x27, x3, x27\n\t"
+ "adcs x12, x12, x4\n\t"
+ "mul x4, x3, x28\n\t"
+ "umulh x5, x3, x28\n\t"
+ "adcs x13, x13, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x11, x11, %x[a]\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adcs x13, x13, x27\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x13, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ "adds x10, x10, x5\n\t"
+ "adcs x11, x11, xzr\n\t"
+ "adcs x12, x12, xzr\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x13, asr 63\n\t"
+ "and x13, x13, #0x7fffffffffffffff\n\t"
+ "adds x10, x10, x5\n\t"
+ "adcs x11, x11, xzr\n\t"
+ "adcs x12, x12, xzr\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* Store */
+ "stp x10, x11, [x29, #48]\n\t"
+ "stp x12, x13, [x29, #64]\n\t"
+ "sub x25, x25, #1\n\t"
+ "cmp x25, #0\n\t"
+ "bge L_curve25519_bits_%=\n\t"
+ "mov x25, #63\n\t"
+ "sub x24, x24, #8\n\t"
+ "cmp x24, #0\n\t"
+ "bge L_curve25519_words_%=\n\t"
+ /* Invert */
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #16\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #48\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x1, x29, #0x50\n\t"
+ "add x2, x29, #0x70\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #4\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_1_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_1_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "add x1, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #9\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_2_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_2_%=\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x90\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #19\n\t"
+ "add x1, x29, #0x90\n\t"
+ "\n"
+ "L_curve25519_inv_3_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_3_%=\n\t"
+ "add x0, x29, #0x70\n\t"
+ "add x2, x29, #0x70\n\t"
+ "bl fe_mul\n\t"
+ "mov x24, #10\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_4_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_4_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x70\n\t"
+ "add x1, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #49\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_5_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_5_%=\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x90\n\t"
+ "bl fe_sq\n\t"
+ "mov x24, #0x63\n\t"
+ "add x1, x29, #0x90\n\t"
+ "\n"
+ "L_curve25519_inv_6_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_6_%=\n\t"
+ "add x0, x29, #0x70\n\t"
+ "add x2, x29, #0x70\n\t"
+ "bl fe_mul\n\t"
+ "mov x24, #50\n\t"
+ "add x1, x29, #0x70\n\t"
+ "\n"
+ "L_curve25519_inv_7_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_7_%=\n\t"
+ "add x0, x29, #0x50\n\t"
+ "add x2, x29, #0x50\n\t"
+ "bl fe_mul\n\t"
+ "mov x24, #5\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_curve25519_inv_8_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x24, x24, #1\n\t"
+ "cmp x24, #0\n\t"
+ "bne L_curve25519_inv_8_%=\n\t"
+ "add x0, x29, #16\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "ldr %x[r], [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x6, x7, [%x[r]]\n\t"
+ "ldp x8, x9, [%x[r], #16]\n\t"
+ "ldp x10, x11, [x29, #16]\n\t"
+ "ldp x12, x13, [x29, #32]\n\t"
+ /* A[0] * B[0] */
+ "mul x14, x6, x10\n\t"
+ "umulh x15, x6, x10\n\t"
+ /* A[0] * B[1] */
+ "mul x3, x6, x11\n\t"
+ "umulh x16, x6, x11\n\t"
+ "adds x15, x15, x3\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x3, x7, x10\n\t"
+ "umulh x4, x7, x10\n\t"
+ "adds x15, x15, x3\n\t"
+ "adcs x16, x16, x4\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x3, x6, x12\n\t"
+ "umulh x4, x6, x12\n\t"
+ "adds x16, x16, x3\n\t"
+ "adc x17, x17, x4\n\t"
+ /* A[1] * B[1] */
+ "mul x3, x7, x11\n\t"
+ "umulh x4, x7, x11\n\t"
+ "adds x16, x16, x3\n\t"
+ "adcs x17, x17, x4\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x3, x8, x10\n\t"
+ "umulh x4, x8, x10\n\t"
+ "adds x16, x16, x3\n\t"
+ "adcs x17, x17, x4\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x3, x6, x13\n\t"
+ "umulh x4, x6, x13\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs x19, x19, x4\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x3, x7, x12\n\t"
+ "umulh x4, x7, x12\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs x19, x19, x4\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x3, x8, x11\n\t"
+ "umulh x4, x8, x11\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs x19, x19, x4\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x3, x9, x10\n\t"
+ "umulh x4, x9, x10\n\t"
+ "adds x17, x17, x3\n\t"
+ "adcs x19, x19, x4\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x3, x7, x13\n\t"
+ "umulh x4, x7, x13\n\t"
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, x4\n\t"
+ "adc x21, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x3, x8, x12\n\t"
+ "umulh x4, x8, x12\n\t"
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, x4\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x3, x9, x11\n\t"
+ "umulh x4, x9, x11\n\t"
+ "adds x19, x19, x3\n\t"
+ "adcs x20, x20, x4\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x3, x8, x13\n\t"
+ "umulh x4, x8, x13\n\t"
+ "adds x20, x20, x3\n\t"
+ "adcs x21, x21, x4\n\t"
+ "adc x22, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x3, x9, x12\n\t"
+ "umulh x4, x9, x12\n\t"
+ "adds x20, x20, x3\n\t"
+ "adcs x21, x21, x4\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x3, x9, x13\n\t"
+ "umulh x4, x9, x13\n\t"
+ "adds x21, x21, x3\n\t"
+ "adc x22, x22, x4\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x22, x22, x21, #63\n\t"
+ "extr x21, x21, x20, #63\n\t"
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x3, #19\n\t"
+ "mul x4, x3, x19\n\t"
+ "umulh x19, x3, x19\n\t"
+ "adds x14, x14, x4\n\t"
+ "mul x4, x3, x20\n\t"
+ "umulh x20, x3, x20\n\t"
+ "adcs x15, x15, x4\n\t"
+ "mul x4, x3, x21\n\t"
+ "umulh x21, x3, x21\n\t"
+ "adcs x16, x16, x4\n\t"
+ "mul x4, x3, x22\n\t"
+ "umulh x5, x3, x22\n\t"
+ "adcs x17, x17, x4\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x15, x15, x19\n\t"
+ "adcs x16, x16, x20\n\t"
+ "adcs x17, x17, x21\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* Overflow */
+ "extr x5, x5, x17, #63\n\t"
+ "mul x5, x5, x3\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ "adds x14, x14, x5\n\t"
+ "adcs x15, x15, xzr\n\t"
+ "adcs x16, x16, xzr\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x5, x3, x17, asr 63\n\t"
+ "and x17, x17, #0x7fffffffffffffff\n\t"
+ "adds x14, x14, x5\n\t"
+ "adcs x15, x15, xzr\n\t"
+ "adcs x16, x16, xzr\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* Store */
+ "stp x14, x15, [%x[r]]\n\t"
+ "stp x16, x17, [%x[r], #16]\n\t"
+ "mov x0, xzr\n\t"
+ "ldp x29, x30, [sp], #0xc0\n\t"
+ : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
+ :
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ return (uint32_t)(size_t)r;
+}
+
+void fe_pow22523(fe r, const fe a)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-128]!\n\t"
+ "add x29, sp, #0\n\t"
+ /* pow22523 */
+ "str %x[r], [x29, #112]\n\t"
+ "str %x[a], [x29, #120]\n\t"
+ "add x0, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #48\n\t"
+ "bl fe_sq\n\t"
+ "ldr x1, [x29, #120]\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #16\n\t"
+ "add x1, x29, #16\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "bl fe_sq\n\t"
+ "add x1, x29, #48\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #4\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_1_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_1_%=\n\t"
+ "add x0, x29, #16\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #9\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_2_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_2_%=\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #19\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_pow22523_3_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_3_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "mov x21, #10\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_4_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_4_%=\n\t"
+ "add x0, x29, #16\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #48\n\t"
+ "add x1, x29, #16\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #49\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_5_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_5_%=\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "add x0, x29, #0x50\n\t"
+ "bl fe_sq\n\t"
+ "mov x21, #0x63\n\t"
+ "add x1, x29, #0x50\n\t"
+ "\n"
+ "L_fe_pow22523_6_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_6_%=\n\t"
+ "add x0, x29, #48\n\t"
+ "add x2, x29, #48\n\t"
+ "bl fe_mul\n\t"
+ "mov x21, #50\n\t"
+ "add x1, x29, #48\n\t"
+ "\n"
+ "L_fe_pow22523_7_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_7_%=\n\t"
+ "add x0, x29, #16\n\t"
+ "add x2, x29, #16\n\t"
+ "bl fe_mul\n\t"
+ "mov x21, #2\n\t"
+ "add x1, x29, #16\n\t"
+ "\n"
+ "L_fe_pow22523_8_%=: \n\t"
+ "bl fe_sq\n\t"
+ "sub x21, x21, #1\n\t"
+ "cmp x21, #0\n\t"
+ "bne L_fe_pow22523_8_%=\n\t"
+ "ldr x0, [x29, #112]\n\t"
+ "ldr x2, [x29, #120]\n\t"
+ "bl fe_mul\n\t"
+ "ldp x29, x30, [sp], #0x80\n\t"
+ : [r] "+r" (r), [a] "+r" (a)
+ :
+ : "memory", "x21"
+ );
+}
+
+void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-64]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[ry], [x29, #16]\n\t"
+ "str %x[rz], [x29, #24]\n\t"
+ "str %x[px], [x29, #32]\n\t"
+ "str %x[py], [x29, #40]\n\t"
+ "str %x[pz], [x29, #48]\n\t"
+ "str %x[pt], [x29, #56]\n\t"
+ "ldr x1, [x29, #32]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x1]\n\t"
+ "ldp x13, x14, [x1, #16]\n\t"
+ "ldp x15, x16, [x2]\n\t"
+ "ldp x17, x19, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x11, x15\n\t"
+ "umulh x4, x11, x15\n\t"
+ /* A[0] * B[1] */
+ "mul x20, x11, x16\n\t"
+ "umulh x5, x11, x16\n\t"
+ "adds x4, x4, x20\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x20, x12, x15\n\t"
+ "umulh x21, x12, x15\n\t"
+ "adds x4, x4, x20\n\t"
+ "adcs x5, x5, x21\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x20, x11, x17\n\t"
+ "umulh x21, x11, x17\n\t"
+ "adds x5, x5, x20\n\t"
+ "adc x6, x6, x21\n\t"
+ /* A[1] * B[1] */
+ "mul x20, x12, x16\n\t"
+ "umulh x21, x12, x16\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x20, x13, x15\n\t"
+ "umulh x21, x13, x15\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x20, x11, x19\n\t"
+ "umulh x21, x11, x19\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x20, x12, x17\n\t"
+ "umulh x21, x12, x17\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x20, x13, x16\n\t"
+ "umulh x21, x13, x16\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x20, x14, x15\n\t"
+ "umulh x21, x14, x15\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x20, x12, x19\n\t"
+ "umulh x21, x12, x19\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x20, x13, x17\n\t"
+ "umulh x21, x13, x17\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x20, x14, x16\n\t"
+ "umulh x21, x14, x16\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x20, x13, x19\n\t"
+ "umulh x21, x13, x19\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x20, x14, x17\n\t"
+ "umulh x21, x14, x17\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x20, x14, x19\n\t"
+ "umulh x21, x14, x19\n\t"
+ "adds x9, x9, x20\n\t"
+ "adc x10, x10, x21\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x20, #19\n\t"
+ "mul x21, x20, x7\n\t"
+ "umulh x7, x20, x7\n\t"
+ "adds x3, x3, x21\n\t"
+ "mul x21, x20, x8\n\t"
+ "umulh x8, x20, x8\n\t"
+ "adcs x4, x4, x21\n\t"
+ "mul x21, x20, x9\n\t"
+ "umulh x9, x20, x9\n\t"
+ "adcs x5, x5, x21\n\t"
+ "mul x21, x20, x10\n\t"
+ "umulh x22, x20, x10\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Overflow */
+ "extr x22, x22, x6, #63\n\t"
+ "mul x22, x22, x20\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x22, x20, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #16]\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ "ldr x2, [x29, #48]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x1]\n\t"
+ "ldp x13, x14, [x1, #16]\n\t"
+ "ldp x15, x16, [x2]\n\t"
+ "ldp x17, x19, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x11, x15\n\t"
+ "umulh x4, x11, x15\n\t"
+ /* A[0] * B[1] */
+ "mul x20, x11, x16\n\t"
+ "umulh x5, x11, x16\n\t"
+ "adds x4, x4, x20\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x20, x12, x15\n\t"
+ "umulh x21, x12, x15\n\t"
+ "adds x4, x4, x20\n\t"
+ "adcs x5, x5, x21\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x20, x11, x17\n\t"
+ "umulh x21, x11, x17\n\t"
+ "adds x5, x5, x20\n\t"
+ "adc x6, x6, x21\n\t"
+ /* A[1] * B[1] */
+ "mul x20, x12, x16\n\t"
+ "umulh x21, x12, x16\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x20, x13, x15\n\t"
+ "umulh x21, x13, x15\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x20, x11, x19\n\t"
+ "umulh x21, x11, x19\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x20, x12, x17\n\t"
+ "umulh x21, x12, x17\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x20, x13, x16\n\t"
+ "umulh x21, x13, x16\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x20, x14, x15\n\t"
+ "umulh x21, x14, x15\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x20, x12, x19\n\t"
+ "umulh x21, x12, x19\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x20, x13, x17\n\t"
+ "umulh x21, x13, x17\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x20, x14, x16\n\t"
+ "umulh x21, x14, x16\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x20, x13, x19\n\t"
+ "umulh x21, x13, x19\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x20, x14, x17\n\t"
+ "umulh x21, x14, x17\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x20, x14, x19\n\t"
+ "umulh x21, x14, x19\n\t"
+ "adds x9, x9, x20\n\t"
+ "adc x10, x10, x21\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x20, #19\n\t"
+ "mul x21, x20, x7\n\t"
+ "umulh x7, x20, x7\n\t"
+ "adds x3, x3, x21\n\t"
+ "mul x21, x20, x8\n\t"
+ "umulh x8, x20, x8\n\t"
+ "adcs x4, x4, x21\n\t"
+ "mul x21, x20, x9\n\t"
+ "umulh x9, x20, x9\n\t"
+ "adcs x5, x5, x21\n\t"
+ "mul x21, x20, x10\n\t"
+ "umulh x22, x20, x10\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Overflow */
+ "extr x22, x22, x6, #63\n\t"
+ "mul x22, x22, x20\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x22, x20, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x2]\n\t"
+ "ldp x13, x14, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x15, x11\n\t"
+ "umulh x4, x15, x11\n\t"
+ /* A[0] * B[1] */
+ "mul x20, x15, x12\n\t"
+ "umulh x5, x15, x12\n\t"
+ "adds x4, x4, x20\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x20, x16, x11\n\t"
+ "umulh x21, x16, x11\n\t"
+ "adds x4, x4, x20\n\t"
+ "adcs x5, x5, x21\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x20, x15, x13\n\t"
+ "umulh x21, x15, x13\n\t"
+ "adds x5, x5, x20\n\t"
+ "adc x6, x6, x21\n\t"
+ /* A[1] * B[1] */
+ "mul x20, x16, x12\n\t"
+ "umulh x21, x16, x12\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x20, x17, x11\n\t"
+ "umulh x21, x17, x11\n\t"
+ "adds x5, x5, x20\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x20, x15, x14\n\t"
+ "umulh x21, x15, x14\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x20, x16, x13\n\t"
+ "umulh x21, x16, x13\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x20, x17, x12\n\t"
+ "umulh x21, x17, x12\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x20, x19, x11\n\t"
+ "umulh x21, x19, x11\n\t"
+ "adds x6, x6, x20\n\t"
+ "adcs x7, x7, x21\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x20, x16, x14\n\t"
+ "umulh x21, x16, x14\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x20, x17, x13\n\t"
+ "umulh x21, x17, x13\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x20, x19, x12\n\t"
+ "umulh x21, x19, x12\n\t"
+ "adds x7, x7, x20\n\t"
+ "adcs x8, x8, x21\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x20, x17, x14\n\t"
+ "umulh x21, x17, x14\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x20, x19, x13\n\t"
+ "umulh x21, x19, x13\n\t"
+ "adds x8, x8, x20\n\t"
+ "adcs x9, x9, x21\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x20, x19, x14\n\t"
+ "umulh x21, x19, x14\n\t"
+ "adds x9, x9, x20\n\t"
+ "adc x10, x10, x21\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x20, #19\n\t"
+ "mul x21, x20, x7\n\t"
+ "umulh x7, x20, x7\n\t"
+ "adds x3, x3, x21\n\t"
+ "mul x21, x20, x8\n\t"
+ "umulh x8, x20, x8\n\t"
+ "adcs x4, x4, x21\n\t"
+ "mul x21, x20, x9\n\t"
+ "umulh x9, x20, x9\n\t"
+ "adcs x5, x5, x21\n\t"
+ "mul x21, x20, x10\n\t"
+ "umulh x22, x20, x10\n\t"
+ "adcs x6, x6, x21\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x22, x22, xzr\n\t"
+ /* Overflow */
+ "extr x22, x22, x6, #63\n\t"
+ "mul x22, x22, x20\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x22, x20, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x22\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldp x29, x30, [sp], #0x40\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22"
+ );
+}
+
+void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[ry], [x29, #16]\n\t"
+ "str %x[rz], [x29, #24]\n\t"
+ "str %x[rt], [x29, #32]\n\t"
+ "str %x[px], [x29, #40]\n\t"
+ "str %x[py], [x29, #48]\n\t"
+ "str %x[pz], [x29, #56]\n\t"
+ "str %x[pt], [x29, #64]\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ "ldr x2, [x29, #64]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x1]\n\t"
+ "ldp x13, x14, [x1, #16]\n\t"
+ "ldp x15, x16, [x2]\n\t"
+ "ldp x17, x19, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x11, x15\n\t"
+ "umulh x4, x11, x15\n\t"
+ /* A[0] * B[1] */
+ "mul x24, x11, x16\n\t"
+ "umulh x5, x11, x16\n\t"
+ "adds x4, x4, x24\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x24, x12, x15\n\t"
+ "umulh x25, x12, x15\n\t"
+ "adds x4, x4, x24\n\t"
+ "adcs x5, x5, x25\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x24, x11, x17\n\t"
+ "umulh x25, x11, x17\n\t"
+ "adds x5, x5, x24\n\t"
+ "adc x6, x6, x25\n\t"
+ /* A[1] * B[1] */
+ "mul x24, x12, x16\n\t"
+ "umulh x25, x12, x16\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x24, x13, x15\n\t"
+ "umulh x25, x13, x15\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x24, x11, x19\n\t"
+ "umulh x25, x11, x19\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x24, x12, x17\n\t"
+ "umulh x25, x12, x17\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x24, x13, x16\n\t"
+ "umulh x25, x13, x16\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x24, x14, x15\n\t"
+ "umulh x25, x14, x15\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x24, x12, x19\n\t"
+ "umulh x25, x12, x19\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x24, x13, x17\n\t"
+ "umulh x25, x13, x17\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x24, x14, x16\n\t"
+ "umulh x25, x14, x16\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x24, x13, x19\n\t"
+ "umulh x25, x13, x19\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x24, x14, x17\n\t"
+ "umulh x25, x14, x17\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x24, x14, x19\n\t"
+ "umulh x25, x14, x19\n\t"
+ "adds x9, x9, x24\n\t"
+ "adc x10, x10, x25\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x24, #19\n\t"
+ "mul x25, x24, x7\n\t"
+ "umulh x7, x24, x7\n\t"
+ "adds x3, x3, x25\n\t"
+ "mul x25, x24, x8\n\t"
+ "umulh x8, x24, x8\n\t"
+ "adcs x4, x4, x25\n\t"
+ "mul x25, x24, x9\n\t"
+ "umulh x9, x24, x9\n\t"
+ "adcs x5, x5, x25\n\t"
+ "mul x25, x24, x10\n\t"
+ "umulh x26, x24, x10\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Overflow */
+ "extr x26, x26, x6, #63\n\t"
+ "mul x26, x26, x24\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x26, x24, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #48]\n\t"
+ /* Multiply */
+ "ldp x20, x21, [x2]\n\t"
+ "ldp x22, x23, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x11, x20\n\t"
+ "umulh x4, x11, x20\n\t"
+ /* A[0] * B[1] */
+ "mul x24, x11, x21\n\t"
+ "umulh x5, x11, x21\n\t"
+ "adds x4, x4, x24\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x24, x12, x20\n\t"
+ "umulh x25, x12, x20\n\t"
+ "adds x4, x4, x24\n\t"
+ "adcs x5, x5, x25\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x24, x11, x22\n\t"
+ "umulh x25, x11, x22\n\t"
+ "adds x5, x5, x24\n\t"
+ "adc x6, x6, x25\n\t"
+ /* A[1] * B[1] */
+ "mul x24, x12, x21\n\t"
+ "umulh x25, x12, x21\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x24, x13, x20\n\t"
+ "umulh x25, x13, x20\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x24, x11, x23\n\t"
+ "umulh x25, x11, x23\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x24, x12, x22\n\t"
+ "umulh x25, x12, x22\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x24, x13, x21\n\t"
+ "umulh x25, x13, x21\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x24, x14, x20\n\t"
+ "umulh x25, x14, x20\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x24, x12, x23\n\t"
+ "umulh x25, x12, x23\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x24, x13, x22\n\t"
+ "umulh x25, x13, x22\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x24, x14, x21\n\t"
+ "umulh x25, x14, x21\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x24, x13, x23\n\t"
+ "umulh x25, x13, x23\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x24, x14, x22\n\t"
+ "umulh x25, x14, x22\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x24, x14, x23\n\t"
+ "umulh x25, x14, x23\n\t"
+ "adds x9, x9, x24\n\t"
+ "adc x10, x10, x25\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x24, #19\n\t"
+ "mul x25, x24, x7\n\t"
+ "umulh x7, x24, x7\n\t"
+ "adds x3, x3, x25\n\t"
+ "mul x25, x24, x8\n\t"
+ "umulh x8, x24, x8\n\t"
+ "adcs x4, x4, x25\n\t"
+ "mul x25, x24, x9\n\t"
+ "umulh x9, x24, x9\n\t"
+ "adcs x5, x5, x25\n\t"
+ "mul x25, x24, x10\n\t"
+ "umulh x26, x24, x10\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Overflow */
+ "extr x26, x26, x6, #63\n\t"
+ "mul x26, x26, x24\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x26, x24, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #16]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ /* Multiply */
+ "ldp x11, x12, [x2]\n\t"
+ "ldp x13, x14, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x3, x20, x11\n\t"
+ "umulh x4, x20, x11\n\t"
+ /* A[0] * B[1] */
+ "mul x24, x20, x12\n\t"
+ "umulh x5, x20, x12\n\t"
+ "adds x4, x4, x24\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x24, x21, x11\n\t"
+ "umulh x25, x21, x11\n\t"
+ "adds x4, x4, x24\n\t"
+ "adcs x5, x5, x25\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x24, x20, x13\n\t"
+ "umulh x25, x20, x13\n\t"
+ "adds x5, x5, x24\n\t"
+ "adc x6, x6, x25\n\t"
+ /* A[1] * B[1] */
+ "mul x24, x21, x12\n\t"
+ "umulh x25, x21, x12\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x24, x22, x11\n\t"
+ "umulh x25, x22, x11\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x24, x20, x14\n\t"
+ "umulh x25, x20, x14\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x24, x21, x13\n\t"
+ "umulh x25, x21, x13\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x24, x22, x12\n\t"
+ "umulh x25, x22, x12\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x24, x23, x11\n\t"
+ "umulh x25, x23, x11\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x24, x21, x14\n\t"
+ "umulh x25, x21, x14\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x24, x22, x13\n\t"
+ "umulh x25, x22, x13\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x24, x23, x12\n\t"
+ "umulh x25, x23, x12\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x24, x22, x14\n\t"
+ "umulh x25, x22, x14\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x24, x23, x13\n\t"
+ "umulh x25, x23, x13\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x24, x23, x14\n\t"
+ "umulh x25, x23, x14\n\t"
+ "adds x9, x9, x24\n\t"
+ "adc x10, x10, x25\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x24, #19\n\t"
+ "mul x25, x24, x7\n\t"
+ "umulh x7, x24, x7\n\t"
+ "adds x3, x3, x25\n\t"
+ "mul x25, x24, x8\n\t"
+ "umulh x8, x24, x8\n\t"
+ "adcs x4, x4, x25\n\t"
+ "mul x25, x24, x9\n\t"
+ "umulh x9, x24, x9\n\t"
+ "adcs x5, x5, x25\n\t"
+ "mul x25, x24, x10\n\t"
+ "umulh x26, x24, x10\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Overflow */
+ "extr x26, x26, x6, #63\n\t"
+ "mul x26, x26, x24\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x26, x24, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldr x0, [x29, #24]\n\t"
+ /* Multiply */
+ /* A[0] * B[0] */
+ "mul x3, x11, x15\n\t"
+ "umulh x4, x11, x15\n\t"
+ /* A[0] * B[1] */
+ "mul x24, x11, x16\n\t"
+ "umulh x5, x11, x16\n\t"
+ "adds x4, x4, x24\n\t"
+ "adc x5, x5, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x24, x12, x15\n\t"
+ "umulh x25, x12, x15\n\t"
+ "adds x4, x4, x24\n\t"
+ "adcs x5, x5, x25\n\t"
+ "adc x6, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x24, x11, x17\n\t"
+ "umulh x25, x11, x17\n\t"
+ "adds x5, x5, x24\n\t"
+ "adc x6, x6, x25\n\t"
+ /* A[1] * B[1] */
+ "mul x24, x12, x16\n\t"
+ "umulh x25, x12, x16\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x24, x13, x15\n\t"
+ "umulh x25, x13, x15\n\t"
+ "adds x5, x5, x24\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x24, x11, x19\n\t"
+ "umulh x25, x11, x19\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x24, x12, x17\n\t"
+ "umulh x25, x12, x17\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x24, x13, x16\n\t"
+ "umulh x25, x13, x16\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x24, x14, x15\n\t"
+ "umulh x25, x14, x15\n\t"
+ "adds x6, x6, x24\n\t"
+ "adcs x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x24, x12, x19\n\t"
+ "umulh x25, x12, x19\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x24, x13, x17\n\t"
+ "umulh x25, x13, x17\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x24, x14, x16\n\t"
+ "umulh x25, x14, x16\n\t"
+ "adds x7, x7, x24\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x24, x13, x19\n\t"
+ "umulh x25, x13, x19\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x24, x14, x17\n\t"
+ "umulh x25, x14, x17\n\t"
+ "adds x8, x8, x24\n\t"
+ "adcs x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x24, x14, x19\n\t"
+ "umulh x25, x14, x19\n\t"
+ "adds x9, x9, x24\n\t"
+ "adc x10, x10, x25\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x24, #19\n\t"
+ "mul x25, x24, x7\n\t"
+ "umulh x7, x24, x7\n\t"
+ "adds x3, x3, x25\n\t"
+ "mul x25, x24, x8\n\t"
+ "umulh x8, x24, x8\n\t"
+ "adcs x4, x4, x25\n\t"
+ "mul x25, x24, x9\n\t"
+ "umulh x9, x24, x9\n\t"
+ "adcs x5, x5, x25\n\t"
+ "mul x25, x24, x10\n\t"
+ "umulh x26, x24, x10\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x7\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adc x26, x26, xzr\n\t"
+ /* Overflow */
+ "extr x26, x26, x6, #63\n\t"
+ "mul x26, x26, x24\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x26, x24, x6, asr 63\n\t"
+ "and x6, x6, #0x7fffffffffffffff\n\t"
+ "adds x3, x3, x26\n\t"
+ "adcs x4, x4, xzr\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* Store */
+ "stp x3, x4, [x0]\n\t"
+ "stp x5, x6, [x0, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+ );
+}
+
+void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "ldr x1, [x29, #48]\n\t"
+ /* Square */
+ "ldp x12, x13, [x1]\n\t"
+ "ldp x14, x15, [x1, #16]\n\t"
+ /* A[0] * A[1] */
+ "mul x5, x12, x13\n\t"
+ "umulh x6, x12, x13\n\t"
+ /* A[0] * A[2] */
+ "mul x25, x12, x14\n\t"
+ "umulh x7, x12, x14\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x25, x12, x15\n\t"
+ "umulh x8, x12, x15\n\t"
+ "adds x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x25, x13, x14\n\t"
+ "umulh x26, x13, x14\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x25, x13, x15\n\t"
+ "umulh x26, x13, x15\n\t"
+ "adds x8, x8, x25\n\t"
+ "adc x9, x9, x26\n\t"
+ /* A[2] * A[3] */
+ "mul x25, x14, x15\n\t"
+ "umulh x10, x14, x15\n\t"
+ "adds x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* Double */
+ "adds x5, x5, x5\n\t"
+ "adcs x6, x6, x6\n\t"
+ "adcs x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x4, x12, x12\n\t"
+ "umulh x27, x12, x12\n\t"
+ /* A[1] * A[1] */
+ "mul x25, x13, x13\n\t"
+ "umulh x26, x13, x13\n\t"
+ "adds x5, x5, x27\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x25, x14, x14\n\t"
+ "umulh x26, x14, x14\n\t"
+ "adds x7, x7, x27\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x25, x15, x15\n\t"
+ "umulh x26, x15, x15\n\t"
+ "adds x9, x9, x27\n\t"
+ "adcs x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "stp x4, x5, [x0]\n\t"
+ "stp x6, x7, [x0, #16]\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x1, [x29, #56]\n\t"
+ /* Square */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * A[1] */
+ "mul x9, x21, x22\n\t"
+ "umulh x10, x21, x22\n\t"
+ /* A[0] * A[2] */
+ "mul x25, x21, x23\n\t"
+ "umulh x11, x21, x23\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x25, x21, x24\n\t"
+ "umulh x16, x21, x24\n\t"
+ "adds x11, x11, x25\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x25, x22, x23\n\t"
+ "umulh x26, x22, x23\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x25, x22, x24\n\t"
+ "umulh x26, x22, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adc x17, x17, x26\n\t"
+ /* A[2] * A[3] */
+ "mul x25, x23, x24\n\t"
+ "umulh x19, x23, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* Double */
+ "adds x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adcs x11, x11, x11\n\t"
+ "adcs x16, x16, x16\n\t"
+ "adcs x17, x17, x17\n\t"
+ "adcs x19, x19, x19\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x8, x21, x21\n\t"
+ "umulh x27, x21, x21\n\t"
+ /* A[1] * A[1] */
+ "mul x25, x22, x22\n\t"
+ "umulh x26, x22, x22\n\t"
+ "adds x9, x9, x27\n\t"
+ "adcs x10, x10, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x25, x23, x23\n\t"
+ "umulh x26, x23, x23\n\t"
+ "adds x11, x11, x27\n\t"
+ "adcs x16, x16, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x25, x24, x24\n\t"
+ "umulh x26, x24, x24\n\t"
+ "adds x17, x17, x27\n\t"
+ "adcs x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x11, #63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x8, x8, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x9, x9, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x10, x10, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x9, x9, x16\n\t"
+ "adcs x10, x10, x17\n\t"
+ "adcs x11, x11, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x11, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x11, asr 63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Store */
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "ldr x0, [x29, #24]\n\t"
+ /* Add */
+ "adds x12, x12, x21\n\t"
+ "adcs x13, x13, x22\n\t"
+ "adcs x14, x14, x23\n\t"
+ "adc x15, x15, x24\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ /* Square */
+ /* A[0] * A[1] */
+ "mul x17, x12, x13\n\t"
+ "umulh x19, x12, x13\n\t"
+ /* A[0] * A[2] */
+ "mul x25, x12, x14\n\t"
+ "umulh x20, x12, x14\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x25, x12, x15\n\t"
+ "umulh x21, x12, x15\n\t"
+ "adds x20, x20, x25\n\t"
+ "adc x21, x21, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x25, x13, x14\n\t"
+ "umulh x26, x13, x14\n\t"
+ "adds x20, x20, x25\n\t"
+ "adcs x21, x21, x26\n\t"
+ "adc x22, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x25, x13, x15\n\t"
+ "umulh x26, x13, x15\n\t"
+ "adds x21, x21, x25\n\t"
+ "adc x22, x22, x26\n\t"
+ /* A[2] * A[3] */
+ "mul x25, x14, x15\n\t"
+ "umulh x23, x14, x15\n\t"
+ "adds x22, x22, x25\n\t"
+ "adc x23, x23, xzr\n\t"
+ /* Double */
+ "adds x17, x17, x17\n\t"
+ "adcs x19, x19, x19\n\t"
+ "adcs x20, x20, x20\n\t"
+ "adcs x21, x21, x21\n\t"
+ "adcs x22, x22, x22\n\t"
+ "adcs x23, x23, x23\n\t"
+ "adc x24, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x16, x12, x12\n\t"
+ "umulh x27, x12, x12\n\t"
+ /* A[1] * A[1] */
+ "mul x25, x13, x13\n\t"
+ "umulh x26, x13, x13\n\t"
+ "adds x17, x17, x27\n\t"
+ "adcs x19, x19, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x25, x14, x14\n\t"
+ "umulh x26, x14, x14\n\t"
+ "adds x20, x20, x27\n\t"
+ "adcs x21, x21, x25\n\t"
+ "adc x27, x26, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x25, x15, x15\n\t"
+ "umulh x26, x15, x15\n\t"
+ "adds x22, x22, x27\n\t"
+ "adcs x23, x23, x25\n\t"
+ "adc x24, x24, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x24, x24, x23, #63\n\t"
+ "extr x23, x23, x22, #63\n\t"
+ "extr x22, x22, x21, #63\n\t"
+ "extr x21, x21, x20, #63\n\t"
+ "and x20, x20, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x21\n\t"
+ "umulh x21, x25, x21\n\t"
+ "adds x16, x16, x26\n\t"
+ "mul x26, x25, x22\n\t"
+ "umulh x22, x25, x22\n\t"
+ "adcs x17, x17, x26\n\t"
+ "mul x26, x25, x23\n\t"
+ "umulh x23, x25, x23\n\t"
+ "adcs x19, x19, x26\n\t"
+ "mul x26, x25, x24\n\t"
+ "umulh x27, x25, x24\n\t"
+ "adcs x20, x20, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x17, x17, x21\n\t"
+ "adcs x19, x19, x22\n\t"
+ "adcs x20, x20, x23\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x20, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x20, x20, #0x7fffffffffffffff\n\t"
+ "adds x16, x16, x27\n\t"
+ "adcs x17, x17, xzr\n\t"
+ "adcs x19, x19, xzr\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x20, asr 63\n\t"
+ "and x20, x20, #0x7fffffffffffffff\n\t"
+ "adds x16, x16, x27\n\t"
+ "adcs x17, x17, xzr\n\t"
+ "adcs x19, x19, xzr\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* Store */
+ "stp x16, x17, [x0]\n\t"
+ "stp x19, x20, [x0, #16]\n\t"
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #32]\n\t"
+ /* Add */
+ "adds x12, x8, x4\n\t"
+ "adcs x13, x9, x5\n\t"
+ "adcs x14, x10, x6\n\t"
+ "adc x15, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x21, x8, x4\n\t"
+ "sbcs x22, x9, x5\n\t"
+ "sbcs x23, x10, x6\n\t"
+ "sbcs x24, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x21, x21, x25\n\t"
+ "adcs x22, x22, x28\n\t"
+ "adcs x23, x23, x28\n\t"
+ "adc x24, x24, x26\n\t"
+ "stp x12, x13, [x0]\n\t"
+ "stp x14, x15, [x0, #16]\n\t"
+ "stp x21, x22, [x1]\n\t"
+ "stp x23, x24, [x1, #16]\n\t"
+ "ldr x0, [x29, #16]\n\t"
+ /* Sub */
+ "subs x16, x16, x12\n\t"
+ "sbcs x17, x17, x13\n\t"
+ "sbcs x19, x19, x14\n\t"
+ "sbcs x20, x20, x15\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x16, x17, [x0]\n\t"
+ "stp x19, x20, [x0, #16]\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ /* Square * 2 */
+ "ldp x12, x13, [x1]\n\t"
+ "ldp x14, x15, [x1, #16]\n\t"
+ /* A[0] * A[1] */
+ "mul x5, x12, x13\n\t"
+ "umulh x6, x12, x13\n\t"
+ /* A[0] * A[2] */
+ "mul x25, x12, x14\n\t"
+ "umulh x7, x12, x14\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* A[0] * A[3] */
+ "mul x25, x12, x15\n\t"
+ "umulh x8, x12, x15\n\t"
+ "adds x7, x7, x25\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[1] * A[2] */
+ "mul x25, x13, x14\n\t"
+ "umulh x26, x13, x14\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * A[3] */
+ "mul x25, x13, x15\n\t"
+ "umulh x26, x13, x15\n\t"
+ "adds x8, x8, x25\n\t"
+ "adc x9, x9, x26\n\t"
+ /* A[2] * A[3] */
+ "mul x25, x14, x15\n\t"
+ "umulh x10, x14, x15\n\t"
+ "adds x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* Double */
+ "adds x5, x5, x5\n\t"
+ "adcs x6, x6, x6\n\t"
+ "adcs x7, x7, x7\n\t"
+ "adcs x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[0] * A[0] */
+ "mul x4, x12, x12\n\t"
+ "umulh x28, x12, x12\n\t"
+ /* A[1] * A[1] */
+ "mul x25, x13, x13\n\t"
+ "umulh x26, x13, x13\n\t"
+ "adds x5, x5, x28\n\t"
+ "adcs x6, x6, x25\n\t"
+ "adc x28, x26, xzr\n\t"
+ /* A[2] * A[2] */
+ "mul x25, x14, x14\n\t"
+ "umulh x26, x14, x14\n\t"
+ "adds x7, x7, x28\n\t"
+ "adcs x8, x8, x25\n\t"
+ "adc x28, x26, xzr\n\t"
+ /* A[3] * A[3] */
+ "mul x25, x15, x15\n\t"
+ "umulh x26, x15, x15\n\t"
+ "adds x9, x9, x28\n\t"
+ "adcs x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Double and Reduce */
+ "mov x25, #0x169\n\t"
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "lsr x28, x11, #61\n\t"
+ "extr x11, x11, x10, #62\n\t"
+ "extr x10, x10, x9, #62\n\t"
+ "extr x9, x9, x8, #62\n\t"
+ "extr x8, x8, x7, #62\n\t"
+ "extr x7, x7, x6, #63\n\t"
+ "extr x6, x6, x5, #63\n\t"
+ "extr x5, x5, x4, #63\n\t"
+ "lsl x4, x4, #1\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Two left, only one right */
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ /* Multiply top bits by 19*19 */
+ "mul x28, x28, x25\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x4, x4, x28\n\t"
+ "adcs x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #40]\n\t"
+ /* Sub */
+ "subs x4, x4, x21\n\t"
+ "sbcs x5, x5, x22\n\t"
+ "sbcs x6, x6, x23\n\t"
+ "sbcs x7, x7, x24\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x4, x4, x25\n\t"
+ "adcs x5, x5, x28\n\t"
+ "adcs x6, x6, x28\n\t"
+ "adc x7, x7, x26\n\t"
+ "stp x4, x5, [x0]\n\t"
+ "stp x6, x7, [x0, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz)
+ :
+ : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "str %x[pt], [x29, #72]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ "ldr x3, [x29, #48]\n\t"
+ /* Add */
+ "ldp x12, x13, [x2]\n\t"
+ "ldp x14, x15, [x2, #16]\n\t"
+ "ldp x16, x17, [x3]\n\t"
+ "ldp x19, x20, [x3, #16]\n\t"
+ "adds x4, x12, x16\n\t"
+ "adcs x5, x13, x17\n\t"
+ "adcs x6, x14, x19\n\t"
+ "adc x7, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ /* Sub */
+ "subs x8, x12, x16\n\t"
+ "sbcs x9, x13, x17\n\t"
+ "sbcs x10, x14, x19\n\t"
+ "sbcs x11, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x28\n\t"
+ "adcs x10, x10, x28\n\t"
+ "adc x11, x11, x26\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #168]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x12, x4, x21\n\t"
+ "umulh x13, x4, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x4, x22\n\t"
+ "umulh x14, x4, x22\n\t"
+ "adds x13, x13, x25\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x5, x21\n\t"
+ "umulh x26, x5, x21\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x4, x23\n\t"
+ "umulh x26, x4, x23\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x5, x22\n\t"
+ "umulh x26, x5, x22\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x6, x21\n\t"
+ "umulh x26, x6, x21\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x4, x24\n\t"
+ "umulh x26, x4, x24\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x5, x23\n\t"
+ "umulh x26, x5, x23\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x6, x22\n\t"
+ "umulh x26, x6, x22\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x7, x21\n\t"
+ "umulh x26, x7, x21\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x5, x24\n\t"
+ "umulh x26, x5, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x6, x23\n\t"
+ "umulh x26, x6, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x7, x22\n\t"
+ "umulh x26, x7, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x6, x24\n\t"
+ "umulh x26, x6, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x7, x23\n\t"
+ "umulh x26, x7, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x7, x24\n\t"
+ "umulh x26, x7, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x12, x12, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x13, x13, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x14, x14, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x13, x13, x16\n\t"
+ "adcs x14, x14, x17\n\t"
+ "adcs x15, x15, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x15, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x15, asr 63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x8, x21\n\t"
+ "umulh x5, x8, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x8, x22\n\t"
+ "umulh x6, x8, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x9, x21\n\t"
+ "umulh x26, x9, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x8, x23\n\t"
+ "umulh x26, x8, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x9, x22\n\t"
+ "umulh x26, x9, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x10, x21\n\t"
+ "umulh x26, x10, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x8, x24\n\t"
+ "umulh x26, x8, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x9, x23\n\t"
+ "umulh x26, x9, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x10, x22\n\t"
+ "umulh x26, x10, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x11, x21\n\t"
+ "umulh x26, x11, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x9, x24\n\t"
+ "umulh x26, x9, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x10, x23\n\t"
+ "umulh x26, x10, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x11, x22\n\t"
+ "umulh x26, x11, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x10, x24\n\t"
+ "umulh x26, x10, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x11, x23\n\t"
+ "umulh x26, x11, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x11, x24\n\t"
+ "umulh x26, x11, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x16\n\t"
+ "adcs x6, x6, x17\n\t"
+ "adcs x7, x7, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #16]\n\t"
+ /* Add */
+ "adds x8, x12, x4\n\t"
+ "adcs x9, x13, x5\n\t"
+ "adcs x10, x14, x6\n\t"
+ "adc x11, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ /* Sub */
+ "subs x16, x12, x4\n\t"
+ "sbcs x17, x13, x5\n\t"
+ "sbcs x19, x14, x6\n\t"
+ "sbcs x20, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #160]\n\t"
+ "ldr x3, [x29, #72]\n\t"
+ /* Multiply */
+ "ldp x16, x17, [x1]\n\t"
+ "ldp x19, x20, [x1, #16]\n\t"
+ "ldp x21, x22, [x3]\n\t"
+ "ldp x23, x24, [x3, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x16, x21\n\t"
+ "umulh x5, x16, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x16, x22\n\t"
+ "umulh x6, x16, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x17, x21\n\t"
+ "umulh x26, x17, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x16, x23\n\t"
+ "umulh x26, x16, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x17, x22\n\t"
+ "umulh x26, x17, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x19, x21\n\t"
+ "umulh x26, x19, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x16, x24\n\t"
+ "umulh x26, x16, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x17, x23\n\t"
+ "umulh x26, x17, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x19, x22\n\t"
+ "umulh x26, x19, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x20, x21\n\t"
+ "umulh x26, x20, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x17, x24\n\t"
+ "umulh x26, x17, x24\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x19, x23\n\t"
+ "umulh x26, x19, x23\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x20, x22\n\t"
+ "umulh x26, x20, x22\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x19, x24\n\t"
+ "umulh x26, x19, x24\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x20, x23\n\t"
+ "umulh x26, x20, x23\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x20, x24\n\t"
+ "umulh x26, x20, x24\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ /* Double */
+ "ldp x8, x9, [x1]\n\t"
+ "ldp x10, x11, [x1, #16]\n\t"
+ "adds x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adc x11, x11, x11\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ /* Add */
+ "adds x12, x8, x4\n\t"
+ "adcs x13, x9, x5\n\t"
+ "adcs x14, x10, x6\n\t"
+ "adc x15, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x16, x8, x4\n\t"
+ "sbcs x17, x9, x5\n\t"
+ "sbcs x19, x10, x6\n\t"
+ "sbcs x20, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x12, x13, [x0]\n\t"
+ "stp x14, x15, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ (void)qxy2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "str %x[pt], [x29, #72]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ "ldr x3, [x29, #48]\n\t"
+ /* Add */
+ "ldp x12, x13, [x2]\n\t"
+ "ldp x14, x15, [x2, #16]\n\t"
+ "ldp x16, x17, [x3]\n\t"
+ "ldp x19, x20, [x3, #16]\n\t"
+ "adds x4, x12, x16\n\t"
+ "adcs x5, x13, x17\n\t"
+ "adcs x6, x14, x19\n\t"
+ "adc x7, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ /* Sub */
+ "subs x8, x12, x16\n\t"
+ "sbcs x9, x13, x17\n\t"
+ "sbcs x10, x14, x19\n\t"
+ "sbcs x11, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x28\n\t"
+ "adcs x10, x10, x28\n\t"
+ "adc x11, x11, x26\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x12, x4, x21\n\t"
+ "umulh x13, x4, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x4, x22\n\t"
+ "umulh x14, x4, x22\n\t"
+ "adds x13, x13, x25\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x5, x21\n\t"
+ "umulh x26, x5, x21\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x4, x23\n\t"
+ "umulh x26, x4, x23\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x5, x22\n\t"
+ "umulh x26, x5, x22\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x6, x21\n\t"
+ "umulh x26, x6, x21\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x4, x24\n\t"
+ "umulh x26, x4, x24\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x5, x23\n\t"
+ "umulh x26, x5, x23\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x6, x22\n\t"
+ "umulh x26, x6, x22\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x7, x21\n\t"
+ "umulh x26, x7, x21\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x5, x24\n\t"
+ "umulh x26, x5, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x6, x23\n\t"
+ "umulh x26, x6, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x7, x22\n\t"
+ "umulh x26, x7, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x6, x24\n\t"
+ "umulh x26, x6, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x7, x23\n\t"
+ "umulh x26, x7, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x7, x24\n\t"
+ "umulh x26, x7, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x12, x12, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x13, x13, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x14, x14, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x13, x13, x16\n\t"
+ "adcs x14, x14, x17\n\t"
+ "adcs x15, x15, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x15, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x15, asr 63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #168]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x8, x21\n\t"
+ "umulh x5, x8, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x8, x22\n\t"
+ "umulh x6, x8, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x9, x21\n\t"
+ "umulh x26, x9, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x8, x23\n\t"
+ "umulh x26, x8, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x9, x22\n\t"
+ "umulh x26, x9, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x10, x21\n\t"
+ "umulh x26, x10, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x8, x24\n\t"
+ "umulh x26, x8, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x9, x23\n\t"
+ "umulh x26, x9, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x10, x22\n\t"
+ "umulh x26, x10, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x11, x21\n\t"
+ "umulh x26, x11, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x9, x24\n\t"
+ "umulh x26, x9, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x10, x23\n\t"
+ "umulh x26, x10, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x11, x22\n\t"
+ "umulh x26, x11, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x10, x24\n\t"
+ "umulh x26, x10, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x11, x23\n\t"
+ "umulh x26, x11, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x11, x24\n\t"
+ "umulh x26, x11, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x16\n\t"
+ "adcs x6, x6, x17\n\t"
+ "adcs x7, x7, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #16]\n\t"
+ /* Add */
+ "adds x8, x12, x4\n\t"
+ "adcs x9, x13, x5\n\t"
+ "adcs x10, x14, x6\n\t"
+ "adc x11, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ /* Sub */
+ "subs x16, x12, x4\n\t"
+ "sbcs x17, x13, x5\n\t"
+ "sbcs x19, x14, x6\n\t"
+ "sbcs x20, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #160]\n\t"
+ "ldr x3, [x29, #72]\n\t"
+ /* Multiply */
+ "ldp x16, x17, [x1]\n\t"
+ "ldp x19, x20, [x1, #16]\n\t"
+ "ldp x21, x22, [x3]\n\t"
+ "ldp x23, x24, [x3, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x16, x21\n\t"
+ "umulh x5, x16, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x16, x22\n\t"
+ "umulh x6, x16, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x17, x21\n\t"
+ "umulh x26, x17, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x16, x23\n\t"
+ "umulh x26, x16, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x17, x22\n\t"
+ "umulh x26, x17, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x19, x21\n\t"
+ "umulh x26, x19, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x16, x24\n\t"
+ "umulh x26, x16, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x17, x23\n\t"
+ "umulh x26, x17, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x19, x22\n\t"
+ "umulh x26, x19, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x20, x21\n\t"
+ "umulh x26, x20, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x17, x24\n\t"
+ "umulh x26, x17, x24\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x19, x23\n\t"
+ "umulh x26, x19, x23\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x20, x22\n\t"
+ "umulh x26, x20, x22\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x19, x24\n\t"
+ "umulh x26, x19, x24\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x20, x23\n\t"
+ "umulh x26, x20, x23\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x20, x24\n\t"
+ "umulh x26, x20, x24\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ /* Double */
+ "ldp x8, x9, [x1]\n\t"
+ "ldp x10, x11, [x1, #16]\n\t"
+ "adds x8, x8, x8\n\t"
+ "adcs x9, x9, x9\n\t"
+ "adcs x10, x10, x10\n\t"
+ "adc x11, x11, x11\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ /* Add */
+ "adds x12, x8, x4\n\t"
+ "adcs x13, x9, x5\n\t"
+ "adcs x14, x10, x6\n\t"
+ "adc x15, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x16, x8, x4\n\t"
+ "sbcs x17, x9, x5\n\t"
+ "sbcs x19, x10, x6\n\t"
+ "sbcs x20, x11, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x12, x13, [x1]\n\t"
+ "stp x14, x15, [x1, #16]\n\t"
+ "stp x16, x17, [x0]\n\t"
+ "stp x19, x20, [x0, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ (void)qxy2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "str %x[pt], [x29, #72]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ "ldr x3, [x29, #48]\n\t"
+ /* Add */
+ "ldp x12, x13, [x2]\n\t"
+ "ldp x14, x15, [x2, #16]\n\t"
+ "ldp x16, x17, [x3]\n\t"
+ "ldp x19, x20, [x3, #16]\n\t"
+ "adds x4, x12, x16\n\t"
+ "adcs x5, x13, x17\n\t"
+ "adcs x6, x14, x19\n\t"
+ "adc x7, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ /* Sub */
+ "subs x8, x12, x16\n\t"
+ "sbcs x9, x13, x17\n\t"
+ "sbcs x10, x14, x19\n\t"
+ "sbcs x11, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x28\n\t"
+ "adcs x10, x10, x28\n\t"
+ "adc x11, x11, x26\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x12, x4, x21\n\t"
+ "umulh x13, x4, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x4, x22\n\t"
+ "umulh x14, x4, x22\n\t"
+ "adds x13, x13, x25\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x5, x21\n\t"
+ "umulh x26, x5, x21\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x4, x23\n\t"
+ "umulh x26, x4, x23\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x5, x22\n\t"
+ "umulh x26, x5, x22\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x6, x21\n\t"
+ "umulh x26, x6, x21\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x4, x24\n\t"
+ "umulh x26, x4, x24\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x5, x23\n\t"
+ "umulh x26, x5, x23\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x6, x22\n\t"
+ "umulh x26, x6, x22\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x7, x21\n\t"
+ "umulh x26, x7, x21\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x5, x24\n\t"
+ "umulh x26, x5, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x6, x23\n\t"
+ "umulh x26, x6, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x7, x22\n\t"
+ "umulh x26, x7, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x6, x24\n\t"
+ "umulh x26, x6, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x7, x23\n\t"
+ "umulh x26, x7, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x7, x24\n\t"
+ "umulh x26, x7, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x12, x12, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x13, x13, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x14, x14, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x13, x13, x16\n\t"
+ "adcs x14, x14, x17\n\t"
+ "adcs x15, x15, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x15, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x15, asr 63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #184]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x8, x21\n\t"
+ "umulh x5, x8, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x8, x22\n\t"
+ "umulh x6, x8, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x9, x21\n\t"
+ "umulh x26, x9, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x8, x23\n\t"
+ "umulh x26, x8, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x9, x22\n\t"
+ "umulh x26, x9, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x10, x21\n\t"
+ "umulh x26, x10, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x8, x24\n\t"
+ "umulh x26, x8, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x9, x23\n\t"
+ "umulh x26, x9, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x10, x22\n\t"
+ "umulh x26, x10, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x11, x21\n\t"
+ "umulh x26, x11, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x9, x24\n\t"
+ "umulh x26, x9, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x10, x23\n\t"
+ "umulh x26, x10, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x11, x22\n\t"
+ "umulh x26, x11, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x10, x24\n\t"
+ "umulh x26, x10, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x11, x23\n\t"
+ "umulh x26, x11, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x11, x24\n\t"
+ "umulh x26, x11, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x16\n\t"
+ "adcs x6, x6, x17\n\t"
+ "adcs x7, x7, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #16]\n\t"
+ /* Add */
+ "adds x8, x12, x4\n\t"
+ "adcs x9, x13, x5\n\t"
+ "adcs x10, x14, x6\n\t"
+ "adc x11, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ /* Sub */
+ "subs x16, x12, x4\n\t"
+ "sbcs x17, x13, x5\n\t"
+ "sbcs x19, x14, x6\n\t"
+ "sbcs x20, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldr x0, [x29, #48]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ "ldr x2, [x29, #160]\n\t"
+ /* Multiply */
+ "ldp x12, x13, [x1]\n\t"
+ "ldp x14, x15, [x1, #16]\n\t"
+ "ldp x16, x17, [x2]\n\t"
+ "ldp x19, x20, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x12, x16\n\t"
+ "umulh x5, x12, x16\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x12, x17\n\t"
+ "umulh x6, x12, x17\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x13, x16\n\t"
+ "umulh x26, x13, x16\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x12, x19\n\t"
+ "umulh x26, x12, x19\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x13, x17\n\t"
+ "umulh x26, x13, x17\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x14, x16\n\t"
+ "umulh x26, x14, x16\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x12, x20\n\t"
+ "umulh x26, x12, x20\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x13, x19\n\t"
+ "umulh x26, x13, x19\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x14, x17\n\t"
+ "umulh x26, x14, x17\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x15, x16\n\t"
+ "umulh x26, x15, x16\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x13, x20\n\t"
+ "umulh x26, x13, x20\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x14, x19\n\t"
+ "umulh x26, x14, x19\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x15, x17\n\t"
+ "umulh x26, x15, x17\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x14, x20\n\t"
+ "umulh x26, x14, x20\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x15, x19\n\t"
+ "umulh x26, x15, x19\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x15, x20\n\t"
+ "umulh x26, x15, x20\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #48]\n\t"
+ /* Double */
+ "adds x4, x4, x4\n\t"
+ "adcs x5, x5, x5\n\t"
+ "adcs x6, x6, x6\n\t"
+ "adc x7, x7, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #168]\n\t"
+ "ldr x2, [x29, #72]\n\t"
+ /* Multiply */
+ "ldp x16, x17, [x1]\n\t"
+ "ldp x19, x20, [x1, #16]\n\t"
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x8, x16, x21\n\t"
+ "umulh x9, x16, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x16, x22\n\t"
+ "umulh x10, x16, x22\n\t"
+ "adds x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x17, x21\n\t"
+ "umulh x26, x17, x21\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x16, x23\n\t"
+ "umulh x26, x16, x23\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x17, x22\n\t"
+ "umulh x26, x17, x22\n\t"
+ "adds x10, x10, x25\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x12, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x19, x21\n\t"
+ "umulh x26, x19, x21\n\t"
+ "adds x10, x10, x25\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x16, x24\n\t"
+ "umulh x26, x16, x24\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x17, x23\n\t"
+ "umulh x26, x17, x23\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x19, x22\n\t"
+ "umulh x26, x19, x22\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x20, x21\n\t"
+ "umulh x26, x20, x21\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x17, x24\n\t"
+ "umulh x26, x17, x24\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x19, x23\n\t"
+ "umulh x26, x19, x23\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x20, x22\n\t"
+ "umulh x26, x20, x22\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x19, x24\n\t"
+ "umulh x26, x19, x24\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x20, x23\n\t"
+ "umulh x26, x20, x23\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x20, x24\n\t"
+ "umulh x26, x20, x24\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x15, x15, x14, #63\n\t"
+ "extr x14, x14, x13, #63\n\t"
+ "extr x13, x13, x12, #63\n\t"
+ "extr x12, x12, x11, #63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x12\n\t"
+ "umulh x12, x25, x12\n\t"
+ "adds x8, x8, x26\n\t"
+ "mul x26, x25, x13\n\t"
+ "umulh x13, x25, x13\n\t"
+ "adcs x9, x9, x26\n\t"
+ "mul x26, x25, x14\n\t"
+ "umulh x14, x25, x14\n\t"
+ "adcs x10, x10, x26\n\t"
+ "mul x26, x25, x15\n\t"
+ "umulh x27, x25, x15\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x9, x9, x12\n\t"
+ "adcs x10, x10, x13\n\t"
+ "adcs x11, x11, x14\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x11, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x11, asr 63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x1, [x29, #40]\n\t"
+ /* Add */
+ "adds x12, x4, x8\n\t"
+ "adcs x13, x5, x9\n\t"
+ "adcs x14, x6, x10\n\t"
+ "adc x15, x7, x11\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x16, x4, x8\n\t"
+ "sbcs x17, x5, x9\n\t"
+ "sbcs x19, x6, x10\n\t"
+ "sbcs x20, x7, x11\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x12, x13, [x0]\n\t"
+ "stp x14, x15, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ (void)qz;
+ (void)qt2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-80]!\n\t"
+ "add x29, sp, #0\n\t"
+ "str %x[rx], [x29, #16]\n\t"
+ "str %x[ry], [x29, #24]\n\t"
+ "str %x[rz], [x29, #32]\n\t"
+ "str %x[rt], [x29, #40]\n\t"
+ "str %x[px], [x29, #48]\n\t"
+ "str %x[py], [x29, #56]\n\t"
+ "str %x[pz], [x29, #64]\n\t"
+ "str %x[pt], [x29, #72]\n\t"
+ "ldr x2, [x29, #56]\n\t"
+ "ldr x3, [x29, #48]\n\t"
+ /* Add */
+ "ldp x12, x13, [x2]\n\t"
+ "ldp x14, x15, [x2, #16]\n\t"
+ "ldp x16, x17, [x3]\n\t"
+ "ldp x19, x20, [x3, #16]\n\t"
+ "adds x4, x12, x16\n\t"
+ "adcs x5, x13, x17\n\t"
+ "adcs x6, x14, x19\n\t"
+ "adc x7, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ /* Sub */
+ "subs x8, x12, x16\n\t"
+ "sbcs x9, x13, x17\n\t"
+ "sbcs x10, x14, x19\n\t"
+ "sbcs x11, x15, x20\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x28\n\t"
+ "adcs x10, x10, x28\n\t"
+ "adc x11, x11, x26\n\t"
+ "ldr x0, [x29, #32]\n\t"
+ "ldr x2, [x29, #184]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x12, x4, x21\n\t"
+ "umulh x13, x4, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x4, x22\n\t"
+ "umulh x14, x4, x22\n\t"
+ "adds x13, x13, x25\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x5, x21\n\t"
+ "umulh x26, x5, x21\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x4, x23\n\t"
+ "umulh x26, x4, x23\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x5, x22\n\t"
+ "umulh x26, x5, x22\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x6, x21\n\t"
+ "umulh x26, x6, x21\n\t"
+ "adds x14, x14, x25\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x4, x24\n\t"
+ "umulh x26, x4, x24\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x5, x23\n\t"
+ "umulh x26, x5, x23\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x6, x22\n\t"
+ "umulh x26, x6, x22\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x7, x21\n\t"
+ "umulh x26, x7, x21\n\t"
+ "adds x15, x15, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x5, x24\n\t"
+ "umulh x26, x5, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x6, x23\n\t"
+ "umulh x26, x6, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x7, x22\n\t"
+ "umulh x26, x7, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x6, x24\n\t"
+ "umulh x26, x6, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x7, x23\n\t"
+ "umulh x26, x7, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x7, x24\n\t"
+ "umulh x26, x7, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x15, #63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x12, x12, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x13, x13, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x14, x14, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x15, x15, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x13, x13, x16\n\t"
+ "adcs x14, x14, x17\n\t"
+ "adcs x15, x15, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x15, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x15, asr 63\n\t"
+ "and x15, x15, #0x7fffffffffffffff\n\t"
+ "adds x12, x12, x27\n\t"
+ "adcs x13, x13, xzr\n\t"
+ "adcs x14, x14, xzr\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #176]\n\t"
+ /* Multiply */
+ "ldp x21, x22, [x1]\n\t"
+ "ldp x23, x24, [x1, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x8, x21\n\t"
+ "umulh x5, x8, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x8, x22\n\t"
+ "umulh x6, x8, x22\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x9, x21\n\t"
+ "umulh x26, x9, x21\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x8, x23\n\t"
+ "umulh x26, x8, x23\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x9, x22\n\t"
+ "umulh x26, x9, x22\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x10, x21\n\t"
+ "umulh x26, x10, x21\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x16, x16, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x8, x24\n\t"
+ "umulh x26, x8, x24\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x9, x23\n\t"
+ "umulh x26, x9, x23\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x10, x22\n\t"
+ "umulh x26, x10, x22\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x11, x21\n\t"
+ "umulh x26, x11, x21\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x16, x16, x26\n\t"
+ "adc x17, x17, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x9, x24\n\t"
+ "umulh x26, x9, x24\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x10, x23\n\t"
+ "umulh x26, x10, x23\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x11, x22\n\t"
+ "umulh x26, x11, x22\n\t"
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x26\n\t"
+ "adc x19, x19, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x10, x24\n\t"
+ "umulh x26, x10, x24\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x11, x23\n\t"
+ "umulh x26, x11, x23\n\t"
+ "adds x17, x17, x25\n\t"
+ "adcs x19, x19, x26\n\t"
+ "adc x20, x20, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x11, x24\n\t"
+ "umulh x26, x11, x24\n\t"
+ "adds x19, x19, x25\n\t"
+ "adc x20, x20, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x20, x20, x19, #63\n\t"
+ "extr x19, x19, x17, #63\n\t"
+ "extr x17, x17, x16, #63\n\t"
+ "extr x16, x16, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x16\n\t"
+ "umulh x16, x25, x16\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x17\n\t"
+ "umulh x17, x25, x17\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x19\n\t"
+ "umulh x19, x25, x19\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x20\n\t"
+ "umulh x27, x25, x20\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x16\n\t"
+ "adcs x6, x6, x17\n\t"
+ "adcs x7, x7, x19\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #24]\n\t"
+ "ldr x1, [x29, #16]\n\t"
+ /* Add */
+ "adds x8, x12, x4\n\t"
+ "adcs x9, x13, x5\n\t"
+ "adcs x10, x14, x6\n\t"
+ "adc x11, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x11, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x8, x8, x25\n\t"
+ "sbcs x9, x9, x28\n\t"
+ "sbcs x10, x10, x28\n\t"
+ "sbc x11, x11, x26\n\t"
+ /* Sub */
+ "subs x16, x12, x4\n\t"
+ "sbcs x17, x13, x5\n\t"
+ "sbcs x19, x14, x6\n\t"
+ "sbcs x20, x15, x7\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x8, x9, [x0]\n\t"
+ "stp x10, x11, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldr x0, [x29, #48]\n\t"
+ "ldr x1, [x29, #64]\n\t"
+ "ldr x2, [x29, #160]\n\t"
+ /* Multiply */
+ "ldp x12, x13, [x1]\n\t"
+ "ldp x14, x15, [x1, #16]\n\t"
+ "ldp x16, x17, [x2]\n\t"
+ "ldp x19, x20, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x4, x12, x16\n\t"
+ "umulh x5, x12, x16\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x12, x17\n\t"
+ "umulh x6, x12, x17\n\t"
+ "adds x5, x5, x25\n\t"
+ "adc x6, x6, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x13, x16\n\t"
+ "umulh x26, x13, x16\n\t"
+ "adds x5, x5, x25\n\t"
+ "adcs x6, x6, x26\n\t"
+ "adc x7, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x12, x19\n\t"
+ "umulh x26, x12, x19\n\t"
+ "adds x6, x6, x25\n\t"
+ "adc x7, x7, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x13, x17\n\t"
+ "umulh x26, x13, x17\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x14, x16\n\t"
+ "umulh x26, x14, x16\n\t"
+ "adds x6, x6, x25\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x8, x8, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x12, x20\n\t"
+ "umulh x26, x12, x20\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x13, x19\n\t"
+ "umulh x26, x13, x19\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x14, x17\n\t"
+ "umulh x26, x14, x17\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x15, x16\n\t"
+ "umulh x26, x15, x16\n\t"
+ "adds x7, x7, x25\n\t"
+ "adcs x8, x8, x26\n\t"
+ "adc x9, x9, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x13, x20\n\t"
+ "umulh x26, x13, x20\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x14, x19\n\t"
+ "umulh x26, x14, x19\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x15, x17\n\t"
+ "umulh x26, x15, x17\n\t"
+ "adds x8, x8, x25\n\t"
+ "adcs x9, x9, x26\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x14, x20\n\t"
+ "umulh x26, x14, x20\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x15, x19\n\t"
+ "umulh x26, x15, x19\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x15, x20\n\t"
+ "umulh x26, x15, x20\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x11, x11, x10, #63\n\t"
+ "extr x10, x10, x9, #63\n\t"
+ "extr x9, x9, x8, #63\n\t"
+ "extr x8, x8, x7, #63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x8\n\t"
+ "umulh x8, x25, x8\n\t"
+ "adds x4, x4, x26\n\t"
+ "mul x26, x25, x9\n\t"
+ "umulh x9, x25, x9\n\t"
+ "adcs x5, x5, x26\n\t"
+ "mul x26, x25, x10\n\t"
+ "umulh x10, x25, x10\n\t"
+ "adcs x6, x6, x26\n\t"
+ "mul x26, x25, x11\n\t"
+ "umulh x27, x25, x11\n\t"
+ "adcs x7, x7, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x5, x5, x8\n\t"
+ "adcs x6, x6, x9\n\t"
+ "adcs x7, x7, x10\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x7, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x7, asr 63\n\t"
+ "and x7, x7, #0x7fffffffffffffff\n\t"
+ "adds x4, x4, x27\n\t"
+ "adcs x5, x5, xzr\n\t"
+ "adcs x6, x6, xzr\n\t"
+ "adc x7, x7, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #48]\n\t"
+ /* Double */
+ "adds x4, x4, x4\n\t"
+ "adcs x5, x5, x5\n\t"
+ "adcs x6, x6, x6\n\t"
+ "adc x7, x7, x7\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x7, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x4, x4, x25\n\t"
+ "sbcs x5, x5, x28\n\t"
+ "sbcs x6, x6, x28\n\t"
+ "sbc x7, x7, x26\n\t"
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #168]\n\t"
+ "ldr x2, [x29, #72]\n\t"
+ /* Multiply */
+ "ldp x16, x17, [x1]\n\t"
+ "ldp x19, x20, [x1, #16]\n\t"
+ "ldp x21, x22, [x2]\n\t"
+ "ldp x23, x24, [x2, #16]\n\t"
+ /* A[0] * B[0] */
+ "mul x8, x16, x21\n\t"
+ "umulh x9, x16, x21\n\t"
+ /* A[0] * B[1] */
+ "mul x25, x16, x22\n\t"
+ "umulh x10, x16, x22\n\t"
+ "adds x9, x9, x25\n\t"
+ "adc x10, x10, xzr\n\t"
+ /* A[1] * B[0] */
+ "mul x25, x17, x21\n\t"
+ "umulh x26, x17, x21\n\t"
+ "adds x9, x9, x25\n\t"
+ "adcs x10, x10, x26\n\t"
+ "adc x11, xzr, xzr\n\t"
+ /* A[0] * B[2] */
+ "mul x25, x16, x23\n\t"
+ "umulh x26, x16, x23\n\t"
+ "adds x10, x10, x25\n\t"
+ "adc x11, x11, x26\n\t"
+ /* A[1] * B[1] */
+ "mul x25, x17, x22\n\t"
+ "umulh x26, x17, x22\n\t"
+ "adds x10, x10, x25\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x12, xzr, xzr\n\t"
+ /* A[2] * B[0] */
+ "mul x25, x19, x21\n\t"
+ "umulh x26, x19, x21\n\t"
+ "adds x10, x10, x25\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x12, x12, xzr\n\t"
+ /* A[0] * B[3] */
+ "mul x25, x16, x24\n\t"
+ "umulh x26, x16, x24\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, xzr, xzr\n\t"
+ /* A[1] * B[2] */
+ "mul x25, x17, x23\n\t"
+ "umulh x26, x17, x23\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[2] * B[1] */
+ "mul x25, x19, x22\n\t"
+ "umulh x26, x19, x22\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[3] * B[0] */
+ "mul x25, x20, x21\n\t"
+ "umulh x26, x20, x21\n\t"
+ "adds x11, x11, x25\n\t"
+ "adcs x12, x12, x26\n\t"
+ "adc x13, x13, xzr\n\t"
+ /* A[1] * B[3] */
+ "mul x25, x17, x24\n\t"
+ "umulh x26, x17, x24\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, xzr, xzr\n\t"
+ /* A[2] * B[2] */
+ "mul x25, x19, x23\n\t"
+ "umulh x26, x19, x23\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[3] * B[1] */
+ "mul x25, x20, x22\n\t"
+ "umulh x26, x20, x22\n\t"
+ "adds x12, x12, x25\n\t"
+ "adcs x13, x13, x26\n\t"
+ "adc x14, x14, xzr\n\t"
+ /* A[2] * B[3] */
+ "mul x25, x19, x24\n\t"
+ "umulh x26, x19, x24\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, xzr, xzr\n\t"
+ /* A[3] * B[2] */
+ "mul x25, x20, x23\n\t"
+ "umulh x26, x20, x23\n\t"
+ "adds x13, x13, x25\n\t"
+ "adcs x14, x14, x26\n\t"
+ "adc x15, x15, xzr\n\t"
+ /* A[3] * B[3] */
+ "mul x25, x20, x24\n\t"
+ "umulh x26, x20, x24\n\t"
+ "adds x14, x14, x25\n\t"
+ "adc x15, x15, x26\n\t"
+ /* Reduce */
+ /* Move top half into t4-t7 and remove top bit from t3 */
+ "extr x15, x15, x14, #63\n\t"
+ "extr x14, x14, x13, #63\n\t"
+ "extr x13, x13, x12, #63\n\t"
+ "extr x12, x12, x11, #63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ /* Multiply top half by 19 */
+ "mov x25, #19\n\t"
+ "mul x26, x25, x12\n\t"
+ "umulh x12, x25, x12\n\t"
+ "adds x8, x8, x26\n\t"
+ "mul x26, x25, x13\n\t"
+ "umulh x13, x25, x13\n\t"
+ "adcs x9, x9, x26\n\t"
+ "mul x26, x25, x14\n\t"
+ "umulh x14, x25, x14\n\t"
+ "adcs x10, x10, x26\n\t"
+ "mul x26, x25, x15\n\t"
+ "umulh x27, x25, x15\n\t"
+ "adcs x11, x11, x26\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Add remaining product results in */
+ "adds x9, x9, x12\n\t"
+ "adcs x10, x10, x13\n\t"
+ "adcs x11, x11, x14\n\t"
+ "adc x27, x27, xzr\n\t"
+ /* Overflow */
+ "extr x27, x27, x11, #63\n\t"
+ "mul x27, x27, x25\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Reduce if top bit set */
+ "and x27, x25, x11, asr 63\n\t"
+ "and x11, x11, #0x7fffffffffffffff\n\t"
+ "adds x8, x8, x27\n\t"
+ "adcs x9, x9, xzr\n\t"
+ "adcs x10, x10, xzr\n\t"
+ "adc x11, x11, xzr\n\t"
+ /* Store */
+ "ldr x0, [x29, #40]\n\t"
+ "ldr x1, [x29, #32]\n\t"
+ /* Add */
+ "adds x12, x4, x8\n\t"
+ "adcs x13, x5, x9\n\t"
+ "adcs x14, x6, x10\n\t"
+ "adc x15, x7, x11\n\t"
+ "mov x25, #-19\n\t"
+ "asr x28, x15, #63\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Sub modulus (if overflow) */
+ "subs x12, x12, x25\n\t"
+ "sbcs x13, x13, x28\n\t"
+ "sbcs x14, x14, x28\n\t"
+ "sbc x15, x15, x26\n\t"
+ /* Sub */
+ "subs x16, x4, x8\n\t"
+ "sbcs x17, x5, x9\n\t"
+ "sbcs x19, x6, x10\n\t"
+ "sbcs x20, x7, x11\n\t"
+ "mov x25, #-19\n\t"
+ "csetm x28, cc\n\t"
+ /* Mask the modulus */
+ "and x25, x28, x25\n\t"
+ "and x26, x28, #0x7fffffffffffffff\n\t"
+ /* Add modulus (if underflow) */
+ "adds x16, x16, x25\n\t"
+ "adcs x17, x17, x28\n\t"
+ "adcs x19, x19, x28\n\t"
+ "adc x20, x20, x26\n\t"
+ "stp x12, x13, [x0]\n\t"
+ "stp x14, x15, [x0, #16]\n\t"
+ "stp x16, x17, [x1]\n\t"
+ "stp x19, x20, [x1, #16]\n\t"
+ "ldp x29, x30, [sp], #0x50\n\t"
+ : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
+ :
+ : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+ (void)qz;
+ (void)qt2d;
+ (void)qyplusx;
+ (void)qyminusx;
+}
+
+#endif /* WOLFSSL_ARMASM */
+#endif /* __aarch64__ */
diff --git a/wolfcrypt/src/port/arm/armv8-poly1305.c b/wolfcrypt/src/port/arm/armv8-poly1305.c
new file mode 100644
index 0000000..3df07f7
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-poly1305.c
@@ -0,0 +1,1166 @@
+/* armv8-poly1305.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/*
+ * Based off the public domain implementations by Andrew Moon
+ * and Daniel J. Bernstein
+ */
+
+
+#ifdef __aarch64__
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#ifdef HAVE_POLY1305
+#include <wolfssl/wolfcrypt/poly1305.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/cpuid.h>
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+#ifdef CHACHA_AEAD_TEST
+ #include <stdio.h>
+#endif
+
+static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
+ size_t bytes)
+{
+ __asm__ __volatile__ (
+ "CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
+ "BLO L_poly1305_16_64_done_%= \n\t"
+ /* Load r and h */
+ "LDP x21, x23, %[ctx_r] \n\t"
+ "LDR w25, %[ctx_r_4] \n\t"
+ "LDP x2, x4, %[ctx_h] \n\t"
+ "LDR w6, %[ctx_h_4] \n\t"
+ "LSR x22, x21, #32 \n\t"
+ "LSR x24, x23, #32 \n\t"
+ "LSR x3, x2, #32 \n\t"
+ "LSR x5, x4, #32 \n\t"
+ "AND x21, x21, #0x3ffffff \n\t"
+ "AND x23, x23, #0x3ffffff \n\t"
+ "AND x2, x2, #0x3ffffff \n\t"
+ "AND x4, x4, #0x3ffffff \n\t"
+ /* s1 = r1 * 5; */
+ /* s2 = r2 * 5; */
+ /* s3 = r3 * 5; */
+ /* s4 = r4 * 5; */
+ "MOV x15, #5 \n\t"
+ "CMP %[finished], #0 \n\t"
+ "MUL w7, w22, w15 \n\t"
+ "CSET %[finished], EQ \n\t"
+ "MUL w8, w23, w15 \n\t"
+ "LSL %[finished], %[finished], #24 \n\t"
+ "MUL w9, w24, w15 \n\t"
+ "MOV x14, #0x3ffffff \n\t"
+ "MUL w10, w25, w15 \n\t"
+ "\n"
+ ".align 2 \n\t"
+ "L_poly1305_16_64_loop_%=: \n\t"
+ /* t0 = U8TO64(&m[0]); */
+ /* t1 = U8TO64(&m[8]); */
+ "LDP x16, x17, [%[m]], #16 \n\t"
+ /* h0 += (U8TO32(m + 0)) & 0x3ffffff; */
+ "AND x26, x16, #0x3ffffff \n\t"
+ "ADD x2, x2, x26 \n\t"
+ /* h1 += (U8TO32(m + 3) >> 2) & 0x3ffffff; */
+ "AND x26, x14, x16, LSR #26 \n\t"
+ "ADD x3, x3, x26 \n\t"
+ /* h2 += (U8TO32(m + 6) >> 4) & 0x3ffffff; */
+ "EXTR x26, x17, x16, #52 \n\t"
+ "AND x26, x26, #0x3ffffff \n\t"
+ "ADD x4, x4, x26 \n\t"
+ /* h3 += (U8TO32(m + 9) >> 6) & 0x3ffffff; */
+ "AND x26, x14, x17, LSR #14 \n\t"
+ "ADD x5, x5, x26 \n\t"
+ /* h4 += (U8TO32(m + 12) >> 8) | hibit; */
+ "ORR x17, %[finished], x17, LSR #40 \n\t"
+ "ADD x6, x6, x17 \n\t"
+ /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
+ /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
+ /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */
+ /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */
+ /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
+ "MUL x16, x2, x21 \n\t"
+ "MUL x17, x2, x22 \n\t"
+ "MUL x26, x2, x23 \n\t"
+ "MUL x19, x2, x24 \n\t"
+ "MUL x20, x2, x25 \n\t"
+ "MADD x16, x3, x10, x16 \n\t"
+ "MADD x17, x3, x21, x17 \n\t"
+ "MADD x26, x3, x22, x26 \n\t"
+ "MADD x19, x3, x23, x19 \n\t"
+ "MADD x20, x3, x24, x20 \n\t"
+ "MADD x16, x4, x9, x16 \n\t"
+ "MADD x17, x4, x10, x17 \n\t"
+ "MADD x26, x4, x21, x26 \n\t"
+ "MADD x19, x4, x22, x19 \n\t"
+ "MADD x20, x4, x23, x20 \n\t"
+ "MADD x16, x5, x8, x16 \n\t"
+ "MADD x17, x5, x9, x17 \n\t"
+ "MADD x26, x5, x10, x26 \n\t"
+ "MADD x19, x5, x21, x19 \n\t"
+ "MADD x20, x5, x22, x20 \n\t"
+ "MADD x16, x6, x7, x16 \n\t"
+ "MADD x17, x6, x8, x17 \n\t"
+ "MADD x26, x6, x9, x26 \n\t"
+ "MADD x19, x6, x10, x19 \n\t"
+ "MADD x20, x6, x21, x20 \n\t"
+ /* d1 = d1 + d0 >> 26 */
+ /* d2 = d2 + d1 >> 26 */
+ /* d3 = d3 + d2 >> 26 */
+ /* d4 = d4 + d3 >> 26 */
+ /* h0 = d0 & 0x3ffffff */
+ /* h1 = d1 & 0x3ffffff */
+ /* h2 = d2 & 0x3ffffff */
+ /* h0 = h0 + (d4 >> 26) * 5 */
+ /* h1 = h1 + h0 >> 26 */
+ /* h3 = d3 & 0x3ffffff */
+ /* h4 = d4 & 0x3ffffff */
+ /* h0 = h0 & 0x3ffffff */
+ "ADD x17, x17, x16, LSR #26 \n\t"
+ "ADD x20, x20, x19, LSR #26 \n\t"
+ "AND x16, x16, #0x3ffffff \n\t"
+ "LSR x2, x20, #26 \n\t"
+ "AND x19, x19, #0x3ffffff \n\t"
+ "MADD x16, x2, x15, x16 \n\t"
+ "ADD x26, x26, x17, LSR #26 \n\t"
+ "AND x17, x17, #0x3ffffff \n\t"
+ "AND x20, x20, #0x3ffffff \n\t"
+ "ADD x19, x19, x26, LSR #26 \n\t"
+ "AND x4, x26, #0x3ffffff \n\t"
+ "ADD x3, x17, x16, LSR #26 \n\t"
+ "AND x2, x16, #0x3ffffff \n\t"
+ "ADD x6, x20, x19, LSR #26 \n\t"
+ "AND x5, x19, #0x3ffffff \n\t"
+ "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
+ "CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
+ "BHS L_poly1305_16_64_loop_%= \n\t"
+ /* Store h */
+ "ORR x2, x2, x3, LSL #32 \n\t"
+ "ORR x4, x4, x5, LSL #32 \n\t"
+ "STP x2, x4, %[ctx_h] \n\t"
+ "STR w6, %[ctx_h_4] \n\t"
+ "\n"
+ ".align 2 \n\t"
+ "L_poly1305_16_64_done_%=: \n\t"
+ : [ctx_h] "+m" (ctx->h[0]),
+ [ctx_h_4] "+m" (ctx->h[4]),
+ [bytes] "+r" (bytes),
+ [m] "+r" (m)
+ : [POLY1305_BLOCK_SIZE] "I" (POLY1305_BLOCK_SIZE),
+ [ctx_r] "m" (ctx->r[0]),
+ [ctx_r_4] "m" (ctx->r[4]),
+ [finished] "r" ((word64)ctx->finished)
+ : "memory", "cc",
+ "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w15",
+ "w21", "w22", "w23", "w24", "w25", "x2", "x3", "x4", "x5", "x6",
+ "x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17", "x19", "x20",
+ "x21", "x22", "x23", "x24", "x25", "x26"
+ );
+}
+
+void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
+ size_t bytes)
+{
+ __asm__ __volatile__ (
+ /* If less than 4 blocks to process then use regular method */
+ "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t"
+ "BLO L_poly1305_64_done_%= \n\t"
+ "MOV x9, #0x3ffffff \n\t"
+ /* Load h */
+ "LDP x20, x22, [%[h]] \n\t"
+ "MOV v27.D[0], x9 \n\t"
+ "LDR w24, [%[h], #16] \n\t"
+ "MOV v27.D[1], x9 \n\t"
+ "LSR x21, x20, #32 \n\t"
+ "DUP v29.4S, v27.S[0] \n\t"
+ "LSR x23, x22, #32 \n\t"
+ "MOV x9, #5 \n\t"
+ "AND x20, x20, #0x3ffffff \n\t"
+ "MOV v28.D[0], x9 \n\t"
+ "AND x22, x22, #0x3ffffff \n\t"
+ /* Zero accumulator registers */
+ "MOVI v15.2D, #0x0 \n\t"
+ "MOVI v16.2D, #0x0 \n\t"
+ "MOVI v17.2D, #0x0 \n\t"
+ "MOVI v18.2D, #0x0 \n\t"
+ "MOVI v19.2D, #0x0 \n\t"
+ /* Set hibit */
+ "CMP %[finished], #0 \n\t"
+ "CSET x9, EQ \n\t"
+ "LSL x9, x9, #24 \n\t"
+ "MOV v26.D[0], x9 \n\t"
+ "MOV v26.D[1], x9 \n\t"
+ "DUP v30.4S, v26.S[0] \n\t"
+ "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t"
+ "BLO L_poly1305_64_start_block_size_64_%= \n\t"
+ /* Load r^2 to NEON v0, v1, v2, v3, v4 */
+ "LD4 { v0.S-v3.S }[2], [%[r_2]], #16 \n\t"
+ "LD1 { v4.S }[2], [%[r_2]] \n\t"
+ "SUB %[r_2], %[r_2], #16 \n\t"
+ /* Load r^4 to NEON v0, v1, v2, v3, v4 */
+ "LD4 { v0.S-v3.S }[0], [%[r_4]], #16 \n\t"
+ "LD1 { v4.S }[0], [%[r_4]] \n\t"
+ "SUB %[r_4], %[r_4], #16 \n\t"
+ "MOV v0.S[1], v0.S[0] \n\t"
+ "MOV v0.S[3], v0.S[2] \n\t"
+ "MOV v1.S[1], v1.S[0] \n\t"
+ "MOV v1.S[3], v1.S[2] \n\t"
+ "MOV v2.S[1], v2.S[0] \n\t"
+ "MOV v2.S[3], v2.S[2] \n\t"
+ "MOV v3.S[1], v3.S[0] \n\t"
+ "MOV v3.S[3], v3.S[2] \n\t"
+ "MOV v4.S[1], v4.S[0] \n\t"
+ "MOV v4.S[3], v4.S[2] \n\t"
+ /* Store [r^4, r^2] * 5 */
+ "MUL v5.4S, v0.4S, v28.S[0] \n\t"
+ "MUL v6.4S, v1.4S, v28.S[0] \n\t"
+ "MUL v7.4S, v2.4S, v28.S[0] \n\t"
+ "MUL v8.4S, v3.4S, v28.S[0] \n\t"
+ "MUL v9.4S, v4.4S, v28.S[0] \n\t"
+ /* Copy r^4 to ARM */
+ "MOV w25, v0.S[0] \n\t"
+ "MOV w26, v1.S[0] \n\t"
+ "MOV w27, v2.S[0] \n\t"
+ "MOV w28, v3.S[0] \n\t"
+ "MOV w30, v4.S[0] \n\t"
+ /* Copy 5*r^4 to ARM */
+ "MOV w15, v5.S[0] \n\t"
+ "MOV w16, v6.S[0] \n\t"
+ "MOV w17, v7.S[0] \n\t"
+ "MOV w8, v8.S[0] \n\t"
+ "MOV w19, v9.S[0] \n\t"
+ /* Load m */
+ /* Load four message blocks to NEON v10, v11, v12, v13, v14 */
+ "LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
+ "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t"
+ "USHR v14.4S, v13.4S, #8 \n\t"
+ "ORR v14.16B, v14.16B, v30.16B \n\t"
+ "SHL v13.4S, v13.4S, #18 \n\t"
+ "SRI v13.4S, v12.4S, #14 \n\t"
+ "SHL v12.4S, v12.4S, #12 \n\t"
+ "SRI v12.4S, v11.4S, #20 \n\t"
+ "SHL v11.4S, v11.4S, #6 \n\t"
+ "SRI v11.4S, v10.4S, #26 \n\t"
+ "AND v10.16B, v10.16B, v29.16B \n\t"
+ "AND v11.16B, v11.16B, v29.16B \n\t"
+ "AND v12.16B, v12.16B, v29.16B \n\t"
+ "AND v13.16B, v13.16B, v29.16B \n\t"
+ "AND v14.16B, v14.16B, v29.16B \n\t"
+ /* Four message blocks loaded */
+ /* Add messages to accumulator */
+ "ADD v15.2S, v15.2S, v10.2S \n\t"
+ "ADD v16.2S, v16.2S, v11.2S \n\t"
+ "ADD v17.2S, v17.2S, v12.2S \n\t"
+ "ADD v18.2S, v18.2S, v13.2S \n\t"
+ "ADD v19.2S, v19.2S, v14.2S \n\t"
+ "\n"
+ ".align 2 \n\t"
+ "L_poly1305_64_loop_128_%=: \n\t"
+ /* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */
+ /* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */
+ /* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */
+ /* d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*s4 */
+ /* d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 */
+ "UMULL v21.2D, v15.2S, v0.2S \n\t"
+ /* Compute h*r^2 */
+ /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
+ /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
+ /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */
+ /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */
+ /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
+ "MUL x9, x20, x25 \n\t"
+ "UMULL v22.2D, v15.2S, v1.2S \n\t"
+ "MUL x10, x20, x26 \n\t"
+ "UMULL v23.2D, v15.2S, v2.2S \n\t"
+ "MUL x11, x20, x27 \n\t"
+ "UMULL v24.2D, v15.2S, v3.2S \n\t"
+ "MUL x12, x20, x28 \n\t"
+ "UMULL v25.2D, v15.2S, v4.2S \n\t"
+ "MUL x13, x20, x30 \n\t"
+ "UMLAL v21.2D, v16.2S, v9.2S \n\t"
+ "MADD x9, x21, x19, x9 \n\t"
+ "UMLAL v22.2D, v16.2S, v0.2S \n\t"
+ "MADD x10, x21, x25, x10 \n\t"
+ "UMLAL v23.2D, v16.2S, v1.2S \n\t"
+ "MADD x11, x21, x26, x11 \n\t"
+ "UMLAL v24.2D, v16.2S, v2.2S \n\t"
+ "MADD x12, x21, x27, x12 \n\t"
+ "UMLAL v25.2D, v16.2S, v3.2S \n\t"
+ "MADD x13, x21, x28, x13 \n\t"
+ "UMLAL v21.2D, v17.2S, v8.2S \n\t"
+ "MADD x9, x22, x8, x9 \n\t"
+ "UMLAL v22.2D, v17.2S, v9.2S \n\t"
+ "MADD x10, x22, x19, x10 \n\t"
+ "UMLAL v23.2D, v17.2S, v0.2S \n\t"
+ "MADD x11, x22, x25, x11 \n\t"
+ "UMLAL v24.2D, v17.2S, v1.2S \n\t"
+ "MADD x12, x22, x26, x12 \n\t"
+ "UMLAL v25.2D, v17.2S, v2.2S \n\t"
+ "MADD x13, x22, x27, x13 \n\t"
+ "UMLAL v21.2D, v18.2S, v7.2S \n\t"
+ "MADD x9, x23, x17, x9 \n\t"
+ "UMLAL v22.2D, v18.2S, v8.2S \n\t"
+ "MADD x10, x23, x8, x10 \n\t"
+ "UMLAL v23.2D, v18.2S, v9.2S \n\t"
+ "MADD x11, x23, x19, x11 \n\t"
+ "UMLAL v24.2D, v18.2S, v0.2S \n\t"
+ "MADD x12, x23, x25, x12 \n\t"
+ "UMLAL v25.2D, v18.2S, v1.2S \n\t"
+ "MADD x13, x23, x26, x13 \n\t"
+ "UMLAL v21.2D, v19.2S, v6.2S \n\t"
+ "MADD x9, x24, x16, x9 \n\t"
+ "UMLAL v22.2D, v19.2S, v7.2S \n\t"
+ "MADD x10, x24, x17, x10 \n\t"
+ "UMLAL v23.2D, v19.2S, v8.2S \n\t"
+ "MADD x11, x24, x8, x11 \n\t"
+ "UMLAL v24.2D, v19.2S, v9.2S \n\t"
+ "MADD x12, x24, x19, x12 \n\t"
+ "UMLAL v25.2D, v19.2S, v0.2S \n\t"
+ "MADD x13, x24, x25, x13 \n\t"
+ /* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */
+ /* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */
+ /* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */
+ /* d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*s4 */
+ /* d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 */
+ "UMLAL2 v21.2D, v10.4S, v0.4S \n\t"
+ /* Reduce h % P */
+ "MOV x14, #5 \n\t"
+ "UMLAL2 v22.2D, v10.4S, v1.4S \n\t"
+ "ADD x10, x10, x9, LSR #26 \n\t"
+ "UMLAL2 v23.2D, v10.4S, v2.4S \n\t"
+ "ADD x13, x13, x12, LSR #26 \n\t"
+ "UMLAL2 v24.2D, v10.4S, v3.4S \n\t"
+ "AND x9, x9, #0x3ffffff \n\t"
+ "UMLAL2 v25.2D, v10.4S, v4.4S \n\t"
+ "LSR x20, x13, #26 \n\t"
+ "UMLAL2 v21.2D, v11.4S, v9.4S \n\t"
+ "AND x12, x12, #0x3ffffff \n\t"
+ "UMLAL2 v22.2D, v11.4S, v0.4S \n\t"
+ "MADD x9, x20, x14, x9 \n\t"
+ "UMLAL2 v23.2D, v11.4S, v1.4S \n\t"
+ "ADD x11, x11, x10, LSR #26 \n\t"
+ "UMLAL2 v24.2D, v11.4S, v2.4S \n\t"
+ "AND x10, x10, #0x3ffffff \n\t"
+ "UMLAL2 v25.2D, v11.4S, v3.4S \n\t"
+ "AND x13, x13, #0x3ffffff \n\t"
+ "UMLAL2 v21.2D, v12.4S, v8.4S \n\t"
+ "ADD x12, x12, x11, LSR #26 \n\t"
+ "UMLAL2 v22.2D, v12.4S, v9.4S \n\t"
+ "AND x22, x11, #0x3ffffff \n\t"
+ "UMLAL2 v23.2D, v12.4S, v0.4S \n\t"
+ "ADD x21, x10, x9, LSR #26 \n\t"
+ "UMLAL2 v24.2D, v12.4S, v1.4S \n\t"
+ "AND x20, x9, #0x3ffffff \n\t"
+ "UMLAL2 v25.2D, v12.4S, v2.4S \n\t"
+ "ADD x24, x13, x12, LSR #26 \n\t"
+ "UMLAL2 v21.2D, v13.4S, v7.4S \n\t"
+ "AND x23, x12, #0x3ffffff \n\t"
+ "UMLAL2 v22.2D, v13.4S, v8.4S \n\t"
+ "UMLAL2 v23.2D, v13.4S, v9.4S \n\t"
+ "UMLAL2 v24.2D, v13.4S, v0.4S \n\t"
+ "UMLAL2 v25.2D, v13.4S, v1.4S \n\t"
+ "UMLAL2 v21.2D, v14.4S, v6.4S \n\t"
+ "UMLAL2 v22.2D, v14.4S, v7.4S \n\t"
+ "UMLAL2 v23.2D, v14.4S, v8.4S \n\t"
+ "UMLAL2 v24.2D, v14.4S, v9.4S \n\t"
+ "UMLAL2 v25.2D, v14.4S, v0.4S \n\t"
+ /* If less than six message blocks left then leave loop */
+ "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t"
+ "BLS L_poly1305_64_loop_128_final_%= \n\t"
+ /* Load m */
+ /* Load four message blocks to NEON v10, v11, v12, v13, v14 */
+ "LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
+ "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t"
+ "USHR v14.4S, v13.4S, #8 \n\t"
+ "ORR v14.16B, v14.16B, v30.16B \n\t"
+ "SHL v13.4S, v13.4S, #18 \n\t"
+ "SRI v13.4S, v12.4S, #14 \n\t"
+ "SHL v12.4S, v12.4S, #12 \n\t"
+ "SRI v12.4S, v11.4S, #20 \n\t"
+ "SHL v11.4S, v11.4S, #6 \n\t"
+ "SRI v11.4S, v10.4S, #26 \n\t"
+ "AND v10.16B, v10.16B, v29.16B \n\t"
+ "AND v11.16B, v11.16B, v29.16B \n\t"
+ "AND v12.16B, v12.16B, v29.16B \n\t"
+ "AND v13.16B, v13.16B, v29.16B \n\t"
+ "AND v14.16B, v14.16B, v29.16B \n\t"
+ /* Four message blocks loaded */
+ /* Add new message block to accumulator */
+ "UADDW v21.2D, v21.2D, v10.2S \n\t"
+ "UADDW v22.2D, v22.2D, v11.2S \n\t"
+ "UADDW v23.2D, v23.2D, v12.2S \n\t"
+ "UADDW v24.2D, v24.2D, v13.2S \n\t"
+ "UADDW v25.2D, v25.2D, v14.2S \n\t"
+ /* Reduce radix 26 NEON */
+ /* Interleave h0 -> h1 -> h2 -> h3 -> h4 */
+ /* with h3 -> h4 -> h0 -> h1 */
+ "USRA v22.2D, v21.2D, #26 \n\t"
+ "AND v21.16B, v21.16B, v27.16B \n\t"
+ "USRA v25.2D, v24.2D, #26 \n\t"
+ "AND v24.16B, v24.16B, v27.16B \n\t"
+ "USHR v15.2D, v25.2D, #26 \n\t"
+ "USRA v23.2D, v22.2D, #26 \n\t"
+ /* Simulate multiplying by 5 using adding and shifting */
+ "SHL v18.2D, v15.2D, #2 \n\t"
+ "AND v16.16B, v22.16B, v27.16B \n\t"
+ "ADD v18.2D, v18.2D, v15.2D \n\t"
+ "AND v19.16B, v25.16B, v27.16B \n\t"
+ "ADD v21.2D, v21.2D, v18.2D \n\t"
+ "USRA v24.2D, v23.2D, #26 \n\t"
+ "AND v17.16B, v23.16B, v27.16B \n\t"
+ "USRA v16.2D, v21.2D, #26 \n\t"
+ "AND v15.16B, v21.16B, v27.16B \n\t"
+ "USRA v19.2D, v24.2D, #26 \n\t"
+ "AND v18.16B, v24.16B, v27.16B \n\t"
+ /* Copy values to lower halves of result registers */
+ "MOV v15.S[1], v15.S[2] \n\t"
+ "MOV v16.S[1], v16.S[2] \n\t"
+ "MOV v17.S[1], v17.S[2] \n\t"
+ "MOV v18.S[1], v18.S[2] \n\t"
+ "MOV v19.S[1], v19.S[2] \n\t"
+ "B L_poly1305_64_loop_128_%= \n\t"
+ "\n"
+ ".align 2 \n\t"
+ "L_poly1305_64_loop_128_final_%=: \n\t"
+ /* Load m */
+ /* Load two message blocks to NEON v10, v11, v12, v13, v14 */
+ "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
+ /* Copy r^2 to lower half of registers */
+ "MOV v0.D[0], v0.D[1] \n\t"
+ "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
+ "MOV v5.D[0], v5.D[1] \n\t"
+ "USHR v14.2D, v11.2D, #40 \n\t"
+ "MOV v1.D[0], v1.D[1] \n\t"
+ "ORR v14.16B, v14.16B, v26.16B \n\t"
+ "MOV v6.D[0], v6.D[1] \n\t"
+ "USHR v13.2D, v11.2D, #14 \n\t"
+ "MOV v2.D[0], v2.D[1] \n\t"
+ "AND v13.16B, v13.16B, v27.16B \n\t"
+ "MOV v7.D[0], v7.D[1] \n\t"
+ "SHL v12.2D, v11.2D, #12 \n\t"
+ "MOV v3.D[0], v3.D[1] \n\t"
+ "SRI v12.2D, v10.2D, #52 \n\t"
+ "MOV v8.D[0], v8.D[1] \n\t"
+ "AND v12.16B, v12.16B, v27.16B \n\t"
+ "MOV v4.D[0], v4.D[1] \n\t"
+ "USHR v11.2D, v10.2D, #26 \n\t"
+ "MOV v9.D[0], v9.D[1] \n\t"
+ "AND v11.16B, v11.16B, v27.16B \n\t"
+ /* Copy r^2 to ARM */
+ "MOV w25, v0.S[2] \n\t"
+ "AND v10.16B, v10.16B, v27.16B \n\t"
+ "MOV w26, v1.S[2] \n\t"
+ /* Two message blocks loaded */
+ /* Add last messages */
+ "ADD v21.2D, v21.2D, v10.2D \n\t"
+ "MOV w27, v2.S[2] \n\t"
+ "ADD v22.2D, v22.2D, v11.2D \n\t"
+ "MOV w28, v3.S[2] \n\t"
+ "ADD v23.2D, v23.2D, v12.2D \n\t"
+ "MOV w30, v4.S[2] \n\t"
+ "ADD v24.2D, v24.2D, v13.2D \n\t"
+ /* Copy 5*r^2 to ARM */
+ "MOV w15, v5.S[2] \n\t"
+ "ADD v25.2D, v25.2D, v14.2D \n\t"
+ "MOV w16, v6.S[2] \n\t"
+ /* Reduce message to be ready for next multiplication */
+ /* Reduce radix 26 NEON */
+ /* Interleave h0 -> h1 -> h2 -> h3 -> h4 */
+ /* with h3 -> h4 -> h0 -> h1 */
+ "USRA v22.2D, v21.2D, #26 \n\t"
+ "MOV w17, v7.S[2] \n\t"
+ "AND v21.16B, v21.16B, v27.16B \n\t"
+ "MOV w8, v8.S[2] \n\t"
+ "USRA v25.2D, v24.2D, #26 \n\t"
+ "MOV w19, v9.S[2] \n\t"
+ "AND v24.16B, v24.16B, v27.16B \n\t"
+ "USHR v15.2D, v25.2D, #26 \n\t"
+ "USRA v23.2D, v22.2D, #26 \n\t"
+ /* Simulate multiplying by 5 using adding and shifting */
+ "SHL v18.2D, v15.2D, #2 \n\t"
+ "AND v16.16B, v22.16B, v27.16B \n\t"
+ "ADD v18.2D, v18.2D, v15.2D \n\t"
+ "AND v19.16B, v25.16B, v27.16B \n\t"
+ "ADD v21.2D, v21.2D, v18.2D \n\t"
+ "USRA v24.2D, v23.2D, #26 \n\t"
+ "AND v17.16B, v23.16B, v27.16B \n\t"
+ "USRA v16.2D, v21.2D, #26 \n\t"
+ "AND v15.16B, v21.16B, v27.16B \n\t"
+ "USRA v19.2D, v24.2D, #26 \n\t"
+ "AND v18.16B, v24.16B, v27.16B \n\t"
+ /* Copy values to lower halves of result registers */
+ "MOV v15.S[1], v15.S[2] \n\t"
+ "MOV v16.S[1], v16.S[2] \n\t"
+ "MOV v17.S[1], v17.S[2] \n\t"
+ "MOV v18.S[1], v18.S[2] \n\t"
+ "MOV v19.S[1], v19.S[2] \n\t"
+ /* If less than 2 blocks left go straight to final multiplication. */
+ "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
+ "BLO L_poly1305_64_last_mult_%= \n\t"
+ /* Else go to one loop of L_poly1305_64_loop_64 */
+ "B L_poly1305_64_loop_64_%= \n\t"
+ "\n"
+ ".align 2 \n\t"
+ "L_poly1305_64_start_block_size_64_%=: \n\t"
+ /* Load r^2 to NEON v0, v1, v2, v3, v4 */
+ "LD4R { v0.2S-v3.2S }, [%[r_2]], #16 \n\t"
+ "LD1R { v4.2S }, [%[r_2]] \n\t"
+ "SUB %[r_2], %[r_2], #16 \n\t"
+ /* Store r^2 * 5 */
+ "MUL v5.4S, v0.4S, v28.S[0] \n\t"
+ "MUL v6.4S, v1.4S, v28.S[0] \n\t"
+ "MUL v7.4S, v2.4S, v28.S[0] \n\t"
+ "MUL v8.4S, v3.4S, v28.S[0] \n\t"
+ "MUL v9.4S, v4.4S, v28.S[0] \n\t"
+ /* Copy r^2 to ARM */
+ "MOV w25, v0.S[0] \n\t"
+ "MOV w26, v1.S[0] \n\t"
+ "MOV w27, v2.S[0] \n\t"
+ "MOV w28, v3.S[0] \n\t"
+ "MOV w30, v4.S[0] \n\t"
+ /* Copy 5*r^2 to ARM */
+ "MOV w15, v5.S[0] \n\t"
+ "MOV w16, v6.S[0] \n\t"
+ "MOV w17, v7.S[0] \n\t"
+ "MOV w8, v8.S[0] \n\t"
+ "MOV w19, v9.S[0] \n\t"
+ /* Load m */
+ /* Load two message blocks to NEON v10, v11, v12, v13, v14 */
+ "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
+ "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
+ "USHR v14.2D, v11.2D, #40 \n\t"
+ "ORR v14.16B, v14.16B, v26.16B \n\t"
+ "USHR v13.2D, v11.2D, #14 \n\t"
+ "AND v13.16B, v13.16B, v27.16B \n\t"
+ "SHL v12.2D, v11.2D, #12 \n\t"
+ "SRI v12.2D, v10.2D, #52 \n\t"
+ "AND v12.16B, v12.16B, v27.16B \n\t"
+ "USHR v11.2D, v10.2D, #26 \n\t"
+ "AND v11.16B, v11.16B, v27.16B \n\t"
+ "AND v10.16B, v10.16B, v27.16B \n\t"
+ "MOV v10.S[1], v10.S[2] \n\t"
+ "MOV v11.S[1], v11.S[2] \n\t"
+ "MOV v12.S[1], v12.S[2] \n\t"
+ "MOV v13.S[1], v13.S[2] \n\t"
+ "MOV v14.S[1], v14.S[2] \n\t"
+ /* Two message blocks loaded */
+ /* Add messages to accumulator */
+ "ADD v15.2S, v15.2S, v10.2S \n\t"
+ "ADD v16.2S, v16.2S, v11.2S \n\t"
+ "ADD v17.2S, v17.2S, v12.2S \n\t"
+ "ADD v18.2S, v18.2S, v13.2S \n\t"
+ "ADD v19.2S, v19.2S, v14.2S \n\t"
+ "\n"
+ ".align 2 \n\t"
+ "L_poly1305_64_loop_64_%=: \n\t"
+ /* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */
+ /* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */
+ /* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */
+ /* d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*s4 */
+ /* d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 */
+ "UMULL v21.2D, v15.2S, v0.2S \n\t"
+ /* Compute h*r^2 */
+ /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
+ /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
+ /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */
+ /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */
+ /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
+ "MUL x9, x20, x25 \n\t"
+ "UMULL v22.2D, v15.2S, v1.2S \n\t"
+ "MUL x10, x20, x26 \n\t"
+ "UMULL v23.2D, v15.2S, v2.2S \n\t"
+ "MUL x11, x20, x27 \n\t"
+ "UMULL v24.2D, v15.2S, v3.2S \n\t"
+ "MUL x12, x20, x28 \n\t"
+ "UMULL v25.2D, v15.2S, v4.2S \n\t"
+ "MUL x13, x20, x30 \n\t"
+ "UMLAL v21.2D, v16.2S, v9.2S \n\t"
+ "MADD x9, x21, x19, x9 \n\t"
+ "UMLAL v22.2D, v16.2S, v0.2S \n\t"
+ "MADD x10, x21, x25, x10 \n\t"
+ "UMLAL v23.2D, v16.2S, v1.2S \n\t"
+ "MADD x11, x21, x26, x11 \n\t"
+ "UMLAL v24.2D, v16.2S, v2.2S \n\t"
+ "MADD x12, x21, x27, x12 \n\t"
+ "UMLAL v25.2D, v16.2S, v3.2S \n\t"
+ "MADD x13, x21, x28, x13 \n\t"
+ "UMLAL v21.2D, v17.2S, v8.2S \n\t"
+ "MADD x9, x22, x8, x9 \n\t"
+ "UMLAL v22.2D, v17.2S, v9.2S \n\t"
+ "MADD x10, x22, x19, x10 \n\t"
+ "UMLAL v23.2D, v17.2S, v0.2S \n\t"
+ "MADD x11, x22, x25, x11 \n\t"
+ "UMLAL v24.2D, v17.2S, v1.2S \n\t"
+ "MADD x12, x22, x26, x12 \n\t"
+ "UMLAL v25.2D, v17.2S, v2.2S \n\t"
+ "MADD x13, x22, x27, x13 \n\t"
+ "UMLAL v21.2D, v18.2S, v7.2S \n\t"
+ "MADD x9, x23, x17, x9 \n\t"
+ "UMLAL v22.2D, v18.2S, v8.2S \n\t"
+ "MADD x10, x23, x8, x10 \n\t"
+ "UMLAL v23.2D, v18.2S, v9.2S \n\t"
+ "MADD x11, x23, x19, x11 \n\t"
+ "UMLAL v24.2D, v18.2S, v0.2S \n\t"
+ "MADD x12, x23, x25, x12 \n\t"
+ "UMLAL v25.2D, v18.2S, v1.2S \n\t"
+ "MADD x13, x23, x26, x13 \n\t"
+ "UMLAL v21.2D, v19.2S, v6.2S \n\t"
+ "MADD x9, x24, x16, x9 \n\t"
+ "UMLAL v22.2D, v19.2S, v7.2S \n\t"
+ "MADD x10, x24, x17, x10 \n\t"
+ "UMLAL v23.2D, v19.2S, v8.2S \n\t"
+ "MADD x11, x24, x8, x11 \n\t"
+ "UMLAL v24.2D, v19.2S, v9.2S \n\t"
+ "MADD x12, x24, x19, x12 \n\t"
+ "UMLAL v25.2D, v19.2S, v0.2S \n\t"
+ "MADD x13, x24, x25, x13 \n\t"
+ /* Load m */
+ /* Load two message blocks to NEON v10, v11, v12, v13, v14 */
+ "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
+ /* Reduce h % P */
+ "MOV x14, #5 \n\t"
+ "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
+ "ADD x10, x10, x9, LSR #26 \n\t"
+ "USHR v14.2D, v11.2D, #40 \n\t"
+ "ADD x13, x13, x12, LSR #26 \n\t"
+ "ORR v14.16B, v14.16B, v26.16B \n\t"
+ "AND x9, x9, #0x3ffffff \n\t"
+ "USHR v13.2D, v11.2D, #14 \n\t"
+ "LSR x20, x13, #26 \n\t"
+ "AND v13.16B, v13.16B, v27.16B \n\t"
+ "AND x12, x12, #0x3ffffff \n\t"
+ "SHL v12.2D, v11.2D, #12 \n\t"
+ "MADD x9, x20, x14, x9 \n\t"
+ "SRI v12.2D, v10.2D, #52 \n\t"
+ "ADD x11, x11, x10, LSR #26 \n\t"
+ "AND v12.16B, v12.16B, v27.16B \n\t"
+ "AND x10, x10, #0x3ffffff \n\t"
+ "USHR v11.2D, v10.2D, #26 \n\t"
+ "AND x13, x13, #0x3ffffff \n\t"
+ "AND v11.16B, v11.16B, v27.16B \n\t"
+ "ADD x12, x12, x11, LSR #26 \n\t"
+ "AND v10.16B, v10.16B, v27.16B \n\t"
+ "AND x22, x11, #0x3ffffff \n\t"
+ /* Two message blocks loaded */
+ "ADD v21.2D, v21.2D, v10.2D \n\t"
+ "ADD x21, x10, x9, LSR #26 \n\t"
+ "ADD v22.2D, v22.2D, v11.2D \n\t"
+ "AND x20, x9, #0x3ffffff \n\t"
+ "ADD v23.2D, v23.2D, v12.2D \n\t"
+ "ADD x24, x13, x12, LSR #26 \n\t"
+ "ADD v24.2D, v24.2D, v13.2D \n\t"
+ "AND x23, x12, #0x3ffffff \n\t"
+ "ADD v25.2D, v25.2D, v14.2D \n\t"
+ /* Reduce radix 26 NEON */
+ /* Interleave h0 -> h1 -> h2 -> h3 -> h4 */
+ /* with h3 -> h4 -> h0 -> h1 */
+ "USRA v22.2D, v21.2D, #26 \n\t"
+ "AND v21.16B, v21.16B, v27.16B \n\t"
+ "USRA v25.2D, v24.2D, #26 \n\t"
+ "AND v24.16B, v24.16B, v27.16B \n\t"
+ "USHR v15.2D, v25.2D, #26 \n\t"
+ "USRA v23.2D, v22.2D, #26 \n\t"
+ /* Simulate multiplying by 5 using adding and shifting */
+ "SHL v18.2D, v15.2D, #2 \n\t"
+ "AND v16.16B, v22.16B, v27.16B \n\t"
+ "ADD v18.2D, v18.2D, v15.2D \n\t"
+ "AND v19.16B, v25.16B, v27.16B \n\t"
+ "ADD v21.2D, v21.2D, v18.2D \n\t"
+ "USRA v24.2D, v23.2D, #26 \n\t"
+ "AND v17.16B, v23.16B, v27.16B \n\t"
+ "USRA v16.2D, v21.2D, #26 \n\t"
+ "AND v15.16B, v21.16B, v27.16B \n\t"
+ "USRA v19.2D, v24.2D, #26 \n\t"
+ "AND v18.16B, v24.16B, v27.16B \n\t"
+ /* Copy values to lower halves of result registers */
+ "MOV v15.S[1], v15.S[2] \n\t"
+ "MOV v16.S[1], v16.S[2] \n\t"
+ "MOV v17.S[1], v17.S[2] \n\t"
+ "MOV v18.S[1], v18.S[2] \n\t"
+ "MOV v19.S[1], v19.S[2] \n\t"
+ /* If at least two message blocks left then loop_64 */
+ "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
+ "BHS L_poly1305_64_loop_64_%= \n\t"
+ "\n"
+ ".align 2 \n\t"
+ "L_poly1305_64_last_mult_%=: \n\t"
+ /* Load r */
+ "LD4 { v0.S-v3.S }[1], [%[r]], #16 \n\t"
+ /* Compute h*r^2 */
+ /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
+ /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
+ /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */
+ /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */
+ /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
+ "MUL x9, x20, x25 \n\t"
+ "LD1 { v4.S }[1], [%[r]] \n\t"
+ "MUL x10, x20, x26 \n\t"
+ "SUB %[r], %[r], #16 \n\t"
+ "MUL x11, x20, x27 \n\t"
+ /* Store [r^2, r] * 5 */
+ "MUL v5.2S, v0.2S, v28.S[0] \n\t"
+ "MUL x12, x20, x28 \n\t"
+ "MUL v6.2S, v1.2S, v28.S[0] \n\t"
+ "MUL x13, x20, x30 \n\t"
+ "MUL v7.2S, v2.2S, v28.S[0] \n\t"
+ "MADD x9, x21, x19, x9 \n\t"
+ "MUL v8.2S, v3.2S, v28.S[0] \n\t"
+ "MADD x10, x21, x25, x10 \n\t"
+ "MUL v9.2S, v4.2S, v28.S[0] \n\t"
+ "MADD x11, x21, x26, x11 \n\t"
+ /* Final multiply by [r^2, r] */
+ /* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */
+ /* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */
+ /* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */
+ /* d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*s4 */
+ /* d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 */
+ "UMULL v21.2D, v15.2S, v0.2S \n\t"
+ "MADD x12, x21, x27, x12 \n\t"
+ "UMULL v22.2D, v15.2S, v1.2S \n\t"
+ "MADD x13, x21, x28, x13 \n\t"
+ "UMULL v23.2D, v15.2S, v2.2S \n\t"
+ "MADD x9, x22, x8, x9 \n\t"
+ "UMULL v24.2D, v15.2S, v3.2S \n\t"
+ "MADD x10, x22, x19, x10 \n\t"
+ "UMULL v25.2D, v15.2S, v4.2S \n\t"
+ "MADD x11, x22, x25, x11 \n\t"
+ "UMLAL v21.2D, v16.2S, v9.2S \n\t"
+ "MADD x12, x22, x26, x12 \n\t"
+ "UMLAL v22.2D, v16.2S, v0.2S \n\t"
+ "MADD x13, x22, x27, x13 \n\t"
+ "UMLAL v23.2D, v16.2S, v1.2S \n\t"
+ "MADD x9, x23, x17, x9 \n\t"
+ "UMLAL v24.2D, v16.2S, v2.2S \n\t"
+ "MADD x10, x23, x8, x10 \n\t"
+ "UMLAL v25.2D, v16.2S, v3.2S \n\t"
+ "MADD x11, x23, x19, x11 \n\t"
+ "UMLAL v21.2D, v17.2S, v8.2S \n\t"
+ "MADD x12, x23, x25, x12 \n\t"
+ "UMLAL v22.2D, v17.2S, v9.2S \n\t"
+ "MADD x13, x23, x26, x13 \n\t"
+ "UMLAL v23.2D, v17.2S, v0.2S \n\t"
+ "MADD x9, x24, x16, x9 \n\t"
+ "UMLAL v24.2D, v17.2S, v1.2S \n\t"
+ "MADD x10, x24, x17, x10 \n\t"
+ "UMLAL v25.2D, v17.2S, v2.2S \n\t"
+ "MADD x11, x24, x8, x11 \n\t"
+ "UMLAL v21.2D, v18.2S, v7.2S \n\t"
+ "MADD x12, x24, x19, x12 \n\t"
+ "UMLAL v22.2D, v18.2S, v8.2S \n\t"
+ "MADD x13, x24, x25, x13 \n\t"
+ "UMLAL v23.2D, v18.2S, v9.2S \n\t"
+ /* Reduce h % P */
+ "MOV x14, #5 \n\t"
+ "UMLAL v24.2D, v18.2S, v0.2S \n\t"
+ "ADD x10, x10, x9, LSR #26 \n\t"
+ "UMLAL v25.2D, v18.2S, v1.2S \n\t"
+ "ADD x13, x13, x12, LSR #26 \n\t"
+ "UMLAL v21.2D, v19.2S, v6.2S \n\t"
+ "AND x9, x9, #0x3ffffff \n\t"
+ "UMLAL v22.2D, v19.2S, v7.2S \n\t"
+ "LSR x20, x13, #26 \n\t"
+ "UMLAL v23.2D, v19.2S, v8.2S \n\t"
+ "AND x12, x12, #0x3ffffff \n\t"
+ "UMLAL v24.2D, v19.2S, v9.2S \n\t"
+ "MADD x9, x20, x14, x9 \n\t"
+ "UMLAL v25.2D, v19.2S, v0.2S \n\t"
+ "ADD x11, x11, x10, LSR #26 \n\t"
+ /* Add even and odd elements */
+ "ADDP d21, v21.2D \n\t"
+ "AND x10, x10, #0x3ffffff \n\t"
+ "ADDP d22, v22.2D \n\t"
+ "AND x13, x13, #0x3ffffff \n\t"
+ "ADDP d23, v23.2D \n\t"
+ "ADD x12, x12, x11, LSR #26 \n\t"
+ "ADDP d24, v24.2D \n\t"
+ "AND x22, x11, #0x3ffffff \n\t"
+ "ADDP d25, v25.2D \n\t"
+ "ADD x21, x10, x9, LSR #26 \n\t"
+ "AND x20, x9, #0x3ffffff \n\t"
+ "ADD x24, x13, x12, LSR #26 \n\t"
+ "AND x23, x12, #0x3ffffff \n\t"
+ /* Load h to NEON */
+ "MOV v5.D[0], x20 \n\t"
+ "MOV v6.D[0], x21 \n\t"
+ "MOV v7.D[0], x22 \n\t"
+ "MOV v8.D[0], x23 \n\t"
+ "MOV v9.D[0], x24 \n\t"
+ /* Add ctx->h to current accumulator */
+ "ADD v21.2D, v21.2D, v5.2D \n\t"
+ "ADD v22.2D, v22.2D, v6.2D \n\t"
+ "ADD v23.2D, v23.2D, v7.2D \n\t"
+ "ADD v24.2D, v24.2D, v8.2D \n\t"
+ "ADD v25.2D, v25.2D, v9.2D \n\t"
+ /* Reduce h (h % P) */
+ /* Reduce radix 26 NEON */
+ /* Interleave h0 -> h1 -> h2 -> h3 -> h4 */
+ /* with h3 -> h4 -> h0 -> h1 */
+ "USRA v22.2D, v21.2D, #26 \n\t"
+ "AND v21.16B, v21.16B, v27.16B \n\t"
+ "USRA v25.2D, v24.2D, #26 \n\t"
+ "AND v24.16B, v24.16B, v27.16B \n\t"
+ "USHR v5.2D, v25.2D, #26 \n\t"
+ "USRA v23.2D, v22.2D, #26 \n\t"
+ /* Simulate multiplying by 5 using adding and shifting */
+ "SHL v8.2D, v5.2D, #2 \n\t"
+ "AND v6.16B, v22.16B, v27.16B \n\t"
+ "ADD v8.2D, v8.2D, v5.2D \n\t"
+ "AND v9.16B, v25.16B, v27.16B \n\t"
+ "ADD v21.2D, v21.2D, v8.2D \n\t"
+ "USRA v24.2D, v23.2D, #26 \n\t"
+ "AND v7.16B, v23.16B, v27.16B \n\t"
+ "USRA v6.2D, v21.2D, #26 \n\t"
+ "AND v5.16B, v21.16B, v27.16B \n\t"
+ "USRA v9.2D, v24.2D, #26 \n\t"
+ "AND v8.16B, v24.16B, v27.16B \n\t"
+ /* Copy values to lower halves of result registers */
+ /* Store h */
+ "ST4 { v5.S-v8.S }[0], [%[h]], #16 \n\t"
+ "ST1 { v9.S }[0], [%[h]] \n\t"
+ "SUB %[h], %[h], #16 \n\t"
+ "\n"
+ ".align 2 \n\t"
+ "L_poly1305_64_done_%=: \n\t"
+ : [bytes] "+r" (bytes),
+ [m] "+r" (m),
+ [ctx] "+m" (ctx)
+ : [POLY1305_BLOCK_SIZE] "I" (POLY1305_BLOCK_SIZE),
+ [h] "r" (ctx->h),
+ [r] "r" (ctx->r),
+ [r_2] "r" (ctx->r_2),
+ [r_4] "r" (ctx->r_4),
+ [finished] "r" ((word64)ctx->finished)
+ : "memory", "cc",
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+ "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+ "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17",
+ "w19", "w20", "w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28",
+ "w30", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+ "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27",
+ "x28", "x30"
+ );
+ poly1305_blocks_16(ctx, m, bytes);
+}
+
+void poly1305_block(Poly1305* ctx, const unsigned char *m)
+{
+ poly1305_blocks_16(ctx, m, POLY1305_BLOCK_SIZE);
+}
+
+#if defined(POLY130564)
+static word64 clamp[] = {
+ 0x0ffffffc0fffffff,
+ 0x0ffffffc0ffffffc,
+};
+#endif /* POLY130564 */
+
+
+int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
+{
+ if (key == NULL)
+ return BAD_FUNC_ARG;
+
+#ifdef CHACHA_AEAD_TEST
+ word32 k;
+ printf("Poly key used:\n");
+ for (k = 0; k < keySz; k++) {
+ printf("%02x", key[k]);
+ if ((k+1) % 8 == 0)
+ printf("\n");
+ }
+ printf("\n");
+#endif
+
+ if (keySz != 32 || ctx == NULL)
+ return BAD_FUNC_ARG;
+
+ __asm__ __volatile__ (
+ /* Load key material */
+ "LDP x8, x9, [%[key]] \n\t"
+ "LDP x10, x11, [%[key], #16] \n\t"
+ /* Load clamp */
+ "LDP x12, x13, [%[clamp]] \n\t"
+ /* Apply clamp */
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ "AND x8, x8, x12 \n\t"
+ "AND x9, x9, x13 \n\t"
+ "MOV x19, xzr \n\t"
+ "MOV x20, xzr \n\t"
+ "MOV x21, xzr \n\t"
+ "MOV x22, xzr \n\t"
+ "MOV x23, xzr \n\t"
+ "BFI x19, x8, #0, #26 \n\t"
+ "LSR x8, x8, #26 \n\t"
+ "BFI x20, x8, #0, #26 \n\t"
+ "LSR x8, x8, #26 \n\t"
+ "BFI x21, x8, #0, #12 \n\t"
+ "BFI x21, x9, #12, #14 \n\t"
+ "LSR x9, x9, #14 \n\t"
+ "BFI x22, x9, #0, #26 \n\t"
+ "LSR x9, x9, #26 \n\t"
+ "BFI x23, x9, #0, #24 \n\t"
+ /* Compute r^2 */
+ /* r*5 */
+ "MOV x8, #5 \n\t"
+ "MUL x24, x20, x8 \n\t"
+ "MUL x25, x21, x8 \n\t"
+ "MUL x26, x22, x8 \n\t"
+ "MUL x27, x23, x8 \n\t"
+ /* d = r*r */
+ /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
+ /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
+ /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */
+ /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */
+ /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
+ "MUL x14, x19, x19 \n\t"
+ "MUL x15, x19, x20 \n\t"
+ "MUL x16, x19, x21 \n\t"
+ "MUL x17, x19, x22 \n\t"
+ "MUL x7, x19, x23 \n\t"
+ "MADD x14, x20, x27, x14 \n\t"
+ "MADD x15, x20, x19, x15 \n\t"
+ "MADD x16, x20, x20, x16 \n\t"
+ "MADD x17, x20, x21, x17 \n\t"
+ "MADD x7, x20, x22, x7 \n\t"
+ "MADD x14, x21, x26, x14 \n\t"
+ "MADD x15, x21, x27, x15 \n\t"
+ "MADD x16, x21, x19, x16 \n\t"
+ "MADD x17, x21, x20, x17 \n\t"
+ "MADD x7, x21, x21, x7 \n\t"
+ "MADD x14, x22, x25, x14 \n\t"
+ "MADD x15, x22, x26, x15 \n\t"
+ "MADD x16, x22, x27, x16 \n\t"
+ "MADD x17, x22, x19, x17 \n\t"
+ "MADD x7, x22, x20, x7 \n\t"
+ "MADD x14, x23, x24, x14 \n\t"
+ "MADD x15, x23, x25, x15 \n\t"
+ "MADD x16, x23, x26, x16 \n\t"
+ "MADD x17, x23, x27, x17 \n\t"
+ "MADD x7, x23, x19, x7 \n\t"
+ /* r_2 = r^2 % P */
+ "ADD x15, x15, x14, LSR #26 \n\t"
+ "ADD x7, x7, x17, LSR #26 \n\t"
+ "AND x14, x14, #0x3ffffff \n\t"
+ "LSR x9, x7, #26 \n\t"
+ "AND x17, x17, #0x3ffffff \n\t"
+ "MADD x14, x9, x8, x14 \n\t"
+ "ADD x16, x16, x15, LSR #26 \n\t"
+ "AND x15, x15, #0x3ffffff \n\t"
+ "AND x7, x7, #0x3ffffff \n\t"
+ "ADD x17, x17, x16, LSR #26 \n\t"
+ "AND x16, x16, #0x3ffffff \n\t"
+ "ADD x15, x15, x14, LSR #26 \n\t"
+ "AND x14, x14, #0x3ffffff \n\t"
+ "ADD x7, x7, x17, LSR #26 \n\t"
+ "AND x17, x17, #0x3ffffff \n\t"
+ /* Store r */
+ "ORR x19, x19, x20, LSL #32 \n\t"
+ "ORR x21, x21, x22, LSL #32 \n\t"
+ "STP x19, x21, [%[ctx_r]] \n\t"
+ "STR w23, [%[ctx_r], #16] \n\t"
+ "MOV x8, #5 \n\t"
+ "MUL x24, x15, x8 \n\t"
+ "MUL x25, x16, x8 \n\t"
+ "MUL x26, x17, x8 \n\t"
+ "MUL x27, x7, x8 \n\t"
+ /* Compute r^4 */
+ /* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
+ /* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
+ /* d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s4 + h4 * s3 */
+ /* d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s4 */
+ /* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
+ "MUL x19, x14, x14 \n\t"
+ "MUL x20, x14, x15 \n\t"
+ "MUL x21, x14, x16 \n\t"
+ "MUL x22, x14, x17 \n\t"
+ "MUL x23, x14, x7 \n\t"
+ "MADD x19, x15, x27, x19 \n\t"
+ "MADD x20, x15, x14, x20 \n\t"
+ "MADD x21, x15, x15, x21 \n\t"
+ "MADD x22, x15, x16, x22 \n\t"
+ "MADD x23, x15, x17, x23 \n\t"
+ "MADD x19, x16, x26, x19 \n\t"
+ "MADD x20, x16, x27, x20 \n\t"
+ "MADD x21, x16, x14, x21 \n\t"
+ "MADD x22, x16, x15, x22 \n\t"
+ "MADD x23, x16, x16, x23 \n\t"
+ "MADD x19, x17, x25, x19 \n\t"
+ "MADD x20, x17, x26, x20 \n\t"
+ "MADD x21, x17, x27, x21 \n\t"
+ "MADD x22, x17, x14, x22 \n\t"
+ "MADD x23, x17, x15, x23 \n\t"
+ "MADD x19, x7, x24, x19 \n\t"
+ "MADD x20, x7, x25, x20 \n\t"
+ "MADD x21, x7, x26, x21 \n\t"
+ "MADD x22, x7, x27, x22 \n\t"
+ "MADD x23, x7, x14, x23 \n\t"
+ /* r^4 % P */
+ "ADD x20, x20, x19, LSR #26 \n\t"
+ "ADD x23, x23, x22, LSR #26 \n\t"
+ "AND x19, x19, #0x3ffffff \n\t"
+ "LSR x9, x23, #26 \n\t"
+ "AND x22, x22, #0x3ffffff \n\t"
+ "MADD x19, x9, x8, x19 \n\t"
+ "ADD x21, x21, x20, LSR #26 \n\t"
+ "AND x20, x20, #0x3ffffff \n\t"
+ "AND x23, x23, #0x3ffffff \n\t"
+ "ADD x22, x22, x21, LSR #26 \n\t"
+ "AND x21, x21, #0x3ffffff \n\t"
+ "ADD x20, x20, x19, LSR #26 \n\t"
+ "AND x19, x19, #0x3ffffff \n\t"
+ "ADD x23, x23, x22, LSR #26 \n\t"
+ "AND x22, x22, #0x3ffffff \n\t"
+ /* Store r^2 */
+ "ORR x14, x14, x15, LSL #32 \n\t"
+ "ORR x16, x16, x17, LSL #32 \n\t"
+ "STP x14, x16, [%[ctx_r_2]] \n\t"
+ "STR w7, [%[ctx_r_2], #16] \n\t"
+ /* Store r^4 */
+ "ORR x19, x19, x20, LSL #32 \n\t"
+ "ORR x21, x21, x22, LSL #32 \n\t"
+ "STP x19, x21, [%[ctx_r_4]] \n\t"
+ "STR w23, [%[ctx_r_4], #16] \n\t"
+ /* h (accumulator) = 0 */
+ "STP xzr, xzr, [%[ctx_h_0]] \n\t"
+ "STR wzr, [%[ctx_h_0], #16] \n\t"
+ /* Save pad for later */
+ "STP x10, x11, [%[ctx_pad]] \n\t"
+ /* Zero leftover */
+ "STR xzr, [%[ctx_leftover]] \n\t"
+ /* Zero finished */
+ "STRB wzr, [%[ctx_finished]] \n\t"
+ :
+ : [clamp] "r" (clamp),
+ [key] "r" (key),
+ [ctx_r] "r" (ctx->r),
+ [ctx_r_2] "r" (ctx->r_2),
+ [ctx_r_4] "r" (ctx->r_4),
+ [ctx_h_0] "r" (ctx->h),
+ [ctx_pad] "r" (ctx->pad),
+ [ctx_leftover] "r" (&ctx->leftover),
+ [ctx_finished] "r" (&ctx->finished)
+ : "memory", "cc",
+ "w7", "w14", "w15", "w16", "w17", "w19", "w20", "w21", "w22", "w23",
+ "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+ "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+
+ return 0;
+}
+
+
+int wc_Poly1305Final(Poly1305* ctx, byte* mac)
+{
+
+ if (ctx == NULL)
+ return BAD_FUNC_ARG;
+
+ /* process the remaining block */
+ if (ctx->leftover) {
+ size_t i = ctx->leftover;
+ ctx->buffer[i++] = 1;
+ for (; i < POLY1305_BLOCK_SIZE; i++)
+ ctx->buffer[i] = 0;
+ ctx->finished = 1;
+ poly1305_block(ctx, ctx->buffer);
+ }
+
+ __asm__ __volatile__ (
+ /* Load raw h and zero h registers */
+ "LDP x2, x3, %[h_addr] \n\t"
+ "MOV x5, xzr \n\t"
+ "LDR w4, %[h_4_addr] \n\t"
+ "MOV x6, xzr \n\t"
+ "LDP x16, x17, %[pad_addr] \n\t"
+ /* Base 26 -> Base 64 */
+ "MOV w5, w2 \n\t"
+ "LSR x2, x2, #32 \n\t"
+ "ORR x5, x5, x2, LSL #26 \n\t"
+ "ORR x5, x5, x3, LSL #52 \n\t"
+ "LSR w6, w3, #12 \n\t"
+ "LSR x3, x3, #32 \n\t"
+ "ORR x6, x6, x3, LSL #14 \n\t"
+ "ORR x6, x6, x4, LSL #40 \n\t"
+ "LSR x7, x4, #24 \n\t"
+ /* Check if h is larger than p */
+ "ADDS x2, x5, #5 \n\t"
+ "ADCS x3, x6, xzr \n\t"
+ "ADC x4, x7, xzr \n\t"
+ /* Check if h+5 is larger than 2^130 */
+ "CMP x4, #3 \n\t"
+ "CSEL x5, x2, x5, HI \n\t"
+ "CSEL x6, x3, x6, HI \n\t"
+ "ADDS x5, x5, x16 \n\t"
+ "ADC x6, x6, x17 \n\t"
+ "STP x5, x6, [%[mac]] \n\t"
+ : [mac] "+r" (mac)
+ : [pad_addr] "m" (ctx->pad),
+ [h_addr] "m" (ctx->h),
+ [h_4_addr] "m" (ctx->h[4])
+ : "memory", "cc",
+ "w2", "w3", "w4", "w5", "w6", "w7", "x2", "x3", "x4", "x5",
+ "x6", "x7", "x16", "x17"
+ );
+
+ /* zero out the state */
+ ctx->h[0] = 0;
+ ctx->h[1] = 0;
+ ctx->h[2] = 0;
+ ctx->h[3] = 0;
+ ctx->h[4] = 0;
+ ctx->r[0] = 0;
+ ctx->r[1] = 0;
+ ctx->r[2] = 0;
+ ctx->r[3] = 0;
+ ctx->r[4] = 0;
+ ctx->r_2[0] = 0;
+ ctx->r_2[1] = 0;
+ ctx->r_2[2] = 0;
+ ctx->r_2[3] = 0;
+ ctx->r_2[4] = 0;
+ ctx->r_4[0] = 0;
+ ctx->r_4[1] = 0;
+ ctx->r_4[2] = 0;
+ ctx->r_4[3] = 0;
+ ctx->r_4[4] = 0;
+ ctx->pad[0] = 0;
+ ctx->pad[1] = 0;
+ ctx->pad[2] = 0;
+ ctx->pad[3] = 0;
+
+ return 0;
+}
+
+#endif /* HAVE_POLY1305 */
+#endif /* WOLFSSL_ARMASM */
+#endif /* __aarch64__ */
diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c
new file mode 100644
index 0000000..7f214d4
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-sha256.c
@@ -0,0 +1,1508 @@
+/* armv8-sha256.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#if !defined(NO_SHA256) || defined(WOLFSSL_SHA224)
+
+#include <wolfssl/wolfcrypt/sha256.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+
+static const ALIGN32 word32 K[64] = {
+ 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
+ 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
+ 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
+ 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
+ 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
+ 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
+ 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
+ 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
+ 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
+ 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
+ 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
+ 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
+ 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
+};
+
+
+static int InitSha256(wc_Sha256* sha256)
+{
+ int ret = 0;
+
+ if (sha256 == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ sha256->digest[0] = 0x6A09E667L;
+ sha256->digest[1] = 0xBB67AE85L;
+ sha256->digest[2] = 0x3C6EF372L;
+ sha256->digest[3] = 0xA54FF53AL;
+ sha256->digest[4] = 0x510E527FL;
+ sha256->digest[5] = 0x9B05688CL;
+ sha256->digest[6] = 0x1F83D9ABL;
+ sha256->digest[7] = 0x5BE0CD19L;
+
+ sha256->buffLen = 0;
+ sha256->loLen = 0;
+ sha256->hiLen = 0;
+
+ return ret;
+}
+
+static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len)
+{
+ word32 tmp = sha256->loLen;
+ if ((sha256->loLen += len) < tmp)
+ sha256->hiLen++; /* carry low to high */
+}
+
+
+#ifdef __aarch64__
+
+/* ARMv8 hardware acceleration */
+static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
+{
+ word32 add;
+ word32 numBlocks;
+
+ /* only perform actions if a buffer is passed in */
+ if (len > 0) {
+ /* fill leftover buffer with data */
+ add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
+ XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add);
+ sha256->buffLen += add;
+ data += add;
+ len -= add;
+
+ /* number of blocks in a row to complete */
+ numBlocks = (len + sha256->buffLen)/WC_SHA256_BLOCK_SIZE;
+
+ if (numBlocks > 0) {
+ word32* k = (word32*)K;
+
+ /* get leftover amount after blocks */
+ add = (len + sha256->buffLen) - numBlocks * WC_SHA256_BLOCK_SIZE;
+ __asm__ volatile (
+ "#load leftover data\n"
+ "LD1 {v0.2d-v3.2d}, %[buffer] \n"
+
+ "#load current digest\n"
+ "LD1 {v12.2d-v13.2d}, %[digest] \n"
+ "MOV w8, %w[blocks] \n"
+ "REV32 v0.16b, v0.16b \n"
+ "REV32 v1.16b, v1.16b \n"
+ "REV32 v2.16b, v2.16b \n"
+ "REV32 v3.16b, v3.16b \n"
+
+ "#load K values in \n"
+ "LD1 {v16.4s-v19.4s}, [%[k]], #64 \n"
+ "LD1 {v20.4s-v23.4s}, [%[k]], #64 \n"
+ "MOV v14.16b, v12.16b \n" /* store digest for add at the end */
+ "MOV v15.16b, v13.16b \n"
+ "LD1 {v24.4s-v27.4s}, [%[k]], #64 \n"
+ "LD1 {v28.4s-v31.4s}, [%[k]], #64 \n"
+
+ /* beginning of SHA256 block operation */
+ "1:\n"
+ /* Round 1 */
+ "MOV v4.16b, v0.16b \n"
+ "ADD v0.4s, v0.4s, v16.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 2 */
+ "SHA256SU0 v4.4s, v1.4s \n"
+ "ADD v0.4s, v1.4s, v17.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v4.4s, v2.4s, v3.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 3 */
+ "SHA256SU0 v1.4s, v2.4s \n"
+ "ADD v0.4s, v2.4s, v18.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v1.4s, v3.4s, v4.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 4 */
+ "SHA256SU0 v2.4s, v3.4s \n"
+ "ADD v0.4s, v3.4s, v19.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v2.4s, v4.4s, v1.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 5 */
+ "SHA256SU0 v3.4s, v4.4s \n"
+ "ADD v0.4s, v4.4s, v20.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v3.4s, v1.4s, v2.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 6 */
+ "SHA256SU0 v4.4s, v1.4s \n"
+ "ADD v0.4s, v1.4s, v21.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v4.4s, v2.4s, v3.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 7 */
+ "SHA256SU0 v1.4s, v2.4s \n"
+ "ADD v0.4s, v2.4s, v22.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v1.4s, v3.4s, v4.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 8 */
+ "SHA256SU0 v2.4s, v3.4s \n"
+ "ADD v0.4s, v3.4s, v23.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v2.4s, v4.4s, v1.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 9 */
+ "SHA256SU0 v3.4s, v4.4s \n"
+ "ADD v0.4s, v4.4s, v24.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v3.4s, v1.4s, v2.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 10 */
+ "SHA256SU0 v4.4s, v1.4s \n"
+ "ADD v0.4s, v1.4s, v25.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v4.4s, v2.4s, v3.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 11 */
+ "SHA256SU0 v1.4s, v2.4s \n"
+ "ADD v0.4s, v2.4s, v26.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v1.4s, v3.4s, v4.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 12 */
+ "SHA256SU0 v2.4s, v3.4s \n"
+ "ADD v0.4s, v3.4s, v27.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v2.4s, v4.4s, v1.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 13 */
+ "SHA256SU0 v3.4s, v4.4s \n"
+ "ADD v0.4s, v4.4s, v28.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256SU1 v3.4s, v1.4s, v2.4s \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 14 */
+ "ADD v0.4s, v1.4s, v29.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 15 */
+ "ADD v0.4s, v2.4s, v30.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ /* Round 16 */
+ "ADD v0.4s, v3.4s, v31.4s \n"
+ "MOV v11.16b, v12.16b \n"
+ "SHA256H q12, q13, v0.4s \n"
+ "SHA256H2 q13, q11, v0.4s \n"
+
+ "#Add working vars back into digest state \n"
+ "SUB w8, w8, #1 \n"
+ "ADD v12.4s, v12.4s, v14.4s \n"
+ "ADD v13.4s, v13.4s, v15.4s \n"
+
+ "#check if more blocks should be done\n"
+ "CBZ w8, 2f \n"
+
+ "#load in message and schedule updates \n"
+ "LD1 {v0.2d-v3.2d}, [%[dataIn]], #64 \n"
+ "MOV v14.16b, v12.16b \n"
+ "MOV v15.16b, v13.16b \n"
+ "REV32 v0.16b, v0.16b \n"
+ "REV32 v1.16b, v1.16b \n"
+ "REV32 v2.16b, v2.16b \n"
+ "REV32 v3.16b, v3.16b \n"
+ "B 1b \n" /* do another block */
+
+ "2:\n"
+ "STP q12, q13, %[out] \n"
+
+ : [out] "=m" (sha256->digest), "=m" (sha256->buffer), "=r" (numBlocks),
+ "=r" (data), "=r" (k)
+ : [k] "4" (k), [digest] "m" (sha256->digest), [buffer] "m" (sha256->buffer),
+ [blocks] "2" (numBlocks), [dataIn] "3" (data)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+ "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+ "v22", "v23", "v24", "v25", "v26", "v27", "v28",
+ "v29", "v30", "v31", "w8"
+ );
+
+ AddLength(sha256, WC_SHA256_BLOCK_SIZE * numBlocks);
+
+ /* copy over any remaining data leftover */
+ XMEMCPY(sha256->buffer, data, add);
+ sha256->buffLen = add;
+ }
+ }
+
+ /* account for possibility of not used if len = 0 */
+ (void)add;
+ (void)numBlocks;
+
+ return 0;
+}
+
+
+static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
+{
+ byte* local;
+
+ local = (byte*)sha256->buffer;
+ AddLength(sha256, sha256->buffLen); /* before adding pads */
+
+ local[sha256->buffLen++] = 0x80; /* add 1 */
+
+ /* pad with zeros */
+ if (sha256->buffLen > WC_SHA256_PAD_SIZE) {
+
+ XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
+ sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
+ __asm__ volatile (
+ "LD1 {v4.2d-v7.2d}, %[buffer] \n"
+ "MOV v0.16b, v4.16b \n"
+ "MOV v1.16b, v5.16b \n"
+ "REV32 v0.16b, v0.16b \n"
+ "REV32 v1.16b, v1.16b \n"
+ "MOV v2.16b, v6.16b \n"
+ "MOV v3.16b, v7.16b \n"
+ "REV32 v2.16b, v2.16b \n"
+ "REV32 v3.16b, v3.16b \n"
+ "MOV v4.16b, v0.16b \n"
+ "MOV v5.16b, v1.16b \n"
+ "LD1 {v20.2d-v21.2d}, %[digest] \n"
+
+ "#SHA256 operation on updated message \n"
+ "MOV v16.16b, v20.16b \n"
+ "MOV v17.16b, v21.16b \n"
+
+ "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
+ "SHA256SU0 v4.4s, v1.4s \n"
+ "ADD v0.4s, v0.4s, v22.4s \n"
+ "MOV v6.16b, v2.16b \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256SU1 v4.4s, v2.4s, v3.4s \n"
+ "SHA256H q16, q17, v0.4s \n"
+ "SHA256H2 q17, q18, v0.4s \n"
+
+ "SHA256SU0 v5.4s, v2.4s \n"
+ "ADD v1.4s, v1.4s, v23.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v7.16b, v3.16b \n"
+ "SHA256SU1 v5.4s, v3.4s, v4.4s \n"
+ "SHA256H q16, q17, v1.4s \n"
+ "SHA256H2 q17, q18, v1.4s \n"
+
+ "SHA256SU0 v6.4s, v3.4s \n"
+ "ADD v2.4s, v2.4s, v24.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v8.16b, v4.16b \n"
+ "SHA256SU1 v6.4s, v4.4s, v5.4s \n"
+ "SHA256H q16, q17, v2.4s \n"
+ "SHA256H2 q17, q18, v2.4s \n"
+
+ "SHA256SU0 v7.4s, v4.4s \n"
+ "ADD v3.4s, v3.4s, v25.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v9.16b, v5.16b \n"
+ "SHA256SU1 v7.4s, v5.4s, v6.4s \n"
+ "SHA256H q16, q17, v3.4s \n"
+ "SHA256H2 q17, q18, v3.4s \n"
+
+ "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
+ "SHA256SU0 v8.4s, v5.4s \n"
+ "ADD v4.4s, v4.4s, v22.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v10.16b, v6.16b \n"
+ "SHA256SU1 v8.4s, v6.4s, v7.4s \n"
+ "SHA256H q16, q17, v4.4s \n"
+ "SHA256H2 q17, q18, v4.4s \n"
+
+ "SHA256SU0 v9.4s, v6.4s \n"
+ "ADD v5.4s, v5.4s, v23.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v11.16b, v7.16b \n"
+ "SHA256SU1 v9.4s, v7.4s, v8.4s \n"
+ "SHA256H q16, q17, v5.4s \n"
+ "SHA256H2 q17, q18, v5.4s \n"
+
+ "SHA256SU0 v10.4s, v7.4s \n"
+ "ADD v6.4s, v6.4s, v24.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v12.16b, v8.16b \n"
+ "SHA256SU1 v10.4s, v8.4s, v9.4s \n"
+ "SHA256H q16, q17, v6.4s \n"
+ "SHA256H2 q17, q18, v6.4s \n"
+
+ "SHA256SU0 v11.4s, v8.4s \n"
+ "ADD v7.4s, v7.4s, v25.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v13.16b, v9.16b \n"
+ "SHA256SU1 v11.4s, v9.4s, v10.4s \n"
+ "SHA256H q16, q17, v7.4s \n"
+ "SHA256H2 q17, q18, v7.4s \n"
+
+ "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
+ "SHA256SU0 v12.4s, v9.4s \n"
+ "ADD v8.4s, v8.4s, v22.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v14.16b, v10.16b \n"
+ "SHA256SU1 v12.4s, v10.4s, v11.4s \n"
+ "SHA256H q16, q17, v8.4s \n"
+ "SHA256H2 q17, q18, v8.4s \n"
+
+ "SHA256SU0 v13.4s, v10.4s \n"
+ "ADD v9.4s, v9.4s, v23.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v15.16b, v11.16b \n"
+ "SHA256SU1 v13.4s, v11.4s, v12.4s \n"
+ "SHA256H q16, q17, v9.4s \n"
+ "SHA256H2 q17, q18, v9.4s \n"
+
+ "SHA256SU0 v14.4s, v11.4s \n"
+ "ADD v10.4s, v10.4s, v24.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256SU1 v14.4s, v12.4s, v13.4s \n"
+ "SHA256H q16, q17, v10.4s \n"
+ "SHA256H2 q17, q18, v10.4s \n"
+
+ "SHA256SU0 v15.4s, v12.4s \n"
+ "ADD v11.4s, v11.4s, v25.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256SU1 v15.4s, v13.4s, v14.4s \n"
+ "SHA256H q16, q17, v11.4s \n"
+ "SHA256H2 q17, q18, v11.4s \n"
+
+ "LD1 {v22.16b-v25.16b}, [%[k]] \n"
+ "ADD v12.4s, v12.4s, v22.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256H q16, q17, v12.4s \n"
+ "SHA256H2 q17, q18, v12.4s \n"
+
+ "ADD v13.4s, v13.4s, v23.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256H q16, q17, v13.4s \n"
+ "SHA256H2 q17, q18, v13.4s \n"
+
+ "ADD v14.4s, v14.4s, v24.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256H q16, q17, v14.4s \n"
+ "SHA256H2 q17, q18, v14.4s \n"
+
+ "ADD v15.4s, v15.4s, v25.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256H q16, q17, v15.4s \n"
+ "SHA256H2 q17, q18, v15.4s \n"
+
+ "#Add working vars back into digest state \n"
+ "ADD v16.4s, v16.4s, v20.4s \n"
+ "ADD v17.4s, v17.4s, v21.4s \n"
+ "STP q16, q17, %[out] \n"
+
+ : [out] "=m" (sha256->digest)
+ : [k] "r" (K), [digest] "m" (sha256->digest),
+ [buffer] "m" (sha256->buffer)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11"
+ , "v12", "v13", "v14", "v15", "v16", "v17", "v18"
+ , "v19", "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+
+ sha256->buffLen = 0;
+ }
+ XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen);
+
+ /* put lengths in bits */
+ sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) +
+ (sha256->hiLen << 3);
+ sha256->loLen = sha256->loLen << 3;
+
+ /* store lengths */
+ #if defined(LITTLE_ENDIAN_ORDER)
+ __asm__ volatile (
+ "LD1 {v0.2d-v3.2d}, %[in] \n"
+ "REV32 v0.16b, v0.16b \n"
+ "REV32 v1.16b, v1.16b \n"
+ "REV32 v2.16b, v2.16b \n"
+ "REV32 v3.16b, v3.16b \n"
+ "ST1 {v0.2d-v3.2d}, %[out] \n"
+ : [out] "=m" (sha256->buffer)
+ : [in] "m" (sha256->buffer)
+ : "cc", "memory", "v0", "v1", "v2", "v3"
+ );
+ #endif
+ /* ! length ordering dependent on digest endian type ! */
+ XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
+ XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
+ sizeof(word32));
+
+ __asm__ volatile (
+ "#load in message and schedule updates \n"
+ "LD1 {v4.2d-v7.2d}, %[buffer] \n"
+ "MOV v0.16b, v4.16b \n"
+ "MOV v1.16b, v5.16b \n"
+ "MOV v2.16b, v6.16b \n"
+ "MOV v3.16b, v7.16b \n"
+ "LD1 {v20.2d-v21.2d}, %[digest] \n"
+
+ "MOV v16.16b, v20.16b \n"
+ "MOV v17.16b, v21.16b \n"
+ "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
+ "SHA256SU0 v4.4s, v1.4s \n"
+ "ADD v0.4s, v0.4s, v22.4s \n"
+ "MOV v6.16b, v2.16b \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256SU1 v4.4s, v2.4s, v3.4s \n"
+ "SHA256H q16, q17, v0.4s \n"
+ "SHA256H2 q17, q18, v0.4s \n"
+
+ "SHA256SU0 v5.4s, v2.4s \n"
+ "ADD v1.4s, v1.4s, v23.4s \n"
+ "MOV v7.16b, v3.16b \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256SU1 v5.4s, v3.4s, v4.4s \n"
+ "SHA256H q16, q17, v1.4s \n"
+ "SHA256H2 q17, q18, v1.4s \n"
+
+ "SHA256SU0 v6.4s, v3.4s \n"
+ "ADD v2.4s, v2.4s, v24.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v8.16b, v4.16b \n"
+ "SHA256SU1 v6.4s, v4.4s, v5.4s \n"
+ "SHA256H q16, q17, v2.4s \n"
+ "SHA256H2 q17, q18, v2.4s \n"
+
+ "SHA256SU0 v7.4s, v4.4s \n"
+ "ADD v3.4s, v3.4s, v25.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v9.16b, v5.16b \n"
+ "SHA256SU1 v7.4s, v5.4s, v6.4s \n"
+ "SHA256H q16, q17, v3.4s \n"
+ "SHA256H2 q17, q18, v3.4s \n"
+
+ "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
+ "SHA256SU0 v8.4s, v5.4s \n"
+ "ADD v4.4s, v4.4s, v22.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v10.16b, v6.16b \n"
+ "SHA256SU1 v8.4s, v6.4s, v7.4s \n"
+ "SHA256H q16, q17, v4.4s \n"
+ "SHA256H2 q17, q18, v4.4s \n"
+
+ "SHA256SU0 v9.4s, v6.4s \n"
+ "ADD v5.4s, v5.4s, v23.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v11.16b, v7.16b \n"
+ "SHA256SU1 v9.4s, v7.4s, v8.4s \n"
+ "SHA256H q16, q17, v5.4s \n"
+ "SHA256H2 q17, q18, v5.4s \n"
+
+ "SHA256SU0 v10.4s, v7.4s \n"
+ "ADD v6.4s, v6.4s, v24.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v12.16b, v8.16b \n"
+ "SHA256SU1 v10.4s, v8.4s, v9.4s \n"
+ "SHA256H q16, q17, v6.4s \n"
+ "SHA256H2 q17, q18, v6.4s \n"
+
+ "SHA256SU0 v11.4s, v8.4s \n"
+ "ADD v7.4s, v7.4s, v25.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v13.16b, v9.16b \n"
+ "SHA256SU1 v11.4s, v9.4s, v10.4s \n"
+ "SHA256H q16, q17, v7.4s \n"
+ "SHA256H2 q17, q18, v7.4s \n"
+
+ "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
+ "SHA256SU0 v12.4s, v9.4s \n"
+ "ADD v8.4s, v8.4s, v22.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v14.16b, v10.16b \n"
+ "SHA256SU1 v12.4s, v10.4s, v11.4s \n"
+ "SHA256H q16, q17, v8.4s \n"
+ "SHA256H2 q17, q18, v8.4s \n"
+
+ "SHA256SU0 v13.4s, v10.4s \n"
+ "ADD v9.4s, v9.4s, v23.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "MOV v15.16b, v11.16b \n"
+ "SHA256SU1 v13.4s, v11.4s, v12.4s \n"
+ "SHA256H q16, q17, v9.4s \n"
+ "SHA256H2 q17, q18, v9.4s \n"
+
+ "SHA256SU0 v14.4s, v11.4s \n"
+ "ADD v10.4s, v10.4s, v24.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256SU1 v14.4s, v12.4s, v13.4s \n"
+ "SHA256H q16, q17, v10.4s \n"
+ "SHA256H2 q17, q18, v10.4s \n"
+
+ "SHA256SU0 v15.4s, v12.4s \n"
+ "ADD v11.4s, v11.4s, v25.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256SU1 v15.4s, v13.4s, v14.4s \n"
+ "SHA256H q16, q17, v11.4s \n"
+ "SHA256H2 q17, q18, v11.4s \n"
+
+ "LD1 {v22.16b-v25.16b}, [%[k]] \n"
+ "ADD v12.4s, v12.4s, v22.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256H q16, q17, v12.4s \n"
+ "SHA256H2 q17, q18, v12.4s \n"
+
+ "ADD v13.4s, v13.4s, v23.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256H q16, q17, v13.4s \n"
+ "SHA256H2 q17, q18, v13.4s \n"
+
+ "ADD v14.4s, v14.4s, v24.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256H q16, q17, v14.4s \n"
+ "SHA256H2 q17, q18, v14.4s \n"
+
+ "ADD v15.4s, v15.4s, v25.4s \n"
+ "MOV v18.16b, v16.16b \n"
+ "SHA256H q16, q17, v15.4s \n"
+ "SHA256H2 q17, q18, v15.4s \n"
+
+ "#Add working vars back into digest state \n"
+ "ADD v16.4s, v16.4s, v20.4s \n"
+ "ADD v17.4s, v17.4s, v21.4s \n"
+
+ "#Store value as hash output \n"
+ #if defined(LITTLE_ENDIAN_ORDER)
+ "REV32 v16.16b, v16.16b \n"
+ #endif
+ "ST1 {v16.16b}, [%[hashOut]], #16 \n"
+ #if defined(LITTLE_ENDIAN_ORDER)
+ "REV32 v17.16b, v17.16b \n"
+ #endif
+ "ST1 {v17.16b}, [%[hashOut]] \n"
+ : [hashOut] "=r" (hash)
+ : [k] "r" (K), [digest] "m" (sha256->digest),
+ [buffer] "m" (sha256->buffer),
+ "0" (hash)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+ "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+ "v22", "v23", "v24", "v25"
+ );
+
+ return 0;
+}
+
+#else /* not using 64 bit */
+
+/* ARMv8 hardware acceleration Aarch32 */
+static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
+{
+ word32 add;
+ word32 numBlocks;
+
+ /* only perform actions if a buffer is passed in */
+ if (len > 0) {
+ /* fill leftover buffer with data */
+ add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
+ XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add);
+ sha256->buffLen += add;
+ data += add;
+ len -= add;
+
+ /* number of blocks in a row to complete */
+ numBlocks = (len + sha256->buffLen)/WC_SHA256_BLOCK_SIZE;
+
+ if (numBlocks > 0) {
+ word32* bufPt = sha256->buffer;
+ word32* digPt = sha256->digest;
+ /* get leftover amount after blocks */
+ add = (len + sha256->buffLen) - numBlocks * WC_SHA256_BLOCK_SIZE;
+ __asm__ volatile (
+ "#load leftover data\n"
+ "VLDM %[buffer]!, {q0-q3} \n"
+
+ "#load current digest\n"
+ "VLDM %[digest], {q12-q13} \n"
+ "MOV r8, %[blocks] \n"
+ "VREV32.8 q0, q0 \n"
+ "VREV32.8 q1, q1 \n"
+ "VREV32.8 q2, q2 \n"
+ "VREV32.8 q3, q3 \n"
+ "VLDM %[k]! ,{q5-q8} \n"
+ "VLDM %[k]! ,{q9}\n"
+
+ "VMOV.32 q14, q12 \n" /* store digest for add at the end */
+ "VMOV.32 q15, q13 \n"
+
+ /* beginning of SHA256 block operation */
+ "1:\n"
+
+ /* Round 1 */
+ "VMOV.32 q4, q0 \n"
+ "VADD.i32 q0, q0, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 2 */
+ "SHA256SU0.32 q4, q1 \n"
+ "VADD.i32 q0, q1, q6 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q4, q2, q3 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 3 */
+ "SHA256SU0.32 q1, q2 \n"
+ "VADD.i32 q0, q2, q7 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q1, q3, q4 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 4 */
+ "SHA256SU0.32 q2, q3 \n"
+ "VADD.i32 q0, q3, q8 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q2, q4, q1 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 5 */
+ "SHA256SU0.32 q3, q4 \n"
+ "VADD.i32 q0, q4, q9 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q3, q1, q2 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 6 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "SHA256SU0.32 q4, q1 \n"
+ "VADD.i32 q0, q1, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q4, q2, q3 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 7 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "SHA256SU0.32 q1, q2 \n"
+ "VADD.i32 q0, q2, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q1, q3, q4 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 8 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "SHA256SU0.32 q2, q3 \n"
+ "VADD.i32 q0, q3, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q2, q4, q1 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 9 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "SHA256SU0.32 q3, q4 \n"
+ "VADD.i32 q0, q4, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q3, q1, q2 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 10 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "SHA256SU0.32 q4, q1 \n"
+ "VADD.i32 q0, q1, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q4, q2, q3 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 11 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "SHA256SU0.32 q1, q2 \n"
+ "VADD.i32 q0, q2, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q1, q3, q4 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 12 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "SHA256SU0.32 q2, q3 \n"
+ "VADD.i32 q0, q3, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q2, q4, q1 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 13 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "SHA256SU0.32 q3, q4 \n"
+ "VADD.i32 q0, q4, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q3, q1, q2 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 14 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "VADD.i32 q0, q1, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 15 */
+ "VLD1.32 {q10}, [%[k]]! \n"
+ "VADD.i32 q0, q2, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 16 */
+ "VLD1.32 {q10}, [%[k]] \n"
+ "SUB r8, r8, #1 \n"
+ "VADD.i32 q0, q3, q10 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ "#Add working vars back into digest state \n"
+ "VADD.i32 q12, q12, q14 \n"
+ "VADD.i32 q13, q13, q15 \n"
+
+ "#check if more blocks should be done\n"
+ "CMP r8, #0 \n"
+ "BEQ 2f \n"
+
+ "#load in message and schedule updates \n"
+ "VLD1.32 {q0}, [%[dataIn]]! \n"
+ "VLD1.32 {q1}, [%[dataIn]]! \n"
+ "VLD1.32 {q2}, [%[dataIn]]! \n"
+ "VLD1.32 {q3}, [%[dataIn]]! \n"
+
+ /* reset K pointer */
+ "SUB %[k], %[k], #160 \n"
+ "VREV32.8 q0, q0 \n"
+ "VREV32.8 q1, q1 \n"
+ "VREV32.8 q2, q2 \n"
+ "VREV32.8 q3, q3 \n"
+ "VMOV.32 q14, q12 \n"
+ "VMOV.32 q15, q13 \n"
+ "B 1b \n" /* do another block */
+
+ "2:\n"
+ "VST1.32 {q12, q13}, [%[out]] \n"
+
+ : [out] "=r" (digPt), "=r" (bufPt), "=r" (numBlocks),
+ "=r" (data)
+ : [k] "r" (K), [digest] "0" (digPt), [buffer] "1" (bufPt),
+ [blocks] "2" (numBlocks), [dataIn] "3" (data)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15", "r8"
+ );
+
+ AddLength(sha256, WC_SHA256_BLOCK_SIZE * numBlocks);
+
+ /* copy over any remaining data leftover */
+ XMEMCPY(sha256->buffer, data, add);
+ sha256->buffLen = add;
+ }
+ }
+
+ /* account for possibility of not used if len = 0 */
+ (void)add;
+ (void)numBlocks;
+
+ return 0;
+}
+
+
+static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
+{
+ byte* local;
+
+ if (sha256 == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ local = (byte*)sha256->buffer;
+ AddLength(sha256, sha256->buffLen); /* before adding pads */
+
+ local[sha256->buffLen++] = 0x80; /* add 1 */
+
+ /* pad with zeros */
+ if (sha256->buffLen > WC_SHA256_PAD_SIZE) {
+ word32* bufPt = sha256->buffer;
+ word32* digPt = sha256->digest;
+ XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
+ sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
+ __asm__ volatile (
+ "#load leftover data\n"
+ "VLDM %[buffer]!, {q0-q3} \n"
+
+ "#load current digest\n"
+ "VLDM %[digest], {q12-q13} \n"
+ "VREV32.8 q0, q0 \n"
+ "VREV32.8 q1, q1 \n"
+ "VREV32.8 q2, q2 \n"
+ "VREV32.8 q3, q3 \n"
+
+ "#load K values in \n"
+ "VMOV.32 q14, q12 \n" /* store digest for add at the end */
+ "VMOV.32 q15, q13 \n"
+
+ /* beginning of SHA256 block operation */
+ /* Round 1 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "VMOV.32 q4, q0 \n"
+ "VADD.i32 q0, q0, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 2 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q4, q1 \n"
+ "VADD.i32 q0, q1, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q4, q2, q3 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 3 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q1, q2 \n"
+ "VADD.i32 q0, q2, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q1, q3, q4 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 4 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q2, q3 \n"
+ "VADD.i32 q0, q3, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q2, q4, q1 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 5 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q3, q4 \n"
+ "VADD.i32 q0, q4, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q3, q1, q2 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 6 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q4, q1 \n"
+ "VADD.i32 q0, q1, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q4, q2, q3 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 7 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q1, q2 \n"
+ "VADD.i32 q0, q2, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q1, q3, q4 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 8 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q2, q3 \n"
+ "VADD.i32 q0, q3, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q2, q4, q1 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 9 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q3, q4 \n"
+ "VADD.i32 q0, q4, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q3, q1, q2 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 10 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q4, q1 \n"
+ "VADD.i32 q0, q1, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q4, q2, q3 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 11 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q1, q2 \n"
+ "VADD.i32 q0, q2, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q1, q3, q4 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 12 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q2, q3 \n"
+ "VADD.i32 q0, q3, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q2, q4, q1 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 13 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q3, q4 \n"
+ "VADD.i32 q0, q4, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q3, q1, q2 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 14 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "VADD.i32 q0, q1, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 15 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "VADD.i32 q0, q2, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 16 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "VADD.i32 q0, q3, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ "#Add working vars back into digest state \n"
+ "VADD.i32 q12, q12, q14 \n"
+ "VADD.i32 q13, q13, q15 \n"
+
+ /* reset K pointer */
+ "SUB %[k], %[k], #256 \n"
+ "VST1.32 {q12, q13}, [%[out]] \n"
+
+ : [out] "=r" (digPt), "=r" (bufPt)
+ : [k] "r" (K), [digest] "0" (digPt), [buffer] "1" (bufPt)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15"
+ );
+
+ sha256->buffLen = 0;
+ }
+ XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen);
+
+ /* put lengths in bits */
+ sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) +
+ (sha256->hiLen << 3);
+ sha256->loLen = sha256->loLen << 3;
+
+ /* store lengths */
+ #if defined(LITTLE_ENDIAN_ORDER)
+ {
+ word32* bufPt = sha256->buffer;
+ __asm__ volatile (
+ "VLD1.32 {q0}, [%[in]] \n"
+ "VREV32.8 q0, q0 \n"
+ "VST1.32 {q0}, [%[out]]!\n"
+ "VLD1.32 {q1}, [%[in]] \n"
+ "VREV32.8 q1, q1 \n"
+ "VST1.32 {q1}, [%[out]]!\n"
+ "VLD1.32 {q2}, [%[in]] \n"
+ "VREV32.8 q2, q2 \n"
+ "VST1.32 {q2}, [%[out]]!\n"
+ "VLD1.32 {q3}, [%[in]] \n"
+ "VREV32.8 q3, q3 \n"
+ "VST1.32 {q3}, [%[out]] \n"
+ : [out] "=r" (bufPt)
+ : [in] "0" (bufPt)
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+ }
+ #endif
+ /* ! length ordering dependent on digest endian type ! */
+ XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
+ XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
+ sizeof(word32));
+
+ bufPt = sha256->buffer;
+ word32* digPt = sha256->digest;
+ __asm__ volatile (
+ "#load leftover data\n"
+ "VLDM %[buffer]!, {q0-q3} \n"
+
+ "#load current digest\n"
+ "VLDM %[digest], {q12-q13} \n"
+
+ "VMOV.32 q14, q12 \n" /* store digest for add at the end */
+ "VMOV.32 q15, q13 \n"
+
+ /* beginning of SHA256 block operation */
+ /* Round 1 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "VMOV.32 q4, q0 \n"
+ "VADD.i32 q0, q0, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 2 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q4, q1 \n"
+ "VADD.i32 q0, q1, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q4, q2, q3 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 3 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q1, q2 \n"
+ "VADD.i32 q0, q2, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q1, q3, q4 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 4 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q2, q3 \n"
+ "VADD.i32 q0, q3, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q2, q4, q1 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 5 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q3, q4 \n"
+ "VADD.i32 q0, q4, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q3, q1, q2 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 6 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q4, q1 \n"
+ "VADD.i32 q0, q1, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q4, q2, q3 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 7 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q1, q2 \n"
+ "VADD.i32 q0, q2, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q1, q3, q4 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 8 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q2, q3 \n"
+ "VADD.i32 q0, q3, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q2, q4, q1 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 9 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q3, q4 \n"
+ "VADD.i32 q0, q4, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q3, q1, q2 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 10 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q4, q1 \n"
+ "VADD.i32 q0, q1, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q4, q2, q3 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 11 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q1, q2 \n"
+ "VADD.i32 q0, q2, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q1, q3, q4 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 12 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q2, q3 \n"
+ "VADD.i32 q0, q3, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q2, q4, q1 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 13 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "SHA256SU0.32 q3, q4 \n"
+ "VADD.i32 q0, q4, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256SU1.32 q3, q1, q2 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 14 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "VADD.i32 q0, q1, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 15 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "VADD.i32 q0, q2, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ /* Round 16 */
+ "VLD1.32 {q5}, [%[k]]! \n"
+ "VADD.i32 q0, q3, q5 \n"
+ "VMOV.32 q11, q12 \n"
+ "SHA256H.32 q12, q13, q0 \n"
+ "SHA256H2.32 q13, q11, q0 \n"
+
+ "#Add working vars back into digest state \n"
+ "VADD.i32 q12, q12, q14 \n"
+ "VADD.i32 q13, q13, q15 \n"
+
+ "#Store value as hash output \n"
+ #if defined(LITTLE_ENDIAN_ORDER)
+ "VREV32.8 q12, q12 \n"
+ #endif
+ "VST1.32 {q12}, [%[hashOut]]! \n"
+ #if defined(LITTLE_ENDIAN_ORDER)
+ "VREV32.8 q13, q13 \n"
+ #endif
+ "VST1.32 {q13}, [%[hashOut]] \n"
+
+ : [out] "=r" (digPt), "=r" (bufPt),
+ [hashOut] "=r" (hash)
+ : [k] "r" (K), [digest] "0" (digPt), [buffer] "1" (bufPt),
+ "2" (hash)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14",
+ "q15"
+ );
+
+ return 0;
+}
+
+#endif /* __aarch64__ */
+
+
+#ifndef NO_SHA256
+
+int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
+{
+ if (sha256 == NULL)
+ return BAD_FUNC_ARG;
+
+ sha256->heap = heap;
+ (void)devId;
+
+ return InitSha256(sha256);
+}
+
+int wc_InitSha256(wc_Sha256* sha256)
+{
+ return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
+}
+
+void wc_Sha256Free(wc_Sha256* sha256)
+{
+ (void)sha256;
+}
+
+int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
+{
+ if (sha256 == NULL || (data == NULL && len != 0)) {
+ return BAD_FUNC_ARG;
+ }
+
+ return Sha256Update(sha256, data, len);
+}
+
+int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
+{
+ int ret;
+
+ if (sha256 == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ ret = Sha256Final(sha256, hash);
+ if (ret != 0)
+ return ret;
+
+ return InitSha256(sha256); /* reset state */
+}
+
+int wc_Sha256GetHash(wc_Sha256* sha256, byte* hash)
+{
+ int ret;
+ wc_Sha256 tmpSha256;
+
+ if (sha256 == NULL || hash == NULL)
+ return BAD_FUNC_ARG;
+
+ ret = wc_Sha256Copy(sha256, &tmpSha256);
+ if (ret == 0) {
+ ret = wc_Sha256Final(&tmpSha256, hash);
+ }
+ return ret;
+}
+
+#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB)
+int wc_Sha256SetFlags(wc_Sha256* sha256, word32 flags)
+{
+ if (sha256) {
+ sha256->flags = flags;
+ }
+ return 0;
+}
+int wc_Sha256GetFlags(wc_Sha256* sha256, word32* flags)
+{
+ if (sha256 && flags) {
+ *flags = sha256->flags;
+ }
+ return 0;
+}
+#endif
+
+int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
+{
+ int ret = 0;
+
+ if (src == NULL || dst == NULL)
+ return BAD_FUNC_ARG;
+
+ XMEMCPY(dst, src, sizeof(wc_Sha256));
+
+ return ret;
+}
+
+#endif /* !NO_SHA256 */
+
+
+#ifdef WOLFSSL_SHA224
+ static int InitSha224(wc_Sha224* sha224)
+ {
+
+ int ret = 0;
+
+ if (sha224 == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ sha224->digest[0] = 0xc1059ed8;
+ sha224->digest[1] = 0x367cd507;
+ sha224->digest[2] = 0x3070dd17;
+ sha224->digest[3] = 0xf70e5939;
+ sha224->digest[4] = 0xffc00b31;
+ sha224->digest[5] = 0x68581511;
+ sha224->digest[6] = 0x64f98fa7;
+ sha224->digest[7] = 0xbefa4fa4;
+
+ sha224->buffLen = 0;
+ sha224->loLen = 0;
+ sha224->hiLen = 0;
+
+ return ret;
+ }
+
+ int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId)
+ {
+ if (sha224 == NULL)
+ return BAD_FUNC_ARG;
+
+ sha224->heap = heap;
+ (void)devId;
+
+ return InitSha224(sha224);
+ }
+
+ int wc_InitSha224(wc_Sha224* sha224)
+ {
+ return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID);
+ }
+
+ int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len)
+ {
+ int ret;
+
+ if (sha224 == NULL || (data == NULL && len > 0)) {
+ return BAD_FUNC_ARG;
+ }
+
+ ret = Sha256Update((wc_Sha256 *)sha224, data, len);
+
+ return ret;
+ }
+
+ int wc_Sha224Final(wc_Sha224* sha224, byte* hash)
+ {
+ int ret;
+ word32 hashTmp[WC_SHA256_DIGEST_SIZE/sizeof(word32)];
+
+ if (sha224 == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ ret = Sha256Final((wc_Sha256*)sha224, (byte*)hashTmp);
+ if (ret != 0)
+ return ret;
+
+ XMEMCPY(hash, hashTmp, WC_SHA224_DIGEST_SIZE);
+
+ return InitSha224(sha224); /* reset state */
+ }
+
+ void wc_Sha224Free(wc_Sha224* sha224)
+ {
+ if (sha224 == NULL)
+ return;
+ }
+
+ int wc_Sha224GetHash(wc_Sha224* sha224, byte* hash)
+ {
+ int ret;
+ wc_Sha224 tmpSha224;
+
+ if (sha224 == NULL || hash == NULL)
+ return BAD_FUNC_ARG;
+
+ ret = wc_Sha224Copy(sha224, &tmpSha224);
+ if (ret == 0) {
+ ret = wc_Sha224Final(&tmpSha224, hash);
+ }
+ return ret;
+ }
+
+#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB)
+ int wc_Sha224SetFlags(wc_Sha224* sha224, word32 flags)
+ {
+ if (sha224) {
+ sha224->flags = flags;
+ }
+ return 0;
+ }
+ int wc_Sha224GetFlags(wc_Sha224* sha224, word32* flags)
+ {
+ if (sha224 && flags) {
+ *flags = sha224->flags;
+ }
+ return 0;
+ }
+#endif
+
+ int wc_Sha224Copy(wc_Sha224* src, wc_Sha224* dst)
+ {
+ int ret = 0;
+
+ if (src == NULL || dst == NULL)
+ return BAD_FUNC_ARG;
+
+ XMEMCPY(dst, src, sizeof(wc_Sha224));
+
+ return ret;
+ }
+
+#endif /* WOLFSSL_SHA224 */
+
+#endif /* !NO_SHA256 || WOLFSSL_SHA224 */
+#endif /* WOLFSSL_ARMASM */
diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S
new file mode 100644
index 0000000..a35bccb
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S
@@ -0,0 +1,1046 @@
+/* armv8-sha512-asm
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S
+ */
+#ifdef __aarch64__
+ .text
+ .section .rodata
+ .type L_SHA512_transform_neon_len_k, %object
+ .size L_SHA512_transform_neon_len_k, 640
+ .align 3
+L_SHA512_transform_neon_len_k:
+ .xword 0x428a2f98d728ae22
+ .xword 0x7137449123ef65cd
+ .xword 0xb5c0fbcfec4d3b2f
+ .xword 0xe9b5dba58189dbbc
+ .xword 0x3956c25bf348b538
+ .xword 0x59f111f1b605d019
+ .xword 0x923f82a4af194f9b
+ .xword 0xab1c5ed5da6d8118
+ .xword 0xd807aa98a3030242
+ .xword 0x12835b0145706fbe
+ .xword 0x243185be4ee4b28c
+ .xword 0x550c7dc3d5ffb4e2
+ .xword 0x72be5d74f27b896f
+ .xword 0x80deb1fe3b1696b1
+ .xword 0x9bdc06a725c71235
+ .xword 0xc19bf174cf692694
+ .xword 0xe49b69c19ef14ad2
+ .xword 0xefbe4786384f25e3
+ .xword 0xfc19dc68b8cd5b5
+ .xword 0x240ca1cc77ac9c65
+ .xword 0x2de92c6f592b0275
+ .xword 0x4a7484aa6ea6e483
+ .xword 0x5cb0a9dcbd41fbd4
+ .xword 0x76f988da831153b5
+ .xword 0x983e5152ee66dfab
+ .xword 0xa831c66d2db43210
+ .xword 0xb00327c898fb213f
+ .xword 0xbf597fc7beef0ee4
+ .xword 0xc6e00bf33da88fc2
+ .xword 0xd5a79147930aa725
+ .xword 0x6ca6351e003826f
+ .xword 0x142929670a0e6e70
+ .xword 0x27b70a8546d22ffc
+ .xword 0x2e1b21385c26c926
+ .xword 0x4d2c6dfc5ac42aed
+ .xword 0x53380d139d95b3df
+ .xword 0x650a73548baf63de
+ .xword 0x766a0abb3c77b2a8
+ .xword 0x81c2c92e47edaee6
+ .xword 0x92722c851482353b
+ .xword 0xa2bfe8a14cf10364
+ .xword 0xa81a664bbc423001
+ .xword 0xc24b8b70d0f89791
+ .xword 0xc76c51a30654be30
+ .xword 0xd192e819d6ef5218
+ .xword 0xd69906245565a910
+ .xword 0xf40e35855771202a
+ .xword 0x106aa07032bbd1b8
+ .xword 0x19a4c116b8d2d0c8
+ .xword 0x1e376c085141ab53
+ .xword 0x2748774cdf8eeb99
+ .xword 0x34b0bcb5e19b48a8
+ .xword 0x391c0cb3c5c95a63
+ .xword 0x4ed8aa4ae3418acb
+ .xword 0x5b9cca4f7763e373
+ .xword 0x682e6ff3d6b2b8a3
+ .xword 0x748f82ee5defb2fc
+ .xword 0x78a5636f43172f60
+ .xword 0x84c87814a1f0ab72
+ .xword 0x8cc702081a6439ec
+ .xword 0x90befffa23631e28
+ .xword 0xa4506cebde82bde9
+ .xword 0xbef9a3f7b2c67915
+ .xword 0xc67178f2e372532b
+ .xword 0xca273eceea26619c
+ .xword 0xd186b8c721c0c207
+ .xword 0xeada7dd6cde0eb1e
+ .xword 0xf57d4f7fee6ed178
+ .xword 0x6f067aa72176fba
+ .xword 0xa637dc5a2c898a6
+ .xword 0x113f9804bef90dae
+ .xword 0x1b710b35131c471b
+ .xword 0x28db77f523047d84
+ .xword 0x32caab7b40c72493
+ .xword 0x3c9ebe0a15c9bebc
+ .xword 0x431d67c49c100d4c
+ .xword 0x4cc5d4becb3e42b6
+ .xword 0x597f299cfc657e2a
+ .xword 0x5fcb6fab3ad6faec
+ .xword 0x6c44198c4a475817
+ .text
+ .section .rodata
+ .type L_SHA512_transform_neon_len_ror8, %object
+ .size L_SHA512_transform_neon_len_ror8, 16
+ .align 4
+L_SHA512_transform_neon_len_ror8:
+ .xword 0x7060504030201, 0x80f0e0d0c0b0a09
+ .text
+ .align 2
+ .globl Transform_Sha512_Len
+ .type Transform_Sha512_Len, %function
+Transform_Sha512_Len:
+ stp x29, x30, [sp, #-128]!
+ add x29, sp, #0
+ str x17, [x29, #16]
+ str x19, [x29, #24]
+ stp x20, x21, [x29, #32]
+ stp x22, x23, [x29, #48]
+ stp x24, x25, [x29, #64]
+ stp x26, x27, [x29, #80]
+ stp d8, d9, [x29, #96]
+ stp d10, d11, [x29, #112]
+ adr x3, L_SHA512_transform_neon_len_k
+ adr x27, L_SHA512_transform_neon_len_ror8
+ ld1 {v11.16b}, [x27]
+ # Load digest into working vars
+ ldp x4, x5, [x0]
+ ldp x6, x7, [x0, #16]
+ ldp x8, x9, [x0, #32]
+ ldp x10, x11, [x0, #48]
+ # Start of loop processing a block
+L_sha512_len_neon_begin:
+ # Load W
+ # Copy digest to add in at end
+ ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #0x40
+ mov x19, x4
+ ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #0x40
+ mov x20, x5
+ rev64 v0.16b, v0.16b
+ mov x21, x6
+ rev64 v1.16b, v1.16b
+ mov x22, x7
+ rev64 v2.16b, v2.16b
+ mov x23, x8
+ rev64 v3.16b, v3.16b
+ mov x24, x9
+ rev64 v4.16b, v4.16b
+ mov x25, x10
+ rev64 v5.16b, v5.16b
+ mov x26, x11
+ rev64 v6.16b, v6.16b
+ rev64 v7.16b, v7.16b
+ # Pre-calc: b ^ c
+ eor x16, x5, x6
+ mov x27, #4
+ # Start of 16 rounds
+L_sha512_len_neon_start:
+ # Round 0
+ mov x13, v0.d[0]
+ ldr x15, [x3], #8
+ ror x12, x8, #14
+ ror x14, x4, #28
+ eor x12, x12, x8, ror 18
+ eor x14, x14, x4, ror 34
+ eor x12, x12, x8, ror 41
+ eor x14, x14, x4, ror 39
+ add x11, x11, x12
+ eor x17, x4, x5
+ eor x12, x9, x10
+ and x16, x17, x16
+ and x12, x12, x8
+ add x11, x11, x13
+ eor x12, x12, x10
+ add x11, x11, x15
+ eor x16, x16, x5
+ add x11, x11, x12
+ add x14, x14, x16
+ add x7, x7, x11
+ add x11, x11, x14
+ # Round 1
+ mov x13, v0.d[1]
+ ldr x15, [x3], #8
+ ext v10.16b, v0.16b, v1.16b, #8
+ ror x12, x7, #14
+ shl v8.2d, v7.2d, #45
+ ror x14, x11, #28
+ sri v8.2d, v7.2d, #19
+ eor x12, x12, x7, ror 18
+ shl v9.2d, v7.2d, #3
+ eor x14, x14, x11, ror 34
+ sri v9.2d, v7.2d, #61
+ eor x12, x12, x7, ror 41
+ eor v9.16b, v9.16b, v8.16b
+ eor x14, x14, x11, ror 39
+ ushr v8.2d, v7.2d, #6
+ add x10, x10, x12
+ eor v9.16b, v9.16b, v8.16b
+ eor x16, x11, x4
+ add v0.2d, v0.2d, v9.2d
+ eor x12, x8, x9
+ ext v9.16b, v4.16b, v5.16b, #8
+ and x17, x16, x17
+ add v0.2d, v0.2d, v9.2d
+ and x12, x12, x7
+ shl v8.2d, v10.2d, #63
+ add x10, x10, x13
+ sri v8.2d, v10.2d, #1
+ eor x12, x12, x9
+ tbl v9.16b, {v10.16b}, v11.16b
+ add x10, x10, x15
+ eor v9.16b, v9.16b, v8.16b
+ eor x17, x17, x4
+ ushr v10.2d, v10.2d, #7
+ add x10, x10, x12
+ eor v9.16b, v9.16b, v10.16b
+ add x14, x14, x17
+ add v0.2d, v0.2d, v9.2d
+ add x6, x6, x10
+ add x10, x10, x14
+ # Round 2
+ mov x13, v1.d[0]
+ ldr x15, [x3], #8
+ ror x12, x6, #14
+ ror x14, x10, #28
+ eor x12, x12, x6, ror 18
+ eor x14, x14, x10, ror 34
+ eor x12, x12, x6, ror 41
+ eor x14, x14, x10, ror 39
+ add x9, x9, x12
+ eor x17, x10, x11
+ eor x12, x7, x8
+ and x16, x17, x16
+ and x12, x12, x6
+ add x9, x9, x13
+ eor x12, x12, x8
+ add x9, x9, x15
+ eor x16, x16, x11
+ add x9, x9, x12
+ add x14, x14, x16
+ add x5, x5, x9
+ add x9, x9, x14
+ # Round 3
+ mov x13, v1.d[1]
+ ldr x15, [x3], #8
+ ext v10.16b, v1.16b, v2.16b, #8
+ ror x12, x5, #14
+ shl v8.2d, v0.2d, #45
+ ror x14, x9, #28
+ sri v8.2d, v0.2d, #19
+ eor x12, x12, x5, ror 18
+ shl v9.2d, v0.2d, #3
+ eor x14, x14, x9, ror 34
+ sri v9.2d, v0.2d, #61
+ eor x12, x12, x5, ror 41
+ eor v9.16b, v9.16b, v8.16b
+ eor x14, x14, x9, ror 39
+ ushr v8.2d, v0.2d, #6
+ add x8, x8, x12
+ eor v9.16b, v9.16b, v8.16b
+ eor x16, x9, x10
+ add v1.2d, v1.2d, v9.2d
+ eor x12, x6, x7
+ ext v9.16b, v5.16b, v6.16b, #8
+ and x17, x16, x17
+ add v1.2d, v1.2d, v9.2d
+ and x12, x12, x5
+ shl v8.2d, v10.2d, #63
+ add x8, x8, x13
+ sri v8.2d, v10.2d, #1
+ eor x12, x12, x7
+ tbl v9.16b, {v10.16b}, v11.16b
+ add x8, x8, x15
+ eor v9.16b, v9.16b, v8.16b
+ eor x17, x17, x10
+ ushr v10.2d, v10.2d, #7
+ add x8, x8, x12
+ eor v9.16b, v9.16b, v10.16b
+ add x14, x14, x17
+ add v1.2d, v1.2d, v9.2d
+ add x4, x4, x8
+ add x8, x8, x14
+ # Round 4
+ mov x13, v2.d[0]
+ ldr x15, [x3], #8
+ ror x12, x4, #14
+ ror x14, x8, #28
+ eor x12, x12, x4, ror 18
+ eor x14, x14, x8, ror 34
+ eor x12, x12, x4, ror 41
+ eor x14, x14, x8, ror 39
+ add x7, x7, x12
+ eor x17, x8, x9
+ eor x12, x5, x6
+ and x16, x17, x16
+ and x12, x12, x4
+ add x7, x7, x13
+ eor x12, x12, x6
+ add x7, x7, x15
+ eor x16, x16, x9
+ add x7, x7, x12
+ add x14, x14, x16
+ add x11, x11, x7
+ add x7, x7, x14
+ # Round 5
+ mov x13, v2.d[1]
+ ldr x15, [x3], #8
+ ext v10.16b, v2.16b, v3.16b, #8
+ ror x12, x11, #14
+ shl v8.2d, v1.2d, #45
+ ror x14, x7, #28
+ sri v8.2d, v1.2d, #19
+ eor x12, x12, x11, ror 18
+ shl v9.2d, v1.2d, #3
+ eor x14, x14, x7, ror 34
+ sri v9.2d, v1.2d, #61
+ eor x12, x12, x11, ror 41
+ eor v9.16b, v9.16b, v8.16b
+ eor x14, x14, x7, ror 39
+ ushr v8.2d, v1.2d, #6
+ add x6, x6, x12
+ eor v9.16b, v9.16b, v8.16b
+ eor x16, x7, x8
+ add v2.2d, v2.2d, v9.2d
+ eor x12, x4, x5
+ ext v9.16b, v6.16b, v7.16b, #8
+ and x17, x16, x17
+ add v2.2d, v2.2d, v9.2d
+ and x12, x12, x11
+ shl v8.2d, v10.2d, #63
+ add x6, x6, x13
+ sri v8.2d, v10.2d, #1
+ eor x12, x12, x5
+ tbl v9.16b, {v10.16b}, v11.16b
+ add x6, x6, x15
+ eor v9.16b, v9.16b, v8.16b
+ eor x17, x17, x8
+ ushr v10.2d, v10.2d, #7
+ add x6, x6, x12
+ eor v9.16b, v9.16b, v10.16b
+ add x14, x14, x17
+ add v2.2d, v2.2d, v9.2d
+ add x10, x10, x6
+ add x6, x6, x14
+ # Round 6
+ mov x13, v3.d[0]
+ ldr x15, [x3], #8
+ ror x12, x10, #14
+ ror x14, x6, #28
+ eor x12, x12, x10, ror 18
+ eor x14, x14, x6, ror 34
+ eor x12, x12, x10, ror 41
+ eor x14, x14, x6, ror 39
+ add x5, x5, x12
+ eor x17, x6, x7
+ eor x12, x11, x4
+ and x16, x17, x16
+ and x12, x12, x10
+ add x5, x5, x13
+ eor x12, x12, x4
+ add x5, x5, x15
+ eor x16, x16, x7
+ add x5, x5, x12
+ add x14, x14, x16
+ add x9, x9, x5
+ add x5, x5, x14
+ # Round 7
+ mov x13, v3.d[1]
+ ldr x15, [x3], #8
+ ext v10.16b, v3.16b, v4.16b, #8
+ ror x12, x9, #14
+ shl v8.2d, v2.2d, #45
+ ror x14, x5, #28
+ sri v8.2d, v2.2d, #19
+ eor x12, x12, x9, ror 18
+ shl v9.2d, v2.2d, #3
+ eor x14, x14, x5, ror 34
+ sri v9.2d, v2.2d, #61
+ eor x12, x12, x9, ror 41
+ eor v9.16b, v9.16b, v8.16b
+ eor x14, x14, x5, ror 39
+ ushr v8.2d, v2.2d, #6
+ add x4, x4, x12
+ eor v9.16b, v9.16b, v8.16b
+ eor x16, x5, x6
+ add v3.2d, v3.2d, v9.2d
+ eor x12, x10, x11
+ ext v9.16b, v7.16b, v0.16b, #8
+ and x17, x16, x17
+ add v3.2d, v3.2d, v9.2d
+ and x12, x12, x9
+ shl v8.2d, v10.2d, #63
+ add x4, x4, x13
+ sri v8.2d, v10.2d, #1
+ eor x12, x12, x11
+ tbl v9.16b, {v10.16b}, v11.16b
+ add x4, x4, x15
+ eor v9.16b, v9.16b, v8.16b
+ eor x17, x17, x6
+ ushr v10.2d, v10.2d, #7
+ add x4, x4, x12
+ eor v9.16b, v9.16b, v10.16b
+ add x14, x14, x17
+ add v3.2d, v3.2d, v9.2d
+ add x8, x8, x4
+ add x4, x4, x14
+ # Round 8
+ mov x13, v4.d[0]
+ ldr x15, [x3], #8
+ ror x12, x8, #14
+ ror x14, x4, #28
+ eor x12, x12, x8, ror 18
+ eor x14, x14, x4, ror 34
+ eor x12, x12, x8, ror 41
+ eor x14, x14, x4, ror 39
+ add x11, x11, x12
+ eor x17, x4, x5
+ eor x12, x9, x10
+ and x16, x17, x16
+ and x12, x12, x8
+ add x11, x11, x13
+ eor x12, x12, x10
+ add x11, x11, x15
+ eor x16, x16, x5
+ add x11, x11, x12
+ add x14, x14, x16
+ add x7, x7, x11
+ add x11, x11, x14
+ # Round 9
+ mov x13, v4.d[1]
+ ldr x15, [x3], #8
+ ext v10.16b, v4.16b, v5.16b, #8
+ ror x12, x7, #14
+ shl v8.2d, v3.2d, #45
+ ror x14, x11, #28
+ sri v8.2d, v3.2d, #19
+ eor x12, x12, x7, ror 18
+ shl v9.2d, v3.2d, #3
+ eor x14, x14, x11, ror 34
+ sri v9.2d, v3.2d, #61
+ eor x12, x12, x7, ror 41
+ eor v9.16b, v9.16b, v8.16b
+ eor x14, x14, x11, ror 39
+ ushr v8.2d, v3.2d, #6
+ add x10, x10, x12
+ eor v9.16b, v9.16b, v8.16b
+ eor x16, x11, x4
+ add v4.2d, v4.2d, v9.2d
+ eor x12, x8, x9
+ ext v9.16b, v0.16b, v1.16b, #8
+ and x17, x16, x17
+ add v4.2d, v4.2d, v9.2d
+ and x12, x12, x7
+ shl v8.2d, v10.2d, #63
+ add x10, x10, x13
+ sri v8.2d, v10.2d, #1
+ eor x12, x12, x9
+ tbl v9.16b, {v10.16b}, v11.16b
+ add x10, x10, x15
+ eor v9.16b, v9.16b, v8.16b
+ eor x17, x17, x4
+ ushr v10.2d, v10.2d, #7
+ add x10, x10, x12
+ eor v9.16b, v9.16b, v10.16b
+ add x14, x14, x17
+ add v4.2d, v4.2d, v9.2d
+ add x6, x6, x10
+ add x10, x10, x14
+ # Round 10
+ mov x13, v5.d[0]
+ ldr x15, [x3], #8
+ ror x12, x6, #14
+ ror x14, x10, #28
+ eor x12, x12, x6, ror 18
+ eor x14, x14, x10, ror 34
+ eor x12, x12, x6, ror 41
+ eor x14, x14, x10, ror 39
+ add x9, x9, x12
+ eor x17, x10, x11
+ eor x12, x7, x8
+ and x16, x17, x16
+ and x12, x12, x6
+ add x9, x9, x13
+ eor x12, x12, x8
+ add x9, x9, x15
+ eor x16, x16, x11
+ add x9, x9, x12
+ add x14, x14, x16
+ add x5, x5, x9
+ add x9, x9, x14
+ # Round 11
+ mov x13, v5.d[1]
+ ldr x15, [x3], #8
+ ext v10.16b, v5.16b, v6.16b, #8
+ ror x12, x5, #14
+ shl v8.2d, v4.2d, #45
+ ror x14, x9, #28
+ sri v8.2d, v4.2d, #19
+ eor x12, x12, x5, ror 18
+ shl v9.2d, v4.2d, #3
+ eor x14, x14, x9, ror 34
+ sri v9.2d, v4.2d, #61
+ eor x12, x12, x5, ror 41
+ eor v9.16b, v9.16b, v8.16b
+ eor x14, x14, x9, ror 39
+ ushr v8.2d, v4.2d, #6
+ add x8, x8, x12
+ eor v9.16b, v9.16b, v8.16b
+ eor x16, x9, x10
+ add v5.2d, v5.2d, v9.2d
+ eor x12, x6, x7
+ ext v9.16b, v1.16b, v2.16b, #8
+ and x17, x16, x17
+ add v5.2d, v5.2d, v9.2d
+ and x12, x12, x5
+ shl v8.2d, v10.2d, #63
+ add x8, x8, x13
+ sri v8.2d, v10.2d, #1
+ eor x12, x12, x7
+ tbl v9.16b, {v10.16b}, v11.16b
+ add x8, x8, x15
+ eor v9.16b, v9.16b, v8.16b
+ eor x17, x17, x10
+ ushr v10.2d, v10.2d, #7
+ add x8, x8, x12
+ eor v9.16b, v9.16b, v10.16b
+ add x14, x14, x17
+ add v5.2d, v5.2d, v9.2d
+ add x4, x4, x8
+ add x8, x8, x14
+ # Round 12
+ mov x13, v6.d[0]
+ ldr x15, [x3], #8
+ ror x12, x4, #14
+ ror x14, x8, #28
+ eor x12, x12, x4, ror 18
+ eor x14, x14, x8, ror 34
+ eor x12, x12, x4, ror 41
+ eor x14, x14, x8, ror 39
+ add x7, x7, x12
+ eor x17, x8, x9
+ eor x12, x5, x6
+ and x16, x17, x16
+ and x12, x12, x4
+ add x7, x7, x13
+ eor x12, x12, x6
+ add x7, x7, x15
+ eor x16, x16, x9
+ add x7, x7, x12
+ add x14, x14, x16
+ add x11, x11, x7
+ add x7, x7, x14
+ # Round 13
+ mov x13, v6.d[1]
+ ldr x15, [x3], #8
+ ext v10.16b, v6.16b, v7.16b, #8
+ ror x12, x11, #14
+ shl v8.2d, v5.2d, #45
+ ror x14, x7, #28
+ sri v8.2d, v5.2d, #19
+ eor x12, x12, x11, ror 18
+ shl v9.2d, v5.2d, #3
+ eor x14, x14, x7, ror 34
+ sri v9.2d, v5.2d, #61
+ eor x12, x12, x11, ror 41
+ eor v9.16b, v9.16b, v8.16b
+ eor x14, x14, x7, ror 39
+ ushr v8.2d, v5.2d, #6
+ add x6, x6, x12
+ eor v9.16b, v9.16b, v8.16b
+ eor x16, x7, x8
+ add v6.2d, v6.2d, v9.2d
+ eor x12, x4, x5
+ ext v9.16b, v2.16b, v3.16b, #8
+ and x17, x16, x17
+ add v6.2d, v6.2d, v9.2d
+ and x12, x12, x11
+ shl v8.2d, v10.2d, #63
+ add x6, x6, x13
+ sri v8.2d, v10.2d, #1
+ eor x12, x12, x5
+ tbl v9.16b, {v10.16b}, v11.16b
+ add x6, x6, x15
+ eor v9.16b, v9.16b, v8.16b
+ eor x17, x17, x8
+ ushr v10.2d, v10.2d, #7
+ add x6, x6, x12
+ eor v9.16b, v9.16b, v10.16b
+ add x14, x14, x17
+ add v6.2d, v6.2d, v9.2d
+ add x10, x10, x6
+ add x6, x6, x14
+ # Round 14
+ mov x13, v7.d[0]
+ ldr x15, [x3], #8
+ ror x12, x10, #14
+ ror x14, x6, #28
+ eor x12, x12, x10, ror 18
+ eor x14, x14, x6, ror 34
+ eor x12, x12, x10, ror 41
+ eor x14, x14, x6, ror 39
+ add x5, x5, x12
+ eor x17, x6, x7
+ eor x12, x11, x4
+ and x16, x17, x16
+ and x12, x12, x10
+ add x5, x5, x13
+ eor x12, x12, x4
+ add x5, x5, x15
+ eor x16, x16, x7
+ add x5, x5, x12
+ add x14, x14, x16
+ add x9, x9, x5
+ add x5, x5, x14
+ # Round 15
+ mov x13, v7.d[1]
+ ldr x15, [x3], #8
+ ext v10.16b, v7.16b, v0.16b, #8
+ ror x12, x9, #14
+ shl v8.2d, v6.2d, #45
+ ror x14, x5, #28
+ sri v8.2d, v6.2d, #19
+ eor x12, x12, x9, ror 18
+ shl v9.2d, v6.2d, #3
+ eor x14, x14, x5, ror 34
+ sri v9.2d, v6.2d, #61
+ eor x12, x12, x9, ror 41
+ eor v9.16b, v9.16b, v8.16b
+ eor x14, x14, x5, ror 39
+ ushr v8.2d, v6.2d, #6
+ add x4, x4, x12
+ eor v9.16b, v9.16b, v8.16b
+ eor x16, x5, x6
+ add v7.2d, v7.2d, v9.2d
+ eor x12, x10, x11
+ ext v9.16b, v3.16b, v4.16b, #8
+ and x17, x16, x17
+ add v7.2d, v7.2d, v9.2d
+ and x12, x12, x9
+ shl v8.2d, v10.2d, #63
+ add x4, x4, x13
+ sri v8.2d, v10.2d, #1
+ eor x12, x12, x11
+ tbl v9.16b, {v10.16b}, v11.16b
+ add x4, x4, x15
+ eor v9.16b, v9.16b, v8.16b
+ eor x17, x17, x6
+ ushr v10.2d, v10.2d, #7
+ add x4, x4, x12
+ eor v9.16b, v9.16b, v10.16b
+ add x14, x14, x17
+ add v7.2d, v7.2d, v9.2d
+ add x8, x8, x4
+ add x4, x4, x14
+ subs x27, x27, #1
+ bne L_sha512_len_neon_start
+ # Round 0
+ mov x13, v0.d[0]
+ ldr x15, [x3], #8
+ ror x12, x8, #14
+ ror x14, x4, #28
+ eor x12, x12, x8, ror 18
+ eor x14, x14, x4, ror 34
+ eor x12, x12, x8, ror 41
+ eor x14, x14, x4, ror 39
+ add x11, x11, x12
+ eor x17, x4, x5
+ eor x12, x9, x10
+ and x16, x17, x16
+ and x12, x12, x8
+ add x11, x11, x13
+ eor x12, x12, x10
+ add x11, x11, x15
+ eor x16, x16, x5
+ add x11, x11, x12
+ add x14, x14, x16
+ add x7, x7, x11
+ add x11, x11, x14
+ # Round 1
+ mov x13, v0.d[1]
+ ldr x15, [x3], #8
+ ror x12, x7, #14
+ ror x14, x11, #28
+ eor x12, x12, x7, ror 18
+ eor x14, x14, x11, ror 34
+ eor x12, x12, x7, ror 41
+ eor x14, x14, x11, ror 39
+ add x10, x10, x12
+ eor x16, x11, x4
+ eor x12, x8, x9
+ and x17, x16, x17
+ and x12, x12, x7
+ add x10, x10, x13
+ eor x12, x12, x9
+ add x10, x10, x15
+ eor x17, x17, x4
+ add x10, x10, x12
+ add x14, x14, x17
+ add x6, x6, x10
+ add x10, x10, x14
+ # Round 2
+ mov x13, v1.d[0]
+ ldr x15, [x3], #8
+ ror x12, x6, #14
+ ror x14, x10, #28
+ eor x12, x12, x6, ror 18
+ eor x14, x14, x10, ror 34
+ eor x12, x12, x6, ror 41
+ eor x14, x14, x10, ror 39
+ add x9, x9, x12
+ eor x17, x10, x11
+ eor x12, x7, x8
+ and x16, x17, x16
+ and x12, x12, x6
+ add x9, x9, x13
+ eor x12, x12, x8
+ add x9, x9, x15
+ eor x16, x16, x11
+ add x9, x9, x12
+ add x14, x14, x16
+ add x5, x5, x9
+ add x9, x9, x14
+ # Round 3
+ mov x13, v1.d[1]
+ ldr x15, [x3], #8
+ ror x12, x5, #14
+ ror x14, x9, #28
+ eor x12, x12, x5, ror 18
+ eor x14, x14, x9, ror 34
+ eor x12, x12, x5, ror 41
+ eor x14, x14, x9, ror 39
+ add x8, x8, x12
+ eor x16, x9, x10
+ eor x12, x6, x7
+ and x17, x16, x17
+ and x12, x12, x5
+ add x8, x8, x13
+ eor x12, x12, x7
+ add x8, x8, x15
+ eor x17, x17, x10
+ add x8, x8, x12
+ add x14, x14, x17
+ add x4, x4, x8
+ add x8, x8, x14
+ # Round 4
+ mov x13, v2.d[0]
+ ldr x15, [x3], #8
+ ror x12, x4, #14
+ ror x14, x8, #28
+ eor x12, x12, x4, ror 18
+ eor x14, x14, x8, ror 34
+ eor x12, x12, x4, ror 41
+ eor x14, x14, x8, ror 39
+ add x7, x7, x12
+ eor x17, x8, x9
+ eor x12, x5, x6
+ and x16, x17, x16
+ and x12, x12, x4
+ add x7, x7, x13
+ eor x12, x12, x6
+ add x7, x7, x15
+ eor x16, x16, x9
+ add x7, x7, x12
+ add x14, x14, x16
+ add x11, x11, x7
+ add x7, x7, x14
+ # Round 5
+ mov x13, v2.d[1]
+ ldr x15, [x3], #8
+ ror x12, x11, #14
+ ror x14, x7, #28
+ eor x12, x12, x11, ror 18
+ eor x14, x14, x7, ror 34
+ eor x12, x12, x11, ror 41
+ eor x14, x14, x7, ror 39
+ add x6, x6, x12
+ eor x16, x7, x8
+ eor x12, x4, x5
+ and x17, x16, x17
+ and x12, x12, x11
+ add x6, x6, x13
+ eor x12, x12, x5
+ add x6, x6, x15
+ eor x17, x17, x8
+ add x6, x6, x12
+ add x14, x14, x17
+ add x10, x10, x6
+ add x6, x6, x14
+ # Round 6
+ mov x13, v3.d[0]
+ ldr x15, [x3], #8
+ ror x12, x10, #14
+ ror x14, x6, #28
+ eor x12, x12, x10, ror 18
+ eor x14, x14, x6, ror 34
+ eor x12, x12, x10, ror 41
+ eor x14, x14, x6, ror 39
+ add x5, x5, x12
+ eor x17, x6, x7
+ eor x12, x11, x4
+ and x16, x17, x16
+ and x12, x12, x10
+ add x5, x5, x13
+ eor x12, x12, x4
+ add x5, x5, x15
+ eor x16, x16, x7
+ add x5, x5, x12
+ add x14, x14, x16
+ add x9, x9, x5
+ add x5, x5, x14
+ # Round 7
+ mov x13, v3.d[1]
+ ldr x15, [x3], #8
+ ror x12, x9, #14
+ ror x14, x5, #28
+ eor x12, x12, x9, ror 18
+ eor x14, x14, x5, ror 34
+ eor x12, x12, x9, ror 41
+ eor x14, x14, x5, ror 39
+ add x4, x4, x12
+ eor x16, x5, x6
+ eor x12, x10, x11
+ and x17, x16, x17
+ and x12, x12, x9
+ add x4, x4, x13
+ eor x12, x12, x11
+ add x4, x4, x15
+ eor x17, x17, x6
+ add x4, x4, x12
+ add x14, x14, x17
+ add x8, x8, x4
+ add x4, x4, x14
+ # Round 8
+ mov x13, v4.d[0]
+ ldr x15, [x3], #8
+ ror x12, x8, #14
+ ror x14, x4, #28
+ eor x12, x12, x8, ror 18
+ eor x14, x14, x4, ror 34
+ eor x12, x12, x8, ror 41
+ eor x14, x14, x4, ror 39
+ add x11, x11, x12
+ eor x17, x4, x5
+ eor x12, x9, x10
+ and x16, x17, x16
+ and x12, x12, x8
+ add x11, x11, x13
+ eor x12, x12, x10
+ add x11, x11, x15
+ eor x16, x16, x5
+ add x11, x11, x12
+ add x14, x14, x16
+ add x7, x7, x11
+ add x11, x11, x14
+ # Round 9
+ mov x13, v4.d[1]
+ ldr x15, [x3], #8
+ ror x12, x7, #14
+ ror x14, x11, #28
+ eor x12, x12, x7, ror 18
+ eor x14, x14, x11, ror 34
+ eor x12, x12, x7, ror 41
+ eor x14, x14, x11, ror 39
+ add x10, x10, x12
+ eor x16, x11, x4
+ eor x12, x8, x9
+ and x17, x16, x17
+ and x12, x12, x7
+ add x10, x10, x13
+ eor x12, x12, x9
+ add x10, x10, x15
+ eor x17, x17, x4
+ add x10, x10, x12
+ add x14, x14, x17
+ add x6, x6, x10
+ add x10, x10, x14
+ # Round 10
+ mov x13, v5.d[0]
+ ldr x15, [x3], #8
+ ror x12, x6, #14
+ ror x14, x10, #28
+ eor x12, x12, x6, ror 18
+ eor x14, x14, x10, ror 34
+ eor x12, x12, x6, ror 41
+ eor x14, x14, x10, ror 39
+ add x9, x9, x12
+ eor x17, x10, x11
+ eor x12, x7, x8
+ and x16, x17, x16
+ and x12, x12, x6
+ add x9, x9, x13
+ eor x12, x12, x8
+ add x9, x9, x15
+ eor x16, x16, x11
+ add x9, x9, x12
+ add x14, x14, x16
+ add x5, x5, x9
+ add x9, x9, x14
+ # Round 11
+ mov x13, v5.d[1]
+ ldr x15, [x3], #8
+ ror x12, x5, #14
+ ror x14, x9, #28
+ eor x12, x12, x5, ror 18
+ eor x14, x14, x9, ror 34
+ eor x12, x12, x5, ror 41
+ eor x14, x14, x9, ror 39
+ add x8, x8, x12
+ eor x16, x9, x10
+ eor x12, x6, x7
+ and x17, x16, x17
+ and x12, x12, x5
+ add x8, x8, x13
+ eor x12, x12, x7
+ add x8, x8, x15
+ eor x17, x17, x10
+ add x8, x8, x12
+ add x14, x14, x17
+ add x4, x4, x8
+ add x8, x8, x14
+ # Round 12
+ mov x13, v6.d[0]
+ ldr x15, [x3], #8
+ ror x12, x4, #14
+ ror x14, x8, #28
+ eor x12, x12, x4, ror 18
+ eor x14, x14, x8, ror 34
+ eor x12, x12, x4, ror 41
+ eor x14, x14, x8, ror 39
+ add x7, x7, x12
+ eor x17, x8, x9
+ eor x12, x5, x6
+ and x16, x17, x16
+ and x12, x12, x4
+ add x7, x7, x13
+ eor x12, x12, x6
+ add x7, x7, x15
+ eor x16, x16, x9
+ add x7, x7, x12
+ add x14, x14, x16
+ add x11, x11, x7
+ add x7, x7, x14
+ # Round 13
+ mov x13, v6.d[1]
+ ldr x15, [x3], #8
+ ror x12, x11, #14
+ ror x14, x7, #28
+ eor x12, x12, x11, ror 18
+ eor x14, x14, x7, ror 34
+ eor x12, x12, x11, ror 41
+ eor x14, x14, x7, ror 39
+ add x6, x6, x12
+ eor x16, x7, x8
+ eor x12, x4, x5
+ and x17, x16, x17
+ and x12, x12, x11
+ add x6, x6, x13
+ eor x12, x12, x5
+ add x6, x6, x15
+ eor x17, x17, x8
+ add x6, x6, x12
+ add x14, x14, x17
+ add x10, x10, x6
+ add x6, x6, x14
+ # Round 14
+ mov x13, v7.d[0]
+ ldr x15, [x3], #8
+ ror x12, x10, #14
+ ror x14, x6, #28
+ eor x12, x12, x10, ror 18
+ eor x14, x14, x6, ror 34
+ eor x12, x12, x10, ror 41
+ eor x14, x14, x6, ror 39
+ add x5, x5, x12
+ eor x17, x6, x7
+ eor x12, x11, x4
+ and x16, x17, x16
+ and x12, x12, x10
+ add x5, x5, x13
+ eor x12, x12, x4
+ add x5, x5, x15
+ eor x16, x16, x7
+ add x5, x5, x12
+ add x14, x14, x16
+ add x9, x9, x5
+ add x5, x5, x14
+ # Round 15
+ mov x13, v7.d[1]
+ ldr x15, [x3], #8
+ ror x12, x9, #14
+ ror x14, x5, #28
+ eor x12, x12, x9, ror 18
+ eor x14, x14, x5, ror 34
+ eor x12, x12, x9, ror 41
+ eor x14, x14, x5, ror 39
+ add x4, x4, x12
+ eor x16, x5, x6
+ eor x12, x10, x11
+ and x17, x16, x17
+ and x12, x12, x9
+ add x4, x4, x13
+ eor x12, x12, x11
+ add x4, x4, x15
+ eor x17, x17, x6
+ add x4, x4, x12
+ add x14, x14, x17
+ add x8, x8, x4
+ add x4, x4, x14
+ add x11, x11, x26
+ add x10, x10, x25
+ add x9, x9, x24
+ add x8, x8, x23
+ add x7, x7, x22
+ add x6, x6, x21
+ add x5, x5, x20
+ add x4, x4, x19
+ adr x3, L_SHA512_transform_neon_len_k
+ subs w2, w2, #0x80
+ bne L_sha512_len_neon_begin
+ stp x4, x5, [x0]
+ stp x6, x7, [x0, #16]
+ stp x8, x9, [x0, #32]
+ stp x10, x11, [x0, #48]
+ ldr x17, [x29, #16]
+ ldr x19, [x29, #24]
+ ldp x20, x21, [x29, #32]
+ ldp x22, x23, [x29, #48]
+ ldp x24, x25, [x29, #64]
+ ldp x26, x27, [x29, #80]
+ ldp d8, d9, [x29, #96]
+ ldp d10, d11, [x29, #112]
+ ldp x29, x30, [sp], #0x80
+ ret
+ .size Transform_Sha512_Len,.-Transform_Sha512_Len
+#endif /* __aarch64__ */
diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-sha512-asm.c
new file mode 100644
index 0000000..d323598
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.c
@@ -0,0 +1,1041 @@
+/* armv8-sha512-asm
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generated using (from wolfssl):
+ * cd ../scripts
+ * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c
+ */
+#ifdef __aarch64__
+#include <stdint.h>
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#include <wolfssl/wolfcrypt/sha512.h>
+
+static const uint64_t L_SHA512_transform_neon_len_k[] = {
+ 0x428a2f98d728ae22UL,
+ 0x7137449123ef65cdUL,
+ 0xb5c0fbcfec4d3b2fUL,
+ 0xe9b5dba58189dbbcUL,
+ 0x3956c25bf348b538UL,
+ 0x59f111f1b605d019UL,
+ 0x923f82a4af194f9bUL,
+ 0xab1c5ed5da6d8118UL,
+ 0xd807aa98a3030242UL,
+ 0x12835b0145706fbeUL,
+ 0x243185be4ee4b28cUL,
+ 0x550c7dc3d5ffb4e2UL,
+ 0x72be5d74f27b896fUL,
+ 0x80deb1fe3b1696b1UL,
+ 0x9bdc06a725c71235UL,
+ 0xc19bf174cf692694UL,
+ 0xe49b69c19ef14ad2UL,
+ 0xefbe4786384f25e3UL,
+ 0xfc19dc68b8cd5b5UL,
+ 0x240ca1cc77ac9c65UL,
+ 0x2de92c6f592b0275UL,
+ 0x4a7484aa6ea6e483UL,
+ 0x5cb0a9dcbd41fbd4UL,
+ 0x76f988da831153b5UL,
+ 0x983e5152ee66dfabUL,
+ 0xa831c66d2db43210UL,
+ 0xb00327c898fb213fUL,
+ 0xbf597fc7beef0ee4UL,
+ 0xc6e00bf33da88fc2UL,
+ 0xd5a79147930aa725UL,
+ 0x6ca6351e003826fUL,
+ 0x142929670a0e6e70UL,
+ 0x27b70a8546d22ffcUL,
+ 0x2e1b21385c26c926UL,
+ 0x4d2c6dfc5ac42aedUL,
+ 0x53380d139d95b3dfUL,
+ 0x650a73548baf63deUL,
+ 0x766a0abb3c77b2a8UL,
+ 0x81c2c92e47edaee6UL,
+ 0x92722c851482353bUL,
+ 0xa2bfe8a14cf10364UL,
+ 0xa81a664bbc423001UL,
+ 0xc24b8b70d0f89791UL,
+ 0xc76c51a30654be30UL,
+ 0xd192e819d6ef5218UL,
+ 0xd69906245565a910UL,
+ 0xf40e35855771202aUL,
+ 0x106aa07032bbd1b8UL,
+ 0x19a4c116b8d2d0c8UL,
+ 0x1e376c085141ab53UL,
+ 0x2748774cdf8eeb99UL,
+ 0x34b0bcb5e19b48a8UL,
+ 0x391c0cb3c5c95a63UL,
+ 0x4ed8aa4ae3418acbUL,
+ 0x5b9cca4f7763e373UL,
+ 0x682e6ff3d6b2b8a3UL,
+ 0x748f82ee5defb2fcUL,
+ 0x78a5636f43172f60UL,
+ 0x84c87814a1f0ab72UL,
+ 0x8cc702081a6439ecUL,
+ 0x90befffa23631e28UL,
+ 0xa4506cebde82bde9UL,
+ 0xbef9a3f7b2c67915UL,
+ 0xc67178f2e372532bUL,
+ 0xca273eceea26619cUL,
+ 0xd186b8c721c0c207UL,
+ 0xeada7dd6cde0eb1eUL,
+ 0xf57d4f7fee6ed178UL,
+ 0x6f067aa72176fbaUL,
+ 0xa637dc5a2c898a6UL,
+ 0x113f9804bef90daeUL,
+ 0x1b710b35131c471bUL,
+ 0x28db77f523047d84UL,
+ 0x32caab7b40c72493UL,
+ 0x3c9ebe0a15c9bebcUL,
+ 0x431d67c49c100d4cUL,
+ 0x4cc5d4becb3e42b6UL,
+ 0x597f299cfc657e2aUL,
+ 0x5fcb6fab3ad6faecUL,
+ 0x6c44198c4a475817UL,
+};
+
+static const uint64_t L_SHA512_transform_neon_len_ror8[] = {
+ 0x7060504030201UL,
+ 0x80f0e0d0c0b0a09UL,
+};
+
+void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
+{
+ __asm__ __volatile__ (
+ "stp x29, x30, [sp, #-16]!\n\t"
+ "add x29, sp, #0\n\t"
+ "adr x3, %[L_SHA512_transform_neon_len_k]\n\t"
+ "adr x27, %[L_SHA512_transform_neon_len_ror8]\n\t"
+ "ld1 {v11.16b}, [x27]\n\t"
+ /* Load digest into working vars */
+ "ldp x4, x5, [%x[sha512]]\n\t"
+ "ldp x6, x7, [%x[sha512], #16]\n\t"
+ "ldp x8, x9, [%x[sha512], #32]\n\t"
+ "ldp x10, x11, [%x[sha512], #48]\n\t"
+ /* Start of loop processing a block */
+ "\n"
+ "L_sha512_len_neon_begin_%=: \n\t"
+ /* Load W */
+ /* Copy digest to add in at end */
+ "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[data]], #0x40\n\t"
+ "mov x19, x4\n\t"
+ "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[data]], #0x40\n\t"
+ "mov x20, x5\n\t"
+ "rev64 v0.16b, v0.16b\n\t"
+ "mov x21, x6\n\t"
+ "rev64 v1.16b, v1.16b\n\t"
+ "mov x22, x7\n\t"
+ "rev64 v2.16b, v2.16b\n\t"
+ "mov x23, x8\n\t"
+ "rev64 v3.16b, v3.16b\n\t"
+ "mov x24, x9\n\t"
+ "rev64 v4.16b, v4.16b\n\t"
+ "mov x25, x10\n\t"
+ "rev64 v5.16b, v5.16b\n\t"
+ "mov x26, x11\n\t"
+ "rev64 v6.16b, v6.16b\n\t"
+ "rev64 v7.16b, v7.16b\n\t"
+ /* Pre-calc: b ^ c */
+ "eor x16, x5, x6\n\t"
+ "mov x27, #4\n\t"
+ /* Start of 16 rounds */
+ "\n"
+ "L_sha512_len_neon_start_%=: \n\t"
+ /* Round 0 */
+ "mov x13, v0.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x8, #14\n\t"
+ "ror x14, x4, #28\n\t"
+ "eor x12, x12, x8, ror 18\n\t"
+ "eor x14, x14, x4, ror 34\n\t"
+ "eor x12, x12, x8, ror 41\n\t"
+ "eor x14, x14, x4, ror 39\n\t"
+ "add x11, x11, x12\n\t"
+ "eor x17, x4, x5\n\t"
+ "eor x12, x9, x10\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x8\n\t"
+ "add x11, x11, x13\n\t"
+ "eor x12, x12, x10\n\t"
+ "add x11, x11, x15\n\t"
+ "eor x16, x16, x5\n\t"
+ "add x11, x11, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x7, x7, x11\n\t"
+ "add x11, x11, x14\n\t"
+ /* Round 1 */
+ "mov x13, v0.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ext v10.16b, v0.16b, v1.16b, #8\n\t"
+ "ror x12, x7, #14\n\t"
+ "shl v8.2d, v7.2d, #45\n\t"
+ "ror x14, x11, #28\n\t"
+ "sri v8.2d, v7.2d, #19\n\t"
+ "eor x12, x12, x7, ror 18\n\t"
+ "shl v9.2d, v7.2d, #3\n\t"
+ "eor x14, x14, x11, ror 34\n\t"
+ "sri v9.2d, v7.2d, #61\n\t"
+ "eor x12, x12, x7, ror 41\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x14, x14, x11, ror 39\n\t"
+ "ushr v8.2d, v7.2d, #6\n\t"
+ "add x10, x10, x12\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x16, x11, x4\n\t"
+ "add v0.2d, v0.2d, v9.2d\n\t"
+ "eor x12, x8, x9\n\t"
+ "ext v9.16b, v4.16b, v5.16b, #8\n\t"
+ "and x17, x16, x17\n\t"
+ "add v0.2d, v0.2d, v9.2d\n\t"
+ "and x12, x12, x7\n\t"
+ "shl v8.2d, v10.2d, #63\n\t"
+ "add x10, x10, x13\n\t"
+ "sri v8.2d, v10.2d, #1\n\t"
+ "eor x12, x12, x9\n\t"
+ "tbl v9.16b, {v10.16b}, v11.16b\n\t"
+ "add x10, x10, x15\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x17, x17, x4\n\t"
+ "ushr v10.2d, v10.2d, #7\n\t"
+ "add x10, x10, x12\n\t"
+ "eor v9.16b, v9.16b, v10.16b\n\t"
+ "add x14, x14, x17\n\t"
+ "add v0.2d, v0.2d, v9.2d\n\t"
+ "add x6, x6, x10\n\t"
+ "add x10, x10, x14\n\t"
+ /* Round 2 */
+ "mov x13, v1.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x6, #14\n\t"
+ "ror x14, x10, #28\n\t"
+ "eor x12, x12, x6, ror 18\n\t"
+ "eor x14, x14, x10, ror 34\n\t"
+ "eor x12, x12, x6, ror 41\n\t"
+ "eor x14, x14, x10, ror 39\n\t"
+ "add x9, x9, x12\n\t"
+ "eor x17, x10, x11\n\t"
+ "eor x12, x7, x8\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x6\n\t"
+ "add x9, x9, x13\n\t"
+ "eor x12, x12, x8\n\t"
+ "add x9, x9, x15\n\t"
+ "eor x16, x16, x11\n\t"
+ "add x9, x9, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x5, x5, x9\n\t"
+ "add x9, x9, x14\n\t"
+ /* Round 3 */
+ "mov x13, v1.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ext v10.16b, v1.16b, v2.16b, #8\n\t"
+ "ror x12, x5, #14\n\t"
+ "shl v8.2d, v0.2d, #45\n\t"
+ "ror x14, x9, #28\n\t"
+ "sri v8.2d, v0.2d, #19\n\t"
+ "eor x12, x12, x5, ror 18\n\t"
+ "shl v9.2d, v0.2d, #3\n\t"
+ "eor x14, x14, x9, ror 34\n\t"
+ "sri v9.2d, v0.2d, #61\n\t"
+ "eor x12, x12, x5, ror 41\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x14, x14, x9, ror 39\n\t"
+ "ushr v8.2d, v0.2d, #6\n\t"
+ "add x8, x8, x12\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x16, x9, x10\n\t"
+ "add v1.2d, v1.2d, v9.2d\n\t"
+ "eor x12, x6, x7\n\t"
+ "ext v9.16b, v5.16b, v6.16b, #8\n\t"
+ "and x17, x16, x17\n\t"
+ "add v1.2d, v1.2d, v9.2d\n\t"
+ "and x12, x12, x5\n\t"
+ "shl v8.2d, v10.2d, #63\n\t"
+ "add x8, x8, x13\n\t"
+ "sri v8.2d, v10.2d, #1\n\t"
+ "eor x12, x12, x7\n\t"
+ "tbl v9.16b, {v10.16b}, v11.16b\n\t"
+ "add x8, x8, x15\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x17, x17, x10\n\t"
+ "ushr v10.2d, v10.2d, #7\n\t"
+ "add x8, x8, x12\n\t"
+ "eor v9.16b, v9.16b, v10.16b\n\t"
+ "add x14, x14, x17\n\t"
+ "add v1.2d, v1.2d, v9.2d\n\t"
+ "add x4, x4, x8\n\t"
+ "add x8, x8, x14\n\t"
+ /* Round 4 */
+ "mov x13, v2.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x4, #14\n\t"
+ "ror x14, x8, #28\n\t"
+ "eor x12, x12, x4, ror 18\n\t"
+ "eor x14, x14, x8, ror 34\n\t"
+ "eor x12, x12, x4, ror 41\n\t"
+ "eor x14, x14, x8, ror 39\n\t"
+ "add x7, x7, x12\n\t"
+ "eor x17, x8, x9\n\t"
+ "eor x12, x5, x6\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x4\n\t"
+ "add x7, x7, x13\n\t"
+ "eor x12, x12, x6\n\t"
+ "add x7, x7, x15\n\t"
+ "eor x16, x16, x9\n\t"
+ "add x7, x7, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x11, x11, x7\n\t"
+ "add x7, x7, x14\n\t"
+ /* Round 5 */
+ "mov x13, v2.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ext v10.16b, v2.16b, v3.16b, #8\n\t"
+ "ror x12, x11, #14\n\t"
+ "shl v8.2d, v1.2d, #45\n\t"
+ "ror x14, x7, #28\n\t"
+ "sri v8.2d, v1.2d, #19\n\t"
+ "eor x12, x12, x11, ror 18\n\t"
+ "shl v9.2d, v1.2d, #3\n\t"
+ "eor x14, x14, x7, ror 34\n\t"
+ "sri v9.2d, v1.2d, #61\n\t"
+ "eor x12, x12, x11, ror 41\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x14, x14, x7, ror 39\n\t"
+ "ushr v8.2d, v1.2d, #6\n\t"
+ "add x6, x6, x12\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x16, x7, x8\n\t"
+ "add v2.2d, v2.2d, v9.2d\n\t"
+ "eor x12, x4, x5\n\t"
+ "ext v9.16b, v6.16b, v7.16b, #8\n\t"
+ "and x17, x16, x17\n\t"
+ "add v2.2d, v2.2d, v9.2d\n\t"
+ "and x12, x12, x11\n\t"
+ "shl v8.2d, v10.2d, #63\n\t"
+ "add x6, x6, x13\n\t"
+ "sri v8.2d, v10.2d, #1\n\t"
+ "eor x12, x12, x5\n\t"
+ "tbl v9.16b, {v10.16b}, v11.16b\n\t"
+ "add x6, x6, x15\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x17, x17, x8\n\t"
+ "ushr v10.2d, v10.2d, #7\n\t"
+ "add x6, x6, x12\n\t"
+ "eor v9.16b, v9.16b, v10.16b\n\t"
+ "add x14, x14, x17\n\t"
+ "add v2.2d, v2.2d, v9.2d\n\t"
+ "add x10, x10, x6\n\t"
+ "add x6, x6, x14\n\t"
+ /* Round 6 */
+ "mov x13, v3.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x10, #14\n\t"
+ "ror x14, x6, #28\n\t"
+ "eor x12, x12, x10, ror 18\n\t"
+ "eor x14, x14, x6, ror 34\n\t"
+ "eor x12, x12, x10, ror 41\n\t"
+ "eor x14, x14, x6, ror 39\n\t"
+ "add x5, x5, x12\n\t"
+ "eor x17, x6, x7\n\t"
+ "eor x12, x11, x4\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x10\n\t"
+ "add x5, x5, x13\n\t"
+ "eor x12, x12, x4\n\t"
+ "add x5, x5, x15\n\t"
+ "eor x16, x16, x7\n\t"
+ "add x5, x5, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x9, x9, x5\n\t"
+ "add x5, x5, x14\n\t"
+ /* Round 7 */
+ "mov x13, v3.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ext v10.16b, v3.16b, v4.16b, #8\n\t"
+ "ror x12, x9, #14\n\t"
+ "shl v8.2d, v2.2d, #45\n\t"
+ "ror x14, x5, #28\n\t"
+ "sri v8.2d, v2.2d, #19\n\t"
+ "eor x12, x12, x9, ror 18\n\t"
+ "shl v9.2d, v2.2d, #3\n\t"
+ "eor x14, x14, x5, ror 34\n\t"
+ "sri v9.2d, v2.2d, #61\n\t"
+ "eor x12, x12, x9, ror 41\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x14, x14, x5, ror 39\n\t"
+ "ushr v8.2d, v2.2d, #6\n\t"
+ "add x4, x4, x12\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x16, x5, x6\n\t"
+ "add v3.2d, v3.2d, v9.2d\n\t"
+ "eor x12, x10, x11\n\t"
+ "ext v9.16b, v7.16b, v0.16b, #8\n\t"
+ "and x17, x16, x17\n\t"
+ "add v3.2d, v3.2d, v9.2d\n\t"
+ "and x12, x12, x9\n\t"
+ "shl v8.2d, v10.2d, #63\n\t"
+ "add x4, x4, x13\n\t"
+ "sri v8.2d, v10.2d, #1\n\t"
+ "eor x12, x12, x11\n\t"
+ "tbl v9.16b, {v10.16b}, v11.16b\n\t"
+ "add x4, x4, x15\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x17, x17, x6\n\t"
+ "ushr v10.2d, v10.2d, #7\n\t"
+ "add x4, x4, x12\n\t"
+ "eor v9.16b, v9.16b, v10.16b\n\t"
+ "add x14, x14, x17\n\t"
+ "add v3.2d, v3.2d, v9.2d\n\t"
+ "add x8, x8, x4\n\t"
+ "add x4, x4, x14\n\t"
+ /* Round 8 */
+ "mov x13, v4.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x8, #14\n\t"
+ "ror x14, x4, #28\n\t"
+ "eor x12, x12, x8, ror 18\n\t"
+ "eor x14, x14, x4, ror 34\n\t"
+ "eor x12, x12, x8, ror 41\n\t"
+ "eor x14, x14, x4, ror 39\n\t"
+ "add x11, x11, x12\n\t"
+ "eor x17, x4, x5\n\t"
+ "eor x12, x9, x10\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x8\n\t"
+ "add x11, x11, x13\n\t"
+ "eor x12, x12, x10\n\t"
+ "add x11, x11, x15\n\t"
+ "eor x16, x16, x5\n\t"
+ "add x11, x11, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x7, x7, x11\n\t"
+ "add x11, x11, x14\n\t"
+ /* Round 9 */
+ "mov x13, v4.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ext v10.16b, v4.16b, v5.16b, #8\n\t"
+ "ror x12, x7, #14\n\t"
+ "shl v8.2d, v3.2d, #45\n\t"
+ "ror x14, x11, #28\n\t"
+ "sri v8.2d, v3.2d, #19\n\t"
+ "eor x12, x12, x7, ror 18\n\t"
+ "shl v9.2d, v3.2d, #3\n\t"
+ "eor x14, x14, x11, ror 34\n\t"
+ "sri v9.2d, v3.2d, #61\n\t"
+ "eor x12, x12, x7, ror 41\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x14, x14, x11, ror 39\n\t"
+ "ushr v8.2d, v3.2d, #6\n\t"
+ "add x10, x10, x12\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x16, x11, x4\n\t"
+ "add v4.2d, v4.2d, v9.2d\n\t"
+ "eor x12, x8, x9\n\t"
+ "ext v9.16b, v0.16b, v1.16b, #8\n\t"
+ "and x17, x16, x17\n\t"
+ "add v4.2d, v4.2d, v9.2d\n\t"
+ "and x12, x12, x7\n\t"
+ "shl v8.2d, v10.2d, #63\n\t"
+ "add x10, x10, x13\n\t"
+ "sri v8.2d, v10.2d, #1\n\t"
+ "eor x12, x12, x9\n\t"
+ "tbl v9.16b, {v10.16b}, v11.16b\n\t"
+ "add x10, x10, x15\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x17, x17, x4\n\t"
+ "ushr v10.2d, v10.2d, #7\n\t"
+ "add x10, x10, x12\n\t"
+ "eor v9.16b, v9.16b, v10.16b\n\t"
+ "add x14, x14, x17\n\t"
+ "add v4.2d, v4.2d, v9.2d\n\t"
+ "add x6, x6, x10\n\t"
+ "add x10, x10, x14\n\t"
+ /* Round 10 */
+ "mov x13, v5.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x6, #14\n\t"
+ "ror x14, x10, #28\n\t"
+ "eor x12, x12, x6, ror 18\n\t"
+ "eor x14, x14, x10, ror 34\n\t"
+ "eor x12, x12, x6, ror 41\n\t"
+ "eor x14, x14, x10, ror 39\n\t"
+ "add x9, x9, x12\n\t"
+ "eor x17, x10, x11\n\t"
+ "eor x12, x7, x8\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x6\n\t"
+ "add x9, x9, x13\n\t"
+ "eor x12, x12, x8\n\t"
+ "add x9, x9, x15\n\t"
+ "eor x16, x16, x11\n\t"
+ "add x9, x9, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x5, x5, x9\n\t"
+ "add x9, x9, x14\n\t"
+ /* Round 11 */
+ "mov x13, v5.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ext v10.16b, v5.16b, v6.16b, #8\n\t"
+ "ror x12, x5, #14\n\t"
+ "shl v8.2d, v4.2d, #45\n\t"
+ "ror x14, x9, #28\n\t"
+ "sri v8.2d, v4.2d, #19\n\t"
+ "eor x12, x12, x5, ror 18\n\t"
+ "shl v9.2d, v4.2d, #3\n\t"
+ "eor x14, x14, x9, ror 34\n\t"
+ "sri v9.2d, v4.2d, #61\n\t"
+ "eor x12, x12, x5, ror 41\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x14, x14, x9, ror 39\n\t"
+ "ushr v8.2d, v4.2d, #6\n\t"
+ "add x8, x8, x12\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x16, x9, x10\n\t"
+ "add v5.2d, v5.2d, v9.2d\n\t"
+ "eor x12, x6, x7\n\t"
+ "ext v9.16b, v1.16b, v2.16b, #8\n\t"
+ "and x17, x16, x17\n\t"
+ "add v5.2d, v5.2d, v9.2d\n\t"
+ "and x12, x12, x5\n\t"
+ "shl v8.2d, v10.2d, #63\n\t"
+ "add x8, x8, x13\n\t"
+ "sri v8.2d, v10.2d, #1\n\t"
+ "eor x12, x12, x7\n\t"
+ "tbl v9.16b, {v10.16b}, v11.16b\n\t"
+ "add x8, x8, x15\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x17, x17, x10\n\t"
+ "ushr v10.2d, v10.2d, #7\n\t"
+ "add x8, x8, x12\n\t"
+ "eor v9.16b, v9.16b, v10.16b\n\t"
+ "add x14, x14, x17\n\t"
+ "add v5.2d, v5.2d, v9.2d\n\t"
+ "add x4, x4, x8\n\t"
+ "add x8, x8, x14\n\t"
+ /* Round 12 */
+ "mov x13, v6.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x4, #14\n\t"
+ "ror x14, x8, #28\n\t"
+ "eor x12, x12, x4, ror 18\n\t"
+ "eor x14, x14, x8, ror 34\n\t"
+ "eor x12, x12, x4, ror 41\n\t"
+ "eor x14, x14, x8, ror 39\n\t"
+ "add x7, x7, x12\n\t"
+ "eor x17, x8, x9\n\t"
+ "eor x12, x5, x6\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x4\n\t"
+ "add x7, x7, x13\n\t"
+ "eor x12, x12, x6\n\t"
+ "add x7, x7, x15\n\t"
+ "eor x16, x16, x9\n\t"
+ "add x7, x7, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x11, x11, x7\n\t"
+ "add x7, x7, x14\n\t"
+ /* Round 13 */
+ "mov x13, v6.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ext v10.16b, v6.16b, v7.16b, #8\n\t"
+ "ror x12, x11, #14\n\t"
+ "shl v8.2d, v5.2d, #45\n\t"
+ "ror x14, x7, #28\n\t"
+ "sri v8.2d, v5.2d, #19\n\t"
+ "eor x12, x12, x11, ror 18\n\t"
+ "shl v9.2d, v5.2d, #3\n\t"
+ "eor x14, x14, x7, ror 34\n\t"
+ "sri v9.2d, v5.2d, #61\n\t"
+ "eor x12, x12, x11, ror 41\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x14, x14, x7, ror 39\n\t"
+ "ushr v8.2d, v5.2d, #6\n\t"
+ "add x6, x6, x12\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x16, x7, x8\n\t"
+ "add v6.2d, v6.2d, v9.2d\n\t"
+ "eor x12, x4, x5\n\t"
+ "ext v9.16b, v2.16b, v3.16b, #8\n\t"
+ "and x17, x16, x17\n\t"
+ "add v6.2d, v6.2d, v9.2d\n\t"
+ "and x12, x12, x11\n\t"
+ "shl v8.2d, v10.2d, #63\n\t"
+ "add x6, x6, x13\n\t"
+ "sri v8.2d, v10.2d, #1\n\t"
+ "eor x12, x12, x5\n\t"
+ "tbl v9.16b, {v10.16b}, v11.16b\n\t"
+ "add x6, x6, x15\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x17, x17, x8\n\t"
+ "ushr v10.2d, v10.2d, #7\n\t"
+ "add x6, x6, x12\n\t"
+ "eor v9.16b, v9.16b, v10.16b\n\t"
+ "add x14, x14, x17\n\t"
+ "add v6.2d, v6.2d, v9.2d\n\t"
+ "add x10, x10, x6\n\t"
+ "add x6, x6, x14\n\t"
+ /* Round 14 */
+ "mov x13, v7.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x10, #14\n\t"
+ "ror x14, x6, #28\n\t"
+ "eor x12, x12, x10, ror 18\n\t"
+ "eor x14, x14, x6, ror 34\n\t"
+ "eor x12, x12, x10, ror 41\n\t"
+ "eor x14, x14, x6, ror 39\n\t"
+ "add x5, x5, x12\n\t"
+ "eor x17, x6, x7\n\t"
+ "eor x12, x11, x4\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x10\n\t"
+ "add x5, x5, x13\n\t"
+ "eor x12, x12, x4\n\t"
+ "add x5, x5, x15\n\t"
+ "eor x16, x16, x7\n\t"
+ "add x5, x5, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x9, x9, x5\n\t"
+ "add x5, x5, x14\n\t"
+ /* Round 15 */
+ "mov x13, v7.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ext v10.16b, v7.16b, v0.16b, #8\n\t"
+ "ror x12, x9, #14\n\t"
+ "shl v8.2d, v6.2d, #45\n\t"
+ "ror x14, x5, #28\n\t"
+ "sri v8.2d, v6.2d, #19\n\t"
+ "eor x12, x12, x9, ror 18\n\t"
+ "shl v9.2d, v6.2d, #3\n\t"
+ "eor x14, x14, x5, ror 34\n\t"
+ "sri v9.2d, v6.2d, #61\n\t"
+ "eor x12, x12, x9, ror 41\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x14, x14, x5, ror 39\n\t"
+ "ushr v8.2d, v6.2d, #6\n\t"
+ "add x4, x4, x12\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x16, x5, x6\n\t"
+ "add v7.2d, v7.2d, v9.2d\n\t"
+ "eor x12, x10, x11\n\t"
+ "ext v9.16b, v3.16b, v4.16b, #8\n\t"
+ "and x17, x16, x17\n\t"
+ "add v7.2d, v7.2d, v9.2d\n\t"
+ "and x12, x12, x9\n\t"
+ "shl v8.2d, v10.2d, #63\n\t"
+ "add x4, x4, x13\n\t"
+ "sri v8.2d, v10.2d, #1\n\t"
+ "eor x12, x12, x11\n\t"
+ "tbl v9.16b, {v10.16b}, v11.16b\n\t"
+ "add x4, x4, x15\n\t"
+ "eor v9.16b, v9.16b, v8.16b\n\t"
+ "eor x17, x17, x6\n\t"
+ "ushr v10.2d, v10.2d, #7\n\t"
+ "add x4, x4, x12\n\t"
+ "eor v9.16b, v9.16b, v10.16b\n\t"
+ "add x14, x14, x17\n\t"
+ "add v7.2d, v7.2d, v9.2d\n\t"
+ "add x8, x8, x4\n\t"
+ "add x4, x4, x14\n\t"
+ "subs x27, x27, #1\n\t"
+ "bne L_sha512_len_neon_start_%=\n\t"
+ /* Round 0 */
+ "mov x13, v0.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x8, #14\n\t"
+ "ror x14, x4, #28\n\t"
+ "eor x12, x12, x8, ror 18\n\t"
+ "eor x14, x14, x4, ror 34\n\t"
+ "eor x12, x12, x8, ror 41\n\t"
+ "eor x14, x14, x4, ror 39\n\t"
+ "add x11, x11, x12\n\t"
+ "eor x17, x4, x5\n\t"
+ "eor x12, x9, x10\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x8\n\t"
+ "add x11, x11, x13\n\t"
+ "eor x12, x12, x10\n\t"
+ "add x11, x11, x15\n\t"
+ "eor x16, x16, x5\n\t"
+ "add x11, x11, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x7, x7, x11\n\t"
+ "add x11, x11, x14\n\t"
+ /* Round 1 */
+ "mov x13, v0.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x7, #14\n\t"
+ "ror x14, x11, #28\n\t"
+ "eor x12, x12, x7, ror 18\n\t"
+ "eor x14, x14, x11, ror 34\n\t"
+ "eor x12, x12, x7, ror 41\n\t"
+ "eor x14, x14, x11, ror 39\n\t"
+ "add x10, x10, x12\n\t"
+ "eor x16, x11, x4\n\t"
+ "eor x12, x8, x9\n\t"
+ "and x17, x16, x17\n\t"
+ "and x12, x12, x7\n\t"
+ "add x10, x10, x13\n\t"
+ "eor x12, x12, x9\n\t"
+ "add x10, x10, x15\n\t"
+ "eor x17, x17, x4\n\t"
+ "add x10, x10, x12\n\t"
+ "add x14, x14, x17\n\t"
+ "add x6, x6, x10\n\t"
+ "add x10, x10, x14\n\t"
+ /* Round 2 */
+ "mov x13, v1.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x6, #14\n\t"
+ "ror x14, x10, #28\n\t"
+ "eor x12, x12, x6, ror 18\n\t"
+ "eor x14, x14, x10, ror 34\n\t"
+ "eor x12, x12, x6, ror 41\n\t"
+ "eor x14, x14, x10, ror 39\n\t"
+ "add x9, x9, x12\n\t"
+ "eor x17, x10, x11\n\t"
+ "eor x12, x7, x8\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x6\n\t"
+ "add x9, x9, x13\n\t"
+ "eor x12, x12, x8\n\t"
+ "add x9, x9, x15\n\t"
+ "eor x16, x16, x11\n\t"
+ "add x9, x9, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x5, x5, x9\n\t"
+ "add x9, x9, x14\n\t"
+ /* Round 3 */
+ "mov x13, v1.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x5, #14\n\t"
+ "ror x14, x9, #28\n\t"
+ "eor x12, x12, x5, ror 18\n\t"
+ "eor x14, x14, x9, ror 34\n\t"
+ "eor x12, x12, x5, ror 41\n\t"
+ "eor x14, x14, x9, ror 39\n\t"
+ "add x8, x8, x12\n\t"
+ "eor x16, x9, x10\n\t"
+ "eor x12, x6, x7\n\t"
+ "and x17, x16, x17\n\t"
+ "and x12, x12, x5\n\t"
+ "add x8, x8, x13\n\t"
+ "eor x12, x12, x7\n\t"
+ "add x8, x8, x15\n\t"
+ "eor x17, x17, x10\n\t"
+ "add x8, x8, x12\n\t"
+ "add x14, x14, x17\n\t"
+ "add x4, x4, x8\n\t"
+ "add x8, x8, x14\n\t"
+ /* Round 4 */
+ "mov x13, v2.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x4, #14\n\t"
+ "ror x14, x8, #28\n\t"
+ "eor x12, x12, x4, ror 18\n\t"
+ "eor x14, x14, x8, ror 34\n\t"
+ "eor x12, x12, x4, ror 41\n\t"
+ "eor x14, x14, x8, ror 39\n\t"
+ "add x7, x7, x12\n\t"
+ "eor x17, x8, x9\n\t"
+ "eor x12, x5, x6\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x4\n\t"
+ "add x7, x7, x13\n\t"
+ "eor x12, x12, x6\n\t"
+ "add x7, x7, x15\n\t"
+ "eor x16, x16, x9\n\t"
+ "add x7, x7, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x11, x11, x7\n\t"
+ "add x7, x7, x14\n\t"
+ /* Round 5 */
+ "mov x13, v2.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x11, #14\n\t"
+ "ror x14, x7, #28\n\t"
+ "eor x12, x12, x11, ror 18\n\t"
+ "eor x14, x14, x7, ror 34\n\t"
+ "eor x12, x12, x11, ror 41\n\t"
+ "eor x14, x14, x7, ror 39\n\t"
+ "add x6, x6, x12\n\t"
+ "eor x16, x7, x8\n\t"
+ "eor x12, x4, x5\n\t"
+ "and x17, x16, x17\n\t"
+ "and x12, x12, x11\n\t"
+ "add x6, x6, x13\n\t"
+ "eor x12, x12, x5\n\t"
+ "add x6, x6, x15\n\t"
+ "eor x17, x17, x8\n\t"
+ "add x6, x6, x12\n\t"
+ "add x14, x14, x17\n\t"
+ "add x10, x10, x6\n\t"
+ "add x6, x6, x14\n\t"
+ /* Round 6 */
+ "mov x13, v3.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x10, #14\n\t"
+ "ror x14, x6, #28\n\t"
+ "eor x12, x12, x10, ror 18\n\t"
+ "eor x14, x14, x6, ror 34\n\t"
+ "eor x12, x12, x10, ror 41\n\t"
+ "eor x14, x14, x6, ror 39\n\t"
+ "add x5, x5, x12\n\t"
+ "eor x17, x6, x7\n\t"
+ "eor x12, x11, x4\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x10\n\t"
+ "add x5, x5, x13\n\t"
+ "eor x12, x12, x4\n\t"
+ "add x5, x5, x15\n\t"
+ "eor x16, x16, x7\n\t"
+ "add x5, x5, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x9, x9, x5\n\t"
+ "add x5, x5, x14\n\t"
+ /* Round 7 */
+ "mov x13, v3.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x9, #14\n\t"
+ "ror x14, x5, #28\n\t"
+ "eor x12, x12, x9, ror 18\n\t"
+ "eor x14, x14, x5, ror 34\n\t"
+ "eor x12, x12, x9, ror 41\n\t"
+ "eor x14, x14, x5, ror 39\n\t"
+ "add x4, x4, x12\n\t"
+ "eor x16, x5, x6\n\t"
+ "eor x12, x10, x11\n\t"
+ "and x17, x16, x17\n\t"
+ "and x12, x12, x9\n\t"
+ "add x4, x4, x13\n\t"
+ "eor x12, x12, x11\n\t"
+ "add x4, x4, x15\n\t"
+ "eor x17, x17, x6\n\t"
+ "add x4, x4, x12\n\t"
+ "add x14, x14, x17\n\t"
+ "add x8, x8, x4\n\t"
+ "add x4, x4, x14\n\t"
+ /* Round 8 */
+ "mov x13, v4.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x8, #14\n\t"
+ "ror x14, x4, #28\n\t"
+ "eor x12, x12, x8, ror 18\n\t"
+ "eor x14, x14, x4, ror 34\n\t"
+ "eor x12, x12, x8, ror 41\n\t"
+ "eor x14, x14, x4, ror 39\n\t"
+ "add x11, x11, x12\n\t"
+ "eor x17, x4, x5\n\t"
+ "eor x12, x9, x10\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x8\n\t"
+ "add x11, x11, x13\n\t"
+ "eor x12, x12, x10\n\t"
+ "add x11, x11, x15\n\t"
+ "eor x16, x16, x5\n\t"
+ "add x11, x11, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x7, x7, x11\n\t"
+ "add x11, x11, x14\n\t"
+ /* Round 9 */
+ "mov x13, v4.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x7, #14\n\t"
+ "ror x14, x11, #28\n\t"
+ "eor x12, x12, x7, ror 18\n\t"
+ "eor x14, x14, x11, ror 34\n\t"
+ "eor x12, x12, x7, ror 41\n\t"
+ "eor x14, x14, x11, ror 39\n\t"
+ "add x10, x10, x12\n\t"
+ "eor x16, x11, x4\n\t"
+ "eor x12, x8, x9\n\t"
+ "and x17, x16, x17\n\t"
+ "and x12, x12, x7\n\t"
+ "add x10, x10, x13\n\t"
+ "eor x12, x12, x9\n\t"
+ "add x10, x10, x15\n\t"
+ "eor x17, x17, x4\n\t"
+ "add x10, x10, x12\n\t"
+ "add x14, x14, x17\n\t"
+ "add x6, x6, x10\n\t"
+ "add x10, x10, x14\n\t"
+ /* Round 10 */
+ "mov x13, v5.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x6, #14\n\t"
+ "ror x14, x10, #28\n\t"
+ "eor x12, x12, x6, ror 18\n\t"
+ "eor x14, x14, x10, ror 34\n\t"
+ "eor x12, x12, x6, ror 41\n\t"
+ "eor x14, x14, x10, ror 39\n\t"
+ "add x9, x9, x12\n\t"
+ "eor x17, x10, x11\n\t"
+ "eor x12, x7, x8\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x6\n\t"
+ "add x9, x9, x13\n\t"
+ "eor x12, x12, x8\n\t"
+ "add x9, x9, x15\n\t"
+ "eor x16, x16, x11\n\t"
+ "add x9, x9, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x5, x5, x9\n\t"
+ "add x9, x9, x14\n\t"
+ /* Round 11 */
+ "mov x13, v5.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x5, #14\n\t"
+ "ror x14, x9, #28\n\t"
+ "eor x12, x12, x5, ror 18\n\t"
+ "eor x14, x14, x9, ror 34\n\t"
+ "eor x12, x12, x5, ror 41\n\t"
+ "eor x14, x14, x9, ror 39\n\t"
+ "add x8, x8, x12\n\t"
+ "eor x16, x9, x10\n\t"
+ "eor x12, x6, x7\n\t"
+ "and x17, x16, x17\n\t"
+ "and x12, x12, x5\n\t"
+ "add x8, x8, x13\n\t"
+ "eor x12, x12, x7\n\t"
+ "add x8, x8, x15\n\t"
+ "eor x17, x17, x10\n\t"
+ "add x8, x8, x12\n\t"
+ "add x14, x14, x17\n\t"
+ "add x4, x4, x8\n\t"
+ "add x8, x8, x14\n\t"
+ /* Round 12 */
+ "mov x13, v6.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x4, #14\n\t"
+ "ror x14, x8, #28\n\t"
+ "eor x12, x12, x4, ror 18\n\t"
+ "eor x14, x14, x8, ror 34\n\t"
+ "eor x12, x12, x4, ror 41\n\t"
+ "eor x14, x14, x8, ror 39\n\t"
+ "add x7, x7, x12\n\t"
+ "eor x17, x8, x9\n\t"
+ "eor x12, x5, x6\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x4\n\t"
+ "add x7, x7, x13\n\t"
+ "eor x12, x12, x6\n\t"
+ "add x7, x7, x15\n\t"
+ "eor x16, x16, x9\n\t"
+ "add x7, x7, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x11, x11, x7\n\t"
+ "add x7, x7, x14\n\t"
+ /* Round 13 */
+ "mov x13, v6.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x11, #14\n\t"
+ "ror x14, x7, #28\n\t"
+ "eor x12, x12, x11, ror 18\n\t"
+ "eor x14, x14, x7, ror 34\n\t"
+ "eor x12, x12, x11, ror 41\n\t"
+ "eor x14, x14, x7, ror 39\n\t"
+ "add x6, x6, x12\n\t"
+ "eor x16, x7, x8\n\t"
+ "eor x12, x4, x5\n\t"
+ "and x17, x16, x17\n\t"
+ "and x12, x12, x11\n\t"
+ "add x6, x6, x13\n\t"
+ "eor x12, x12, x5\n\t"
+ "add x6, x6, x15\n\t"
+ "eor x17, x17, x8\n\t"
+ "add x6, x6, x12\n\t"
+ "add x14, x14, x17\n\t"
+ "add x10, x10, x6\n\t"
+ "add x6, x6, x14\n\t"
+ /* Round 14 */
+ "mov x13, v7.d[0]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x10, #14\n\t"
+ "ror x14, x6, #28\n\t"
+ "eor x12, x12, x10, ror 18\n\t"
+ "eor x14, x14, x6, ror 34\n\t"
+ "eor x12, x12, x10, ror 41\n\t"
+ "eor x14, x14, x6, ror 39\n\t"
+ "add x5, x5, x12\n\t"
+ "eor x17, x6, x7\n\t"
+ "eor x12, x11, x4\n\t"
+ "and x16, x17, x16\n\t"
+ "and x12, x12, x10\n\t"
+ "add x5, x5, x13\n\t"
+ "eor x12, x12, x4\n\t"
+ "add x5, x5, x15\n\t"
+ "eor x16, x16, x7\n\t"
+ "add x5, x5, x12\n\t"
+ "add x14, x14, x16\n\t"
+ "add x9, x9, x5\n\t"
+ "add x5, x5, x14\n\t"
+ /* Round 15 */
+ "mov x13, v7.d[1]\n\t"
+ "ldr x15, [x3], #8\n\t"
+ "ror x12, x9, #14\n\t"
+ "ror x14, x5, #28\n\t"
+ "eor x12, x12, x9, ror 18\n\t"
+ "eor x14, x14, x5, ror 34\n\t"
+ "eor x12, x12, x9, ror 41\n\t"
+ "eor x14, x14, x5, ror 39\n\t"
+ "add x4, x4, x12\n\t"
+ "eor x16, x5, x6\n\t"
+ "eor x12, x10, x11\n\t"
+ "and x17, x16, x17\n\t"
+ "and x12, x12, x9\n\t"
+ "add x4, x4, x13\n\t"
+ "eor x12, x12, x11\n\t"
+ "add x4, x4, x15\n\t"
+ "eor x17, x17, x6\n\t"
+ "add x4, x4, x12\n\t"
+ "add x14, x14, x17\n\t"
+ "add x8, x8, x4\n\t"
+ "add x4, x4, x14\n\t"
+ "add x11, x11, x26\n\t"
+ "add x10, x10, x25\n\t"
+ "add x9, x9, x24\n\t"
+ "add x8, x8, x23\n\t"
+ "add x7, x7, x22\n\t"
+ "add x6, x6, x21\n\t"
+ "add x5, x5, x20\n\t"
+ "add x4, x4, x19\n\t"
+ "adr x3, %[L_SHA512_transform_neon_len_k]\n\t"
+ "subs %w[len], %w[len], #0x80\n\t"
+ "bne L_sha512_len_neon_begin_%=\n\t"
+ "stp x4, x5, [%x[sha512]]\n\t"
+ "stp x6, x7, [%x[sha512], #16]\n\t"
+ "stp x8, x9, [%x[sha512], #32]\n\t"
+ "stp x10, x11, [%x[sha512], #48]\n\t"
+ "ldp x29, x30, [sp], #16\n\t"
+ : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
+ : [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k), [L_SHA512_transform_neon_len_ror8] "S" (L_SHA512_transform_neon_len_ror8)
+ : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+ );
+}
+
+#endif /* WOLFSSL_ARMASM */
+#endif /* __aarch64__ */
diff --git a/wolfcrypt/src/port/arm/armv8-sha512.c b/wolfcrypt/src/port/arm/armv8-sha512.c
new file mode 100644
index 0000000..e909c7c
--- /dev/null
+++ b/wolfcrypt/src/port/arm/armv8-sha512.c
@@ -0,0 +1,715 @@
+/* sha512.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_ARMASM
+#if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
+
+#include <wolfssl/wolfcrypt/sha512.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/cpuid.h>
+#include <wolfssl/wolfcrypt/hash.h>
+
+#include <wolfssl/wolfcrypt/logging.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifdef WOLFSSL_SHA512
+
+static int InitSha512(wc_Sha512* sha512)
+{
+ if (sha512 == NULL)
+ return BAD_FUNC_ARG;
+
+ sha512->digest[0] = W64LIT(0x6a09e667f3bcc908);
+ sha512->digest[1] = W64LIT(0xbb67ae8584caa73b);
+ sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b);
+ sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1);
+ sha512->digest[4] = W64LIT(0x510e527fade682d1);
+ sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f);
+ sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b);
+ sha512->digest[7] = W64LIT(0x5be0cd19137e2179);
+
+ sha512->buffLen = 0;
+ sha512->loLen = 0;
+ sha512->hiLen = 0;
+#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB)
+ sha512->flags = 0;
+#endif
+
+ return 0;
+}
+
+#endif /* WOLFSSL_SHA512 */
+
+#ifdef WOLFSSL_SHA512
+
+int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId)
+{
+ int ret = 0;
+
+ if (sha512 == NULL)
+ return BAD_FUNC_ARG;
+
+ sha512->heap = heap;
+
+ ret = InitSha512(sha512);
+ if (ret != 0)
+ return ret;
+
+#ifdef WOLFSSL_SMALL_STACK_CACHE
+ sha512->W = NULL;
+#endif
+
+ (void)devId;
+
+ return ret;
+}
+
+#endif /* WOLFSSL_SHA512 */
+
+#ifndef WOLFSSL_ARMASM
+static const word64 K512[80] = {
+ W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
+ W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
+ W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
+ W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
+ W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
+ W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
+ W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
+ W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
+ W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
+ W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
+ W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
+ W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
+ W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
+ W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
+ W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
+ W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
+ W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
+ W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
+ W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
+ W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
+ W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
+ W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
+ W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
+ W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
+ W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
+ W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
+ W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
+ W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
+ W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
+ W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
+ W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
+ W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
+ W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
+ W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
+ W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
+ W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
+ W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
+ W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
+ W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
+ W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
+};
+
+#ifdef LITTLE_ENDIAN_ORDER
+#define blk0(i) (W[i] = ByteReverseWord64(DATA[i]))
+#else
+#define blk0(i) (W[i] = DATA[i])
+#endif
+
+#define blk2(i) ( \
+ W[ i ] += \
+ s1(W[(i- 2) & 15])+ \
+ W[(i- 7) & 15] + \
+ s0(W[(i-15) & 15]) \
+ )
+
+#define Ch(x,y,z) (z ^ ((z ^ y) & x))
+#define Maj(x,y,z) (y ^ ((y ^ z) & (x ^ y)))
+
+#define a(i) T[(0-i) & 7]
+#define b(i) T[(1-i) & 7]
+#define c(i) T[(2-i) & 7]
+#define d(i) T[(3-i) & 7]
+#define e(i) T[(4-i) & 7]
+#define f(i) T[(5-i) & 7]
+#define g(i) T[(6-i) & 7]
+#define h(i) T[(7-i) & 7]
+
+#define S0(x) (rotrFixed64(x,28) ^ rotrFixed64(x,34) ^ rotrFixed64(x,39))
+#define S1(x) (rotrFixed64(x,14) ^ rotrFixed64(x,18) ^ rotrFixed64(x,41))
+#define s0(x) (rotrFixed64(x, 1) ^ rotrFixed64(x, 8) ^ (x>>7))
+#define s1(x) (rotrFixed64(x,19) ^ rotrFixed64(x,61) ^ (x>>6))
+
+#define R0(i) \
+ h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + blk0(i); \
+ d(i) += h(i); \
+ h(i) += S0(a(i)) + Maj(a(i),b(i),c(i))
+#define R(i) \
+ h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + blk2(i); \
+ d(i) += h(i); \
+ h(i) += S0(a(i)) + Maj(a(i),b(i),c(i))
+
+#define DATA sha512->buffer
+static void Transform_Sha512(wc_Sha512* sha512)
+{
+ const word64* K = K512;
+ word32 j;
+ word64 T[8];
+ word64 W[16];
+
+ /* Copy digest to working vars */
+ T[0] = sha512->digest[0];
+ T[1] = sha512->digest[1];
+ T[2] = sha512->digest[2];
+ T[3] = sha512->digest[3];
+ T[4] = sha512->digest[4];
+ T[5] = sha512->digest[5];
+ T[6] = sha512->digest[6];
+ T[7] = sha512->digest[7];
+
+ /* 80 operations, partially loop unrolled */
+ j = 0;
+ R0( 0); R0( 1); R0( 2); R0( 3);
+ R0( 4); R0( 5); R0( 6); R0( 7);
+ R0( 8); R0( 9); R0(10); R0(11);
+ R0(12); R0(13); R0(14); R0(15);
+ for (j = 16; j < 80; j += 16) {
+ R( 0); R( 1); R( 2); R( 3);
+ R( 4); R( 5); R( 6); R( 7);
+ R( 8); R( 9); R(10); R(11);
+ R(12); R(13); R(14); R(15);
+ }
+
+ /* Add the working vars back into digest */
+ sha512->digest[0] += T[0];
+ sha512->digest[1] += T[1];
+ sha512->digest[2] += T[2];
+ sha512->digest[3] += T[3];
+ sha512->digest[4] += T[4];
+ sha512->digest[5] += T[5];
+ sha512->digest[6] += T[6];
+ sha512->digest[7] += T[7];
+
+ return 0;
+}
+#undef DATA
+
+#define DATA ((word64*)data)
+static void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
+{
+ const word64* K = K512;
+ word32 j;
+ word64 T[8];
+ word64 TO[8];
+ word64 W[16];
+
+ /* Copy digest to working vars */
+ T[0] = sha512->digest[0];
+ T[1] = sha512->digest[1];
+ T[2] = sha512->digest[2];
+ T[3] = sha512->digest[3];
+ T[4] = sha512->digest[4];
+ T[5] = sha512->digest[5];
+ T[6] = sha512->digest[6];
+ T[7] = sha512->digest[7];
+
+ do {
+ TO[0] = T[0];
+ TO[1] = T[1];
+ TO[2] = T[2];
+ TO[3] = T[3];
+ TO[4] = T[4];
+ TO[5] = T[5];
+ TO[6] = T[6];
+ TO[7] = T[7];
+
+ /* 80 operations, partially loop unrolled */
+ j = 0;
+ R0( 0); R0( 1); R0( 2); R0( 3);
+ R0( 4); R0( 5); R0( 6); R0( 7);
+ R0( 8); R0( 9); R0(10); R0(11);
+ R0(12); R0(13); R0(14); R0(15);
+ for (j = 16; j < 80; j += 16) {
+ R( 0); R( 1); R( 2); R( 3);
+ R( 4); R( 5); R( 6); R( 7);
+ R( 8); R( 9); R(10); R(11);
+ R(12); R(13); R(14); R(15);
+ }
+
+ T[0] += TO[0];
+ T[1] += TO[1];
+ T[2] += TO[2];
+ T[3] += TO[3];
+ T[4] += TO[4];
+ T[5] += TO[5];
+ T[6] += TO[6];
+ T[7] += TO[7];
+
+ data += 128;
+ len -= 128;
+ }
+ while (len > 0);
+
+ /* Add the working vars back into digest */
+ sha512->digest[0] = T[0];
+ sha512->digest[1] = T[1];
+ sha512->digest[2] = T[2];
+ sha512->digest[3] = T[3];
+ sha512->digest[4] = T[4];
+ sha512->digest[5] = T[5];
+ sha512->digest[6] = T[6];
+ sha512->digest[7] = T[7];
+
+ return 0;
+}
+#undef DATA
+#endif
+
+
+static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len)
+{
+ word64 tmp = sha512->loLen;
+ if ( (sha512->loLen += len) < tmp)
+ sha512->hiLen++; /* carry low to high */
+}
+
+static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
+{
+ int ret = 0;
+ /* do block size increments */
+ byte* local = (byte*)sha512->buffer;
+ word32 blocksLen;
+
+ /* check that internal buffLen is valid */
+ if (sha512->buffLen >= WC_SHA512_BLOCK_SIZE)
+ return BUFFER_E;
+
+ AddLength(sha512, len);
+
+ if (sha512->buffLen > 0) {
+ word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen);
+ if (add > 0) {
+ XMEMCPY(&local[sha512->buffLen], data, add);
+
+ sha512->buffLen += add;
+ data += add;
+ len -= add;
+ }
+
+ if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) {
+#ifndef WOLFSSL_ARMASM
+ Transform_Sha512(sha512);
+#else
+ Transform_Sha512_Len(sha512, (const byte*)sha512->buffer,
+ WC_SHA512_BLOCK_SIZE);
+#endif
+ sha512->buffLen = 0;
+ }
+ }
+
+ blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
+ if (blocksLen > 0) {
+ /* Byte reversal performed in function if required. */
+ Transform_Sha512_Len(sha512, data, blocksLen);
+ data += blocksLen;
+ len -= blocksLen;
+ }
+
+ if (len > 0) {
+ XMEMCPY(local, data, len);
+ sha512->buffLen = len;
+ }
+
+ return ret;
+}
+
+#ifdef WOLFSSL_SHA512
+
+int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
+{
+ if (sha512 == NULL || (data == NULL && len > 0)) {
+ return BAD_FUNC_ARG;
+ }
+
+ return Sha512Update(sha512, data, len);
+}
+
+#endif /* WOLFSSL_SHA512 */
+
+static WC_INLINE int Sha512Final(wc_Sha512* sha512)
+{
+ byte* local = (byte*)sha512->buffer;
+
+ if (sha512 == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ local[sha512->buffLen++] = 0x80; /* add 1 */
+
+ /* pad with zeros */
+ if (sha512->buffLen > WC_SHA512_PAD_SIZE) {
+ XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_BLOCK_SIZE -
+ sha512->buffLen);
+ sha512->buffLen += WC_SHA512_BLOCK_SIZE - sha512->buffLen;
+#ifndef WOLFSSL_ARMASM
+ Transform_Sha512(sha512);
+#else
+ Transform_Sha512_Len(sha512, (const byte*)sha512->buffer,
+ WC_SHA512_BLOCK_SIZE);
+#endif
+
+ sha512->buffLen = 0;
+ }
+ XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen);
+
+ /* put lengths in bits */
+ sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) +
+ (sha512->hiLen << 3);
+ sha512->loLen = sha512->loLen << 3;
+
+ /* store lengths */
+ /* ! length ordering dependent on digest endian type ! */
+
+ sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
+ sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
+
+ ByteReverseWords64(
+ &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
+ &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
+ WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE);
+#ifndef WOLFSSL_ARMASM
+ Transform_Sha512(sha512);
+#else
+ Transform_Sha512_Len(sha512, (const byte*)sha512->buffer,
+ WC_SHA512_BLOCK_SIZE);
+#endif
+
+#ifdef LITTLE_ENDIAN_ORDER
+ ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE);
+#endif
+
+ return 0;
+}
+
+#ifdef WOLFSSL_SHA512
+
+int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash)
+{
+#ifdef LITTLE_ENDIAN_ORDER
+ word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)];
+#endif
+
+ if (sha512 == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+#ifdef LITTLE_ENDIAN_ORDER
+ ByteReverseWords64((word64*)digest, (word64*)sha512->digest,
+ WC_SHA512_DIGEST_SIZE);
+ XMEMCPY(hash, digest, WC_SHA512_DIGEST_SIZE);
+#else
+ XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
+#endif
+
+ return 0;
+}
+
+int wc_Sha512Final(wc_Sha512* sha512, byte* hash)
+{
+ int ret;
+
+ if (sha512 == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ ret = Sha512Final(sha512);
+ if (ret != 0)
+ return ret;
+
+ XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
+
+ return InitSha512(sha512); /* reset state */
+}
+
+int wc_InitSha512(wc_Sha512* sha512)
+{
+ return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID);
+}
+
+void wc_Sha512Free(wc_Sha512* sha512)
+{
+ if (sha512 == NULL)
+ return;
+
+#ifdef WOLFSSL_SMALL_STACK_CACHE
+ if (sha512->W != NULL) {
+ XFREE(sha512->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+ sha512->W = NULL;
+ }
+#endif
+}
+
+#endif /* WOLFSSL_SHA512 */
+
+/* -------------------------------------------------------------------------- */
+/* SHA384 */
+/* -------------------------------------------------------------------------- */
+#ifdef WOLFSSL_SHA384
+
+static int InitSha384(wc_Sha384* sha384)
+{
+ if (sha384 == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8);
+ sha384->digest[1] = W64LIT(0x629a292a367cd507);
+ sha384->digest[2] = W64LIT(0x9159015a3070dd17);
+ sha384->digest[3] = W64LIT(0x152fecd8f70e5939);
+ sha384->digest[4] = W64LIT(0x67332667ffc00b31);
+ sha384->digest[5] = W64LIT(0x8eb44a8768581511);
+ sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7);
+ sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4);
+
+ sha384->buffLen = 0;
+ sha384->loLen = 0;
+ sha384->hiLen = 0;
+#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB)
+ sha384->flags = 0;
+#endif
+
+ return 0;
+}
+
+int wc_Sha384Update(wc_Sha384* sha384, const byte* data, word32 len)
+{
+ if (sha384 == NULL || (data == NULL && len > 0)) {
+ return BAD_FUNC_ARG;
+ }
+
+ return Sha512Update((wc_Sha512*)sha384, data, len);
+}
+
+
+int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash)
+{
+#ifdef LITTLE_ENDIAN_ORDER
+ word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)];
+#endif
+
+ if (sha384 == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+#ifdef LITTLE_ENDIAN_ORDER
+ ByteReverseWords64((word64*)digest, (word64*)sha384->digest,
+ WC_SHA384_DIGEST_SIZE);
+ XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE);
+#else
+ XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
+#endif
+
+ return 0;
+}
+
+int wc_Sha384Final(wc_Sha384* sha384, byte* hash)
+{
+ int ret;
+
+ if (sha384 == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ ret = Sha512Final((wc_Sha512*)sha384);
+ if (ret != 0)
+ return ret;
+
+ XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
+
+ return InitSha384(sha384); /* reset state */
+}
+
+int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)
+{
+ int ret;
+
+ if (sha384 == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ sha384->heap = heap;
+ ret = InitSha384(sha384);
+ if (ret != 0)
+ return ret;
+
+#ifdef WOLFSSL_SMALL_STACK_CACHE
+ sha384->W = NULL;
+#endif
+
+ (void)devId;
+
+ return ret;
+}
+
+int wc_InitSha384(wc_Sha384* sha384)
+{
+ return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID);
+}
+
+void wc_Sha384Free(wc_Sha384* sha384)
+{
+ if (sha384 == NULL)
+ return;
+
+#ifdef WOLFSSL_SMALL_STACK_CACHE
+ if (sha384->W != NULL) {
+ XFREE(sha384->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+ sha384->W = NULL;
+ }
+#endif
+}
+
+#endif /* WOLFSSL_SHA384 */
+
+#ifdef WOLFSSL_SHA512
+
+int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash)
+{
+ int ret;
+ wc_Sha512 tmpSha512;
+
+ if (sha512 == NULL || hash == NULL)
+ return BAD_FUNC_ARG;
+
+ ret = wc_Sha512Copy(sha512, &tmpSha512);
+ if (ret == 0) {
+ ret = wc_Sha512Final(&tmpSha512, hash);
+ wc_Sha512Free(&tmpSha512);
+ }
+ return ret;
+}
+
+int wc_Sha512Copy(wc_Sha512* src, wc_Sha512* dst)
+{
+ int ret = 0;
+
+ if (src == NULL || dst == NULL)
+ return BAD_FUNC_ARG;
+
+ XMEMCPY(dst, src, sizeof(wc_Sha512));
+#ifdef WOLFSSL_SMALL_STACK_CACHE
+ dst->W = NULL;
+#endif
+
+#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB)
+ dst->flags |= WC_HASH_FLAG_ISCOPY;
+#endif
+
+ return ret;
+}
+
+#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB)
+int wc_Sha512SetFlags(wc_Sha512* sha512, word32 flags)
+{
+ if (sha512) {
+ sha512->flags = flags;
+ }
+ return 0;
+}
+int wc_Sha512GetFlags(wc_Sha512* sha512, word32* flags)
+{
+ if (sha512 && flags) {
+ *flags = sha512->flags;
+ }
+ return 0;
+}
+#endif
+
+#endif /* WOLFSSL_SHA512 */
+
+#ifdef WOLFSSL_SHA384
+
+int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash)
+{
+ int ret;
+ wc_Sha384 tmpSha384;
+
+ if (sha384 == NULL || hash == NULL)
+ return BAD_FUNC_ARG;
+ ret = wc_Sha384Copy(sha384, &tmpSha384);
+ if (ret == 0) {
+ ret = wc_Sha384Final(&tmpSha384, hash);
+ wc_Sha384Free(&tmpSha384);
+ }
+ return ret;
+}
+int wc_Sha384Copy(wc_Sha384* src, wc_Sha384* dst)
+{
+ int ret = 0;
+
+ if (src == NULL || dst == NULL)
+ return BAD_FUNC_ARG;
+
+ XMEMCPY(dst, src, sizeof(wc_Sha384));
+#ifdef WOLFSSL_SMALL_STACK_CACHE
+ dst->W = NULL;
+#endif
+
+#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB)
+ dst->flags |= WC_HASH_FLAG_ISCOPY;
+#endif
+
+ return ret;
+}
+
+#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB)
+int wc_Sha384SetFlags(wc_Sha384* sha384, word32 flags)
+{
+ if (sha384) {
+ sha384->flags = flags;
+ }
+ return 0;
+}
+int wc_Sha384GetFlags(wc_Sha384* sha384, word32* flags)
+{
+ if (sha384 && flags) {
+ *flags = sha384->flags;
+ }
+ return 0;
+}
+#endif
+
+#endif /* WOLFSSL_SHA384 */
+
+#endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */
+#endif /* WOLFSSL_ARMASM */
diff --git a/wolfcrypt/src/port/arm/cryptoCell.c b/wolfcrypt/src/port/arm/cryptoCell.c
new file mode 100644
index 0000000..c3bd2d9
--- /dev/null
+++ b/wolfcrypt/src/port/arm/cryptoCell.c
@@ -0,0 +1,309 @@
+/* cryptoCell.c
+ *
+ * Copyright (C) 2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+/* This source is included in wc_port.c */
+/* WOLFSSL_CRYPTOCELL_C is defined by wc_port.c in case compile tries to
+ include this .c directly */
+#ifdef WOLFSSL_CRYPTOCELL_C
+
+#ifdef WOLFSSL_CRYPTOCELL
+
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/ecc.h>
+#include <wolfssl/wolfcrypt/port/arm/cryptoCell.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+/* Global Variables (extern) */
+CRYS_RND_State_t wc_rndState;
+CRYS_RND_WorkBuff_t wc_rndWorkBuff;
+SaSiRndGenerateVectWorkFunc_t wc_rndGenVectFunc = CRYS_RND_GenerateVector;
+
+static word32 cc310_enableCount = 0;
+
+static void cc310_enable(void)
+{
+ cc310_enableCount++;
+
+ /* Enable the CC310 HW/IQ once*/
+
+ NRF_CRYPTOCELL->ENABLE = 1;
+ NVIC_EnableIRQ(CRYPTOCELL_IRQn);
+}
+
+static void cc310_disable(void)
+{
+ cc310_enableCount--;
+
+ /* Disable HW/IRQ if no more users */
+ if (cc310_enableCount == 0) {
+ NRF_CRYPTOCELL->ENABLE = 0;
+ NVIC_DisableIRQ(CRYPTOCELL_IRQn);
+ }
+}
+
+int cc310_Init(void)
+{
+ int ret = 0;
+ static int initialized = 0;
+
+ if (!initialized) {
+ /* Enable the CC310 HW. */
+ cc310_enable();
+
+ /*Initialize the CC310 run-time library*/
+ ret = SaSi_LibInit();
+
+ if (ret != SA_SILIB_RET_OK) {
+ WOLFSSL_MSG("Error SaSi_LibInit");
+ return ret;
+ }
+
+ /* RNG CryptoCell CC310 */
+ ret = CRYS_RndInit(&wc_rndState, &wc_rndWorkBuff);
+ if (ret != CRYS_OK) {
+ WOLFSSL_MSG("Error CRYS_RndInit");
+ return ret;
+ }
+ initialized = 1;
+ }
+ return ret;
+}
+
+void cc310_Free(void)
+{
+ CRYSError_t crys_result;
+
+ SaSi_LibFini();
+
+ crys_result = CRYS_RND_UnInstantiation(&wc_rndState);
+
+ if (crys_result != CRYS_OK) {
+ WOLFSSL_MSG("Error RYS_RND_UnInstantiation");
+ }
+ cc310_disable();
+}
+
+int cc310_random_generate(byte* output, word32 size)
+{
+ CRYSError_t crys_result;
+
+ crys_result = CRYS_RND_GenerateVector(&wc_rndState, size, output);
+
+ return (crys_result == CRYS_OK) ? 0 : -1;
+}
+#ifdef HAVE_ECC
+CRYS_ECPKI_DomainID_t cc310_mapCurve(int curve_id)
+{
+ switch(curve_id)
+ {
+ case ECC_CURVE_DEF: return CRYS_ECPKI_DomainID_secp256r1; /* default */
+ case ECC_SECP160K1: return CRYS_ECPKI_DomainID_secp160k1;
+ case ECC_SECP160R1: return CRYS_ECPKI_DomainID_secp160r1;
+ case ECC_SECP160R2: return CRYS_ECPKI_DomainID_secp160r2;
+ case ECC_SECP192K1: return CRYS_ECPKI_DomainID_secp192k1;
+ case ECC_SECP192R1: return CRYS_ECPKI_DomainID_secp192r1;
+ case ECC_SECP224K1: return CRYS_ECPKI_DomainID_secp224k1;
+ case ECC_SECP224R1: return CRYS_ECPKI_DomainID_secp224r1;
+ case ECC_SECP256K1: return CRYS_ECPKI_DomainID_secp256k1;
+ case ECC_SECP256R1: return CRYS_ECPKI_DomainID_secp256r1;
+ case ECC_SECP384R1: return CRYS_ECPKI_DomainID_secp384r1;
+ case ECC_SECP521R1: return CRYS_ECPKI_DomainID_secp521r1;
+ default: WOLFSSL_MSG("Curve not identified");
+ return CRYS_ECPKI_DomainID_Builded;
+ }
+}
+#endif /* HAVE_ECC */
+
+#ifndef NO_RSA
+CRYS_RSA_HASH_OpMode_t cc310_hashModeRSA(enum wc_HashType hash_type, int isHashed)
+{
+ switch(hash_type)
+ {
+ case WC_HASH_TYPE_MD5:
+ #ifndef NO_MD5
+ return isHashed? CRYS_RSA_After_MD5_mode : CRYS_RSA_HASH_MD5_mode;
+ #endif
+ case WC_HASH_TYPE_SHA:
+ #ifndef NO_SHA
+ return isHashed? CRYS_RSA_After_SHA1_mode : CRYS_RSA_HASH_SHA1_mode;
+ #endif
+ case WC_HASH_TYPE_SHA224:
+ #ifdef WOLFSSL_SHA224
+ return isHashed? CRYS_RSA_After_SHA224_mode : CRYS_RSA_HASH_SHA224_mode;
+ #endif
+ case WC_HASH_TYPE_SHA256:
+ #ifndef NO_SHA256
+ return isHashed? CRYS_RSA_After_SHA256_mode : CRYS_RSA_HASH_SHA256_mode;
+ #endif
+ case WC_HASH_TYPE_SHA384:
+ #ifdef WOLFSSL_SHA384
+ return isHashed? CRYS_RSA_After_SHA384_mode : CRYS_RSA_HASH_SHA384_mode;
+ #endif
+ case WC_HASH_TYPE_SHA512:
+ #ifdef WOLFSSL_SHA512
+ return isHashed? CRYS_RSA_After_SHA512_mode : CRYS_RSA_HASH_SHA512_mode;
+ #endif
+ case WC_HASH_TYPE_NONE:
+ /* default to SHA256 */
+ return isHashed? CRYS_RSA_After_SHA256_mode : CRYS_RSA_HASH_SHA256_mode;
+ default:
+ return CRYS_RSA_After_HASH_NOT_KNOWN_mode;
+ }
+}
+#endif /* !NO_RSA */
+
+#ifdef HAVE_ECC
+CRYS_ECPKI_HASH_OpMode_t cc310_hashModeECC(int hash_size)
+{
+ CRYS_ECPKI_HASH_OpMode_t hash_mode;
+ switch (hash_size)
+ {
+ case 20:
+ hash_mode = CRYS_ECPKI_AFTER_HASH_SHA1_mode;
+ break;
+ case 28:
+ hash_mode = CRYS_ECPKI_AFTER_HASH_SHA224_mode;
+ break;
+ case 32:
+ hash_mode = CRYS_ECPKI_AFTER_HASH_SHA256_mode;
+ break;
+ case 48:
+ hash_mode = CRYS_ECPKI_AFTER_HASH_SHA384_mode;
+ break;
+ case 64:
+ hash_mode = CRYS_ECPKI_AFTER_HASH_SHA512_mode;
+ break;
+ default:
+ hash_mode = CRYS_ECPKI_HASH_OpModeLast;
+ break;
+ }
+ return hash_mode;
+}
+#endif /* HAVE_ECC */
+#endif /* WOLFSSL_CRYPTOCELL*/
+
+#if !defined(NO_CRYPT_BENCHMARK) && defined(WOLFSSL_nRF5x_SDK_15_2)
+
+static int mRtcSec = 0;
+static const nrfx_rtc_t rtc = NRFX_RTC_INSTANCE(0);
+
+static void rtc_handler(nrfx_rtc_int_type_t int_type)
+{
+ if (int_type == NRFX_RTC_INT_COMPARE0) {
+ mRtcSec++;
+ nrfx_rtc_counter_clear(&rtc);
+ nrfx_rtc_int_enable(&rtc, RTC_CHANNEL_INT_MASK(0));
+#ifdef BSP_LED_1
+ nrf_gpio_pin_toggle(BSP_LED_1);
+#endif
+ }
+ else if (int_type == NRF_DRV_RTC_INT_TICK) {
+#ifdef BSP_LED_0
+ nrf_gpio_pin_toggle(BSP_LED_0);
+#endif
+ }
+}
+
+static void rtc_config(void)
+{
+ uint32_t err_code;
+ nrfx_rtc_config_t config = NRFX_RTC_DEFAULT_CONFIG;
+
+ /* configure gpio for pin toggling. */
+ bsp_board_init(BSP_INIT_LEDS);
+
+ /* start the internal LFCLK XTAL oscillator.*/
+ err_code = nrf_drv_clock_init();
+ APP_ERROR_CHECK(err_code);
+ nrf_drv_clock_lfclk_request(NULL);
+
+ /* Initialize RTC instance */
+ err_code = nrfx_rtc_init(&rtc, &config, rtc_handler);
+ APP_ERROR_CHECK(err_code);
+
+ /* Enable tick event */
+ nrfx_rtc_tick_enable(&rtc, false);
+
+ /* Set compare channel to trigger interrupt after 1 seconds */
+ err_code = nrfx_rtc_cc_set(&rtc, 0, RTC_INPUT_FREQ, true);
+ APP_ERROR_CHECK(err_code);
+
+ /* Power on RTC instance */
+ nrfx_rtc_enable(&rtc);
+}
+
+static int rtc_get_ms(void)
+{
+ /* Prescaler is 12-bit for COUNTER: frequency = (32768/(PRESCALER+1)) */
+ int frequency = (RTC_INPUT_FREQ / (rtc_prescaler_get(rtc.p_reg) + 1));
+ uint32_t counter = nrfx_rtc_counter_get(&rtc);
+
+ /* Convert with rounding frequency to milliseconds */
+ return ((counter * 1000) + (frequency / 2) ) / frequency;
+}
+
+double current_time(int reset)
+{
+ double time;
+ static int initialized = 0;
+
+ if (!initialized) {
+ rtc_config();
+ initialized = 1;
+ }
+ time = mRtcSec;
+ time += (double)rtc_get_ms() / 1000;
+
+ return time;
+}
+
+int nrf_random_generate(byte* output, word32 size)
+{
+ uint32_t err_code;
+ static int initialized = 0;
+
+ /* RNG must be initialized once */
+ if (!initialized) {
+ err_code = nrf_drv_rng_init(NULL);
+ if (err_code != NRF_SUCCESS) {
+ return -1;
+ }
+ initialized = 1;
+ }
+ nrf_drv_rng_block_rand(output, size);
+ return 0;
+}
+#endif /* !NO_CRYPT_BENCHMARK && WOLFSSL_nRF5x_SDK_15_2 */
+
+#endif /* WOLFSSL_CRYPTOCELL_C */
diff --git a/wolfcrypt/src/port/arm/cryptoCellHash.c b/wolfcrypt/src/port/arm/cryptoCellHash.c
new file mode 100644
index 0000000..bc729f7
--- /dev/null
+++ b/wolfcrypt/src/port/arm/cryptoCellHash.c
@@ -0,0 +1,134 @@
+/* cryptoCellHash.c
+ *
+ * Copyright (C) 2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+/* This source is included in wc_port.c */
+/* WOLFSSL_CRYPTOCELL_HASH_C is defined by wc_port.c in case compile tries
+ to include this .c directly */
+#ifdef WOLFSSL_CRYPTOCELL_HASH_C
+#if !defined(NO_SHA256) && defined(WOLFSSL_CRYPTOCELL)
+
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/sha256.h>
+#include <wolfssl/wolfcrypt/port/arm/cryptoCell.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
+{
+ CRYSError_t ret = 0;
+
+ (void)heap;
+ (void)devId;
+
+ if (sha256 == NULL)
+ return BAD_FUNC_ARG;
+
+ XMEMSET(sha256->digest, 0, sizeof(sha256->digest));
+
+ /* initializes the HASH context and machine to the supported mode.*/
+ ret = CRYS_HASH_Init(&sha256->ctx, CRYS_HASH_SHA256_mode);
+
+ if (ret != SA_SILIB_RET_OK){
+ WOLFSSL_MSG("Error CRYS_HASH_Init failed");
+ }
+
+ return ret;
+}
+
+int wc_InitSha256(Sha256* sha256)
+{
+ return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
+}
+
+int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
+{
+ CRYSError_t ret = 0;
+ size_t length;
+ size_t remaining = len;
+ byte const * p_cur = data;
+
+ if (sha256 == NULL || (data == NULL && len > 0)) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (data == NULL && len == 0) {
+ /* valid, but do nothing */
+ return 0;
+ }
+
+ /* If the input is larger than CC310_MAX_LENGTH_DMA, split into smaller */
+ do {
+ length = (remaining > CC310_MAX_LENGTH_DMA) ?
+ CC310_MAX_LENGTH_DMA : remaining;
+
+ ret = CRYS_HASH_Update(&sha256->ctx, (uint8_t *)p_cur, length);
+
+ remaining -= length;
+ p_cur += length;
+
+ } while (ret == CRYS_OK && remaining > 0);
+
+ return ret;
+}
+
+int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
+{
+ CRYSError_t ret = 0;
+ CRYS_HASH_Result_t hashResult;
+
+ if (sha256 == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ ret = CRYS_HASH_Finish(&sha256->ctx, hashResult);
+
+ if (ret != SA_SILIB_RET_OK){
+ WOLFSSL_MSG("Error CRYS_HASH_Finish failed");
+ return ret;
+ }
+ XMEMCPY(sha256->digest, hashResult, WC_SHA256_DIGEST_SIZE);
+
+ XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
+
+ /* reset state */
+ return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
+}
+
+void wc_Sha256Free(wc_Sha256* sha256)
+{
+ if (sha256 == NULL)
+ return;
+}
+
+#endif /* !NO_SHA256 && WOLFSSL_CRYPTOCELL */
+#endif /* WOLFSSL_CRYPTOCELL_HASH_C */
diff --git a/wolfcrypt/src/port/atmel/README.md b/wolfcrypt/src/port/atmel/README.md
new file mode 100644
index 0000000..50352fc
--- /dev/null
+++ b/wolfcrypt/src/port/atmel/README.md
@@ -0,0 +1,94 @@
+# Microchip/Atmel ATECC508A/ATECC608A Support
+
+Support for ATECC508A using these methods:
+* TLS: Using the PK callbacks and reference ATECC508A callbacks. See Coding section below. Requires options `HAVE_PK_CALLBACKS` and `WOLFSSL_ATECC_PKCB or WOLFSSL_ATECC508A`
+* wolfCrypt: Native wc_ecc_* API's using the `./configure CFLAGS="-DWOLFSSL_ATECC508A"` or `#define WOLFSSL_ATECC508A`.
+
+## Dependency
+
+Requires the Microchip CryptoAuthLib. The examples in `wolfcrypt/src/port/atmel/atmel.c` make calls to the `atcatls_*` API's.
+
+
+## Building
+
+### Build Options
+
+* `HAVE_PK_CALLBACKS`: Option for enabling wolfSSL's PK callback support for TLS.
+* `WOLFSSL_ATECC508A`: Enables support for initializing the CryptoAuthLib and setting up the encryption key used for the I2C communication.
+* `WOLFSSL_ATECC_PKCB`: Enables support for the reference PK callbacks without init.
+* `WOLFSSL_ATMEL`: Enables ASF hooks seeding random data using the `atmel_get_random_number` function.
+* `WOLFSSL_ATMEL_TIME`: Enables the built-in `atmel_get_curr_time_and_date` function get getting time from ASF RTC.
+* `ATECC_GET_ENC_KEY`: Macro to define your own function for getting the encryption key.
+* `ATECC_SLOT_I2C_ENC`: Macro for the default encryption key slot. Can also get via the slot callback with `ATMEL_SLOT_ENCKEY`.
+* `ATECC_MAX_SLOT`: Macro for the maximum dynamically allocated slots.
+
+### Build Command Examples
+
+`./configure --enable-pkcallbacks CFLAGS="-DWOLFSSL_ATECC_PKCB"`
+`#define HAVE_PK_CALLBACKS`
+`#define WOLFSSL_ATECC_PKCB`
+
+or
+
+`./configure CFLAGS="-DWOLFSSL_ATECC508A"`
+`#define WOLFSSL_ATECC508A`
+
+
+## Coding
+
+Setup the PK callbacks for TLS using:
+
+```
+/* Setup PK Callbacks for ATECC508A */
+WOLFSSL_CTX* ctx;
+wolfSSL_CTX_SetEccKeyGenCb(ctx, atcatls_create_key_cb);
+wolfSSL_CTX_SetEccVerifyCb(ctx, atcatls_verify_signature_cb);
+wolfSSL_CTX_SetEccSignCb(ctx, atcatls_sign_certificate_cb);
+wolfSSL_CTX_SetEccSharedSecretCb(ctx, atcatls_create_pms_cb);
+```
+
+The reference ATECC508A PK callback functions are located in the `wolfcrypt/src/port/atmel/atmel.c` file.
+
+
+Adding a custom context to the callbacks:
+
+```
+/* Setup PK Callbacks context */
+WOLFSSL* ssl;
+void* myOwnCtx;
+wolfSSL_SetEccKeyGenCtx(ssl, myOwnCtx);
+wolfSSL_SetEccVerifyCtx(ssl, myOwnCtx);
+wolfSSL_SetEccSignCtx(ssl, myOwnCtx);
+wolfSSL_SetEccSharedSecretCtx(ssl, myOwnCtx);
+```
+
+## Benchmarks
+
+Supports ECC SECP256R1 (NIST P-256)
+
+### TLS
+
+TLS Establishment Times:
+
+* Hardware accelerated ATECC508A: 2.342 seconds average
+* Software only: 13.422 seconds average
+
+The TLS connection establishment time is 5.73 times faster with the ATECC508A.
+
+### Cryptographic ECC
+
+Software only implementation (SAMD21 48Mhz Cortex-M0, Fast Math TFM-ASM):
+
+`EC-DHE key generation 3123.000 milliseconds, avg over 5 iterations, 1.601 ops/sec`
+`EC-DHE key agreement 3117.000 milliseconds, avg over 5 iterations, 1.604 ops/sec`
+`EC-DSA sign time 1997.000 milliseconds, avg over 5 iterations, 2.504 ops/sec`
+`EC-DSA verify time 5057.000 milliseconds, avg over 5 iterations, 0.988 ops/sec`
+
+ATECC508A HW accelerated implementation:
+`EC-DHE key generation 144.400 milliseconds, avg over 5 iterations, 34.722 ops/sec`
+`EC-DHE key agreement 134.200 milliseconds, avg over 5 iterations, 37.313 ops/sec`
+`EC-DSA sign time 293.400 milliseconds, avg over 5 iterations, 17.065 ops/sec`
+`EC-DSA verify time 208.400 milliseconds, avg over 5 iterations, 24.038 ops/sec`
+
+
+For details see our [wolfSSL Atmel ATECC508A](https://wolfssl.com/wolfSSL/wolfssl-atmel.html) page.
diff --git a/wolfcrypt/src/port/atmel/atmel.c b/wolfcrypt/src/port/atmel/atmel.c
new file mode 100644
index 0000000..04d2aeb
--- /dev/null
+++ b/wolfcrypt/src/port/atmel/atmel.c
@@ -0,0 +1,843 @@
+/* atmel.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_ATMEL) || defined(WOLFSSL_ATECC508A) || defined(WOLFSSL_ATECC_PKCB)
+
+#include <wolfssl/wolfcrypt/memory.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/ssl.h>
+#include <wolfssl/internal.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifdef WOLFSSL_ATMEL
+/* remap name conflicts */
+#define Aes Aes_Remap
+#define Gmac Gmac_Remap
+#include "asf.h"
+#undef Aes
+#undef Gmac
+#endif /* WOLFSSL_ATMEL */
+
+#include <wolfssl/wolfcrypt/port/atmel/atmel.h>
+
+#ifdef WOLFSSL_ATECC508A
+
+#ifdef WOLFSSL_ATECC508A_TLS
+ extern ATCA_STATUS device_init_default(void);
+#endif
+
+static int mAtcaInitDone = 0;
+
+/* ATECC slotId handling */
+static atmel_slot_alloc_cb mSlotAlloc;
+static atmel_slot_dealloc_cb mSlotDealloc;
+static byte mSlotList[ATECC_MAX_SLOT];
+#ifndef SINGLE_THREADED
+static wolfSSL_Mutex mSlotMutex;
+#endif
+
+/* Raspberry Pi uses /dev/i2c-1 */
+#ifndef ATECC_I2C_ADDR
+#define ATECC_I2C_ADDR 0xC0
+#endif
+#ifndef ATECC_I2C_BUS
+#define ATECC_I2C_BUS 1
+#endif
+#ifndef ATECC_DEV_TYPE
+#define ATECC_DEV_TYPE ATECC508A
+#endif
+static ATCAIfaceCfg cfg_ateccx08a_i2c_pi;
+#endif /* WOLFSSL_ATECC508A */
+
+
+/**
+ * \brief Generate random number to be used for hash.
+ */
+int atmel_get_random_number(uint32_t count, uint8_t* rand_out)
+{
+ int ret = 0;
+#ifdef WOLFSSL_ATECC508A
+ uint8_t i = 0;
+ uint32_t copy_count = 0;
+ uint8_t rng_buffer[RANDOM_NUM_SIZE];
+
+ if (rand_out == NULL) {
+ return -1;
+ }
+
+ while (i < count) {
+ ret = atcab_random(rng_buffer);
+ if (ret != ATCA_SUCCESS) {
+ WOLFSSL_MSG("Failed to create random number!");
+ return -1;
+ }
+ copy_count = (count - i > RANDOM_NUM_SIZE) ? RANDOM_NUM_SIZE : count - i;
+ XMEMCPY(&rand_out[i], rng_buffer, copy_count);
+ i += copy_count;
+ }
+ #ifdef ATCAPRINTF
+ atcab_printbin_label((const char*)"\r\nRandom Number", rand_out, count);
+ #endif
+#else
+ /* TODO: Use on-board TRNG */
+#endif
+ return ret;
+}
+
+int atmel_get_random_block(unsigned char* output, unsigned int sz)
+{
+ return atmel_get_random_number((uint32_t)sz, (uint8_t*)output);
+}
+
+#if defined(WOLFSSL_ATMEL) && defined(WOLFSSL_ATMEL_TIME)
+#include "asf.h"
+#include "rtc_calendar.h"
+extern struct rtc_module *_rtc_instance[RTC_INST_NUM];
+
+long atmel_get_curr_time_and_date(long* tm)
+{
+ long rt = 0;
+
+ /* Get current time */
+ struct rtc_calendar_time rtcTime;
+ const int monthDay[] = {0,31,59,90,120,151,181,212,243,273,304,334};
+ int month, year, yearLeap;
+
+ rtc_calendar_get_time(_rtc_instance[0], &rtcTime);
+
+ /* Convert rtc_calendar_time to seconds since UTC */
+ month = rtcTime.month % 12;
+ year = rtcTime.year + rtcTime.month / 12;
+ if (month < 0) {
+ month += 12;
+ year--;
+ }
+ yearLeap = (month > 1) ? year + 1 : year;
+ rt = rtcTime.second
+ + 60 * (rtcTime.minute
+ + 60 * (rtcTime.hour
+ + 24 * (monthDay[month] + rtcTime.day - 1
+ + 365 * (year - 70)
+ + (yearLeap - 69) / 4
+ - (yearLeap - 1) / 100
+ + (yearLeap + 299) / 400
+ )
+ )
+ );
+
+ (void)tm;
+ return rt;
+}
+#endif
+
+
+#ifdef WOLFSSL_ATECC508A
+
+int atmel_ecc_translate_err(int status)
+{
+ switch (status) {
+ case ATCA_SUCCESS:
+ return 0;
+ case ATCA_BAD_PARAM:
+ return BAD_FUNC_ARG;
+ case ATCA_ALLOC_FAILURE:
+ return MEMORY_E;
+ default:
+ #ifdef WOLFSSL_ATECC508A_DEBUG
+ printf("ATECC Failure: %x\n", (word32)status);
+ #endif
+ break;
+ }
+ return WC_HW_E;
+}
+
+/* Function to set the slotId allocator and deallocator */
+int atmel_set_slot_allocator(atmel_slot_alloc_cb alloc,
+ atmel_slot_dealloc_cb dealloc)
+{
+#ifndef SINGLE_THREADED
+ wc_LockMutex(&mSlotMutex);
+#endif
+ mSlotAlloc = alloc;
+ mSlotDealloc = dealloc;
+#ifndef SINGLE_THREADED
+ wc_UnLockMutex(&mSlotMutex);
+#endif
+ return 0;
+}
+
+/* Function to allocate new slotId number */
+int atmel_ecc_alloc(int slotType)
+{
+ int slotId = ATECC_INVALID_SLOT, i;
+
+#ifndef SINGLE_THREADED
+ wc_LockMutex(&mSlotMutex);
+#endif
+
+ if (mSlotAlloc) {
+ slotId = mSlotAlloc(slotType);
+ }
+ else {
+ switch (slotType) {
+ case ATMEL_SLOT_ENCKEY:
+ /* not reserved in mSlotList, so return */
+ slotId = ATECC_SLOT_I2C_ENC;
+ goto exit;
+ case ATMEL_SLOT_DEVICE:
+ /* not reserved in mSlotList, so return */
+ slotId = ATECC_SLOT_AUTH_PRIV;
+ goto exit;
+ case ATMEL_SLOT_ECDHE:
+ slotId = ATECC_SLOT_ECDHE_PRIV;
+ break;
+ case ATMEL_SLOT_ECDHE_ENC:
+ slotId = ATECC_SLOT_ENC_PARENT;
+ break;
+ case ATMEL_SLOT_ANY:
+ for (i=0; i < ATECC_MAX_SLOT; i++) {
+ /* Find free slotId */
+ if (mSlotList[i] == ATECC_INVALID_SLOT) {
+ slotId = i;
+ break;
+ }
+ }
+ break;
+ }
+
+ /* is slot available */
+ if (mSlotList[slotId] != ATECC_INVALID_SLOT) {
+ slotId = ATECC_INVALID_SLOT;
+ }
+ else {
+ mSlotList[slotId] = slotId;
+ }
+ }
+
+exit:
+#ifndef SINGLE_THREADED
+ wc_UnLockMutex(&mSlotMutex);
+#endif
+
+ return slotId;
+}
+
+
+/* Function to return slotId number to available list */
+void atmel_ecc_free(int slotId)
+{
+#ifndef SINGLE_THREADED
+ wc_LockMutex(&mSlotMutex);
+#endif
+ if (mSlotDealloc) {
+ mSlotDealloc(slotId);
+ }
+ else if (slotId >= 0 && slotId < ATECC_MAX_SLOT) {
+ if (slotId != ATECC_SLOT_AUTH_PRIV && slotId != ATECC_SLOT_I2C_ENC) {
+ /* Mark slotId free */
+ mSlotList[slotId] = ATECC_INVALID_SLOT;
+ }
+ }
+#ifndef SINGLE_THREADED
+ wc_UnLockMutex(&mSlotMutex);
+#endif
+}
+
+
+/**
+ * \brief Callback function for getting the current encryption key
+ */
+int atmel_get_enc_key_default(byte* enckey, word16 keysize)
+{
+ if (enckey == NULL || keysize != ATECC_KEY_SIZE) {
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMSET(enckey, 0xFF, keysize); /* use default value */
+
+ return 0;
+}
+
+/**
+ * \brief Write enc key before.
+ */
+static int atmel_init_enc_key(void)
+{
+ int ret;
+ uint8_t read_key[ATECC_KEY_SIZE];
+ uint8_t writeBlock = 0;
+ uint8_t writeOffset = 0;
+ int slotId;
+
+ slotId = atmel_ecc_alloc(ATMEL_SLOT_ENCKEY);
+
+ /* check for encryption key slotId */
+ if (slotId == ATECC_INVALID_SLOT)
+ return BAD_FUNC_ARG;
+
+ /* get encryption key */
+ ATECC_GET_ENC_KEY(read_key, sizeof(read_key));
+
+ ret = atcab_write_zone(ATCA_ZONE_DATA, slotId, writeBlock, writeOffset,
+ read_key, ATCA_BLOCK_SIZE);
+ ForceZero(read_key, sizeof(read_key));
+ ret = atmel_ecc_translate_err(ret);
+
+ return ret;
+}
+
+int atmel_get_rev_info(word32* revision)
+{
+ int ret;
+ ret = atcab_info((uint8_t*)revision);
+ ret = atmel_ecc_translate_err(ret);
+ return ret;
+}
+
+void atmel_show_rev_info(void)
+{
+#ifdef WOLFSSL_ATECC508A_DEBUG
+ word32 revision = 0;
+ atmel_get_rev_info(&revision);
+ printf("ATECC508A Revision: %x\n", (word32)revision);
+#endif
+}
+
+int atmel_ecc_create_pms(int slotId, const uint8_t* peerKey, uint8_t* pms)
+{
+ int ret;
+ uint8_t read_key[ATECC_KEY_SIZE];
+ int slotIdEnc;
+
+ slotIdEnc = atmel_ecc_alloc(ATMEL_SLOT_ECDHE_ENC);
+ if (slotIdEnc == ATECC_INVALID_SLOT)
+ return BAD_FUNC_ARG;
+
+ /* get encryption key */
+ ATECC_GET_ENC_KEY(read_key, sizeof(read_key));
+
+ /* send the encrypted version of the ECDH command */
+ ret = atcab_ecdh_enc(slotId, peerKey, pms, read_key, slotIdEnc);
+ ret = atmel_ecc_translate_err(ret);
+
+ /* free the ECDHE slot */
+ atmel_ecc_free(slotIdEnc);
+
+ return ret;
+}
+
+int atmel_ecc_create_key(int slotId, byte* peerKey)
+{
+ int ret;
+
+ /* verify provided slotId */
+ if (slotId == ATECC_INVALID_SLOT) {
+ return WC_HW_WAIT_E;
+ }
+
+ /* generate new ephemeral key on device */
+ ret = atcab_genkey(slotId, peerKey);
+ ret = atmel_ecc_translate_err(ret);
+ return ret;
+}
+
+int atmel_ecc_sign(int slotId, const byte* message, byte* signature)
+{
+ int ret;
+
+ ret = atcab_sign(slotId, message, signature);
+ ret = atmel_ecc_translate_err(ret);
+ return ret;
+}
+
+int atmel_ecc_verify(const byte* message, const byte* signature,
+ const byte* pubkey, int* verified)
+{
+ int ret;
+
+ ret = atcab_verify_extern(message, signature, pubkey, (bool*)verified);
+ ret = atmel_ecc_translate_err(ret);
+ return ret;
+}
+
+#endif /* WOLFSSL_ATECC508A */
+
+
+
+int atmel_init(void)
+{
+ int ret = 0;
+
+#ifdef WOLFSSL_ATECC508A
+ if (!mAtcaInitDone) {
+ ATCA_STATUS status;
+ int i;
+
+ #ifndef SINGLE_THREADED
+ wc_InitMutex(&mSlotMutex);
+ #endif
+
+ /* Init the free slotId list */
+ for (i=0; i<ATECC_MAX_SLOT; i++) {
+ if (i == ATECC_SLOT_AUTH_PRIV || i == ATECC_SLOT_I2C_ENC) {
+ mSlotList[i] = i;
+ }
+ else {
+ /* ECC Slots (mark avail) */
+ mSlotList[i] = ATECC_INVALID_SLOT;
+ }
+ }
+
+ /* Setup the hardware interface */
+ XMEMSET(&cfg_ateccx08a_i2c_pi, 0, sizeof(cfg_ateccx08a_i2c_pi));
+ cfg_ateccx08a_i2c_pi.iface_type = ATCA_I2C_IFACE;
+ cfg_ateccx08a_i2c_pi.devtype = ATECC_DEV_TYPE;
+ cfg_ateccx08a_i2c_pi.atcai2c.slave_address = ATECC_I2C_ADDR;
+ cfg_ateccx08a_i2c_pi.atcai2c.bus = ATECC_I2C_BUS;
+ cfg_ateccx08a_i2c_pi.atcai2c.baud = 400000;
+ cfg_ateccx08a_i2c_pi.wake_delay = 1500;
+ cfg_ateccx08a_i2c_pi.rx_retries = 20;
+
+ /* Initialize the CryptoAuthLib to communicate with ATECC508A */
+ status = atcab_init(&cfg_ateccx08a_i2c_pi);
+ if (status != ATCA_SUCCESS) {
+ WOLFSSL_MSG("Failed to initialize atcab");
+ return WC_HW_E;
+ }
+
+ /* show revision information */
+ atmel_show_rev_info();
+
+ #ifdef WOLFSSL_ATECC508A_TLS
+ /* Configure the ECC508 for use with TLS API functions */
+ device_init_default();
+ #endif
+
+ /* Init the I2C pipe encryption key. */
+ /* Value is generated/stored during pair for the ATECC508A and stored
+ on micro flash */
+ /* For this example its a fixed value */
+ if (atmel_init_enc_key() != 0) {
+ WOLFSSL_MSG("Failed to initialize transport key");
+ return WC_HW_E;
+ }
+
+ mAtcaInitDone = 1;
+ }
+#endif /* WOLFSSL_ATECC508A */
+ return ret;
+}
+
+void atmel_finish(void)
+{
+#ifdef WOLFSSL_ATECC508A
+ if (mAtcaInitDone) {
+ atcab_release();
+
+ #ifndef SINGLE_THREADED
+ wc_FreeMutex(&mSlotMutex);
+ #endif
+
+ mAtcaInitDone = 0;
+ }
+#endif
+}
+
+
+/* Reference PK Callbacks */
+#ifdef HAVE_PK_CALLBACKS
+
+/**
+ * \brief Used on the server-side only for creating the ephemeral key for ECDH
+ */
+int atcatls_create_key_cb(WOLFSSL* ssl, ecc_key* key, unsigned int keySz,
+ int ecc_curve, void* ctx)
+{
+ int ret;
+ uint8_t peerKey[ATECC_PUBKEY_SIZE];
+ uint8_t* qx = &peerKey[0];
+ uint8_t* qy = &peerKey[ATECC_PUBKEY_SIZE/2];
+ int slotId;
+
+ (void)ssl;
+ (void)ctx;
+
+ /* ATECC508A only supports P-256 */
+ if (ecc_curve == ECC_SECP256R1) {
+ slotId = atmel_ecc_alloc(ATMEL_SLOT_ECDHE);
+ if (slotId == ATECC_INVALID_SLOT)
+ return WC_HW_WAIT_E;
+
+ /* generate new ephemeral key on device */
+ ret = atmel_ecc_create_key(slotId, peerKey);
+
+ /* load generated ECC508A public key into key, used by wolfSSL */
+ if (ret == 0) {
+ ret = wc_ecc_import_unsigned(key, qx, qy, NULL, ECC_SECP256R1);
+ }
+
+ if (ret == 0) {
+ key->slot = slotId;
+ }
+ else {
+ atmel_ecc_free(slotId);
+ #ifdef WOLFSSL_ATECC508A_DEBUG
+ printf("atcatls_create_key_cb: ret %d\n", ret);
+ #endif
+ }
+ }
+ else {
+ #ifndef WOLFSSL_ATECC508A_NOSOFTECC
+ /* use software for non P-256 cases */
+ WC_RNG rng;
+ ret = wc_InitRng(&rng);
+ if (ret == 0) {
+ ret = wc_ecc_make_key_ex(&rng, keySz, key, ecc_curve);
+ wc_FreeRng(&rng);
+ }
+ #else
+ ret = NOT_COMPILED_IN;
+ #endif /* !WOLFSSL_ATECC508A_NOSOFTECC */
+ }
+ return ret;
+}
+
+/**
+ * \brief Creates a shared secret using a peer public key and a device key
+ */
+int atcatls_create_pms_cb(WOLFSSL* ssl, ecc_key* otherKey,
+ unsigned char* pubKeyDer, word32* pubKeySz,
+ unsigned char* out, word32* outlen,
+ int side, void* ctx)
+{
+ int ret;
+ ecc_key tmpKey;
+ uint8_t peerKeyBuf[ATECC_PUBKEY_SIZE];
+ uint8_t* peerKey = peerKeyBuf;
+ uint8_t* qx = &peerKey[0];
+ uint8_t* qy = &peerKey[ATECC_PUBKEY_SIZE/2];
+ word32 qxLen = ATECC_PUBKEY_SIZE/2, qyLen = ATECC_PUBKEY_SIZE/2;
+
+ if (pubKeyDer == NULL || pubKeySz == NULL || out == NULL || outlen == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ (void)ssl;
+ (void)ctx;
+ (void)otherKey;
+
+ ret = wc_ecc_init(&tmpKey);
+ if (ret != 0) {
+ return ret;
+ }
+
+ /* ATECC508A only supports P-256 */
+ if (otherKey->dp->id == ECC_SECP256R1) {
+ XMEMSET(peerKey, 0, ATECC_PUBKEY_SIZE);
+
+ /* for client: create and export public key */
+ if (side == WOLFSSL_CLIENT_END) {
+ int slotId = atmel_ecc_alloc(ATMEL_SLOT_ECDHE);
+ if (slotId == ATECC_INVALID_SLOT)
+ return WC_HW_WAIT_E;
+ tmpKey.slot = slotId;
+
+ /* generate new ephemeral key on device */
+ ret = atmel_ecc_create_key(slotId, peerKey);
+ if (ret != ATCA_SUCCESS) {
+ goto exit;
+ }
+
+ /* convert raw unsigned public key to X.963 format for TLS */
+ ret = wc_ecc_import_unsigned(&tmpKey, qx, qy, NULL, ECC_SECP256R1);
+ if (ret == 0) {
+ ret = wc_ecc_export_x963(&tmpKey, pubKeyDer, pubKeySz);
+ }
+
+ /* export peer's key as raw unsigned for hardware */
+ if (ret == 0) {
+ ret = wc_ecc_export_public_raw(otherKey, qx, &qxLen, qy, &qyLen);
+ }
+ }
+
+ /* for server: import public key */
+ else if (side == WOLFSSL_SERVER_END) {
+ tmpKey.slot = otherKey->slot;
+
+ /* import peer's key and export as raw unsigned for hardware */
+ ret = wc_ecc_import_x963_ex(pubKeyDer, *pubKeySz, &tmpKey, ECC_SECP256R1);
+ if (ret == 0) {
+ ret = wc_ecc_export_public_raw(&tmpKey, qx, &qxLen, qy, &qyLen);
+ }
+ }
+ else {
+ ret = BAD_FUNC_ARG;
+ }
+
+ if (ret != 0) {
+ goto exit;
+ }
+
+ ret = atmel_ecc_create_pms(tmpKey.slot, peerKey, out);
+ *outlen = ATECC_KEY_SIZE;
+
+ #ifndef WOLFSSL_ATECC508A_NOIDLE
+ /* put chip into idle to prevent watchdog situation on chip */
+ atcab_idle();
+ #endif
+
+ (void)qxLen;
+ (void)qyLen;
+ }
+ else {
+ #ifndef WOLFSSL_ATECC508A_NOSOFTECC
+ /* use software for non P-256 cases */
+ ecc_key* privKey = NULL;
+ ecc_key* pubKey = NULL;
+
+ /* for client: create and export public key */
+ if (side == WOLFSSL_CLIENT_END)
+ {
+ WC_RNG rng;
+ privKey = &tmpKey;
+ pubKey = otherKey;
+
+ ret = wc_InitRng(&rng);
+ if (ret == 0) {
+ ret = wc_ecc_make_key_ex(&rng, 0, privKey, otherKey->dp->id);
+ if (ret == 0) {
+ ret = wc_ecc_export_x963(privKey, pubKeyDer, pubKeySz);
+ }
+ wc_FreeRng(&rng);
+ }
+ }
+ /* for server: import public key */
+ else if (side == WOLFSSL_SERVER_END) {
+ privKey = otherKey;
+ pubKey = &tmpKey;
+
+ ret = wc_ecc_import_x963_ex(pubKeyDer, *pubKeySz, pubKey,
+ otherKey->dp->id);
+ }
+ else {
+ ret = BAD_FUNC_ARG;
+ }
+
+ /* generate shared secret and return it */
+ if (ret == 0) {
+ ret = wc_ecc_shared_secret(privKey, pubKey, out, outlen);
+ }
+ #else
+ ret = NOT_COMPILED_IN;
+ #endif /* !WOLFSSL_ATECC508A_NOSOFTECC */
+ }
+
+exit:
+ wc_ecc_free(&tmpKey);
+
+#ifdef WOLFSSL_ATECC508A_DEBUG
+ if (ret != 0) {
+ printf("atcab_ecdh_enc: ret %d\n", ret);
+ }
+#endif
+
+ return ret;
+}
+
+
+/**
+ * \brief Sign received digest using private key on device
+ */
+int atcatls_sign_certificate_cb(WOLFSSL* ssl, const byte* in, unsigned int inSz,
+ byte* out, word32* outSz, const byte* key, unsigned int keySz, void* ctx)
+{
+ int ret;
+ byte sigRs[ATECC_SIG_SIZE];
+ int slotId;
+
+ (void)ssl;
+ (void)inSz;
+ (void)key;
+ (void)keySz;
+ (void)ctx;
+
+ if (in == NULL || out == NULL || outSz == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ slotId = atmel_ecc_alloc(ATMEL_SLOT_DEVICE);
+ if (slotId == ATECC_INVALID_SLOT)
+ return WC_HW_WAIT_E;
+
+ /* We can only sign with P-256 */
+ ret = atmel_ecc_sign(slotId, in, sigRs);
+ if (ret != ATCA_SUCCESS) {
+ ret = WC_HW_E; goto exit;
+ }
+
+#ifndef WOLFSSL_ATECC508A_NOIDLE
+ /* put chip into idle to prevent watchdog situation on chip */
+ atcab_idle();
+#endif
+
+ /* Encode with ECDSA signature */
+ ret = wc_ecc_rs_raw_to_sig(
+ &sigRs[0], ATECC_SIG_SIZE/2,
+ &sigRs[ATECC_SIG_SIZE/2], ATECC_SIG_SIZE/2,
+ out, outSz);
+ if (ret != 0) {
+ goto exit;
+ }
+
+exit:
+
+ atmel_ecc_free(slotId);
+
+#ifdef WOLFSSL_ATECC508A_DEBUG
+ if (ret != 0) {
+ printf("atcatls_sign_certificate_cb: ret %d\n", ret);
+ }
+#endif
+
+ return ret;
+}
+
+/**
+ * \brief Verify signature received from peers to prove peer's private key.
+ */
+int atcatls_verify_signature_cb(WOLFSSL* ssl, const byte* sig, unsigned int sigSz,
+ const byte* hash, unsigned int hashSz, const byte* key, unsigned int keySz, int* result,
+ void* ctx)
+{
+ int ret;
+ ecc_key tmpKey;
+ word32 idx = 0;
+ uint8_t peerKey[ATECC_PUBKEY_SIZE];
+ uint8_t* qx = &peerKey[0];
+ uint8_t* qy = &peerKey[ATECC_PUBKEY_SIZE/2];
+ word32 qxLen = ATECC_PUBKEY_SIZE/2, qyLen = ATECC_PUBKEY_SIZE/2;
+ byte sigRs[ATECC_SIG_SIZE];
+ word32 rSz = ATECC_SIG_SIZE/2;
+ word32 sSz = ATECC_SIG_SIZE/2;
+
+ (void)sigSz;
+ (void)hashSz;
+ (void)ctx;
+
+ if (ssl == NULL || key == NULL || sig == NULL || hash == NULL || result == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* import public key */
+ ret = wc_ecc_init(&tmpKey);
+ if (ret == 0) {
+ ret = wc_EccPublicKeyDecode(key, &idx, &tmpKey, keySz);
+ }
+ if (ret != 0) {
+ goto exit;
+ }
+
+ if (tmpKey.dp->id == ECC_SECP256R1) {
+ /* export public as unsigned bin for hardware */
+ ret = wc_ecc_export_public_raw(&tmpKey, qx, &qxLen, qy, &qyLen);
+ wc_ecc_free(&tmpKey);
+ if (ret != 0) {
+ goto exit;
+ }
+
+ /* decode the ECDSA signature */
+ ret = wc_ecc_sig_to_rs(sig, sigSz,
+ &sigRs[0], &rSz,
+ &sigRs[ATECC_SIG_SIZE/2], &sSz);
+ if (ret != 0) {
+ goto exit;
+ }
+
+ ret = atmel_ecc_verify(hash, sigRs, peerKey, result);
+ if (ret != ATCA_SUCCESS || !*result) {
+ ret = WC_HW_E; goto exit;
+ }
+
+ #ifndef WOLFSSL_ATECC508A_NOIDLE
+ /* put chip into idle to prevent watchdog situation on chip */
+ atcab_idle();
+ #endif
+ }
+ else {
+ #ifndef WOLFSSL_ATECC508A_NOSOFTECC
+ ret = wc_ecc_verify_hash(sig, sigSz, hash, hashSz, result, &tmpKey);
+ #else
+ ret = NOT_COMPILED_IN;
+ #endif /* !WOLFSSL_ATECC508A_NOSOFTECC */
+ }
+
+ (void)rSz;
+ (void)sSz;
+ (void)qxLen;
+ (void)qyLen;
+
+ ret = 0; /* success */
+
+exit:
+
+#ifdef WOLFSSL_ATECC508A_DEBUG
+ if (ret != 0) {
+ printf("atcatls_verify_signature_cb: ret %d\n", ret);
+ }
+#endif
+
+ return ret;
+}
+
+int atcatls_set_callbacks(WOLFSSL_CTX* ctx)
+{
+ wolfSSL_CTX_SetEccKeyGenCb(ctx, atcatls_create_key_cb);
+ wolfSSL_CTX_SetEccVerifyCb(ctx, atcatls_verify_signature_cb);
+ wolfSSL_CTX_SetEccSignCb(ctx, atcatls_sign_certificate_cb);
+ wolfSSL_CTX_SetEccSharedSecretCb(ctx, atcatls_create_pms_cb);
+ return 0;
+}
+
+int atcatls_set_callback_ctx(WOLFSSL* ssl, void* user_ctx)
+{
+ wolfSSL_SetEccKeyGenCtx(ssl, user_ctx);
+ wolfSSL_SetEccVerifyCtx(ssl, user_ctx);
+ wolfSSL_SetEccSignCtx(ssl, user_ctx);
+ wolfSSL_SetEccSharedSecretCtx(ssl, user_ctx);
+ return 0;
+}
+
+
+#endif /* HAVE_PK_CALLBACKS */
+
+#endif /* WOLFSSL_ATMEL || WOLFSSL_ATECC508A || WOLFSSL_ATECC_PKCB */
diff --git a/wolfcrypt/src/port/caam/caam_aes.c b/wolfcrypt/src/port/caam/caam_aes.c
new file mode 100644
index 0000000..e00214d
--- /dev/null
+++ b/wolfcrypt/src/port/caam/caam_aes.c
@@ -0,0 +1,649 @@
+/* caam_aes.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_AES) && \
+ !defined(NO_IMX6_CAAM_AES)
+
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/aes.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+#include <wolfssl/wolfcrypt/port/caam/wolfcaam.h>
+#include <wolfssl/wolfcrypt/port/caam/caam_driver.h>
+
+#if defined(WOLFSSL_CAAM_DEBUG) || defined(WOLFSSL_CAAM_PRINT)
+#include <stdio.h>
+#endif
+
+int wc_AesSetKey(Aes* aes, const byte* key, word32 len,
+ const byte* iv, int dir)
+{
+ int ret;
+
+ if (aes == NULL || key == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (len > 32) {
+ byte out[32]; /* max AES key size */
+ word32 outSz;
+ int ret;
+
+ if (len != 64 && len != 72 && len != 80) {
+ return BAD_FUNC_ARG;
+ }
+
+ outSz = sizeof(out);
+ /* if length greater then 32 then try to unencapsulate */
+ if ((ret = wc_caamOpenBlob((byte*)key, len, out, &outSz)) != 0) {
+ return ret;
+ }
+
+ XMEMCPY((byte*)aes->key, out, outSz);
+ aes->keylen = outSz;
+ }
+ else {
+ if (len != 16 && len != 24 && len != 32) {
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMCPY((byte*)aes->key, key, len);
+ aes->keylen = len;
+ }
+
+ switch (aes->keylen) {
+ case 16: aes->rounds = 10; break;
+ case 24: aes->rounds = 12; break;
+ case 32: aes->rounds = 14; break;
+ default:
+ return BAD_FUNC_ARG;
+ }
+
+ if ((ret = wc_AesSetIV(aes, iv)) != 0) {
+ return ret;
+ }
+
+#ifdef WOLFSSL_AES_COUNTER
+ aes->left = 0;
+#endif
+
+ return 0;
+}
+
+
+int wc_AesCbcEncrypt(Aes* aes, byte* out,
+ const byte* in, word32 sz)
+{
+ word32 blocks;
+
+ WOLFSSL_ENTER("wc_AesCbcEncrypt");
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ blocks = sz / AES_BLOCK_SIZE;
+
+ if (blocks > 0) {
+ Buffer buf[4];
+ word32 arg[4];
+ word32 keySz;
+ int ret;
+
+ if (wc_AesGetKeySize(aes, &keySz) != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* Set buffers for key, cipher text, and plain text */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)aes->key;
+ buf[0].Length = keySz;
+
+ buf[1].BufferType = DataBuffer;
+ buf[1].TheAddress = (Address)aes->reg;
+ buf[1].Length = AES_BLOCK_SIZE;
+
+ buf[2].BufferType = DataBuffer;
+ buf[2].TheAddress = (Address)in;
+ buf[2].Length = blocks * AES_BLOCK_SIZE;
+
+ buf[3].BufferType = DataBuffer | LastBuffer;
+ buf[3].TheAddress = (Address)out;
+ buf[3].Length = blocks * AES_BLOCK_SIZE;
+
+ arg[0] = CAAM_ENC;
+ arg[1] = keySz;
+ arg[2] = blocks * AES_BLOCK_SIZE;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, CAAM_AESCBC)) != 0) {
+ WOLFSSL_MSG("Error with CAAM AES CBC encrypt");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+
+int wc_AesCbcDecrypt(Aes* aes, byte* out,
+ const byte* in, word32 sz)
+{
+ word32 blocks;
+
+ WOLFSSL_ENTER("wc_AesCbcDecrypt");
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ blocks = sz / AES_BLOCK_SIZE;
+
+ if (blocks > 0) {
+ Buffer buf[4];
+ word32 arg[4];
+ word32 keySz;
+ int ret;
+
+ if (wc_AesGetKeySize(aes, &keySz) != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* Set buffers for key, cipher text, and plain text */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)aes->key;
+ buf[0].Length = keySz;
+
+ buf[1].BufferType = DataBuffer;
+ buf[1].TheAddress = (Address)aes->reg;
+ buf[1].Length = AES_BLOCK_SIZE;
+
+ buf[2].BufferType = DataBuffer;
+ buf[2].TheAddress = (Address)in;
+ buf[2].Length = blocks * AES_BLOCK_SIZE;
+
+ buf[3].BufferType = DataBuffer | LastBuffer;
+ buf[3].TheAddress = (Address)out;
+ buf[3].Length = blocks * AES_BLOCK_SIZE;
+
+ arg[0] = CAAM_DEC;
+ arg[1] = keySz;
+ arg[2] = blocks * AES_BLOCK_SIZE;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, CAAM_AESCBC)) != 0) {
+ WOLFSSL_MSG("Error with CAAM AES CBC decrypt");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+#if defined(HAVE_AES_ECB)
+/* is assumed that input size is a multiple of AES_BLOCK_SIZE */
+int wc_AesEcbEncrypt(Aes* aes, byte* out,
+ const byte* in, word32 sz)
+{
+ word32 blocks;
+ Buffer buf[3];
+ word32 arg[4];
+ word32 keySz;
+ int ret;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ blocks = sz / AES_BLOCK_SIZE;
+
+ if (wc_AesGetKeySize(aes, &keySz) != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* Set buffers for key, cipher text, and plain text */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)aes->key;
+ buf[0].Length = keySz;
+
+ buf[1].BufferType = DataBuffer;
+ buf[1].TheAddress = (Address)in;
+ buf[1].Length = blocks * AES_BLOCK_SIZE;
+
+ buf[2].BufferType = DataBuffer | LastBuffer;
+ buf[2].TheAddress = (Address)out;
+ buf[2].Length = blocks * AES_BLOCK_SIZE;
+
+ arg[0] = CAAM_ENC;
+ arg[1] = keySz;
+ arg[2] = blocks * AES_BLOCK_SIZE;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, CAAM_AESECB)) != 0) {
+ WOLFSSL_MSG("Error with CAAM AES ECB encrypt");
+ return ret;
+ }
+
+ return 0;
+}
+
+
+int wc_AesEcbDecrypt(Aes* aes, byte* out,
+ const byte* in, word32 sz)
+{
+ word32 blocks;
+ Buffer buf[3];
+ word32 arg[4];
+ word32 keySz;
+ int ret;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ blocks = sz / AES_BLOCK_SIZE;
+
+ if (wc_AesGetKeySize(aes, &keySz) != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* Set buffers for key, cipher text, and plain text */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)aes->key;
+ buf[0].Length = keySz;
+
+ buf[1].BufferType = DataBuffer;
+ buf[1].TheAddress = (Address)in;
+ buf[1].Length = blocks * AES_BLOCK_SIZE;
+
+ buf[2].BufferType = DataBuffer | LastBuffer;
+ buf[2].TheAddress = (Address)out;
+ buf[2].Length = blocks * AES_BLOCK_SIZE;
+
+ arg[0] = CAAM_DEC;
+ arg[1] = keySz;
+ arg[2] = blocks * AES_BLOCK_SIZE;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, CAAM_AESECB)) != 0) {
+ WOLFSSL_MSG("Error with CAAM AES ECB decrypt");
+ return ret;
+ }
+
+ return 0;
+}
+#endif
+
+/* AES-CTR */
+#ifdef WOLFSSL_AES_COUNTER
+/* Increment AES counter (from wolfcrypt/src/aes.c) */
+static WC_INLINE void IncrementAesCounter(byte* inOutCtr)
+{
+ /* in network byte order so start at end and work back */
+ int i;
+ for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) {
+ if (++inOutCtr[i]) /* we're done unless we overflow */
+ return;
+ }
+}
+
+
+int wc_AesCtrEncrypt(Aes* aes, byte* out,
+ const byte* in, word32 sz)
+{
+ byte* tmp;
+ Buffer buf[4];
+ word32 arg[4];
+ word32 keySz;
+ int ret, blocks;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (wc_AesGetKeySize(aes, &keySz) != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* consume any unused bytes left in aes->tmp */
+ tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
+ while (aes->left && sz) {
+ *(out++) = *(in++) ^ *(tmp++);
+ aes->left--;
+ sz--;
+ }
+
+ /* do full blocks to then get potential left over amount */
+ blocks = sz / AES_BLOCK_SIZE;
+ if (blocks > 0) {
+ /* Set buffers for key, cipher text, and plain text */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)aes->key;
+ buf[0].Length = keySz;
+
+ buf[1].BufferType = DataBuffer;
+ buf[1].TheAddress = (Address)aes->reg;
+ buf[1].Length = AES_BLOCK_SIZE;
+
+ buf[2].BufferType = DataBuffer;
+ buf[2].TheAddress = (Address)in;
+ buf[2].Length = blocks * AES_BLOCK_SIZE;
+
+ buf[3].BufferType = DataBuffer | LastBuffer;
+ buf[3].TheAddress = (Address)out;
+ buf[3].Length = blocks * AES_BLOCK_SIZE;
+
+ arg[0] = CAAM_ENC;
+ arg[1] = keySz;
+ arg[2] = blocks * AES_BLOCK_SIZE;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, CAAM_AESCTR)) != 0) {
+ WOLFSSL_MSG("Error with CAAM AES CTR encrypt");
+ return ret;
+ }
+
+ out += blocks * AES_BLOCK_SIZE;
+ in += blocks * AES_BLOCK_SIZE;
+ sz -= blocks * AES_BLOCK_SIZE;
+ }
+
+ if (sz) {
+ wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg);
+ IncrementAesCounter((byte*)aes->reg);
+
+ aes->left = AES_BLOCK_SIZE;
+ tmp = (byte*)aes->tmp;
+
+ while (sz--) {
+ *(out++) = *(in++) ^ *(tmp++);
+ aes->left--;
+ }
+ }
+
+ return 0;
+}
+#endif
+
+
+/* AES-DIRECT */
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in)
+{
+ Buffer buf[3];
+ word32 arg[4];
+ word32 keySz;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ /* return BAD_FUNC_ARG; */
+ return;
+ }
+
+ if (wc_AesGetKeySize(aes, &keySz) != 0) {
+ /* return BAD_FUNC_ARG; */
+ return;
+ }
+
+ /* Set buffers for key, cipher text, and plain text */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)aes->key;
+ buf[0].Length = keySz;
+
+ buf[1].BufferType = DataBuffer;
+ buf[1].TheAddress = (Address)in;
+ buf[1].Length = AES_BLOCK_SIZE;
+
+ buf[2].BufferType = DataBuffer | LastBuffer;
+ buf[2].TheAddress = (Address)out;
+ buf[2].Length = AES_BLOCK_SIZE;
+
+ arg[0] = CAAM_ENC;
+ arg[1] = keySz;
+ arg[2] = AES_BLOCK_SIZE;
+
+ if (wc_caamAddAndWait(buf, arg, CAAM_AESECB) != 0) {
+ WOLFSSL_MSG("Error with CAAM AES direct encrypt");
+ }
+}
+
+
+void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in)
+{
+ Buffer buf[3];
+ word32 arg[4];
+ word32 keySz;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ /* return BAD_FUNC_ARG; */
+ return;
+ }
+
+ if (wc_AesGetKeySize(aes, &keySz) != 0) {
+ /* return BAD_FUNC_ARG; */
+ return;
+ }
+
+ /* Set buffers for key, cipher text, and plain text */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)aes->key;
+ buf[0].Length = keySz;
+
+ buf[1].BufferType = DataBuffer;
+ buf[1].TheAddress = (Address)in;
+ buf[1].Length = AES_BLOCK_SIZE;
+
+ buf[2].BufferType = DataBuffer | LastBuffer;
+ buf[2].TheAddress = (Address)out;
+ buf[2].Length = AES_BLOCK_SIZE;
+
+ arg[0] = CAAM_DEC;
+ arg[1] = keySz;
+ arg[2] = AES_BLOCK_SIZE;
+
+ if (wc_caamAddAndWait(buf, arg, CAAM_AESECB) != 0) {
+ WOLFSSL_MSG("Error with CAAM AES direct decrypt");
+ }
+}
+
+
+int wc_AesSetKeyDirect(Aes* aes, const byte* key, word32 len,
+ const byte* iv, int dir)
+{
+ return wc_AesSetKey(aes, key, len, iv, dir);
+}
+#endif
+
+#ifdef HAVE_AESCCM
+int wc_AesCcmEncrypt(Aes* aes, byte* out,
+ const byte* in, word32 inSz,
+ const byte* nonce, word32 nonceSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ Buffer buf[5];
+ word32 arg[4];
+ word32 keySz;
+ word32 i;
+ byte B0Ctr0[AES_BLOCK_SIZE + AES_BLOCK_SIZE];
+ int lenSz;
+ byte mask = 0xFF;
+ const word32 wordSz = (word32)sizeof(word32);
+ int ret;
+
+ /* sanity check on arguments */
+ if (aes == NULL || out == NULL || in == NULL || nonce == NULL
+ || authTag == NULL || nonceSz < 7 || nonceSz > 13 ||
+ authTagSz > AES_BLOCK_SIZE)
+ return BAD_FUNC_ARG;
+
+ if (wc_AesGetKeySize(aes, &keySz) != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* set up B0 and CTR0 similar to how wolfcrypt/src/aes.c does */
+ XMEMCPY(B0Ctr0+1, nonce, nonceSz);
+ XMEMCPY(B0Ctr0+AES_BLOCK_SIZE+1, nonce, nonceSz);
+ lenSz = AES_BLOCK_SIZE - 1 - (byte)nonceSz;
+ B0Ctr0[0] = (authInSz > 0 ? 64 : 0)
+ + (8 * (((byte)authTagSz - 2) / 2))
+ + (lenSz - 1);
+ for (i = 0; i < lenSz; i++) {
+ if (mask && i >= wordSz)
+ mask = 0x00;
+ B0Ctr0[AES_BLOCK_SIZE - 1 - i] = (inSz >> ((8 * i) & mask)) & mask;
+ B0Ctr0[AES_BLOCK_SIZE + AES_BLOCK_SIZE - 1 - i] = 0;
+ }
+ B0Ctr0[AES_BLOCK_SIZE] = lenSz - 1;
+
+ /* Set buffers for key, cipher text, and plain text */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)aes->key;
+ buf[0].Length = keySz;
+
+ buf[1].BufferType = DataBuffer;
+ buf[1].TheAddress = (Address)B0Ctr0;
+ buf[1].Length = AES_BLOCK_SIZE + AES_BLOCK_SIZE;
+
+ buf[2].BufferType = DataBuffer;
+ buf[2].TheAddress = (Address)authIn;
+ buf[2].Length = authInSz;
+
+ buf[3].BufferType = DataBuffer;
+ buf[3].TheAddress = (Address)in;
+ buf[3].Length = inSz;
+
+ buf[4].BufferType = DataBuffer | LastBuffer;
+ buf[4].TheAddress = (Address)out;
+ buf[4].Length = inSz;
+
+ arg[0] = CAAM_ENC;
+ arg[1] = keySz;
+ arg[2] = inSz;
+ arg[3] = authInSz;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, CAAM_AESCCM)) != 0) {
+ WOLFSSL_MSG("Error with CAAM AES-CCM encrypt");
+ return ret;
+ }
+
+ XMEMCPY(authTag, B0Ctr0, authTagSz);
+ return 0;
+}
+
+
+#ifdef HAVE_AES_DECRYPT
+int wc_AesCcmDecrypt(Aes* aes, byte* out,
+ const byte* in, word32 inSz,
+ const byte* nonce, word32 nonceSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ Buffer buf[5];
+ word32 arg[4];
+ word32 keySz;
+ word32 i;
+ byte B0Ctr0[AES_BLOCK_SIZE + AES_BLOCK_SIZE];
+ byte tag[AES_BLOCK_SIZE];
+ int lenSz;
+ byte mask = 0xFF;
+ const word32 wordSz = (word32)sizeof(word32);
+ int ret;
+
+ /* sanity check on arguments */
+ if (aes == NULL || out == NULL || in == NULL || nonce == NULL
+ || authTag == NULL || nonceSz < 7 || nonceSz > 13 ||
+ authTagSz > AES_BLOCK_SIZE)
+ return BAD_FUNC_ARG;
+
+ if (wc_AesGetKeySize(aes, &keySz) != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* set up B0 and CTR0 similar to how wolfcrypt/src/aes.c does */
+ XMEMCPY(B0Ctr0+1, nonce, nonceSz);
+ XMEMCPY(B0Ctr0+AES_BLOCK_SIZE+1, nonce, nonceSz);
+ lenSz = AES_BLOCK_SIZE - 1 - (byte)nonceSz;
+ B0Ctr0[0] = (authInSz > 0 ? 64 : 0)
+ + (8 * (((byte)authTagSz - 2) / 2))
+ + (lenSz - 1);
+ for (i = 0; i < lenSz; i++) {
+ if (mask && i >= wordSz)
+ mask = 0x00;
+ B0Ctr0[AES_BLOCK_SIZE - 1 - i] = (inSz >> ((8 * i) & mask)) & mask;
+ B0Ctr0[AES_BLOCK_SIZE + AES_BLOCK_SIZE - 1 - i] = 0;
+ }
+ B0Ctr0[AES_BLOCK_SIZE] = lenSz - 1;
+ wc_AesEncryptDirect(aes, tag, B0Ctr0 + AES_BLOCK_SIZE);
+
+ /* Set buffers for key, cipher text, and plain text */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)aes->key;
+ buf[0].Length = keySz;
+
+ buf[1].BufferType = DataBuffer;
+ buf[1].TheAddress = (Address)B0Ctr0;
+ buf[1].Length = AES_BLOCK_SIZE + AES_BLOCK_SIZE;
+
+ buf[2].BufferType = DataBuffer;
+ buf[2].TheAddress = (Address)authIn;
+ buf[2].Length = authInSz;
+
+ buf[3].BufferType = DataBuffer;
+ buf[3].TheAddress = (Address)in;
+ buf[3].Length = inSz;
+
+ buf[4].BufferType = DataBuffer | LastBuffer;
+ buf[4].TheAddress = (Address)out;
+ buf[4].Length = inSz;
+
+ arg[0] = CAAM_DEC;
+ arg[1] = keySz;
+ arg[2] = inSz;
+ arg[3] = authInSz;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, CAAM_AESCCM)) != 0) {
+ WOLFSSL_MSG("Error with CAAM AES-CCM derypt");
+ return ret;
+ }
+
+ xorbuf(tag, B0Ctr0, authTagSz);
+ if (ConstantCompare(tag, authTag, authTagSz) != 0) {
+ /* If the authTag check fails, don't keep the decrypted data.
+ * Unfortunately, you need the decrypted data to calculate the
+ * check value. */
+ XMEMSET(out, 0, inSz);
+ ret = AES_CCM_AUTH_E;
+ }
+
+ ForceZero(tag, AES_BLOCK_SIZE);
+ ForceZero(B0Ctr0, AES_BLOCK_SIZE * 2);
+
+ return ret;
+
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AESCCM */
+
+#endif /* WOLFSSL_IMX6_CAAM && !NO_AES */
+
diff --git a/wolfcrypt/src/port/caam/caam_doc.pdf b/wolfcrypt/src/port/caam/caam_doc.pdf
new file mode 100644
index 0000000..8213634
--- /dev/null
+++ b/wolfcrypt/src/port/caam/caam_doc.pdf
Binary files differ
diff --git a/wolfcrypt/src/port/caam/caam_driver.c b/wolfcrypt/src/port/caam/caam_driver.c
new file mode 100644
index 0000000..5d44f2d
--- /dev/null
+++ b/wolfcrypt/src/port/caam/caam_driver.c
@@ -0,0 +1,1713 @@
+/* caam_driver.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#if defined(__INTEGRITY) || defined(INTEGRITY)
+
+/* build into Integrity kernel */
+#include <bsp.h>
+#include "wolfssl/wolfcrypt/port/caam/caam_driver.h"
+
+#define CAAM_READ(reg) *(volatile unsigned int*)(reg)
+#define CAAM_WRITE(reg, in) *(volatile unsigned int*)(reg) = (in);
+
+#define DESC_COUNT 1
+#define MAX_BUF 20
+#define BUFFER_COUNT (MAX_BUF * DESC_COUNT)
+
+/* CAAM descriptors can only be 64 unsigned ints */
+#define MAX_DESC_SZ 64
+
+/* 64 byte buffer for when data crosses a page boundary */
+#define ALIGN_BUF 16
+
+/* MAX_CTX is 64 bytes (sha512 digest) + 8 bytes (CAAM length value) */
+#define MAX_CTX 18
+
+#define MIN_READ_REG 0xF2100000
+#define MAX_READ_REG 0XF2110000
+
+struct JobRing {
+ Address JobIn;
+ Address JobOut;
+ Address Desc;
+ Value page; /* page allocation for descriptor to use */
+};
+
+struct buffer {
+ Address data;
+ Address dataSz;
+};
+
+/* CAAM descriptor */
+struct DescStruct {
+ struct IORequestStruct TheIORequest;
+ struct CAAM_DEVICE* caam;
+ struct buffer buf[MAX_BUF]; /* buffers holding data input address */
+ UINT4 desc[MAX_DESC_SZ]; /* max size of 64 word32 */
+ UINT4 aadSzBuf[4]; /* Formatted AAD size for CCM */
+ UINT4 alignBuf[ALIGN_BUF]; /* 64 byte buffer for non page
+ align */
+ UINT4 iv[MAX_CTX]; /* AES IV and also hash state */
+ UINT4 ctxBuf[MAX_CTX]; /* key */
+ Address output; /* address to output buffer */
+ Address ctxOut; /* address to update buffer holding state */
+ Value alignIdx;/* index for align buffer */
+ Value idx; /* index for descriptor buffer */
+ Value headIdx; /* for first portion of descriptor buffer */
+ Value lastIdx; /* for last portion of descriptor buffer */
+ Value outputIdx; /* idx to output buffer in "buf" */
+ Value inputSz; /* size of input buffer */
+ Value ctxSz; /* size of CTX/Key buffer */
+ Value aadSz; /* AAD size for CCM */
+ Value lastFifo;
+ Value type;
+ Value state;
+ Value DescriptorCount;
+ Boolean running; /* True if building/running descriptor is
+ in process */
+};
+
+struct CAAM_DEVICE {
+ struct IODeviceVectorStruct caamVector;
+ struct IODescriptorStruct IODescriptorArray[BUFFER_COUNT];
+ struct DescStruct DescArray[DESC_COUNT];
+ volatile Value InterruptStatus;
+ CALL HandleInterruptCall;
+ struct JobRing ring;
+};
+
+#define DRIVER_NAME "wolfSSL_CAAM_Driver"
+
+static struct CAAM_DEVICE caam;
+
+/******************************************************************************
+ Internal CAAM Job Ring and partition functions
+ ****************************************************************************/
+
+/* flush job ring and reset */
+static Error caamReset(void)
+{
+ int t = 100000; /* time out counter for flushing job ring */
+
+ /* make sure interrupts are masked in JRCFGR0_LS register */
+ CAAM_WRITE(CAAM_BASE | 0x1054, CAAM_READ(CAAM_BASE | 0x1054) | 1);
+
+ /* flush and reset job rings using JRCR0 register */
+ CAAM_WRITE(CAAM_BASE | 0x106C, 1);
+
+ /* check register JRINTR for if halt is in progress */
+ while (t > 0 && ((CAAM_READ(CAAM_BASE | 0x104C) & 0x4) == 0x4)) t--;
+ if (t == 0) {
+ /*unrecoverable failure, the job ring is locked, up hard reset needed*/
+ return NotRestartable;
+ }
+
+ /* now that flush has been done restart the job ring */
+ t = 100000;
+ CAAM_WRITE(CAAM_BASE | 0x106C, 1);
+ while (t > 0 && ((CAAM_READ(CAAM_BASE | 0x106C) & 1) == 1)) t--;
+ if (t == 0) {
+ /*unrecoverable failure, reset bit did not return to 0 */
+ return NotRestartable;
+ }
+
+ /* reset most registers and state machines in CAAM using MCFGR register
+ also reset DMA */
+ CAAM_WRITE(CAAM_BASE | 0x0004, 0x90000000);
+
+ return Success;
+}
+
+/* returns MemoryMapMayNotBeEmpty if page/par is already owned
+ * returns Success on success
+ * all other returns is an error state
+ */
+static Error caamCreatePartition(unsigned char page, unsigned char par)
+{
+ /* check ownership of partition */
+ if ((CAAM_READ(CAAM_BASE | 0x1FBC) & (0x3 << (par * 2))) > 0) {
+ return MemoryMapMayNotBeEmpty;
+ }
+
+ /* set generic all access permissions, gets reset later */
+ CAAM_WRITE(CAAM_BASE | (0x1108 + (par * 16)), 0xF);
+ CAAM_WRITE(CAAM_BASE | (0x110C + (par * 16)), 0xF);
+ CAAM_WRITE(CAAM_BASE | (0x1104 + (par * 16)), 0xFF);
+
+ /* check ownership of page */
+ CAAM_WRITE(CAAM_BASE | 0x10F4, (page << 16) | 0x5);
+ /* wait for inquiry cmd to complete */
+ while ((CAAM_READ(CAAM_BASE | 0x10FC) & 0x0000C000) > 0 &&
+ (CAAM_READ(CAAM_BASE | 0x10FC) & 0x00003000) == 0) {
+ }
+ if ((CAAM_READ(CAAM_BASE | 0x10FC) & 0x000000C0) == 0xC0) {
+ /* owns the page can dealloc it */
+ CAAM_WRITE(CAAM_BASE | 0x10F4, (page << 16) | 0x2);
+ while ((CAAM_READ(CAAM_BASE | 0x10FC) & 0x0000C000) > 0 &&
+ (CAAM_READ(CAAM_BASE | 0x10FC) & 0x00003000) == 0) {}
+ if ((CAAM_READ(CAAM_BASE | 0x10FC) & 0x00003000) > 0) {
+ /* error while deallocating page */
+ return MemoryMapMayNotBeEmpty; /* PSP set on page or is unavailable */
+ }
+ }
+ else {
+ /* check if owned by someone else */
+ if ((CAAM_READ(CAAM_BASE | 0x10FC) & 0x000000C0) != 0) {
+ return MemoryMapMayNotBeEmpty;
+ }
+ }
+
+ /* allocate page to partition */
+ CAAM_WRITE(CAAM_BASE | 0x10F4, (page << 16) | (par << 8) | 0x1);
+ /* wait for alloc cmd to complete */
+ while ((CAAM_READ(CAAM_BASE | 0x10FC) & 0x0000C000) > 0 &&
+ (CAAM_READ(CAAM_BASE | 0x10FC) & 0x00003000) == 0) {
+ }
+
+ if ((CAAM_READ(CAAM_BASE | 0x10FC) & 0x00003000) > 0) {
+ return MemoryOperationNotPerformed;
+ }
+
+ /* double check ownership now of page */
+ CAAM_WRITE(CAAM_BASE | 0x10F4, (page << 16) | 0x5);
+ /* wait for inquiry cmd to complete */
+ while ((CAAM_READ(CAAM_BASE | 0x10FC) & 0x0000C000) > 0 &&
+ (CAAM_READ(CAAM_BASE | 0x10FC) & 0x00003000) == 0) {
+ }
+ if ((CAAM_READ(CAAM_BASE | 0x10FC) & 0x0000000F) == 0 ||
+ (CAAM_READ(CAAM_BASE | 0x10FC) & 0x00003000) > 0) {
+ /* page not owned */
+ return MemoryOperationNotPerformed;
+ }
+
+ return Success;
+}
+
+
+/* Gets the status of a job. Returns Waiting if no output jobs ready to be
+ * read.
+ * If no jobs are done then return Waiting
+ * If jobs are done but does not match desc then return NoActivityReady
+ * Status holds the error values if any */
+static Error caamGetJob(struct CAAM_DEVICE* dev, UINT4* status)
+{
+ UINT4 reg = CAAM_READ(CAAM_BASE | 0x1044); /* JRSTAR0 status */
+ if (status) {
+ *status = 0;
+ }
+
+ /* check for DECO, CCB, and Job Ring error state JRSTAR0 register */
+ if (((reg & 0xF0000000) == 0x20000000) || /* CCB error */
+ ((reg & 0xF0000000) == 0x40000000)|| /* DECO error */
+ ((reg & 0xF0000000) == 0x60000000)) { /* Job Ring error */
+
+ if ((reg & 0x0000000F) > 0) {
+ *status = reg;
+ return Failure;
+ }
+ }
+
+ /* Check number of done jobs in output list */
+ reg = CAAM_READ(CAAM_BASE | 0x103C);
+ if ((reg & 0x000003FF) > 0) {
+ UINT4* out = (UINT4*)(dev->ring.JobOut);
+ if (status) {
+ *status = out[1];
+ }
+
+ if ((dev->ring.Desc ^ 0xF0000000) != out[0]) {
+ db_printf("CAAM job completed vs expected mismatch");
+ return NoActivityReady;
+ }
+
+ if (out[1] > 0) {
+ return Failure;
+ }
+
+ /* increment jobs removed */
+ CAAM_WRITE(CAAM_BASE | 0x1034, 1);
+ }
+ else {
+ /* check if the CAAM is idle and not processing any descriptors */
+ if ((CAAM_READ(CAAM_BASE | 0x0FD4) & 0x00000002) == 2 /* idle */
+ && (CAAM_READ(CAAM_BASE | 0x0FD4) & 0x00000001) == 0) {
+ return NoActivityReady;
+ }
+
+ return Waiting;
+ }
+
+ return Success;
+}
+
+
+/* Initialize CAAM RNG
+ * returns 0 on success */
+static int caamInitRng(struct CAAM_DEVICE* dev)
+{
+ UINT4 reg, status;
+ int ret = 0;
+
+ /* Set up use of the TRNG for seeding wolfSSL HASH-DRBG */
+ CAAM_WRITE(CAAM_RTMCTL, CAAM_PRGM);
+ CAAM_WRITE(CAAM_RTMCTL, CAAM_READ(CAAM_RTMCTL) | 0x40); /* reset */
+
+ /* Set up reading from TRNG */
+ CAAM_WRITE(CAAM_RTMCTL, CAAM_READ(CAAM_RTMCTL) | CAAM_TRNG);
+
+ /* Set up delay for TRNG @TODO Optimizations?
+ * Shift left with RTSDCTL because 0-15 is for sample number
+ * Also setting the max and min frequencies */
+ CAAM_WRITE(CAAM_RTSDCTL, (CAAM_ENT_DLY << 16) | 0x09C4);
+ CAAM_WRITE(CAAM_RTFRQMIN, CAAM_ENT_DLY >> 1); /* 1/2 */
+ CAAM_WRITE(CAAM_RTFRQMAX, CAAM_ENT_DLY << 3); /* up to 8x */
+
+ /* Set back to run mode and clear RTMCL error bit */
+ reg = CAAM_READ(CAAM_RTMCTL) ^ CAAM_PRGM;
+
+ CAAM_WRITE(CAAM_RTMCTL, reg);
+ reg = CAAM_READ(CAAM_RTMCTL);
+ reg |= CAAM_CTLERR;
+ CAAM_WRITE(CAAM_RTMCTL, reg);
+
+ /* check input slot is available and then add */
+ if (CAAM_READ(CAAM_BASE | 0x1014) > 0) {
+ UINT4* in = (UINT4*)dev->ring.JobIn;
+
+ memcpy((unsigned char*)dev->ring.Desc, (unsigned char*)wc_rng_start,
+ sizeof(wc_rng_start));
+
+ in[0] = dev->ring.Desc ^ 0xF0000000; /* physical address */
+ CAAM_WRITE(CAAM_IRJAR0, 0x00000001);
+ }
+ else {
+ return Waiting;
+ }
+
+ do {
+ ret = caamGetJob(dev, &status);
+ /* @TODO use a better way to chill out CPU. */
+ } while (ret == Waiting);
+
+ return ret;
+}
+
+
+static Error caamDoJob(struct DescStruct* desc)
+{
+ Error ret;
+ UINT4 status;
+
+ /* clear and set desc size */
+ desc->desc[0] &= 0xFFFFFF80;
+ desc->desc[0] += desc->idx;
+
+ /* check input slot is available and then add */
+ if (CAAM_READ(CAAM_BASE | 0x1014) > 0) {
+ UINT4* in = (UINT4*)desc->caam->ring.JobIn;
+
+ memcpy((unsigned char*)desc->caam->ring.Desc, (unsigned char*)desc->desc,
+ (desc->idx + 1) * sizeof(UINT4));
+
+ in[0] = desc->caam->ring.Desc ^ 0xF0000000; /* physical address */
+ CAAM_WRITE(CAAM_IRJAR0, 0x00000001);
+ }
+ else {
+ return Waiting;
+ }
+
+ do {
+ ret = caamGetJob(desc->caam, &status);
+ /* @TODO use a better way to chill out CPU. */
+ } while (ret == Waiting);
+
+ if (status != 0 || ret != Success) {
+ #if 0
+ /* Used during testing to print out descriptor */
+ {
+ char msg[2048];
+ char* pt = msg;
+ int z;
+
+ memset(msg, 0, sizeof(msg));
+ for (z = 0; z < desc->idx; z++) {
+ snprintf(pt, sizeof(msg) - (z * 21), "desc[%d] = 0x%8.8x, ",
+ z, desc->desc[z]);
+ pt += 21;
+ }
+ snprintf(pt, sizeof(msg) - (z * 21), "status = 0x%8.8x\n", status);
+ if (desc->buf[0].data != 0) { /* for testing */
+ memcpy((char*)desc->buf[0].data, msg, sizeof(msg));
+ }
+ }
+ #endif
+
+
+ /* try to reset after error */
+ caamReset();
+ return ret;
+ }
+
+ return Success;
+}
+
+
+/* handle input or output buffers
+ * NOTES: if sz == 0 then read all the rest of the buffers available
+ * when align == 1 then there is no alignment constraints
+ *
+ * returns the data size in bytes on success. With failure a negative value is
+ * returned.
+ */
+static int caamAddIO(struct DescStruct* desc, UINT4 options, UINT4 sz,
+ UINT4 align, UINT4* idx)
+{
+ int i, outSz = 0;
+
+ if (align == 0) {
+ return -1; /* programming error */
+ }
+
+ for (i = *idx; i < desc->DescriptorCount; i++) {
+ /* input must be a multiple of "align" bytes */
+ struct buffer* buf = &desc->buf[i];
+ int blocks = buf->dataSz / align;
+ Address data = buf->data;
+ Address dataSz = buf->dataSz;
+
+ if (outSz >= sz && sz != 0) {
+ break;
+ }
+
+ if (dataSz % align > 0) {
+ /* store potential overlap */
+ int tmpSz = dataSz % align;
+ int add = (tmpSz < (align - desc->alignIdx)) ? tmpSz :
+ align - desc->alignIdx;
+ unsigned char* local = (unsigned char*)desc->alignBuf;
+
+ /* if already something in the buffer then add from front */
+ if (desc->alignIdx > 0) {
+ memcpy((unsigned char*)&local[desc->alignIdx],
+ (unsigned char*)data, add);
+ data += add;
+ }
+ else {
+ memcpy((unsigned char*)&local[desc->alignIdx],
+ (unsigned char*)data + (blocks * align), add);
+ }
+ dataSz -= add;
+ desc->alignIdx += add;
+ }
+
+ if (desc->alignIdx == align) {
+ desc->lastFifo = desc->idx;
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return -1;
+ }
+ desc->desc[desc->idx++] = options + desc->alignIdx;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->alignBuf);
+ ASP_FlushCaches((Address)desc->alignBuf, desc->alignIdx);
+ outSz += desc->alignIdx;
+ }
+
+ if (blocks > 0) {
+ desc->lastFifo = desc->idx;
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return -1;
+ }
+ desc->desc[desc->idx++] = options + (blocks * align);
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(data);
+ outSz += (blocks * align);
+
+ /* only one buffer available for align cases so exit here and make
+ a new descriptor after running current one */
+ if (desc->alignIdx == align) {
+ desc->alignIdx = 0;
+ i++; /* start at next buffer */
+ break;
+ }
+ }
+ }
+
+ *idx = i;
+ return outSz;
+}
+
+
+/******************************************************************************
+ IODevice Register Read and Write
+ ****************************************************************************/
+
+static Error caamReadRegister(IODeviceVector ioCaam, Value reg, Value *out)
+{
+ if (reg < MIN_READ_REG || reg > MAX_READ_REG) {
+ return IllegalRegisterNumber;
+ }
+
+ switch (reg) {
+ case CAAM_STATUS:
+ case CAAM_VERSION_MS:
+ case CAAM_VERSION_LS:
+ case CAMM_SUPPORT_MS:
+ case CAMM_SUPPORT_LS:
+ case CAAM_RTMCTL:
+ *out = CAAM_READ(reg);
+ break;
+
+ default:
+ return IllegalRegisterNumber;
+ }
+
+ (void)ioCaam;
+ return Success;
+}
+
+
+static Error caamWriteRegister(IODeviceVector ioCaam, Value reg, Value in)
+{
+ /* Should be no need for writes */
+ return OperationNotAllowedOnTheUniversalIODevice;
+}
+
+
+/******************************************************************************
+ CAAM Blob Operations
+ ****************************************************************************/
+
+/* limit on size due to size of job ring being 64 word32's */
+static Error caamBlob(struct DescStruct* desc)
+{
+ Error err;
+ UINT4 keyType = 0x00000C08; /* default red */
+ UINT4 i = 0;
+ int sz = 0, ret;
+
+ if (desc->idx + 3 > MAX_DESC_SZ) {
+ return Failure;
+ }
+
+ /*default to Red Key type, with offset of 12 and 8 byte load to context 2*/
+ desc->desc[desc->idx++] = (CAAM_LOAD_CTX | CAAM_CLASS2 | CAAM_IMM | keyType);
+
+ /* add key modifier */
+ if (i < desc->DescriptorCount) {
+ UINT4* pt;
+ Address data = desc->buf[i].data;
+ Address dataSz = desc->buf[i].dataSz;
+
+ pt = (UINT4*)data;
+ if (dataSz < 8) { /* expecting 8 bytes for key modifier*/
+ return TooManyBuffers;
+ }
+ desc->desc[desc->idx++] = pt[0];
+ desc->desc[desc->idx++] = pt[1];
+ }
+
+ /* add input */
+ while (sz < desc->inputSz && i < desc->DescriptorCount) {
+ ret = caamAddIO(desc, CAAM_SEQI, desc->inputSz - sz, 1, &i);
+ if (ret < 0) { /* handle error case */
+ return TooManyBuffers;
+ }
+ sz += ret;
+ }
+ desc->outputIdx = i;
+
+ /* add output */
+ if (caamAddIO(desc, CAAM_SEQO, 0, 1, &i) < 0) {
+ return TooManyBuffers;
+ }
+
+ if (desc->idx + 1 > MAX_DESC_SZ) {
+ return Failure;
+ }
+ desc->desc[desc->idx++] = CAAM_OP | CAAM_OPID_BLOB | desc->type;
+
+ if ((err = caamDoJob(desc)) != Success) {
+ return err;
+ }
+
+ /* flush output buffers */
+ for (i = desc->outputIdx; i < desc->DescriptorCount; i++) {
+ ASP_FlushCaches(desc->buf[i].data, desc->buf[i].dataSz);
+ }
+
+ return Success;
+}
+
+
+/******************************************************************************
+ CAAM AES Operations
+ ****************************************************************************/
+
+/* returns amount written on success and negative value in error case.
+ * Is different from caamAddIO in that it only adds a single input buffer
+ * rather than multiple ones.
+ */
+static int caamAesInput(struct DescStruct* desc, UINT4* idx, int align,
+ UINT4 totalSz)
+{
+ int sz;
+ UINT4 i = *idx;
+
+ /* handle alignment constraints on input */
+ if (desc->alignIdx > 0) {
+ sz = desc->alignIdx;
+
+ /* if there is more input buffers then add part of it */
+ if (i < desc->outputIdx && i < desc->DescriptorCount) {
+ sz = align - desc->alignIdx;
+ sz = (sz <= desc->buf[i].dataSz) ? sz : desc->buf[i].dataSz;
+ memcpy((unsigned char*)(desc->alignBuf) + desc->alignIdx,
+ (unsigned char*)(desc->buf[i].data), sz);
+
+ desc->buf[i].dataSz -= sz;
+ desc->buf[i].data += sz;
+ sz += desc->alignIdx;
+ }
+
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return -1;
+ }
+ ASP_FlushCaches((Address)desc->alignBuf, sz);
+ desc->desc[desc->idx++] = (CAAM_FIFO_L | FIFOL_TYPE_LC1 |
+ CAAM_CLASS1 | FIFOL_TYPE_MSG) + sz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->alignBuf);
+ desc->alignIdx = 0;
+ }
+ else {
+ sz = desc->buf[i].dataSz;
+ if ((totalSz + sz) == desc->inputSz) { /* not an issue on final */
+ align = 1;
+ }
+
+ desc->alignIdx = sz % align;
+ if (desc->alignIdx != 0) {
+ sz -= desc->alignIdx;
+ memcpy((unsigned char*)desc->alignBuf,
+ (unsigned char*)(desc->buf[i].data) + sz,
+ desc->alignIdx);
+ }
+
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return -1;
+ }
+ desc->desc[desc->idx++] = (CAAM_FIFO_L | FIFOL_TYPE_LC1 |
+ CAAM_CLASS1 | FIFOL_TYPE_MSG) + sz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->buf[i].data);
+ i++;
+ }
+
+ *idx = i;
+ return sz;
+}
+
+
+/* returns enum Success on success, all other return values should be
+ * considered an error.
+ *
+ * ofst is the amount of leftover buffer from previous calls
+ * inputSz is the amount of input in bytes that is being matched to output
+ */
+static Error caamAesOutput(struct DescStruct* desc, int* ofst, UINT4 inputSz)
+{
+ int offset = *ofst;
+
+ if (desc->output != 0 && offset > 0 && inputSz > 0) {
+ UINT4 addSz;
+
+ /* handle potential leftovers */
+ addSz = (inputSz >= offset) ? offset : inputSz;
+
+ inputSz -= addSz;
+ desc->desc[desc->idx++] = CAAM_FIFO_S | FIFOS_TYPE_MSG + addSz;
+ if (inputSz > 0) { /* check if expecting more output */
+ desc->desc[desc->idx - 1] |= CAAM_FIFOS_CONT;
+ }
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->output);
+
+ if (addSz == offset) {
+ /* reset */
+ desc->output = 0;
+ offset = 0;
+ }
+ else {
+ offset -= addSz;
+ desc->output += addSz;
+
+ if (offset < 0) {
+ return TransferFailed;
+ }
+ }
+ }
+
+ for (; desc->lastIdx < desc->DescriptorCount; desc->lastIdx++) {
+ struct buffer* buf = &desc->buf[desc->lastIdx];
+
+ if (inputSz > 0) {
+ int tmp;
+
+ if (buf->dataSz <= inputSz) {
+ tmp = buf->dataSz;
+ }
+ else {
+ offset = buf->dataSz - inputSz;
+ tmp = inputSz;
+ desc->output = buf->data + tmp;
+ }
+ inputSz -= tmp;
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = CAAM_FIFO_S | FIFOS_TYPE_MSG + tmp;
+ if (inputSz > 0) { /* check if expecting more output */
+ desc->desc[desc->idx - 1] |= CAAM_FIFOS_CONT;
+ }
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(buf->data);
+ }
+ else {
+ break;
+ }
+ }
+
+ *ofst = offset;
+ return Success;
+}
+
+
+/* check size of output and get starting buffer for it */
+static Error caamAesOutSz(struct DescStruct* desc, UINT4 i)
+{
+ int sz = 0;
+
+ for (desc->outputIdx = i; desc->outputIdx < desc->DescriptorCount &&
+ sz < desc->inputSz; desc->outputIdx++) {
+ sz += desc->buf[desc->outputIdx].dataSz;
+ }
+ desc->lastIdx = desc->outputIdx;
+
+ /* make certain that output size is same as input */
+ sz = 0;
+ for (; desc->lastIdx < desc->DescriptorCount; desc->lastIdx++) {
+ sz += desc->buf[desc->lastIdx].dataSz;
+ }
+ if (sz != desc->inputSz) {
+ return SizeIsTooLarge;
+ }
+ desc->lastIdx = desc->outputIdx;
+
+ return Success;
+}
+
+
+/* AES operations follow the buffer sequence of KEY -> (IV) -> Input -> Output
+ */
+static Error caamAes(struct DescStruct* desc)
+{
+ struct buffer* ctx[3];
+ struct buffer* iv[3];
+ Value ofst = 0;
+ Error err;
+ UINT4 i, totalSz = 0;
+ int ctxIdx = 0;
+ int ivIdx = 0;
+ int offset = 0;
+ int align = 1;
+ int sz = 0;
+
+ int ctxSz = desc->ctxSz;
+
+ if (desc->state != CAAM_ENC && desc->state != CAAM_DEC) {
+ return IllegalStatusNumber;
+ }
+
+ if (ctxSz != 16 && ctxSz != 24 && ctxSz != 32) {
+ return ArgumentError;
+ }
+
+ /* get key */
+ for (i = 0; i < desc->DescriptorCount; i++) {
+ struct buffer* buf = &desc->buf[i];
+ unsigned char* local = (unsigned char*)desc->ctxBuf;
+
+ if (sz < ctxSz && sz < (MAX_CTX * sizeof(UINT4))) {
+ ctx[ctxIdx] = buf;
+ sz += buf->dataSz;
+
+ memcpy((unsigned char*)&local[offset],
+ (unsigned char*)ctx[ctxIdx]->data, ctx[ctxIdx]->dataSz);
+ offset += ctx[ctxIdx]->dataSz;
+ ctxIdx++;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* sanity checks on size of key */
+ if (sz > ctxSz) {
+ return SizeIsTooLarge;
+ }
+ if (ctxSz > (MAX_CTX * sizeof(UINT4)) - 16) {
+ return ArgumentError;
+ }
+
+ /* Flush cache of ctx buffer then :
+ Add KEY Load command 0x0220000X
+ Add address to read key from 0xXXXXXXXX */
+ ASP_FlushCaches((Address)desc->ctxBuf, ctxSz);
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = (CAAM_KEY | CAAM_CLASS1 | CAAM_NWB) + ctxSz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->ctxBuf);
+
+ /* get IV if needed by algorithm */
+ switch (desc->type) {
+ case CAAM_AESECB:
+ break;
+
+ case CAAM_AESCTR:
+ ofst = 0x00001000;
+ /* fall through because states are the same only the offset changes */
+
+ case CAAM_AESCBC:
+ {
+ int maxSz = 16; /* default to CBC/CTR max size */
+
+ sz = 0;
+ offset = 0;
+ for (; i < desc->DescriptorCount; i++) {
+ struct buffer* buf = &desc->buf[i];
+ unsigned char* local = (unsigned char*)desc->iv;
+
+ if (sz < maxSz) {
+ iv[ivIdx] = buf;
+
+ if (buf->dataSz + sz > maxSz) {
+ return SizeIsTooLarge;
+ }
+
+ sz += buf->dataSz;
+ memcpy((unsigned char*)&local[offset],
+ (unsigned char*)iv[ivIdx]->data, iv[ivIdx]->dataSz);
+ offset += iv[ivIdx]->dataSz;
+ ivIdx++;
+ }
+ else {
+ break;
+ }
+ }
+
+ if (sz != maxSz) {
+ /* invalid IV size */
+ return SizeIsTooLarge;
+ }
+
+ ASP_FlushCaches((Address)desc->iv, maxSz);
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = (CAAM_LOAD_CTX | CAAM_CLASS1 | ofst) + maxSz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->iv);
+ }
+ break;
+
+ default:
+ return OperationNotImplemented;
+ }
+
+ /* write operation */
+ if (desc->idx + 1 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = CAAM_OP | CAAM_CLASS1 | desc->type |
+ CAAM_ALG_UPDATE | desc->state;
+
+ /* find output buffers */
+ if (caamAesOutSz(desc, i) != Success) {
+ return SizeIsTooLarge;
+ }
+
+ /* set alignment constraints */
+ if (desc->type == CAAM_AESCBC || desc->type == CAAM_AESECB) {
+ align = 16;
+ }
+
+ /* indefinite loop for input/output buffers */
+ desc->headIdx = desc->idx;
+ desc->output = 0;
+ offset = 0; /* store left over amount for output buffer */
+ do {
+ desc->idx = desc->headIdx; /* reset for each loop */
+
+ /* add a single input buffer (multiple ones was giving deco watch dog
+ * time out errors on the FIFO load of 1c.
+ * @TODO this could be a place for optimization if more data could be
+ * loaded in at one time */
+ if ((sz = caamAesInput(desc, &i, align, totalSz)) < 0) {
+ return TransferFailed;
+ }
+ totalSz += sz;
+
+ if (caamAesOutput(desc, &offset, sz) != Success) {
+ return TransferFailed;
+ }
+
+ /* store updated IV */
+ if (ivIdx > 0) {
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = CAAM_STORE_CTX | CAAM_CLASS1 | ofst | 16;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical((Address)desc->iv);
+ }
+
+ if ((err = caamDoJob(desc)) != Success) {
+ return err;
+ }
+ ASP_FlushCaches((Address)desc->iv, 16);
+ } while (desc->lastIdx < desc->DescriptorCount || offset > 0);
+
+ /* flush output buffers */
+ for (i = desc->outputIdx; i < desc->lastIdx; i++) {
+ ASP_FlushCaches(desc->buf[i].data, desc->buf[i].dataSz);
+ }
+
+ /* handle case with IV */
+ if (ivIdx > 0) {
+ unsigned char* pt = (unsigned char*)desc->iv;
+ ASP_FlushCaches((Address)pt, 16);
+ for (i = 0; i < ivIdx; i++) {
+ memcpy((unsigned char*)iv[i]->data, pt, iv[i]->dataSz);
+ pt += iv[i]->dataSz;
+ ASP_FlushCaches(iv[i]->data, iv[i]->dataSz);
+ }
+ }
+
+ return Success;
+}
+
+
+/******************************************************************************
+ CAAM AEAD Operations
+ ****************************************************************************/
+
+/* AEAD operations follow the buffer sequence of KEY -> (IV or B0 | CTR0) -> (AD)
+ * -> Input -> Output
+ *
+ */
+static Error caamAead(struct DescStruct* desc)
+{
+ struct buffer* ctx[3];
+ struct buffer* iv[3];
+ Value ofst = 0;
+ UINT4 state = CAAM_ALG_INIT;
+ UINT4 totalSz = 0;
+ Error err;
+ UINT4 i;
+ int ctxIdx = 0;
+ int ivIdx = 0;
+ int offset = 0;
+ int sz = 0;
+ int ivSz = 32; /* size of B0 | CTR0 for CCM mode */
+ int ctxSz = desc->ctxSz;
+ int align = 16; /* input should be multiples of 16 bytes unless is final */
+ int opIdx;
+
+ if (desc->state != CAAM_ENC && desc->state != CAAM_DEC) {
+ return IllegalStatusNumber;
+ }
+
+ /* sanity check is valid AES key size */
+ if (ctxSz != 16 && ctxSz != 24 && ctxSz != 32) {
+ return ArgumentError;
+ }
+
+ /* get key */
+ for (i = 0; i < desc->DescriptorCount; i++) {
+ struct buffer* buf = &desc->buf[i];
+ unsigned char* local = (unsigned char*)desc->ctxBuf;
+
+ if (sz < ctxSz && sz < (MAX_CTX * sizeof(UINT4))) {
+ ctx[ctxIdx] = buf;
+ sz += buf->dataSz;
+
+ memcpy((unsigned char*)&local[offset],
+ (unsigned char*)ctx[ctxIdx]->data, ctx[ctxIdx]->dataSz);
+ offset += ctx[ctxIdx]->dataSz;
+ ctxIdx++;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* sanity checks on size of key */
+ if (sz > ctxSz) {
+ return SizeIsTooLarge;
+ }
+
+ /* Flush cache of ctx buffer then :
+ Add KEY Load command 0x0220000X
+ Add address to read key from 0xXXXXXXXX */
+ ASP_FlushCaches((Address)desc->ctxBuf, ctxSz);
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = (CAAM_KEY | CAAM_CLASS1 | CAAM_NWB) + ctxSz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->ctxBuf);
+
+ desc->headIdx = desc->idx;
+ desc->output = 0;
+ offset = 0; /* store left over amount for output buffer */
+ do {
+ desc->idx = desc->headIdx; /* reset for each loop */
+
+ /* write operation */
+ if (desc->idx + 1 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ opIdx = desc->idx;
+ desc->desc[desc->idx++] = CAAM_OP | CAAM_CLASS1 | state | desc->type |
+ desc->state;
+
+ /* get IV if needed by algorithm */
+ switch (desc->type) {
+ case CAAM_AESCCM:
+ if ((state & CAAM_ALG_INIT) == CAAM_ALG_INIT) {
+ sz = 0;
+ offset = 0;
+ for (; i < desc->DescriptorCount; i++) {
+ struct buffer* buf = &desc->buf[i];
+ unsigned char* local = (unsigned char*)desc->iv;
+
+ if (sz < ivSz) {
+ iv[ivIdx] = buf;
+
+ if (buf->dataSz + sz > ivSz) {
+ return SizeIsTooLarge;
+ }
+
+ sz += buf->dataSz;
+ memcpy((unsigned char*)&local[offset],
+ (unsigned char*)iv[ivIdx]->data, iv[ivIdx]->dataSz);
+ offset += iv[ivIdx]->dataSz;
+ ivIdx++;
+ }
+ else {
+ break;
+ }
+ }
+
+ if (sz != ivSz) {
+ /* invalid IV size */
+ return SizeIsTooLarge;
+ }
+ offset = 0;
+ }
+
+ ASP_FlushCaches((Address)desc->iv, ivSz);
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = (CAAM_LOAD_CTX | CAAM_CLASS1 | ofst)
+ + ivSz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->iv);
+ break;
+
+ default:
+ return OperationNotImplemented;
+ }
+
+
+ /********* handle AAD -- is only done with Init **********************/
+ if ((state & CAAM_ALG_INIT) == CAAM_ALG_INIT) {
+ if ((desc->type == CAAM_AESCCM) && (desc->aadSz > 0)) {
+ /* set formatted AAD buffer size for CCM */
+ ASP_FlushCaches((Address)desc->aadSzBuf, sizeof(desc->aadSzBuf));
+ desc->desc[desc->idx++] = CAAM_FIFO_L | CAAM_CLASS1 |
+ FIFOL_TYPE_AAD + desc->aadSz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->aadSzBuf);
+
+ /* now set aadSz to unformatted version for getting buffers */
+ if (desc->aadSz == 2) {
+ unsigned char* pt = (unsigned char*)desc->aadSzBuf;
+ desc->aadSz = (((UINT4)pt[0] & 0xFF) << 8) |
+ ((UINT4)pt[1] & 0xFF);
+ }
+ else {
+ unsigned char* pt = (unsigned char*)desc->aadSzBuf;
+ desc->aadSz = (((UINT4)pt[2] & 0xFF) << 24) |
+ (((UINT4)pt[3] & 0xFF) << 16) |
+ (((UINT4)pt[4] & 0xFF) << 8) |
+ ((UINT4)pt[5] & 0xFF);
+ }
+ }
+
+ /* get additional data buffers */
+ if (desc->aadSz > 0) {
+ sz = 0;
+ for (; i < desc->DescriptorCount; i++) {
+ struct buffer* buf = &desc->buf[i];
+ if (sz < desc->aadSz) {
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->lastFifo = desc->idx;
+ desc->desc[desc->idx++] = CAAM_FIFO_L | CAAM_CLASS1 |
+ FIFOL_TYPE_AAD + buf->dataSz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(buf->data);
+ sz += buf->dataSz;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* flush AAD from FIFO and pad it to 16 byte block */
+ desc->desc[desc->lastFifo] |= FIFOL_TYPE_FC1;
+ }
+
+ /* find output buffers */
+ if (caamAesOutSz(desc, i) != Success) {
+ return SizeIsTooLarge;
+ }
+ }
+
+ /* handle alignment constraints on input */
+ if ((sz = caamAesInput(desc, &i, align, totalSz)) < 0) {
+ return TransferFailed;
+ }
+ totalSz += sz;
+
+ /* handle output buffers */
+ if (caamAesOutput(desc, &offset, sz) != Success) {
+ return TransferFailed;
+ }
+
+ /* store updated IV, if is last then set offset and final for MAC */
+ if ((desc->lastIdx == desc->DescriptorCount) && (offset == 0)) {
+ ivSz = 16;
+ if (desc->state == CAAM_ENC) {
+ ofst = 32 << 8; /* offset is in 15-8 bits */
+ }
+ else {
+ ofst = 0;
+ }
+ desc->desc[opIdx] |= CAAM_ALG_FINAL;
+ }
+ else {
+ /* if not final then store and use ctr and encrypted ctr from
+ context dword 2,3 and 4,5. Also store MAC and AAD info from
+ context dword 6. */
+ ivSz = 56;
+ ofst = 0;
+ }
+
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = CAAM_STORE_CTX | CAAM_CLASS1 | ofst | ivSz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical((Address)desc->iv);
+
+ if ((err = caamDoJob(desc)) != Success) {
+ return err;
+ }
+ state = CAAM_ALG_UPDATE;
+ } while (desc->lastIdx < desc->DescriptorCount || offset > 0);
+
+ /* flush output buffers */
+ for (i = desc->outputIdx; i < desc->lastIdx; i++) {
+ ASP_FlushCaches(desc->buf[i].data, desc->buf[i].dataSz);
+ }
+
+ /* handle case with IV (This is also the output of MAC with AES-CCM) */
+ if (ivIdx > 0) {
+ unsigned char* pt = (unsigned char*)desc->iv;
+ ASP_FlushCaches((Address)pt, ivSz);
+ for (i = 0; i < ivIdx; i++) {
+ memcpy((unsigned char*)iv[i]->data, pt, iv[i]->dataSz);
+ pt += iv[i]->dataSz;
+ ASP_FlushCaches(iv[i]->data, iv[i]->dataSz);
+ }
+ }
+
+ return Success;
+}
+
+
+/******************************************************************************
+ CAAM SHA Operations
+ ****************************************************************************/
+static int shaSize(struct DescStruct* desc)
+{
+ /* sanity check on dataSz for context */
+ switch (desc->type) {
+ case CAAM_MD5:
+ return CAAM_MD5_CTXSZ;
+
+ case CAAM_SHA:
+ return CAAM_SHA_CTXSZ;
+
+ case CAAM_SHA224:
+ return CAAM_SHA224_CTXSZ;
+
+ case CAAM_SHA256:
+ return CAAM_SHA256_CTXSZ;
+
+ case CAAM_SHA384:
+ return CAAM_SHA384_CTXSZ;
+
+ case CAAM_SHA512:
+ return CAAM_SHA512_CTXSZ;
+
+ default:
+ return 0;
+ }
+}
+
+/* SHA operations
+ * start: the index to start traversing through buffers. It's needed to allow
+ * for HMAC to reuse this code.
+ *
+ * return Success on success. All other return values are considered a fail
+ * case.
+ */
+static Error caamSha(struct DescStruct* desc, int start)
+{
+ struct buffer* ctx[3];
+ Error err;
+ UINT4 i;
+ int sz = 0;
+ int ctxIdx = 0;
+ int offset = 0;
+
+ int ctxSz = shaSize(desc);
+
+ /* get context */
+ for (i = start; i < desc->DescriptorCount; i++) {
+ struct buffer* buf = &desc->buf[i];
+ unsigned char* local = (unsigned char*)desc->iv;
+
+ if (sz < ctxSz && sz < (MAX_CTX * sizeof(UINT4))) {
+ ctx[ctxIdx] = buf;
+ sz += buf->dataSz;
+
+ if (ctx[ctxIdx]->dataSz + offset > (MAX_CTX * sizeof(UINT4))) {
+ return SizeIsTooLarge;
+ }
+ memcpy((unsigned char*)&local[offset], (unsigned char*)ctx[ctxIdx]->data,
+ ctx[ctxIdx]->dataSz);
+ offset += ctx[ctxIdx]->dataSz;
+ ctxIdx++;
+ }
+ else {
+ break;
+ }
+ }
+ if (sz > ctxSz || ctxSz > (MAX_CTX * sizeof(UINT4))) {
+ return SizeIsTooLarge;
+ }
+
+ ASP_FlushCaches((Address)desc->iv, ctxSz);
+ /*Manage Context (current digest + 8 byte running message length)*/
+ if ((desc->state & CAAM_ALG_INIT) != CAAM_ALG_INIT) {
+ /* don't load into the class 2 context register on inti.
+ Found that loading in caused context to not get set. */
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = (CAAM_LOAD_CTX | CAAM_CLASS2) + ctxSz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical((Address)desc->iv);
+ }
+
+ /* add operation command */
+ desc->desc[desc->idx++] = CAAM_OP | CAAM_CLASS2 | desc->state |
+ desc->type;
+
+ /* Check case where there is no input.
+ In all cases the FIFO Load should be flushed. */
+ if (i == desc->DescriptorCount) {
+ desc->lastFifo = desc->idx;
+ if (desc->idx + 1 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = CAAM_FIFO_L | CAAM_CLASS2 |
+ FIFOL_TYPE_MSG | CAAM_IMM;
+ }
+
+ /* save index for looping over input */
+ desc->headIdx = desc->idx;
+ do {
+ desc->idx = desc->headIdx; /* reset for each loop */
+ if (i < desc->DescriptorCount) {
+ /* input must be a multiple of 64 bytes unless in final call */
+ if (((desc->state & CAAM_ALG_FINAL) == CAAM_ALG_FINAL)) {
+ if (caamAddIO(desc, (CAAM_FIFO_L | CAAM_CLASS2 |
+ FIFOL_TYPE_MSG), 0, 1, &i) < 0) {
+ return TooManyBuffers;
+ }
+ }
+ else {
+ if (caamAddIO(desc, (CAAM_FIFO_L | CAAM_CLASS2 |
+ FIFOL_TYPE_MSG), 0, 64, &i) < 0) {
+ return TooManyBuffers;
+ }
+ }
+ }
+
+ desc->desc[desc->lastFifo] |= FIFOL_TYPE_LC2;
+
+ /* set context out */
+ if (desc->idx + 2 > MAX_DESC_SZ) {
+ return TransferFailed;
+ }
+ desc->desc[desc->idx++] = CAAM_STORE_CTX | CAAM_CLASS2 + ctxSz;
+ desc->desc[desc->idx++] = BSP_VirtualToPhysical(desc->iv);
+
+ if ((err = caamDoJob(desc)) != Success) {
+ return err;
+ }
+ /* flush context output for each loop */
+ ASP_FlushCaches((Address)desc->iv, ctxSz);
+ } while (i < desc->DescriptorCount);
+
+ /* store context to buffers */
+ {
+ unsigned char* pt = (unsigned char*)desc->iv;
+ for (i = 0; i < ctxIdx; i++) {
+ memcpy((unsigned char*)ctx[i]->data, pt, ctx[i]->dataSz);
+ pt += ctx[i]->dataSz;
+ ASP_FlushCaches(ctx[i]->data, ctx[i]->dataSz);
+ }
+ }
+
+ return Success;
+}
+
+
+/******************************************************************************
+ CAAM TRNG Operations
+ ****************************************************************************/
+
+/* If Entropy is not ready then return Waiting */
+static Error caamRng(struct DescStruct* desc)
+{
+ int sz = 0;
+ int i;
+
+ Address reg; /* RTENT reg to read */
+ int ofst = sizeof(UINT4);
+
+
+ /* Check ENT_VAL bit to make sure entropy is ready */
+ if ((CAAM_READ(CAAM_RTMCTL) & CAAM_ENTVAL) !=
+ CAAM_ENTVAL) {
+ return Waiting;
+ }
+
+ /* check state of TRNG */
+ if ((CAAM_READ(CAAM_RTSTATUS) & 0x0000FFFF) > 0) {
+ return Failure;
+ }
+
+ /* read entropy from RTENT registers */
+ reg = CAAM_RTENT0;
+
+ for (i = 0; i < desc->DescriptorCount; i++) {
+ struct buffer* buf = &desc->buf[i];
+ unsigned char* local = (unsigned char*)buf->data;
+ sz = buf->dataSz;
+
+ while (sz > 3 && reg <= CAAM_RTENT11) {
+ *((UINT4*)local) = CAAM_READ(reg);
+ reg += ofst;
+ local += ofst;
+ sz -= ofst;
+ }
+
+ if (reg > CAAM_RTENT11 && sz > 0) {
+ return SizeIsTooLarge;
+ }
+
+ /* handle non word32 size amount left over */
+ if (sz > 0) {
+ UINT4 tmp = CAAM_READ(reg);
+ memcpy(local, (unsigned char*)&tmp, sz);
+ }
+
+ ASP_FlushCaches(buf->data, buf->dataSz);
+ }
+
+
+ /* read RTENT11 to trigger new entropy generation */
+ if (reg != CAAM_RTENT11) {
+ CAAM_READ(CAAM_RTENT11);
+ }
+
+ return Success;
+}
+
+
+/******************************************************************************
+ IODevice Start, Transfer and Finish Buffer
+ ****************************************************************************/
+/* args[0] holds the state such as encrypt/decrypt or init/update/final
+ * args[1] holds the ctx/key size
+ * args[2] holds the input size
+ * args[3] dependent on algo (such as AAD size with AES-CCM) */
+static Error caamTransferStart(IODeviceVector ioCaam,
+ Value type, const volatile Value args[4])
+{
+ struct CAAM_DEVICE* local = (struct CAAM_DEVICE*)ioCaam;
+ struct DescStruct* desc;
+
+ /* currently only one desc is available for use */
+ desc = &local->DescArray[0];
+
+ /* check if the desc is idle before using */
+ if (GetIORequestStatus((IORequest)desc) != IdleIORequest) {
+ return ResourceNotAvailable;
+ }
+
+ desc->idx = 0;
+ desc->output = 0;
+ desc->ctxOut = 0;
+ desc->outputIdx = 0;
+ desc->alignIdx = 0;
+ desc->lastFifo = 0;
+ desc->state = args[0];
+ desc->ctxSz = args[1];
+ desc->inputSz = args[2];
+ desc->aadSz = 0;
+ desc->desc[desc->idx++] = CAAM_HEAD; /* later will put size to header*/
+
+ switch (type) {
+ case CAAM_AESECB:
+ case CAAM_AESCBC:
+ if (desc->inputSz % 16 != 0) {
+ return ArgumentError;
+ }
+ /* fall through to break */
+ case CAAM_AESCTR:
+ break;
+
+ case CAAM_AESCCM:
+ memset((unsigned char*)desc->aadSzBuf, 0, sizeof(desc->aadSzBuf));
+ if (args[3] > 0) {
+ /* encode the length in */
+ if (args[3] <= 0xFEFF) {
+ unsigned char* pt = (unsigned char*)desc->aadSzBuf;
+ desc->aadSz = 2;
+ pt[0] = ((args[3] & 0xFF00) >> 8);
+ pt[1] = (args[3] & 0x00FF);
+ }
+ else if (args[3] <= 0xFFFFFFFF) {
+ unsigned char* pt = (unsigned char*)desc->aadSzBuf;
+ desc->aadSz = 6;
+ pt[0] = 0xFF; pt[1] = 0xFE;
+ pt[2] = ((args[3] & 0xFF000000) >> 24);
+ pt[3] = ((args[3] & 0x00FF0000) >> 16);
+ pt[4] = ((args[3] & 0x0000FF00) >> 8);
+ pt[5] = (args[3] & 0x000000FF);
+ }
+ }
+ break;
+
+ case CAAM_MD5:
+ case CAAM_SHA:
+ case CAAM_SHA224:
+ case CAAM_SHA256:
+ case CAAM_SHA384:
+ case CAAM_SHA512:
+ break;
+
+ case CAAM_BLOB_ENCAP:
+ case CAAM_BLOB_DECAP:
+ break;
+
+ case CAAM_ENTROPY:
+ break;
+
+ default:
+ /* unknown type */
+ return UsageNotSupported;
+ }
+
+ desc->DescriptorCount = 0;
+ desc->type = type;
+ desc->running = true;
+ StartIORequest((IORequest)desc);
+
+ /* For now only require READ permissions */
+ SetIORequestBufferPermissions((IORequest)desc, MEMORY_READ);
+ return Success;
+}
+
+
+static Error caamTransferBuffer(IODeviceVector TheIODeviceVector,
+ IORequest req, IODescriptor NewIODescriptor,
+ Address data, Address dataSz)
+{
+ struct DescStruct* desc = (struct DescStruct*)req;
+ Error err;
+
+ switch (desc->type) {
+ case CAAM_AESECB:
+ case CAAM_AESCTR:
+ case CAAM_AESCBC:
+ case CAAM_AESCCM:
+
+ case CAAM_MD5:
+ case CAAM_SHA:
+ case CAAM_SHA224:
+ case CAAM_SHA256:
+ case CAAM_SHA384:
+ case CAAM_SHA512:
+
+ case CAAM_BLOB_ENCAP:
+ case CAAM_BLOB_DECAP:
+ case CAAM_ENTROPY:
+ { /* set buffer for transfer finish */
+ struct buffer* buf;
+ if (desc->DescriptorCount >= MAX_BUF) {
+ return TooManyBuffers;
+ }
+ buf = &desc->buf[desc->DescriptorCount];
+ buf->data = data;
+ buf->dataSz = dataSz;
+ }
+ err = Success;
+ break;
+
+ default:
+ err = UsageNotSupported;
+ }
+
+ if (err != Success) {
+ desc->running = false;
+ DismissIORequest(req);
+ return err;
+ }
+
+ desc->DescriptorCount++;
+ return Success;
+}
+
+
+static Error caamTransferFinish(IODeviceVector ioCaam, IORequest req)
+{
+ struct DescStruct* desc = (struct DescStruct*)req;
+ Error ret;
+
+ /* construct desc */
+ switch (desc->type) {
+ case CAAM_AESECB:
+ case CAAM_AESCTR:
+ case CAAM_AESCBC:
+ ret = caamAes(desc);
+ break;
+
+ case CAAM_AESCCM:
+ ret = caamAead(desc);
+ break;
+
+ case CAAM_MD5:
+ case CAAM_SHA:
+ case CAAM_SHA224:
+ case CAAM_SHA256:
+ case CAAM_SHA384:
+ case CAAM_SHA512:
+ ret = caamSha(desc, 0);
+ break;
+
+ case CAAM_ENTROPY:
+ ret = caamRng(desc);
+ break;
+
+ case CAAM_BLOB_ENCAP:
+ case CAAM_BLOB_DECAP:
+ ret = caamBlob(desc);
+ break;
+
+ default:
+ ret = UsageNotSupported;
+ }
+
+ desc->running = false;
+ DismissIORequest(req);
+ return ret;
+}
+
+
+/******************************************************************************
+ IODevice Interrupt and Init
+ ****************************************************************************/
+
+static Error caamTransferWrite(IODeviceVector ioCaam,
+ IORequest req, Value dataSz, const volatile Value *data)
+{
+ DismissIORequest(req);
+ return UsageNotSupported;
+}
+
+
+static void caamTransferAbort(IODeviceVector ioCaam, IORequest req)
+{
+ DismissIORequest(req);
+}
+
+
+static void caamTransferRecall(IODeviceVector ioCaam, IODescriptor req)
+{
+
+}
+
+
+static void HandleInterrupt(Address id)
+{
+ struct CAAM_DEVICE* local = (struct CAAM_DEVICE*)id;
+ Value InterruptStatus = INTERRUPT_AtomicWrite(&local->InterruptStatus, 0);
+ int i;
+
+ /* Loop through descriptors and try to dismiss them */
+ for (i = 0; i < DESC_COUNT; i++) {
+ struct DescStruct* desc = &local->DescArray[i];
+ if (InterruptStatus & (1 << i)) {
+ desc->running = false;
+ if (GetIORequestStatus((IORequest)desc) == IORequestSuspended) {
+ ContinueIORequest((IORequest)desc);
+ }
+ else {
+ DismissIORequest((IORequest)desc);
+ }
+ }
+ }
+}
+
+
+static Error caamCreate(IODeviceVector ioCaam)
+{
+ return Success;
+}
+
+
+void InitCAAM(void)
+{
+ /* get IO vector and set it up */
+ IODeviceVector ioCaam = &caam.caamVector;
+ unsigned int reg;
+ int i;
+ Error ret;
+
+
+ ioCaam->Create = &caamCreate;
+ ioCaam->ReadRegister = &caamReadRegister;
+ ioCaam->WriteRegister = &caamWriteRegister;
+
+ ioCaam->TransferStart = &caamTransferStart;
+ ioCaam->TransferBuffer = &caamTransferBuffer;
+ ioCaam->TransferWrite = &caamTransferWrite;
+ ioCaam->TransferFinish = &caamTransferFinish;
+ ioCaam->TransferAbort = &caamTransferAbort;
+ ioCaam->TransferRecall = &caamTransferRecall;
+#ifdef HARDWARE_CACHE_COHERENCY
+ ioCaam->IOSynchronizationNotRequired = 1;
+#endif
+
+ RegisterIODeviceVector(ioCaam, DRIVER_NAME);
+ RequestIOTerminationTask(ioCaam, 10);
+
+ /* Initialize descriptors */
+ for (i = 0; i < BUFFER_COUNT; i++) {
+ InitializeIODescriptor(ioCaam, &caam.IODescriptorArray[i]);
+ }
+
+ /* Initialize Descriptors */
+ for (i = 0; i < DESC_COUNT; i++) {
+ InitializeIORequest(ioCaam, &caam.DescArray[i].TheIORequest,
+ IOREQUEST_STANDARD);
+ caam.DescArray[i].running = false;
+ caam.DescArray[i].caam = &caam;
+ }
+
+
+ /* call interrupt to make IORequests available */
+ caam.InterruptStatus = 0;
+ INTERRUPT_InitCall(&caam.HandleInterruptCall,
+ &HandleInterrupt, "Start up CAAM IORequest");
+
+ /* set clock speed for CAAM. Setting it here to allow for restricting
+ access */
+ #define REGS_CCM_BASE (0xf20c4000)
+ #define HW_CCM_CCGR0_ADDR (0xf20c4068)
+ #define CG(x) (3 << (x*2))
+
+ reg = CG(6) | CG(5) | CG(4);
+ *(volatile unsigned int*)HW_CCM_CCGR0_ADDR =
+ *(volatile unsigned int*)HW_CCM_CCGR0_ADDR | reg;
+
+ /* set up job ring */
+
+ /* @TODO create partition in physical memory for job rings
+ current partition security is set to the default */
+ for (i = 1; i < CAAM_PAGE_MAX; i++) {
+ ret = caamCreatePartition(i, i);
+ if (ret == 0) {
+ break;
+ }
+
+ if (ret != MemoryMapMayNotBeEmpty) {
+ INTERRUPT_Panic();
+ }
+ }
+
+ if (ret != 0) {
+ INTERRUPT_Panic();
+ }
+
+ caam.ring.page = i;
+ caam.ring.JobIn = (CAAM_PAGE + (i << 12));
+ caam.ring.JobOut = caam.ring.JobIn + 16;
+ caam.ring.Desc = caam.ring.JobOut + 16;
+
+ /* set physical address of job rings */
+ CAAM_WRITE(CAAM_IRBAR0, caam.ring.JobIn ^ 0xF0000000);
+ CAAM_WRITE(CAAM_ORBAR0, caam.ring.JobOut ^ 0xF0000000);
+
+ /* Initialize job ring sizes to 1 */
+ CAAM_WRITE(CAAM_IRSR0, 1);
+ CAAM_WRITE(CAAM_ORSR0, 1);
+
+ /* set DECO watchdog to time out and flush jobs that cause the DECO to hang */
+ CAAM_WRITE((CAAM_BASE | 0x0004), CAAM_READ(CAAM_BASE | 0x0004) | 0x40000000);
+
+ /* start up RNG if not already started */
+ if (caamInitRng(&caam) != 0) {
+ INTERRUPT_Panic();
+ }
+}
+
+void (*__ghsentry_bspuserinit_InitCAAM)(void) = &InitCAAM;
+
+#endif /* INTEGRITY */
diff --git a/wolfcrypt/src/port/caam/caam_init.c b/wolfcrypt/src/port/caam/caam_init.c
new file mode 100644
index 0000000..014341c
--- /dev/null
+++ b/wolfcrypt/src/port/caam/caam_init.c
@@ -0,0 +1,289 @@
+/* caam_init.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_IMX6_CAAM) || defined(WOLFSSL_IMX6_CAAM_RNG) || \
+ defined(WOLFSSL_IMX6_CAAM_BLOB)
+
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/port/caam/wolfcaam.h>
+
+#define WC_CAAM_BLOB_SZ 48
+
+#ifndef WC_CAAM_PASSWORD
+ #define WC_CAAM_PASSWORD "!systempassword"
+#endif
+
+#if defined(__INTEGRITY) || defined(INTEGRITY)
+ #include <INTEGRITY.h>
+ #include <wolfssl/wolfcrypt/port/caam/caam_driver.h>
+ static IODevice caam = NULLIODevice;
+#endif
+
+#if defined(WOLFSSL_CAAM_PRINT) || defined(WOLFSSL_CAAM_DEBUG)
+#include <stdio.h>
+#include <wolfssl/version.h>
+
+static void wc_caamBanner(void)
+{
+ printf("********* wolfSSL Version %s : Printing Out CAAM Information ********\n",
+ LIBWOLFSSL_VERSION_STRING);
+ printf("CAAM Status [0x%8.8x] = 0x%8.8x\n",
+ CAAM_STATUS, WC_CAAM_READ(CAAM_STATUS));
+ printf("CAAM Version MS Register [0x%8.8x] = 0x%8.8x\n",
+ CAAM_VERSION_MS, WC_CAAM_READ(CAAM_VERSION_MS));
+ printf("CAAM Version LS Register [0x%8.8x] = 0x%8.8x\n",
+ CAAM_VERSION_LS, WC_CAAM_READ(CAAM_VERSION_LS));
+ printf("CAAM Support MS Register [0x%8.8x] = 0x%8.8x\n",
+ CAMM_SUPPORT_MS, WC_CAAM_READ(CAMM_SUPPORT_MS));
+ printf("CAAM Support LS [0x%8.8x] = 0x%8.8x\n",
+ CAMM_SUPPORT_LS, WC_CAAM_READ(CAMM_SUPPORT_LS));
+ printf("********************************************************************\n\n");
+}
+#endif
+
+
+/* Allow runtime setting for CAAM IODevice in case user wants to use password
+ * at run time.
+ *
+ * returns 0 on success
+ *
+ * NOTE this is how IODevice is defined in INTEGRITY "typedef struct
+ * IODeviceStruct *IODevice;"
+ */
+int wc_caamSetResource(IODevice ioDev)
+{
+ WOLFSSL_MSG("Setting CAAM driver");
+ caam = ioDev;
+
+ return 0;
+}
+
+/* Check hardware support
+ *
+ * returns 0 on success
+ */
+int wc_caamInit(void)
+{
+ int ret;
+ word32 reg;
+
+ /* get the driver up */
+ if (caam == NULLIODevice) {
+ WOLFSSL_MSG("Starting CAAM driver");
+ if ((ret = (int)RequestResource((Object *)&caam, "wolfSSL_CAAM_Driver",
+ WC_CAAM_PASSWORD)) != (int)Success) {
+ WOLFSSL_MSG("Unable to get the CAAM IODevice, check password?");
+ WOLFSSL_LEAVE("wc_caamInit: error from driver = ", ret);
+ ret = 0; /* not a hard failure because user can set resource */
+ }
+ }
+
+#if defined(WOLFSSL_CAAM_PRINT) || defined(WOLFSSL_CAAM_DEBUG)
+ /* print out CAAM version/info and wolfSSL version */
+ wc_caamBanner();
+#endif
+
+ /* check that for implemented modules
+ * bits 0-3 AES, 4-7 DES, 12-15 Hashing , 16-19 RNG */
+ reg = WC_CAAM_READ(CAMM_SUPPORT_LS);
+
+ #ifndef WC_NO_RNG
+ if (((reg & 0x000F0000) >> 16) > 0) {
+ WOLFSSL_MSG("Found CAAM RNG hardware module");
+ if ((WC_CAAM_READ(CAAM_RTMCTL) & 0x40000001) != 0x40000001) {
+ WOLFSSL_MSG("Error CAAM RNG has not been set up");
+ }
+ }
+ #endif
+
+ #ifndef NO_SHA256
+ if ((reg & 0x0000F000) > 0) {
+ WOLFSSL_MSG("Found CAAM MDHA module");
+ }
+ else {
+ WOLFSSL_MSG("Hashing not supported by CAAM");
+ return WC_HW_E;
+ }
+ #endif
+
+ #ifndef NO_AES
+ if ((reg & 0x0000000F) > 0) {
+ WOLFSSL_MSG("Found CAAM AES module");
+ }
+ else {
+ WOLFSSL_MSG("AES not supported by CAAM");
+ return WC_HW_E;
+ }
+ #endif
+
+ (void)ret;
+ return 0;
+}
+
+
+int wc_caamFree(void)
+{
+ return 0;
+}
+
+
+word32 wc_caamReadRegister(word32 reg)
+{
+ Value out = 0;
+
+ if (caam == NULLIODevice) {
+ WOLFSSL_MSG("Error CAAM IODevice not found! Bad password?");
+ return 0;
+ }
+
+ if (ReadIODeviceRegister(caam, reg, &out) != Success) {
+ WOLFSSL_MSG("Error reading register\n");
+ }
+
+ return (word32)out;
+}
+
+void wc_caamWriteRegister(word32 reg, word32 value)
+{
+ if (caam == NULLIODevice) {
+ WOLFSSL_MSG("Error CAAM IODevice not found! Bad password?");
+ return;
+ }
+
+ if (WriteIODeviceRegister(caam, reg, value) != Success) {
+ WOLFSSL_MSG("Error writing to register\n");
+ }
+}
+
+
+/* return 0 on success and WC_HW_E on failure. Can also return WC_HW_WAIT_E
+ * in the case that the driver is waiting for a resource or RAN_BLOCK_E if
+ * waiting for entropy. */
+int wc_caamAddAndWait(Buffer* buf, word32 arg[4], word32 type)
+{
+ int ret;
+ if (caam == NULLIODevice) {
+ WOLFSSL_MSG("Error CAAM IODevice not found! Bad password?");
+ return WC_HW_E;
+ }
+
+ if ((ret = SynchronousSendIORequest(caam, type, (const Value*)arg, buf))
+ != Success) {
+ #if defined(WOLFSSL_CAAM_PRINT) || defined(WOLFSSL_CAAM_DEBUG)
+ printf("ret of SynchronousSendIORequest = %d type = %d\n", ret, type);
+ #endif
+
+ /* if waiting for resource or RNG return waiting */
+ if (ret == Waiting) {
+ WOLFSSL_MSG("Waiting on entropy from driver");
+ return RAN_BLOCK_E;
+ }
+
+ if (ret == ResourceNotAvailable) {
+ WOLFSSL_MSG("Waiting on CAAM driver");
+ return WC_HW_WAIT_E;
+ }
+
+ return WC_HW_E;
+ }
+
+ (void)ret;
+ return 0;
+}
+
+
+int wc_caamCreateBlob(byte* data, word32 dataSz, byte* out, word32* outSz)
+{
+ Buffer in[3];
+ word32 arg[4];
+ int ret;
+ word32 local[2] = {0,0};
+
+ if (data == NULL || out == NULL || outSz == NULL ||
+ *outSz < dataSz + WC_CAAM_BLOB_SZ) {
+ return BAD_FUNC_ARG;
+ }
+
+ in[0].BufferType = DataBuffer;
+ in[0].TheAddress = (Address)local;
+ in[0].Length = sizeof(local);
+
+ in[1].BufferType = DataBuffer;
+ in[1].TheAddress = (Address)data;
+ in[1].Length = dataSz;
+
+ in[2].BufferType = DataBuffer | LastBuffer;
+ in[2].TheAddress = (Address)out;
+ in[2].Length = dataSz + WC_CAAM_BLOB_SZ;
+
+ arg[2] = dataSz;
+
+ if ((ret = wc_caamAddAndWait(in, arg, CAAM_BLOB_ENCAP)) != 0) {
+ WOLFSSL_MSG("Error with CAAM blob create");
+ return ret;
+ }
+
+ *outSz = dataSz + WC_CAAM_BLOB_SZ;
+ return 0;
+}
+
+
+int wc_caamOpenBlob(byte* data, word32 dataSz, byte* out, word32* outSz)
+{
+ Buffer in[3];
+ word32 arg[4];
+ int ret;
+ word32 local[2] = {0,0};
+
+ if (data == NULL || out == NULL || outSz == NULL ||
+ *outSz < dataSz - WC_CAAM_BLOB_SZ) {
+ return BAD_FUNC_ARG;
+ }
+
+ in[0].BufferType = DataBuffer;
+ in[0].TheAddress = (Address)local;
+ in[0].Length = sizeof(local);
+
+ in[0].BufferType = DataBuffer;
+ in[0].TheAddress = (Address)data;
+ in[0].Length = dataSz;
+
+ in[1].BufferType = DataBuffer | LastBuffer;
+ in[1].TheAddress = (Address)out;
+ in[1].Length = dataSz - WC_CAAM_BLOB_SZ;
+
+ arg[2] = dataSz;
+
+ if ((ret = wc_caamAddAndWait(in, arg, CAAM_BLOB_DECAP)) != 0) {
+ WOLFSSL_MSG("Error with CAAM blob create");
+ return ret;
+ }
+
+ *outSz = dataSz - WC_CAAM_BLOB_SZ;
+ return 0;
+}
+
+#endif /* WOLFSSL_IMX6_CAAM */
+
diff --git a/wolfcrypt/src/port/caam/caam_sha.c b/wolfcrypt/src/port/caam/caam_sha.c
new file mode 100644
index 0000000..74d62fb
--- /dev/null
+++ b/wolfcrypt/src/port/caam/caam_sha.c
@@ -0,0 +1,397 @@
+/* caam_sha.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
+
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+
+#include <INTEGRITY.h>
+#include <wolfssl/wolfcrypt/port/caam/caam_driver.h>
+#include <wolfssl/wolfcrypt/port/caam/wolfcaam.h>
+
+#if defined(WOLFSSL_CAAM_DEBUG) || defined(WOLFSSL_CAAM_PRINT)
+#include <stdio.h>
+#endif
+
+#ifndef NO_SHA
+#include <wolfssl/wolfcrypt/sha.h>
+#endif
+
+#if !defined(NO_SHA256) || defined(WOLFSSL_SHA224)
+#include <wolfssl/wolfcrypt/sha256.h>
+#endif
+
+#if defined(WOLFSSL_SHA384) || defined(WOLFSSL_SHA512)
+#include <wolfssl/wolfcrypt/sha512.h>
+#endif
+
+#ifndef NO_MD5
+#include <wolfssl/wolfcrypt/md5.h>
+#endif
+
+/******************************************************************************
+ Common Code Between SHA Functions
+ ****************************************************************************/
+
+static int _InitSha(wc_Sha* sha, void* heap, int devId, word32 digestSz,
+ word32 type)
+{
+ Buffer buf[1];
+ word32 arg[4];
+ int ret;
+
+ (void)heap;
+ (void)devId;
+
+ if (sha == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMSET(sha, 0, sizeof(Sha));
+
+ /* Set buffer for context */
+ buf[0].BufferType = DataBuffer | LastBuffer;
+ buf[0].TheAddress = (Address)sha->ctx;
+ buf[0].Length = digestSz + WC_CAAM_CTXLEN;
+ buf[0].Transferred = 0;
+
+ arg[0] = CAAM_ALG_INIT;
+ arg[1] = digestSz + WC_CAAM_CTXLEN;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, type)) != 0) {
+ WOLFSSL_MSG("Error with CAAM SHA init");
+ return ret;
+ }
+
+ return 0;
+}
+
+
+static int _ShaUpdate(wc_Sha* sha, const byte* data, word32 len, word32 digestSz,
+ word32 type)
+{
+ Buffer buf[2];
+ word32 arg[4];
+ int ret;
+ byte* local;
+
+ if (sha == NULL ||(data == NULL && len > 0)) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (len == 0) return 0; /* nothing to do */
+
+ local = (byte*)sha->buffer;
+ /* check for filling out existing buffer */
+ if (sha->buffLen > 0) {
+ word32 add = min(len, WC_CAAM_HASH_BLOCK - sha->buffLen);
+ XMEMCPY(&local[sha->buffLen], data, add);
+
+ sha->buffLen += add;
+ data += add;
+ len -= add;
+
+ if (sha->buffLen == WC_CAAM_HASH_BLOCK) {
+ /* Set buffer for context */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)sha->ctx;
+ buf[0].Length = digestSz + WC_CAAM_CTXLEN;
+ buf[0].Transferred = 0;
+
+ /* data to update with */
+ buf[1].BufferType = DataBuffer | LastBuffer;
+ buf[1].TheAddress = (Address)sha->buffer;
+ buf[1].Length = sha->buffLen;
+ buf[1].Transferred = 0;
+
+ arg[0] = CAAM_ALG_UPDATE;
+ arg[1] = digestSz + WC_CAAM_CTXLEN;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, type)) != 0) {
+ WOLFSSL_MSG("Error with CAAM SHA update");
+ return ret;
+ }
+ sha->buffLen = 0; /* cleared out buffer */
+ }
+ }
+
+ /* check if multiple full blocks can be done */
+ if (len >= WC_CAAM_HASH_BLOCK) {
+ word32 sz = len / WC_CAAM_HASH_BLOCK;
+ sz = sz * WC_CAAM_HASH_BLOCK;
+
+ /* Set buffer for context */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)sha->ctx;
+ buf[0].Length = digestSz + WC_CAAM_CTXLEN;
+ buf[0].Transferred = 0;
+
+ /* data to update with */
+ buf[1].BufferType = DataBuffer | LastBuffer;
+ buf[1].TheAddress = (Address)data;
+ buf[1].Length = sz;
+ buf[1].Transferred = 0;
+
+ arg[0] = CAAM_ALG_UPDATE;
+ arg[1] = digestSz + WC_CAAM_CTXLEN;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, type)) != 0) {
+ WOLFSSL_MSG("Error with CAAM SHA update");
+ return ret;
+ }
+
+ len -= sz;
+ data += sz;
+ }
+
+ /* check for left overs */
+ if (len > 0) {
+ word32 add = min(len, WC_CAAM_HASH_BLOCK - sha->buffLen);
+ XMEMCPY(&local[sha->buffLen], data, add);
+ sha->buffLen += add;
+ }
+
+ return 0;
+}
+
+
+static int _ShaFinal(wc_Sha* sha, byte* out, word32 digestSz,
+ word32 type)
+{
+ Buffer buf[2];
+ word32 arg[4];
+ int ret;
+
+ if (sha == NULL || out == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* Set buffer for context */
+ buf[0].BufferType = DataBuffer;
+ buf[0].TheAddress = (Address)sha->ctx;
+ buf[0].Length = digestSz + WC_CAAM_CTXLEN;
+ buf[0].Transferred = 0;
+
+ /* add any potential left overs */
+ buf[1].BufferType = DataBuffer | LastBuffer;
+ buf[1].TheAddress = (Address)sha->buffer;
+ buf[1].Length = sha->buffLen;
+ buf[1].Transferred = 0;
+
+ arg[0] = CAAM_ALG_FINAL;
+ arg[1] = digestSz + WC_CAAM_CTXLEN;
+
+ if ((ret = wc_caamAddAndWait(buf, arg, type)) != 0) {
+ WOLFSSL_MSG("Error with CAAM SHA init");
+ return ret;
+ }
+
+ return 0;
+}
+
+/******************************************************************************
+ MD5
+ ****************************************************************************/
+#if !defined(NO_MD5)
+int wc_InitMd5_ex(wc_Md5* sha, void* heap, int devId)
+{
+ return _InitSha(sha, heap, devId, MD5_DIGEST_SIZE, CAAM_MD5);
+}
+
+
+int wc_Md5Update(wc_Md5* sha, const byte* data, word32 len)
+{
+ return _ShaUpdate(sha, data, len, MD5_DIGEST_SIZE, CAAM_MD5);
+}
+
+
+int wc_Md5Final(wc_Md5* sha, byte* hash)
+{
+ int ret;
+ if ((ret = _ShaFinal(sha, hash, MD5_DIGEST_SIZE, CAAM_MD5)) != 0) {
+ return ret;
+ }
+
+ XMEMCPY(hash, (byte*)sha->ctx, MD5_DIGEST_SIZE);
+ return _InitSha(sha, NULL, 0, MD5_DIGEST_SIZE, CAAM_MD5);
+}
+#endif /* !NO_MD5 */
+
+
+/******************************************************************************
+ SHA 1
+ ****************************************************************************/
+#if !defined(NO_SHA)
+int wc_InitSha_ex(wc_Sha* sha, void* heap, int devId)
+{
+ return _InitSha(sha, heap, devId, SHA_DIGEST_SIZE, CAAM_SHA);
+}
+
+
+int wc_ShaUpdate(wc_Sha* sha, const byte* data, word32 len)
+{
+ return _ShaUpdate(sha, data, len, SHA_DIGEST_SIZE, CAAM_SHA);
+}
+
+
+int wc_ShaFinal(wc_Sha* sha, byte* out)
+{
+ int ret;
+ if ((ret = _ShaFinal(sha, out, SHA_DIGEST_SIZE, CAAM_SHA)) != 0) {
+ return ret;
+ }
+
+ XMEMCPY(out, (byte*)sha->ctx, SHA_DIGEST_SIZE);
+ return _InitSha(sha, NULL, 0, SHA_DIGEST_SIZE, CAAM_SHA);
+}
+#endif /* !NO_SHA */
+
+
+/******************************************************************************
+ SHA 224
+ ****************************************************************************/
+#ifdef WOLFSSL_SHA224
+int wc_InitSha224_ex(wc_Sha224* sha, void* heap, int devId)
+{
+ return _InitSha(sha, heap, devId, SHA256_DIGEST_SIZE, CAAM_SHA224);
+}
+
+
+int wc_Sha224Update(wc_Sha224* sha, const byte* data, word32 len)
+{
+ return _ShaUpdate(sha, data, len, SHA256_DIGEST_SIZE, CAAM_SHA224);
+}
+
+
+int wc_Sha224Final(wc_Sha224* sha, byte* out)
+{
+ int ret;
+ if ((ret = _ShaFinal(sha, out, SHA256_DIGEST_SIZE, CAAM_SHA224)) != 0) {
+ return ret;
+ }
+
+ XMEMCPY(out, (byte*)sha->ctx, SHA224_DIGEST_SIZE);
+ return _InitSha(sha, NULL, 0, SHA256_DIGEST_SIZE, CAAM_SHA224);
+}
+#endif /* WOLFSSL_SHA224 */
+
+
+/******************************************************************************
+ SHA 256
+ ****************************************************************************/
+#if !defined(NO_SHA256)
+int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
+{
+ return _InitSha(sha, heap, devId, SHA256_DIGEST_SIZE, CAAM_SHA256);
+}
+
+
+int wc_Sha256Update(wc_Sha256* sha, const byte* data, word32 len)
+{
+ return _ShaUpdate(sha, data, len, SHA256_DIGEST_SIZE, CAAM_SHA256);
+}
+
+
+int wc_Sha256Final(wc_Sha256* sha, byte* out)
+{
+ int ret;
+ if ((ret = _ShaFinal(sha, out, SHA256_DIGEST_SIZE, CAAM_SHA256)) != 0) {
+ return ret;
+ }
+
+ XMEMCPY(out, (byte*)sha->ctx, SHA256_DIGEST_SIZE);
+ return _InitSha(sha, NULL, 0, SHA256_DIGEST_SIZE, CAAM_SHA256);
+}
+#endif /* !NO_SHA256 */
+
+
+/******************************************************************************
+ SHA 384
+ ****************************************************************************/
+#ifdef WOLFSSL_SHA384
+int wc_InitSha384_ex(wc_Sha384* sha, void* heap, int devId)
+{
+ return _InitSha(sha, heap, devId, SHA512_DIGEST_SIZE, CAAM_SHA384);
+}
+
+
+int wc_Sha384Update(wc_Sha384* sha, const byte* data, word32 len)
+{
+ return _ShaUpdate(sha, data, len, SHA512_DIGEST_SIZE, CAAM_SHA384);
+}
+
+
+int wc_Sha384Final(wc_Sha384* sha, byte* out)
+{
+ int ret;
+ if ((ret = _ShaFinal(sha, out, SHA512_DIGEST_SIZE, CAAM_SHA384)) != 0) {
+ return ret;
+ }
+
+ XMEMCPY(out, (byte*)sha->ctx, SHA384_DIGEST_SIZE);
+ return _InitSha(sha, NULL, 0, SHA512_DIGEST_SIZE, CAAM_SHA384);
+}
+#endif /* WOLFSSL_SHA384 */
+
+
+
+/******************************************************************************
+ SHA 512
+ ****************************************************************************/
+#ifdef WOLFSSL_SHA512
+int wc_InitSha512_ex(wc_Sha512* sha, void* heap, int devId)
+{
+ return _InitSha(sha, heap, devId, SHA512_DIGEST_SIZE, CAAM_SHA512);
+}
+
+
+int wc_Sha512Update(wc_Sha512* sha, const byte* data, word32 len)
+{
+ return _ShaUpdate(sha, data, len, SHA512_DIGEST_SIZE, CAAM_SHA512);
+}
+
+
+int wc_Sha512Final(wc_Sha512* sha, byte* out)
+{
+ int ret;
+ if ((ret = _ShaFinal(sha, out, SHA512_DIGEST_SIZE, CAAM_SHA512)) != 0) {
+ return ret;
+ }
+
+ XMEMCPY(out, (byte*)sha->ctx, SHA512_DIGEST_SIZE);
+ return _InitSha(sha, NULL, 0, SHA512_DIGEST_SIZE, CAAM_SHA512);
+}
+#endif /* WOLFSSL_SHA512 */
+
+#endif /* WOLFSSL_IMX6_CAAM */
+
diff --git a/wolfcrypt/src/port/cavium/README.md b/wolfcrypt/src/port/cavium/README.md
new file mode 100644
index 0000000..b98d866
--- /dev/null
+++ b/wolfcrypt/src/port/cavium/README.md
@@ -0,0 +1,3 @@
+# Cavium Nitrox III/V Support
+
+Please contact wolfSSL at [email protected] to request an evaluation.
diff --git a/wolfcrypt/src/port/cavium/README_Octeon.md b/wolfcrypt/src/port/cavium/README_Octeon.md
new file mode 100644
index 0000000..b2670d0
--- /dev/null
+++ b/wolfcrypt/src/port/cavium/README_Octeon.md
@@ -0,0 +1,3 @@
+# Cavium Octeon III CN7300
+
+Please contact wolfSSL at [email protected] to request an evaluation.
diff --git a/wolfcrypt/src/port/cavium/cavium_nitrox.c b/wolfcrypt/src/port/cavium/cavium_nitrox.c
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/wolfcrypt/src/port/cavium/cavium_nitrox.c
diff --git a/wolfcrypt/src/port/cavium/cavium_octeon_sync.c b/wolfcrypt/src/port/cavium/cavium_octeon_sync.c
new file mode 100644
index 0000000..078e8cb
--- /dev/null
+++ b/wolfcrypt/src/port/cavium/cavium_octeon_sync.c
@@ -0,0 +1,879 @@
+/* cavium_octeon_sync.c
+ *
+ * Copyright(C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.(formerly known as CyaSSL)
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ *(at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/wc_port.h>
+
+#ifdef HAVE_CAVIUM_OCTEON_SYNC
+
+/* Setting NO_MAIN_DRIVER here because this file ends up building
+ * in the library sources which doesn't have NO_MAIN_DRIVER set,
+ * as the library expects main to be somewhere else. */
+#undef NO_MAIN_DRIVER
+#define NO_MAIN_DRIVER
+
+#include <wolfssl/wolfcrypt/port/cavium/cavium_octeon_sync.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "cvmx.h"
+#include "cvmx-asm.h"
+#include "cvmx-key.h"
+#include "cvmx-swap.h"
+
+#ifndef NO_DES3
+ #include <wolfssl/wolfcrypt/des3.h>
+#endif
+#ifndef NO_AES
+ #include <wolfssl/wolfcrypt/aes.h>
+#endif
+
+#define NOOPT __attribute__((optimize("O0")))
+
+static int devId = 1234;
+
+#ifndef NO_DES3
+static int Octeon_Des3_CbcEncrypt(Des3* des3,
+ uint64_t *inp64, uint64_t *outp64, size_t inl)
+{
+ register uint64_t i0, r0;
+ uint64_t *key, *iv;
+
+ if (des3 == NULL || inp64 == NULL || outp64 == NULL)
+ return BAD_FUNC_ARG;
+
+ /* expects 64-bit aligned value */
+ key = (uint64_t*)des3->devKey;
+ CVMX_MT_3DES_KEY(key[0], 0);
+ CVMX_MT_3DES_KEY(key[1], 1);
+ CVMX_MT_3DES_KEY(key[2], 2);
+ iv = (uint64_t*)des3->reg;
+ CVMX_MT_3DES_IV(iv[0]);
+
+ CVMX_PREFETCH0(inp64);
+
+ i0 = *inp64;
+
+ /* DES3 assembly can handle 16-byte chunks */
+ if (inl >= 16) {
+ CVMX_MT_3DES_ENC_CBC(i0);
+ inl -= 8;
+ inp64++;
+ outp64++;
+
+ if (inl >= 8) {
+ i0 = inp64[0];
+ CVMX_MF_3DES_RESULT(r0);
+ CVMX_MT_3DES_ENC_CBC(i0);
+
+ for (;;) {
+ outp64[-1] = r0;
+ inl -= 8;
+ inp64++;
+ outp64++;
+ i0 = *inp64;
+
+ if (inl < 8)
+ break;
+
+ CVMX_PREFETCH(inp64, 64);
+ CVMX_MF_3DES_RESULT(r0);
+ CVMX_MT_3DES_ENC_CBC(i0);
+ }
+ }
+ CVMX_MF_3DES_RESULT(r0);
+ outp64[-1] = r0;
+ }
+ /* remainder */
+ if (inl > 0) {
+ uint64_t r = 0;
+ if (inl <= 8) {
+ XMEMCPY(&r, inp64, inl);
+ CVMX_MT_3DES_ENC_CBC(r);
+ CVMX_MF_3DES_RESULT(*outp64);
+ }
+ else {
+ i0 = *inp64;
+ CVMX_MT_3DES_ENC_CBC(i0);
+ CVMX_MF_3DES_RESULT(*outp64);
+ inp64++, outp64++;
+
+ XMEMCPY(&r, inp64, inl);
+ CVMX_MT_3DES_ENC_CBC(r);
+ CVMX_MF_3DES_RESULT(*outp64);
+ }
+ }
+
+ CVMX_MT_3DES_IV(iv[0]);
+
+ return 0;
+}
+
+static int Octeon_Des3_CbcDecrypt(Des3* des3,
+ uint64_t *inp64, uint64_t *outp64, size_t inl)
+{
+ register uint64_t i0, r0;
+ uint64_t *key, *iv;
+
+ if (des3 == NULL || inp64 == NULL || outp64 == NULL)
+ return BAD_FUNC_ARG;
+
+ /* expects 64-bit aligned value */
+ key = (uint64_t*)des3->devKey;
+ CVMX_MT_3DES_KEY(key[0], 0);
+ CVMX_MT_3DES_KEY(key[1], 1);
+ CVMX_MT_3DES_KEY(key[2], 2);
+
+ iv = (uint64_t*)des3->reg;
+ CVMX_MT_3DES_IV(iv[0]);
+
+ CVMX_PREFETCH0(inp64);
+
+ i0 = *inp64;
+
+ /* DES3 assembly can handle 16-byte chunks */
+ if (inl >= 16) {
+ CVMX_MT_3DES_DEC_CBC(i0);
+ inl -= 8;
+ inp64++;
+ outp64++;
+
+ if (inl >= 8) {
+ i0 = inp64[0];
+ CVMX_MF_3DES_RESULT(r0);
+ CVMX_MT_3DES_DEC_CBC(i0);
+
+ for (;;) {
+ outp64[-1] = r0;
+ inl -= 8;
+ inp64++;
+ outp64++;
+ i0 = *inp64;
+
+ if (inl < 8)
+ break;
+
+ CVMX_PREFETCH(inp64, 64);
+ CVMX_MF_3DES_RESULT(r0);
+ CVMX_MT_3DES_DEC_CBC(i0);
+ }
+ }
+
+ CVMX_MF_3DES_RESULT(r0);
+ outp64[-1] = r0;
+ }
+ /* remainder */
+ if (inl > 0) {
+ if (inl <= 8) {
+ uint64_t r = 0;
+ XMEMCPY(&r, inp64, inl);
+ CVMX_MT_3DES_DEC_CBC(r);
+ CVMX_MF_3DES_RESULT(*outp64);
+ }
+ else {
+ uint64_t r = 0;
+ i0 = *inp64;
+ CVMX_MT_3DES_DEC_CBC(i0);
+ CVMX_MF_3DES_RESULT(*outp64);
+ inp64++, outp64++;
+
+ XMEMCPY(&r, inp64, inl);
+ CVMX_MT_3DES_DEC_CBC(r);
+ CVMX_MF_3DES_RESULT(*outp64);
+ }
+ }
+
+ CVMX_MT_3DES_IV(iv[0]);
+
+ return 0;
+}
+#endif /* !NO_DES3 */
+
+
+#ifndef NO_AES
+
+#ifdef HAVE_AES_CBC
+static int Octeon_AesCbc_Encrypt(Aes *aes,
+ uint64_t *inp64, uint64_t *outp64, size_t inl)
+{
+ register uint64_t i0, i1, r0, r1;
+ uint64_t *key, *iv;
+
+ if (aes == NULL || inp64 == NULL || outp64 == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ iv = (uint64_t*)aes->reg;
+ CVMX_MT_AES_IV(iv[0], 0);
+ CVMX_MT_AES_IV(iv[1], 1);
+
+ key = (uint64_t*)aes->devKey;
+ CVMX_MT_AES_KEY(key[0], 0);
+ CVMX_MT_AES_KEY(key[1], 1);
+ CVMX_MT_AES_KEY(key[2], 2);
+ CVMX_MT_AES_KEY(key[3], 3);
+
+ CVMX_MT_AES_KEYLENGTH(aes->keylen/8 - 1);
+
+ CVMX_PREFETCH0(inp64);
+
+ i0 = inp64[0];
+ i1 = inp64[1];
+
+ /* AES assembly can handle 32-byte chunks */
+ if (inl >= 32) {
+ CVMX_MT_AES_ENC_CBC0(i0);
+ CVMX_MT_AES_ENC_CBC1(i1);
+ inl -= 16;
+ inp64 += 2;
+ outp64 += 2;
+
+ if (inl >= 16) {
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ i0 = inp64[0];
+ i1 = inp64[1];
+ CVMX_MT_AES_ENC_CBC0(i0);
+ CVMX_MT_AES_ENC_CBC1(i1);
+
+ for (;;) {
+ outp64[-2] = r0;
+ outp64[-1] = r1;
+ outp64 += 2;
+ inp64 += 2;
+ inl -= 16;
+ i0 = inp64[0];
+ i1 = inp64[1];
+
+ if (inl < 16)
+ break;
+
+ CVMX_PREFETCH(inp64, 64);
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ CVMX_MT_AES_ENC_CBC0(i0);
+ CVMX_MT_AES_ENC_CBC1(i1);
+ }
+ }
+
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ outp64[-2] = r0;
+ outp64[-1] = r1;
+ }
+ /* remainder */
+ if (inl > 0) {
+ uint64_t in64[2] = { 0, 0 };
+ if (inl <= 16) {
+ XMEMCPY(in64, inp64, inl);
+ CVMX_MT_AES_ENC_CBC0(in64[0]);
+ CVMX_MT_AES_ENC_CBC1(in64[1]);
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ outp64[0] = r0;
+ outp64[1] = r1;
+ }
+ else {
+ CVMX_MT_AES_ENC_CBC0(i0);
+ CVMX_MT_AES_ENC_CBC1(i1);
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ inl -= 16;
+ outp64[0] = r0;
+ outp64[1] = r1;
+ inp64 += 2;
+ outp64 += 2;
+ XMEMCPY(in64, inp64, inl);
+ CVMX_MT_AES_ENC_CBC0(in64[0]);
+ CVMX_MT_AES_ENC_CBC1(in64[1]);
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ outp64[0] = r0;
+ outp64[1] = r1;
+ }
+ }
+
+ CVMX_MF_AES_IV(iv[0], 0);
+ CVMX_MF_AES_IV(iv[1], 1);
+
+ return 0;
+}
+
+static int Octeon_AesCbc_Decrypt(Aes *aes,
+ uint64_t *inp64, uint64_t *outp64, size_t inl)
+{
+ register uint64_t i0, i1, r0, r1;
+ uint64_t *key, *iv;
+
+ if (aes == NULL || inp64 == NULL || outp64 == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ iv = (uint64_t*)aes->reg;
+ key = (uint64_t*)aes->devKey;
+
+ CVMX_MT_AES_IV(iv[0], 0);
+ CVMX_MT_AES_IV(iv[1], 1);
+
+ CVMX_MT_AES_KEY(key[0], 0);
+ CVMX_MT_AES_KEY(key[1], 1);
+ CVMX_MT_AES_KEY(key[2], 2);
+ CVMX_MT_AES_KEY(key[3], 3);
+ CVMX_MT_AES_KEYLENGTH(aes->keylen/8 - 1);
+
+ CVMX_PREFETCH0(inp64);
+
+ i0 = inp64[0];
+ i1 = inp64[1];
+
+ /* AES assembly can handle 32-byte chunks */
+ if (inl >= 32) {
+ CVMX_MT_AES_DEC_CBC0(i0);
+ CVMX_MT_AES_DEC_CBC1(i1);
+ inp64 += 2;
+ outp64 += 2;
+ inl -= 16;
+
+ if (inl >= 16) {
+ i0 = inp64[0];
+ i1 = inp64[1];
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ CVMX_MT_AES_DEC_CBC0(i0);
+ CVMX_MT_AES_DEC_CBC1(i1);
+
+ for (;;) {
+ outp64[-2] = r0;
+ outp64[-1] = r1;
+ outp64 += 2;
+ inp64 += 2;
+ inl -= 16;
+ i0 = inp64[0];
+ i1 = inp64[1];
+
+ if (inl < 16)
+ break;
+
+ CVMX_PREFETCH(inp64, 64);
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ CVMX_MT_AES_DEC_CBC0(i0);
+ CVMX_MT_AES_DEC_CBC1(i1);
+ }
+ }
+
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ outp64[-2] = r0;
+ outp64[-1] = r1;
+ }
+ /* remainder */
+ if (inl > 0) {
+ uint64_t in64[2] = { 0, 0 };
+ XMEMCPY(in64, inp64, inl);
+ CVMX_MT_AES_DEC_CBC0(in64[0]);
+ CVMX_MT_AES_DEC_CBC1(in64[1]);
+ CVMX_MF_AES_RESULT(r0, 0);
+ CVMX_MF_AES_RESULT(r1, 1);
+ outp64[0] = r0;
+ outp64[1] = r1;
+ }
+
+ CVMX_MF_AES_IV(iv[0], 0);
+ CVMX_MF_AES_IV(iv[1], 1);
+
+ return 0;
+}
+#endif /* HAVE_AES_CBC */
+
+
+#ifdef HAVE_AESGCM
+
+#define CVM_AES_RD_RESULT_WR_DATA(in1, in2, out1, out2) \
+ asm volatile(\
+ ".set noreorder \n" \
+ "dmfc2 %[r1],0x0100\n" \
+ "dmfc2 %[r2],0x0101\n" \
+ "dmtc2 %[r3],0x010a\n" \
+ "dmtc2 %[r4],0x310b\n" \
+ ".set reorder \n" \
+ : [r1] "=&d"(in1) , [r2] "=&d"(in2) \
+ : [r3] "d"(out1), [r4] "d"(out2))
+
+static NOOPT void Octeon_GHASH_Restore(word16 poly, byte* h)
+{
+ word64* bigH = (word64*)h;
+ CVMX_MT_GFM_POLY((word64)poly);
+ CVMX_MT_GFM_MUL(bigH[0], 0);
+ CVMX_MT_GFM_MUL(bigH[1], 1);
+}
+
+
+static NOOPT void Octeon_GHASH_Init(word16 poly, byte* h)
+{
+ Octeon_GHASH_Restore(poly, h);
+ CVMX_MT_GFM_RESINP(0, 0);
+ CVMX_MT_GFM_RESINP(0, 1);
+}
+
+
+static NOOPT void Octeon_GHASH_Update(byte* in)
+{
+ word64* bigIn = (word64*)in;
+ CVMX_MT_GFM_XOR0(bigIn[0]);
+ CVMX_MT_GFM_XORMUL1(bigIn[1]);
+}
+
+
+static NOOPT void Octeon_GHASH_Final(byte* out, word64 authInSz, word64 inSz)
+{
+ word64* bigOut = (word64*)out;
+
+ CVMX_MT_GFM_XOR0(authInSz * 8);
+ CVMX_MT_GFM_XORMUL1(inSz * 8);
+ CVMX_MF_GFM_RESINP(bigOut[0], 0);
+ CVMX_MF_GFM_RESINP(bigOut[1], 1);
+}
+
+
+/* Sets the Octeon key with the key found in the Aes record. */
+static NOOPT int Octeon_AesGcm_SetKey(Aes* aes)
+{
+ int ret = 0;
+
+ if (aes == NULL)
+ ret = BAD_FUNC_ARG;
+
+ if (ret == 0) {
+ uint64_t* key = (uint64_t*)aes->devKey;
+
+ CVMX_MT_AES_KEY(key[0], 0);
+ CVMX_MT_AES_KEY(key[1], 1);
+ CVMX_MT_AES_KEY(key[2], 2);
+ CVMX_MT_AES_KEY(key[3], 3);
+ CVMX_MT_AES_KEYLENGTH((aes->keylen / 8) - 1);
+
+ if (!aes->keySet) {
+ uint64_t* bigH = (uint64_t*)aes->H;
+ CVMX_MT_AES_ENC0(0);
+ CVMX_MT_AES_ENC1(0);
+ CVMX_MF_AES_RESULT(bigH[0], 0);
+ CVMX_MF_AES_RESULT(bigH[1], 1);
+ aes->keySet = 1;
+ }
+ }
+
+ return ret;
+}
+
+
+static NOOPT int Octeon_AesGcm_SetIV(Aes* aes, byte* iv, word32 ivSz)
+{
+ int ret = 0;
+
+ if (aes == NULL || iv == NULL)
+ ret = BAD_FUNC_ARG;
+
+ if (ret == 0) {
+ if (ivSz == GCM_NONCE_MID_SZ) {
+ XMEMSET((byte*)aes->reg, 0, sizeof(aes->reg));
+ XMEMCPY((byte*)aes->reg, iv, ivSz);
+ aes->reg[3] = 1;
+ }
+ else {
+ int blocks, remainder, i;
+ byte aesBlock[AES_BLOCK_SIZE];
+
+ blocks = ivSz / AES_BLOCK_SIZE;
+ remainder = ivSz % AES_BLOCK_SIZE;
+
+ for (i = 0; i < blocks; i++, iv += AES_BLOCK_SIZE)
+ Octeon_GHASH_Update(iv);
+
+ XMEMSET(aesBlock, 0, sizeof(aesBlock));
+ for (i = 0; i < remainder; i++)
+ aesBlock[i] = iv[i];
+ Octeon_GHASH_Update(aesBlock);
+
+ Octeon_GHASH_Final((byte*)aes->reg, 0, ivSz);
+ }
+
+ aes->y0 = aes->reg[3];
+ aes->reg[3]++;
+
+ Octeon_GHASH_Init(0xe100, aes->H);
+ }
+
+ return ret;
+}
+
+
+static NOOPT int Octeon_AesGcm_SetAAD(Aes* aes, byte* aad, word32 aadSz)
+{
+ word64* p;
+ ALIGN16 byte aesBlock[AES_BLOCK_SIZE];
+ int blocks, remainder, i;
+
+ if (aes == NULL || (aadSz != 0 && aad == NULL))
+ return BAD_FUNC_ARG;
+
+ if (aadSz == 0)
+ return 0;
+
+ blocks = aadSz / AES_BLOCK_SIZE;
+ remainder = aadSz % AES_BLOCK_SIZE;
+
+ Octeon_GHASH_Restore(0xe100, aes->H);
+
+ p = (word64*)aesBlock;
+
+ for (i = 0; i < blocks; i++, aad += AES_BLOCK_SIZE) {
+ CVMX_LOADUNA_INT64(p[0], aad, 0);
+ CVMX_LOADUNA_INT64(p[1], aad, 8);
+ CVMX_MT_GFM_XOR0(p[0]);
+ CVMX_MT_GFM_XORMUL1(p[1]);
+ }
+
+ XMEMSET(aesBlock, 0, sizeof(aesBlock));
+
+ for (i = 0; i < remainder; i++)
+ aesBlock[i] = aad[i];
+
+ CVMX_MT_GFM_XOR0(p[0]);
+ CVMX_MT_GFM_XORMUL1(p[1]);
+
+ return 0;
+}
+
+
+static int Octeon_AesGcm_SetEncrypt(Aes* aes, byte* in, byte* out, word32 inSz,
+ int encrypt)
+{
+ word32 i, blocks, remainder;
+ ALIGN16 byte aesBlockIn[AES_BLOCK_SIZE];
+ ALIGN16 byte aesBlockOut[AES_BLOCK_SIZE];
+ word64* pIn;
+ word64* pOut;
+ word64* pIv;
+
+ if (aes == NULL || in == NULL || out == NULL)
+ return BAD_FUNC_ARG;
+
+ pIn = (word64*)aesBlockIn;
+ pOut = (word64*)aesBlockOut;
+ pIv = (word64*)aes->reg;
+
+ CVMX_PREFETCH0(in);
+
+ CVMX_MT_AES_ENC0(pIv[0]);
+ CVMX_MT_AES_ENC1(pIv[1]);
+
+ blocks = inSz / AES_BLOCK_SIZE;
+ remainder = inSz % AES_BLOCK_SIZE;
+
+ for (i = 0; i < blocks;
+ i++, in += AES_BLOCK_SIZE, out += AES_BLOCK_SIZE) {
+ CVMX_PREFETCH128(in);
+ aes->reg[3]++;
+
+ CVMX_LOADUNA_INT64(pIn[0], in, 0);
+ CVMX_LOADUNA_INT64(pIn[1], in, 8);
+
+ CVM_AES_RD_RESULT_WR_DATA(pOut[0], pOut[1], pIv[0], pIv[1]);
+
+ if (encrypt) {
+ pOut[0] ^= pIn[0];
+ pOut[1] ^= pIn[1];
+ CVMX_MT_GFM_XOR0(pOut[0]);
+ CVMX_MT_GFM_XORMUL1(pOut[1]);
+ }
+ else {
+ CVMX_MT_GFM_XOR0(pIn[0]);
+ CVMX_MT_GFM_XORMUL1(pIn[1]);
+ pOut[0] ^= pIn[0];
+ pOut[1] ^= pIn[1];
+ }
+
+ CVMX_STOREUNA_INT64(pOut[0], out, 0);
+ CVMX_STOREUNA_INT64(pOut[1], out, 8);
+ }
+
+ if (remainder > 0) {
+ ALIGN16 byte aesBlockMask[AES_BLOCK_SIZE];
+ word64* pMask = (word64*)aesBlockMask;
+
+ XMEMSET(aesBlockOut, 0, sizeof(aesBlockOut));
+ XMEMSET(aesBlockMask, 0, sizeof(aesBlockMask));
+ for (i = 0; i < remainder; i++) {
+ aesBlockIn[i] = in[i];
+ aesBlockMask[i] = 0xFF;
+ }
+
+ if (encrypt) {
+ CVMX_MF_AES_RESULT(pOut[0], 0);
+ CVMX_MF_AES_RESULT(pOut[1], 1);
+
+ pOut[0] ^= pIn[0];
+ pOut[1] ^= pIn[1];
+
+ pOut[0] &= pMask[0];
+ pOut[1] &= pMask[1];
+
+ CVMX_MT_GFM_XOR0(pOut[0]);
+ CVMX_MT_GFM_XORMUL1(pOut[1]);
+ }
+ else {
+ CVMX_MT_GFM_XOR0(pIn[0]);
+ CVMX_MT_GFM_XORMUL1(pIn[1]);
+
+ CVMX_MF_AES_RESULT(pOut[0], 0);
+ CVMX_MF_AES_RESULT(pOut[1], 1);
+
+ pOut[0] ^= pIn[0];
+ pOut[1] ^= pIn[1];
+
+ pOut[0] &= pMask[0];
+ pOut[1] &= pMask[1];
+ }
+
+ for (i = 0; i < remainder; i++)
+ out[i] = aesBlockOut[i];
+ }
+
+ return 0;
+}
+
+
+static NOOPT int Octeon_AesGcm_Finalize(Aes* aes, word32 inSz, word32 aadSz,
+ byte* tag)
+{
+ word64 bigSz;
+ word64* pIv;
+ word64* pIn;
+ word64* pOut;
+ uint32_t countSave;
+ ALIGN16 byte aesBlockIn[AES_BLOCK_SIZE];
+ ALIGN16 byte aesBlockOut[AES_BLOCK_SIZE];
+
+ countSave = aes->reg[3];
+ aes->reg[3] = aes->y0;
+
+ pIv = (word64*)aes->reg;
+ CVMX_MT_AES_ENC0(pIv[0]);
+ CVMX_MT_AES_ENC1(pIv[1]);
+
+ bigSz = (word64)aadSz * 8;
+ CVMX_MT_GFM_XOR0(bigSz);
+ bigSz = (word64)inSz * 8;
+ CVMX_MT_GFM_XORMUL1(bigSz);
+
+ aes->reg[3] = countSave;
+
+ pIn = (word64*)aesBlockIn;
+ CVMX_MF_AES_RESULT(pIn[0], 0);
+ CVMX_MF_AES_RESULT(pIn[1], 1);
+
+ pOut = (word64*)aesBlockOut;
+ CVMX_MF_GFM_RESINP(pOut[0], 0);
+ CVMX_MF_GFM_RESINP(pOut[1], 1);
+
+ pOut[0] ^= pIn[0];
+ pOut[1] ^= pIn[1];
+
+ CVMX_STOREUNA_INT64(pOut[0], tag, 0);
+ CVMX_STOREUNA_INT64(pOut[1], tag, 8);
+
+ return 0;
+}
+
+
+static int Octeon_AesGcm_Encrypt(Aes* aes, byte* in, byte* out, word32 inSz,
+ byte* iv, word32 ivSz, byte* aad, word32 aadSz, byte* tag)
+{
+ int ret = 0;
+
+ if (aes == NULL)
+ ret = BAD_FUNC_ARG;
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_SetKey(aes);
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_SetIV(aes, iv, ivSz);
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_SetAAD(aes, aad, aadSz);
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_SetEncrypt(aes, in, out, inSz, 1);
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_Finalize(aes, inSz, aadSz, tag);
+
+ return ret;
+}
+
+
+static int Octeon_AesGcm_Decrypt(Aes* aes, byte* in, byte* out, word32 inSz,
+ byte* iv, word32 ivSz, byte* aad, word32 aadSz, byte* tag)
+{
+ int ret = 0;
+
+ if (aes == NULL)
+ ret = BAD_FUNC_ARG;
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_SetKey(aes);
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_SetIV(aes, iv, ivSz);
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_SetAAD(aes, aad, aadSz);
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_SetEncrypt(aes, in, out, inSz, 0);
+
+ if (ret == 0)
+ ret = Octeon_AesGcm_Finalize(aes, inSz, aadSz, tag);
+
+ return ret;
+}
+
+#endif /* HAVE_AESGCM */
+
+#endif /* !NO_AES */
+
+#ifdef WOLF_CRYPTO_CB
+
+#include <wolfssl/wolfcrypt/cryptocb.h>
+
+
+static int myCryptoDevCb(int devIdArg, wc_CryptoInfo* info, void* ctx)
+{
+ int ret = NOT_COMPILED_IN; /* return this to bypass HW and use SW */
+
+ if (info == NULL)
+ return BAD_FUNC_ARG;
+
+#ifdef DEBUG_WOLFSSL
+ printf("CryptoDevCb: Algo Type %d\n", info->algo_type);
+#endif
+
+ if (info->algo_type == WC_ALGO_TYPE_CIPHER) {
+#if !defined(NO_AES) || !defined(NO_DES3)
+ #ifdef HAVE_AESGCM
+ if (info->cipher.type == WC_CIPHER_AES_GCM) {
+ if (info->cipher.enc) {
+ ret = Octeon_AesGcm_Encrypt(
+ info->cipher.aesgcm_enc.aes,
+ (byte*)info->cipher.aesgcm_enc.in,
+ (byte*)info->cipher.aesgcm_enc.out,
+ info->cipher.aesgcm_enc.sz,
+ (byte*)info->cipher.aesgcm_enc.iv,
+ info->cipher.aesgcm_enc.ivSz,
+ (byte*)info->cipher.aesgcm_enc.authIn,
+ info->cipher.aesgcm_enc.authInSz,
+ (byte*)info->cipher.aesgcm_enc.authTag);
+ }
+ else {
+ ret = Octeon_AesGcm_Decrypt(
+ info->cipher.aesgcm_dec.aes,
+ (byte*)info->cipher.aesgcm_dec.in,
+ (byte*)info->cipher.aesgcm_dec.out,
+ info->cipher.aesgcm_dec.sz,
+ (byte*)info->cipher.aesgcm_dec.iv,
+ info->cipher.aesgcm_dec.ivSz,
+ (byte*)info->cipher.aesgcm_dec.authIn,
+ info->cipher.aesgcm_dec.authInSz,
+ (byte*)info->cipher.aesgcm_dec.authTag);
+ }
+ }
+ #endif /* HAVE_AESGCM */
+ #ifdef HAVE_AES_CBC
+ if (info->cipher.type == WC_CIPHER_AES_CBC) {
+ if (info->cipher.enc) {
+ ret = Octeon_AesCbc_Encrypt(
+ info->cipher.aescbc.aes,
+ (word64*)info->cipher.aescbc.in,
+ (word64*)info->cipher.aescbc.out,
+ info->cipher.aescbc.sz);
+ }
+ else {
+ ret = Octeon_AesCbc_Decrypt(
+ info->cipher.aescbc.aes,
+ (word64*)info->cipher.aescbc.in,
+ (word64*)info->cipher.aescbc.out,
+ info->cipher.aescbc.sz);
+ }
+ }
+ #endif /* HAVE_AES_CBC */
+ #ifndef NO_DES3
+ if (info->cipher.type == WC_CIPHER_DES3) {
+ if (info->cipher.enc) {
+ ret = Octeon_Des3_CbcEncrypt(
+ info->cipher.des3.des,
+ (word64*)info->cipher.des3.in,
+ (word64*)info->cipher.des3.out,
+ info->cipher.des3.sz);
+ }
+ else {
+ ret = Octeon_Des3_CbcDecrypt(
+ info->cipher.des3.des,
+ (word64*)info->cipher.des3.in,
+ (word64*)info->cipher.des3.out,
+ info->cipher.des3.sz);
+ }
+ }
+ #endif /* !NO_DES3 */
+#endif /* !NO_AES || !NO_DES3 */
+ }
+
+ (void)devIdArg;
+ (void)ctx;
+
+ return ret;
+}
+
+int wc_CryptoCb_InitOcteon(void)
+{
+ if (wc_CryptoCb_RegisterDevice(devId, myCryptoDevCb, NULL) < 0) {
+ return INVALID_DEVID;
+ }
+
+ return devId;
+}
+
+void wc_CryptoCb_CleanupOcteon(int* id)
+{
+ wc_CryptoCb_UnRegisterDevice(*id);
+ *id = INVALID_DEVID;
+}
+
+#endif /* WOLF_CRYPTO_CB */
+
+#endif /* HAVE_CAVIUM_OCTEON_SYNC */
diff --git a/wolfcrypt/src/port/devcrypto/README.md b/wolfcrypt/src/port/devcrypto/README.md
new file mode 100644
index 0000000..7844dca
--- /dev/null
+++ b/wolfcrypt/src/port/devcrypto/README.md
@@ -0,0 +1,43 @@
+# Description
+
+Used to build with cryptodev-linux library with Linux OS.
+
+# Quick Start
+
+## Installing cryptodev module
+
+If not already installed then the cryptodev-linux module will need installed.
+
+```
+git clone https://github.com/cryptodev-linux/cryptodev-linux.git
+cd cryptodev-linux
+make
+sudo make install
+modprobe cryptodev
+```
+
+
+## Options for building wolfSSL
+
+For default build with all supported features use:
+
+```
+./configure --enable-cryptodev
+```
+
+Or for more control over features used:
+
+```
+./configure --enable-devcrypto=cbc
+./configure --enable-devcrypto=hash
+./configure --enable-devcrypto=aes
+./configure --enable-devcrypto=all
+```
+
+Then build the wolfSSL library with:
+
+```
+make
+sudo make install
+./wolfcrypt/test/testwolfcrypt
+```
diff --git a/wolfcrypt/src/port/devcrypto/devcrypto_aes.c b/wolfcrypt/src/port/devcrypto/devcrypto_aes.c
new file mode 100644
index 0000000..1f6d09d
--- /dev/null
+++ b/wolfcrypt/src/port/devcrypto/devcrypto_aes.c
@@ -0,0 +1,384 @@
+/* devcrypto_aes.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+
+#if !defined(NO_AES) && defined(WOLFSSL_DEVCRYPTO)
+
+#include <wolfssl/wolfcrypt/aes.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/port/devcrypto/wc_devcrypto.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+
+#if defined(HAVE_AES_CBC) && defined(WOLFSSL_DEVCRYPTO_CBC)
+int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ struct crypt_op crt;
+ int ret;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* encrypt only up to AES block size of date */
+ sz = sz - (sz % AES_BLOCK_SIZE);
+ if (aes->ctx.cfd == -1) {
+ ret = wc_DevCryptoCreate(&aes->ctx, CRYPTO_AES_CBC,
+ (byte*)aes->devKey, aes->keylen);
+ if (ret != 0)
+ return ret;
+ }
+ wc_SetupCryptSym(&crt, &aes->ctx, (byte*)in, sz, out, (byte*)aes->reg,
+ COP_ENCRYPT);
+ ret = ioctl(aes->ctx.cfd, CIOCCRYPT, &crt);
+ if (ret != 0) {
+ return WC_DEVCRYPTO_E;
+ }
+
+ /* store iv for next call */
+ XMEMCPY(aes->reg, out + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+
+ return 0;
+}
+
+#ifdef HAVE_AES_DECRYPT
+int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ struct crypt_op crt;
+ int ret;
+
+ if (aes == NULL || out == NULL || in == NULL || sz % AES_BLOCK_SIZE != 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+ if (aes->ctx.cfd == -1) {
+ ret = wc_DevCryptoCreate(&aes->ctx, CRYPTO_AES_CBC,
+ (byte*)aes->devKey, aes->keylen);
+ if (ret != 0)
+ return ret;
+ }
+ wc_SetupCryptSym(&crt, &aes->ctx, (byte*)in, sz, out, (byte*)aes->reg,
+ COP_DECRYPT);
+ ret = ioctl(aes->ctx.cfd, CIOCCRYPT, &crt);
+ if (ret != 0) {
+ return WC_DEVCRYPTO_E;
+ }
+
+ XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE);
+ return 0;
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AES_CBC && WOLFSSL_DEVCRYPTO_CBC */
+
+
+#ifdef WOLFSSL_DEVCRYPTO_AES /* all AES algorithms supported */
+int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
+ const byte* iv, int dir)
+{
+#if defined(AES_MAX_KEY_SIZE)
+ const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
+#endif
+
+ if (aes == NULL ||
+ !((keylen == 16) || (keylen == 24) || (keylen == 32))) {
+ return BAD_FUNC_ARG;
+ }
+
+#if defined(AES_MAX_KEY_SIZE)
+ /* Check key length */
+ if (keylen > max_key_len) {
+ return BAD_FUNC_ARG;
+ }
+#endif
+ aes->keylen = keylen;
+ aes->rounds = keylen/4 + 6;
+
+#ifdef WOLFSSL_AES_COUNTER
+ aes->left = 0;
+#endif
+ aes->ctx.cfd = -1;
+ XMEMCPY(aes->devKey, userKey, keylen);
+
+ (void)dir;
+ return wc_AesSetIV(aes, iv);
+}
+
+
+/* AES-DIRECT */
+#if defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AES_ECB)
+
+/* common code between ECB encrypt and decrypt
+ * returns 0 on success */
+static int wc_DevCrypto_AesDirect(Aes* aes, byte* out, const byte* in,
+ word32 sz, int dir)
+{
+ int ret;
+ struct crypt_op crt;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (aes->ctx.cfd == -1) {
+ ret = wc_DevCryptoCreate(&aes->ctx, CRYPTO_AES_ECB, (byte*)aes->devKey,
+ aes->keylen);
+ if (ret != 0)
+ return ret;
+ }
+
+ wc_SetupCryptSym(&crt, &aes->ctx, (byte*)in, sz, out, NULL, dir);
+ ret = ioctl(aes->ctx.cfd, CIOCCRYPT, &crt);
+ if (ret != 0) {
+ return WC_DEVCRYPTO_E;
+ }
+ return 0;
+}
+#endif
+
+
+#if defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AESCCM)
+void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in)
+{
+ wc_DevCrypto_AesDirect(aes, out, in, AES_BLOCK_SIZE, COP_ENCRYPT);
+}
+
+
+void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in)
+{
+ wc_DevCrypto_AesDirect(aes, out, in, AES_BLOCK_SIZE, COP_DECRYPT);
+}
+
+
+int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
+ const byte* iv, int dir)
+{
+ return wc_AesSetKey(aes, userKey, keylen, iv, dir);
+}
+#endif
+
+
+/* AES-CTR */
+#if defined(WOLFSSL_AES_COUNTER)
+
+/* Increment AES counter */
+static WC_INLINE void IncrementAesCounter(byte* inOutCtr)
+{
+ /* in network byte order so start at end and work back */
+ int i;
+ for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) {
+ if (++inOutCtr[i]) /* we're done unless we overflow */
+ return;
+ }
+}
+
+int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ int ret;
+ struct crypt_op crt;
+ byte* tmp;
+
+ if (aes == NULL || out == NULL || in == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* consume any unused bytes left in aes->tmp */
+ tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
+ while (aes->left && sz) {
+ *(out++) = *(in++) ^ *(tmp++);
+ aes->left--;
+ sz--;
+ }
+
+ if (aes->ctx.cfd == -1) {
+ ret = wc_DevCryptoCreate(&aes->ctx, CRYPTO_AES_CTR, (byte*)aes->devKey,
+ aes->keylen);
+ if (ret != 0)
+ return ret;
+ }
+
+ if (sz > 0) {
+ /* clear previously leftover data */
+ tmp = (byte*)aes->tmp;
+ XMEMSET(tmp, 0, AES_BLOCK_SIZE);
+
+ /* update IV */
+ wc_SetupCryptSym(&crt, &aes->ctx, (byte*)in, sz, out, (byte*)aes->reg,
+ COP_ENCRYPT);
+ ret = ioctl(aes->ctx.cfd, CIOCCRYPT, &crt);
+ if (ret != 0) {
+ return WC_DEVCRYPTO_E;
+ }
+
+ /* adjust counter after call to hardware */
+ while (sz >= AES_BLOCK_SIZE) {
+ IncrementAesCounter((byte*)aes->reg);
+ sz -= AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ in += AES_BLOCK_SIZE;
+ }
+ }
+
+ /* create key stream for later if needed */
+ if (sz > 0) {
+ Aes tmpAes;
+ wc_AesSetKey(&tmpAes, (byte*)aes->devKey, aes->keylen, (byte*)aes->reg,
+ AES_ENCRYPTION);
+ wc_AesEncryptDirect(&tmpAes, (byte*)aes->tmp, (const byte*)aes->reg);
+ wc_AesFree(&tmpAes);
+ IncrementAesCounter((byte*)aes->reg);
+
+ aes->left = AES_BLOCK_SIZE - (sz % AES_BLOCK_SIZE);
+ }
+
+ return 0;
+}
+#endif /* WOLFSSL_AES_COUNTER */
+
+
+#ifdef HAVE_AESGCM
+
+int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
+{
+ return wc_AesSetKey(aes, key, len, NULL, AES_ENCRYPTION);
+}
+
+
+
+/* common code for AES-GCM encrypt/decrypt */
+static int wc_DevCrypto_AesGcm(Aes* aes, byte* out, byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz,
+ int dir)
+{
+ struct crypt_auth_op crt = {0};
+ int ret;
+ byte scratch[AES_BLOCK_SIZE];
+
+ /* argument checks */
+ if (aes == NULL || authTagSz > AES_BLOCK_SIZE) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* Account for NULL in/out buffers. Up to tag size is still written into
+ * in/out buffers */
+ if (out == NULL)
+ out = scratch;
+ if (in == NULL)
+ in = scratch;
+
+ XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+ if (aes->ctx.cfd == -1) {
+ ret = wc_DevCryptoCreate(&aes->ctx, CRYPTO_AES_GCM, (byte*)aes->devKey,
+ aes->keylen);
+ if (ret != 0)
+ return ret;
+ }
+
+ /* if decrypting then the tag is expected to be at the end of "in" buffer */
+ if (dir == COP_DECRYPT) {
+ XMEMCPY(in + sz, authTag, authTagSz);
+ sz += authTagSz;
+ }
+ else{
+ /* get full tag from hardware */
+ authTagSz = AES_BLOCK_SIZE;
+ }
+ wc_SetupCryptAead(&crt, &aes->ctx, (byte*)in, sz, out, (byte*)iv, ivSz,
+ dir, (byte*)authIn, authInSz, authTag, authTagSz);
+ ret = ioctl(aes->ctx.cfd, CIOCAUTHCRYPT, &crt);
+ if (ret != 0) {
+ if (dir == COP_DECRYPT) {
+ return AES_GCM_AUTH_E;
+ }
+ else {
+ return WC_DEVCRYPTO_E;
+ }
+ }
+
+ /* after encryption the tag has been placed at the end of "out" buffer */
+ if (dir == COP_ENCRYPT) {
+ XMEMCPY(authTag, out + sz, authTagSz);
+ }
+ return 0;
+}
+
+
+/* it is assumed that "out" buffer has enough room for cipher text + tag */
+int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) {
+ WOLFSSL_MSG("GcmEncrypt authTagSz too small error");
+ return BAD_FUNC_ARG;
+ }
+
+ return wc_DevCrypto_AesGcm(aes, out, (byte*)in, sz, iv, ivSz,
+ authTag, authTagSz, authIn, authInSz,
+ COP_ENCRYPT);
+}
+
+#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AESGCM_DECRYPT)
+/* it is assumed that "in" buffer has enough room for cipher text + tag */
+int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ return wc_DevCrypto_AesGcm(aes, out, (byte*)in, sz, iv, ivSz,
+ (byte*)authTag, authTagSz, authIn, authInSz,
+ COP_DECRYPT);
+}
+#endif /* HAVE_AES_DECRYPT || HAVE_AESGCM_DECRYPT */
+#endif /* HAVE_AESGCM */
+
+
+#ifdef HAVE_AES_ECB
+int wc_AesEcbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ return wc_DevCrypto_AesDirect(aes, out, in, sz, COP_ENCRYPT);
+}
+
+
+int wc_AesEcbDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ return wc_DevCrypto_AesDirect(aes, out, in, sz, COP_DECRYPT);
+}
+#endif /* HAVE_AES_ECB */
+#endif /* WOLFSSL_DEVCRYPTO_AES */
+#endif /* !NO_AES && WOLFSSL_DEVCRYPTO */
+
diff --git a/wolfcrypt/src/port/devcrypto/devcrypto_hash.c b/wolfcrypt/src/port/devcrypto/devcrypto_hash.c
new file mode 100644
index 0000000..f73224d
--- /dev/null
+++ b/wolfcrypt/src/port/devcrypto/devcrypto_hash.c
@@ -0,0 +1,248 @@
+/* devcrypto_hash.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_DEVCRYPTO_HASH)
+
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/port/devcrypto/wc_devcrypto.h>
+
+#if !defined(NO_SHA256)
+#include <wolfssl/wolfcrypt/sha256.h>
+#endif
+
+/* dereference structure based on type to get cryptodev context pointer
+ * can return NULL on fail case */
+static WC_CRYPTODEV* GetHashContext(void* ctx, int type)
+{
+ switch (type) {
+ case CRYPTO_SHA2_256:
+ return &((wc_Sha256*)ctx)->ctx;
+
+ default:
+ return NULL;
+ }
+
+ return NULL;
+}
+
+
+/* generic hash initialization
+ * key is for hmac algorithms and keySz is for the size of key buffer
+ * key should be null in the case of non hmac algorithms
+ * return 0 on success */
+static int HashInit(void* ctx, int type, byte* key, word32 keySz)
+{
+ WC_CRYPTODEV* cdev;
+
+ if ((cdev = GetHashContext(ctx, type)) == NULL) {
+ WOLFSSL_MSG("Unsupported hash type");
+ return BAD_FUNC_ARG;
+ }
+
+ return wc_DevCryptoCreate(cdev, type, key, keySz);
+}
+
+
+/* generic function for updated hash structure
+ * returns 0 on success */
+static int HashUpdate(void* ctx, int type, const byte* input, word32 inputSz)
+{
+ WC_CRYPTODEV* dev;
+ struct crypt_op crt;
+ byte digest[64];
+
+ if (inputSz == 0) {
+ return 0;
+ }
+
+ if ((dev = GetHashContext(ctx, type)) == NULL) {
+ WOLFSSL_MSG("Unsupported hash type");
+ return BAD_FUNC_ARG;
+ }
+
+ wc_SetupCrypt(&crt, dev, (byte*)input, inputSz, NULL, digest, COP_FLAG_UPDATE);
+ if (ioctl(dev->cfd, CIOCCRYPT, &crt)) {
+ WOLFSSL_MSG("Error with call to ioctl");
+ return WC_DEVCRYPTO_E;
+ }
+
+ return 0;
+}
+
+
+/* generic function for getting final digest value */
+static int GetDigest(void* ctx, int type, byte* out)
+{
+ WC_CRYPTODEV* dev;
+ struct crypt_op crt;
+
+ if ((dev = GetHashContext(ctx, type)) == NULL) {
+ WOLFSSL_MSG("Unsupported hash type");
+ return BAD_FUNC_ARG;
+ }
+
+ wc_SetupCrypt(&crt, dev, NULL, 0, NULL, out, COP_FLAG_FINAL);
+ if (ioctl(dev->cfd, CIOCCRYPT, &crt)) {
+ WOLFSSL_MSG("Error with call to ioctl");
+ return WC_DEVCRYPTO_E;
+ }
+
+ return 0;
+}
+
+#if !defined(NO_SHA256)
+
+int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
+{
+ if (sha == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ (void)devId; /* no async for now */
+ XMEMSET(sha, 0, sizeof(wc_Sha256));
+ sha->heap = heap;
+
+ return HashInit((void*)sha, CRYPTO_SHA2_256, NULL, 0);
+}
+
+
+int wc_Sha256Update(wc_Sha256* sha, const byte* in, word32 sz)
+{
+ if (sha == NULL || (sz > 0 && in == NULL)) {
+ return BAD_FUNC_ARG;
+ }
+
+#ifdef WOLFSSL_DEVCRYPTO_HASH_KEEP
+ /* keep full message to hash at end instead of incremental updates */
+ if (sha->len < sha->used + sz) {
+ if (sha->msg == NULL) {
+ sha->msg = (byte*)XMALLOC(sha->used + sz, sha->heap,
+ DYNAMIC_TYPE_TMP_BUFFER);
+ } else {
+ byte* pt = (byte*)XREALLOC(sha->msg, sha->used + sz, sha->heap,
+ DYNAMIC_TYPE_TMP_BUFFER);
+ if (pt == NULL) {
+ return MEMORY_E;
+ }
+ sha->msg = pt;
+ }
+ if (sha->msg == NULL) {
+ return MEMORY_E;
+ }
+ sha->len = sha->used + sz;
+ }
+ XMEMCPY(sha->msg + sha->used, in, sz);
+ sha->used += sz;
+ return 0;
+#else
+ return HashUpdate(sha, CRYPTO_SHA2_256, in, sz);
+#endif
+}
+
+
+int wc_Sha256Final(wc_Sha256* sha, byte* hash)
+{
+ int ret;
+
+ if (sha == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* help static analysis tools out */
+ XMEMSET(hash, 0, WC_SHA256_DIGEST_SIZE);
+#ifdef WOLFSSL_DEVCRYPTO_HASH_KEEP
+ /* keep full message to hash at end instead of incremental updates */
+ if ((ret = HashUpdate(sha, CRYPTO_SHA2_256, sha->msg, sha->used)) < 0) {
+ return ret;
+ }
+ XFREE(sha->msg, sha->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ sha->msg = NULL;
+#endif
+ ret = GetDigest(sha, CRYPTO_SHA2_256, hash);
+ if (ret != 0) {
+ return ret;
+ }
+
+ wc_Sha256Free(sha);
+ return wc_InitSha256_ex(sha, sha->heap, 0);
+}
+
+
+int wc_Sha256GetHash(wc_Sha256* sha, byte* hash)
+{
+ if (sha == NULL || hash == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+#ifdef WOLFSSL_DEVCRYPTO_HASH_KEEP
+ {
+ int ret;
+ wc_Sha256 cpy;
+ wc_Sha256Copy(sha, &cpy);
+
+ if ((ret = HashUpdate(&cpy, CRYPTO_SHA2_256, cpy.msg, cpy.used)) == 0) {
+ /* help static analysis tools out */
+ XMEMSET(hash, 0, WC_SHA256_DIGEST_SIZE);
+ ret = GetDigest(&cpy, CRYPTO_SHA2_256, hash);
+ }
+ wc_Sha256Free(&cpy);
+ return ret;
+ }
+#else
+ (void)sha;
+ (void)hash;
+
+ WOLFSSL_MSG("Compile with WOLFSSL_DEVCRYPTO_HASH_KEEP for this feature");
+ return NOT_COMPILED_IN;
+#endif
+}
+
+int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
+{
+ if (src == NULL || dst == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ wc_InitSha256_ex(dst, src->heap, 0);
+#ifdef WOLFSSL_DEVCRYPTO_HASH_KEEP
+ dst->len = src->len;
+ dst->used = src->used;
+ dst->msg = (byte*)XMALLOC(src->len, dst->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ if (dst->msg == NULL) {
+ return MEMORY_E;
+ }
+ XMEMCPY(dst->msg, src->msg, src->len);
+#endif
+
+ return 0;
+}
+
+#endif /* !NO_SHA256 */
+
+#endif /* WOLFSSL_DEVCRYPTO */
diff --git a/wolfcrypt/src/port/devcrypto/wc_devcrypto.c b/wolfcrypt/src/port/devcrypto/wc_devcrypto.c
new file mode 100644
index 0000000..2c80518
--- /dev/null
+++ b/wolfcrypt/src/port/devcrypto/wc_devcrypto.c
@@ -0,0 +1,167 @@
+/* wc_devcrypto.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_DEVCRYPTO)
+
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/port/devcrypto/wc_devcrypto.h>
+
+/* sets up a context for talking to /dev/crypto
+ * return 0 on success */
+int wc_DevCryptoCreate(WC_CRYPTODEV* ctx, int type, byte* key, word32 keySz)
+{
+ int fd;
+ int isHash = 0; /* flag for if hashing algorithm */
+
+ if (ctx == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* sanity check on session type before creating descriptor */
+ XMEMSET(ctx, 0, sizeof(WC_CRYPTODEV));
+ switch (type) {
+ case CRYPTO_SHA1:
+ case CRYPTO_SHA2_256:
+ isHash = 1;
+ break;
+
+ #ifndef NO_AES
+ case CRYPTO_AES_CTR:
+ case CRYPTO_AES_ECB:
+ case CRYPTO_AES_GCM:
+ case CRYPTO_AES_CBC:
+ isHash = 0;
+ break;
+ #endif
+
+ default:
+ WOLFSSL_MSG("Unknown / Unimplemented algorithm type");
+ return BAD_FUNC_ARG;
+ }
+
+ /* create descriptor */
+ if ((fd = open("/dev/crypto", O_RDWR, 0)) < 0) {
+ WOLFSSL_MSG("Error opening /dev/crypto is cryptodev module loaded?");
+ return WC_DEVCRYPTO_E;
+ }
+ if (fcntl(fd, F_SETFD, 1) == -1) {
+ WOLFSSL_MSG("Error setting F_SETFD with fcntl");
+ close(fd);
+ return WC_DEVCRYPTO_E;
+ }
+
+ /* set up session */
+ ctx->cfd = fd;
+
+ if (isHash) {
+ ctx->sess.mac = type;
+ }
+ else {
+ ctx->sess.cipher = type;
+ ctx->sess.key = (void*)key;
+ ctx->sess.keylen = keySz;
+ }
+
+ if (ioctl(ctx->cfd, CIOCGSESSION, &ctx->sess)) {
+ close(fd);
+ WOLFSSL_MSG("Error starting cryptodev session");
+ return WC_DEVCRYPTO_E;
+ }
+
+ (void)key;
+ (void)keySz;
+
+ return 0;
+}
+
+
+/* free up descriptor and session used with ctx */
+void wc_DevCryptoFree(WC_CRYPTODEV* ctx)
+{
+ if (ctx != NULL && ctx->cfd >= 0) {
+ if (ioctl(ctx->cfd, CIOCFSESSION, &ctx->sess.ses)) {
+ WOLFSSL_MSG("Error stopping cryptodev session");
+ }
+ close(ctx->cfd);
+ }
+}
+
+
+/* setup crypt_op structure */
+void wc_SetupCrypt(struct crypt_op* crt, WC_CRYPTODEV* dev,
+ byte* src, int srcSz, byte* dst, byte* dig, int flag)
+
+{
+ XMEMSET(crt, 0, sizeof(struct crypt_op));
+ crt->ses = dev->sess.ses;
+ crt->src = src;
+ crt->len = srcSz;
+ crt->dst = dst;
+ crt->mac = dig;
+ crt->flags = flag;
+}
+
+
+/* setup crypt_op structure for symmetric key operations */
+void wc_SetupCryptSym(struct crypt_op* crt, WC_CRYPTODEV* dev,
+ byte* src, word32 srcSz, byte* dst, byte* iv, int flag)
+
+{
+ XMEMSET(crt, 0, sizeof(struct crypt_op));
+ crt->ses = dev->sess.ses;
+ crt->src = src;
+ crt->len = srcSz;
+ crt->dst = dst;
+ crt->iv = iv;
+ crt->op = flag;
+}
+
+
+/* setup crypt_auth_op structure for aead operations */
+void wc_SetupCryptAead(struct crypt_auth_op* crt, WC_CRYPTODEV* dev,
+ byte* src, word32 srcSz, byte* dst, byte* iv, word32 ivSz, int flag,
+ byte* authIn, word32 authInSz, byte* authTag, word32 authTagSz)
+{
+ XMEMSET(crt, 0, sizeof(struct crypt_op));
+ crt->ses = dev->sess.ses;
+ crt->src = src;
+ crt->len = srcSz;
+ crt->dst = dst;
+ crt->iv = iv;
+ crt->iv_len = ivSz;
+ crt->op = flag;
+
+ /* also set auth in and tag */
+ crt->auth_src = authIn;
+ crt->auth_len = authInSz;
+ crt->tag = authTag;
+ crt->tag_len = authTagSz;
+}
+#endif /* WOLFSSL_DEVCRYPTO */
+
diff --git a/wolfcrypt/src/port/intel/README.md b/wolfcrypt/src/port/intel/README.md
new file mode 100644
index 0000000..4b5d971
--- /dev/null
+++ b/wolfcrypt/src/port/intel/README.md
@@ -0,0 +1,3 @@
+# Intel QuickAssist Adapter Asynchronous Support
+
+Please contact wolfSSL at [email protected] to request an evaluation.
diff --git a/wolfcrypt/src/port/intel/quickassist.c b/wolfcrypt/src/port/intel/quickassist.c
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/wolfcrypt/src/port/intel/quickassist.c
diff --git a/wolfcrypt/src/port/intel/quickassist_mem.c b/wolfcrypt/src/port/intel/quickassist_mem.c
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/wolfcrypt/src/port/intel/quickassist_mem.c
diff --git a/wolfcrypt/src/port/intel/quickassist_sync.c b/wolfcrypt/src/port/intel/quickassist_sync.c
new file mode 100644
index 0000000..e03bca9
--- /dev/null
+++ b/wolfcrypt/src/port/intel/quickassist_sync.c
@@ -0,0 +1,2004 @@
+/* quickassist_sync.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL. (formerly known as CyaSSL)
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef HAVE_INTEL_QA_SYNC
+
+#ifdef QAT_DEMO_MAIN
+ #define QAT_DEBUG
+#endif
+
+
+#include <wolfssl/internal.h>
+#include <wolfssl/error-ssl.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#ifndef NO_AES
+ #include <wolfssl/wolfcrypt/aes.h>
+#endif
+
+#include <wolfssl/wolfcrypt/cryptocb.h>
+#include <wolfssl/wolfcrypt/port/intel/quickassist_sync.h>
+
+#include "cpa.h"
+#include "cpa_cy_im.h"
+#include "cpa_cy_sym.h"
+#include "cpa_cy_rsa.h"
+#include "cpa_cy_ln.h"
+#include "cpa_cy_ecdh.h"
+#include "cpa_cy_ecdsa.h"
+#include "cpa_cy_dh.h"
+#include "cpa_cy_drbg.h"
+#include "cpa_cy_nrbg.h"
+#include "cpa_cy_prime.h"
+
+#include "icp_sal_user.h"
+#include "icp_sal_poll.h"
+
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+/* User space utils */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#if 0
+ /* Optional feature for partial QAT hashing support */
+ /* This will process updates through hardware instead of caching them */
+ #define QAT_HASH_ENABLE_PARTIAL
+#endif
+#ifdef QAT_HASH_ENABLE_PARTIAL
+ #define MAX_QAT_HASH_BUFFERS 2
+#endif
+
+/* Detect QAT driver version */
+#if defined(CPA_CY_API_VERSION_NUM_MAJOR) && CPA_CY_API_VERSION_NUM_MAJOR > 1
+ #define QAT_V2
+#endif
+
+#ifdef QAT_V2
+ /* quickassist/utilities/libusdm_drv/qae_mem.h */
+ /* Provides user-space API's for accessing NUMA allocated memory through usdm_drv */
+ #include "qae_mem.h"
+#include "linux/include/qae_mem_utils.h"
+#endif
+
+#ifdef QAT_USE_POLLING_THREAD
+ #include <pthread.h>
+#endif
+
+/* Tunable parameters */
+#ifndef QAT_PROCESS_NAME
+ #define QAT_PROCESS_NAME "SSL"
+#endif
+#ifndef QAT_LIMIT_DEV_ACCESS
+ #define QAT_LIMIT_DEV_ACCESS CPA_FALSE
+#endif
+#ifndef QAT_MAX_DEVICES
+ #define QAT_MAX_DEVICES (1) /* maximum number of QAT cards */
+#endif
+
+#ifndef QAT_RETRY_LIMIT
+ #define QAT_RETRY_LIMIT (100)
+#endif
+#ifndef QAT_POLL_RESP_QUOTA
+ #define QAT_POLL_RESP_QUOTA (0) /* all pending */
+#endif
+
+#if !defined(NO_AES) || !defined(NO_DES3)
+ #define QAT_ENABLE_CRYPTO
+#endif
+
+/* Pre-declarations */
+struct IntelQaDev;
+struct wc_CryptoInfo;
+struct WC_BIGINT;
+struct WC_RNG;
+
+
+#if defined(QAT_ENABLE_HASH) || defined(QAT_ENABLE_CRYPTO)
+/* symmetric context */
+typedef struct IntelQaSymCtx {
+ CpaCySymOpData opData;
+ CpaCySymSessionCtx symCtxSrc;
+ CpaCySymSessionCtx symCtx;
+ word32 symCtxSize;
+
+ /* flags */
+ word32 isOpen:1;
+ word32 isCopy:1;
+} IntelQaSymCtx;
+#endif
+
+typedef void (*IntelQaFreeFunc)(struct IntelQaDev*);
+
+
+/* QuickAssist device */
+typedef struct IntelQaDev {
+ CpaInstanceHandle handle;
+ int devId;
+ void* heap;
+
+ /* callback return info */
+ int ret;
+ byte* out;
+ union {
+ word32* outLenPtr;
+ word32 outLen;
+ };
+
+ /* operations */
+ IntelQaFreeFunc freeFunc;
+ union {
+ #ifdef QAT_ENABLE_CRYPTO
+ struct {
+ IntelQaSymCtx ctx;
+ CpaBufferList bufferList;
+ CpaFlatBuffer flatBuffer;
+ byte* authTag;
+ word32 authTagSz;
+ } cipher;
+ #endif
+ } op;
+
+#ifdef QAT_USE_POLLING_THREAD
+ pthread_t pollingThread;
+ byte pollingCy;
+#endif
+} IntelQaDev;
+
+
+/* Interface */
+static int IntelQaHardwareStart(const char*, int);
+static void IntelQaHardwareStop(void);
+static int IntelQaInit(void*);
+static void IntelQaDeInit(int);
+static int IntelQaNumInstances(void);
+static int IntelQaOpen(IntelQaDev*, int);
+static void IntelQaClose(IntelQaDev*);
+static int IntelQaDevCopy(IntelQaDev*, IntelQaDev*);
+static int IntelQaPoll(IntelQaDev*);
+static int IntelQaGetCyInstanceCount(void);
+
+#ifndef NO_AES
+ #ifdef HAVE_AES_CBC
+ static int IntelQaSymAesCbcEncrypt(IntelQaDev*, byte*,
+ const byte*, word32, const byte*, word32, const byte*, word32);
+ #ifdef HAVE_AES_DECRYPT
+ static int IntelQaSymAesCbcDecrypt(IntelQaDev*, byte*,
+ const byte*, word32, const byte*, word32, const byte*, word32);
+ #endif /* HAVE_AES_DECRYPT */
+ #endif /* HAVE_AES_CBC */
+
+ #ifdef HAVE_AESGCM
+ static int IntelQaSymAesGcmEncrypt(IntelQaDev*, byte*,
+ const byte*, word32, const byte*, word32, const byte*, word32,
+ byte*, word32, const byte*, word32);
+ #ifdef HAVE_AES_DECRYPT
+ static int IntelQaSymAesGcmDecrypt(IntelQaDev*, byte*,
+ const byte*, word32, const byte*, word32, const byte*, word32,
+ const byte*, word32, const byte*, word32);
+ #endif /* HAVE_AES_DECRYPT */
+ #endif /* HAVE_AESGCM */
+#endif /* !NO_AES */
+
+#ifndef NO_DES3
+ static int IntelQaSymDes3CbcEncrypt(IntelQaDev*, byte*,
+ const byte*, word32, const byte*, word32, const byte* iv, word32);
+ static int IntelQaSymDes3CbcDecrypt(IntelQaDev* dev, byte*,
+ const byte*, word32, const byte*, word32, const byte* iv, word32);
+#endif /*! NO_DES3 */
+
+#ifdef WOLF_CRYPTO_CB
+ static int IntelQaSymSync_CryptoDevCb(int, struct wc_CryptoInfo*,
+ void*);
+#endif /* WOLF_CRYPTO_CB */
+
+
+#ifdef QAT_DEBUG
+ #define QLOG(...) do { printf(__VA_ARGS__); } while (0)
+#else
+ #define QLOG(...)
+#endif
+
+
+#define OS_HOST_TO_NW_32(uData) ByteReverseWord32(uData)
+
+
+static CpaInstanceHandle* g_cyInstances = NULL;
+static CpaInstanceInfo2* g_cyInstanceInfo = NULL;
+static Cpa32U* g_cyInstMap = NULL;
+static Cpa16U g_numInstances = 0;
+static Cpa16U g_instCounter = 0;
+static CpaBoolean g_cyServiceStarted = CPA_FALSE;
+#ifdef QAT_USE_POLLING_CHECK
+ static CpaBoolean* g_cyPolling = NULL;
+ static pthread_mutex_t* g_PollLock;
+#endif
+static volatile int g_initCount = 0;
+static pthread_mutex_t g_Hwlock = PTHREAD_MUTEX_INITIALIZER;
+
+
+typedef struct qatCapabilities {
+ /* capabilities */
+ word32 supPartial:1;
+ word32 supSha3:1;
+} qatCapabilities_t;
+static qatCapabilities_t g_qatCapabilities = {0};
+
+
+#if defined(QAT_ENABLE_CRYPTO)
+ static int IntelQaSymClose(IntelQaDev* dev, int doFree);
+#endif
+
+
+extern Cpa32U osalLogLevelSet(Cpa32U level);
+
+
+static IntelQaDev qaDev;
+
+
+/* -------------------------------------------------------------------------- */
+/* Polling */
+/* -------------------------------------------------------------------------- */
+
+static WC_INLINE int SyncSleep(word32 ms)
+{
+ int ret = 0;
+ struct timespec resTime, remTime;
+ resTime.tv_sec = ms/1000;
+ resTime.tv_nsec = (ms%1000)*1000000;
+ do {
+ ret = nanosleep(&resTime, &remTime);
+ resTime = remTime;
+ } while ((ret!=0) && (errno == EINTR));
+
+ if (ret != 0) {
+ QLOG("nanoSleep failed with code %d\n", ret);
+ return BAD_FUNC_ARG;
+ }
+
+ return ret;
+}
+
+#ifdef QAT_USE_POLLING_THREAD
+static void* IntelQaPollingThread(void* context)
+{
+ IntelQaDev* dev = (IntelQaDev*)context;
+
+ QLOG("Polling Thread Start\n");
+ while (dev->pollingCy) {
+ icp_sal_CyPollInstance(dev->handle, QAT_POLL_RESP_QUOTA);
+ SyncSleep(10);
+ }
+ QLOG("Polling Thread Exit\n");
+ pthread_exit(NULL);
+}
+
+static CpaStatus IntelQaStartPollingThread(IntelQaDev* dev)
+{
+ if (dev->pollingCy == 0) {
+ dev->pollingCy = 1;
+
+ QLOG("Polling Thread Created\n");
+
+ if (pthread_create(&dev->pollingThread, NULL, IntelQaPollingThread,
+ (void*)dev) != 0) {
+ QLOG("Failed create polling thread!\n");
+ return CPA_STATUS_FAIL;
+ }
+ }
+ return CPA_STATUS_SUCCESS;
+}
+
+static void IntelQaStopPollingThread(IntelQaDev* dev)
+{
+ dev->pollingCy = 0;
+ pthread_join(dev->pollingThread, 0);
+}
+#endif /* QAT_USE_POLLING_THREAD */
+
+
+/* -------------------------------------------------------------------------- */
+/* Device */
+/* -------------------------------------------------------------------------- */
+void IntelQaHardwareStop(void)
+{
+ int i;
+ CpaStatus status;
+
+ g_initCount--; /* track de-init count */
+ if (g_initCount != 0) {
+ return;
+ }
+
+ if (g_cyServiceStarted == CPA_TRUE) {
+ g_cyServiceStarted = CPA_FALSE;
+ for (i=0; i<g_numInstances; i++) {
+ status = cpaCyStopInstance(g_cyInstances[i]);
+ if (status != CPA_STATUS_SUCCESS) {
+ QLOG("IntelQA: Could not stop instance: %d\n"
+ "\tInternal error has occur which probably can only be"
+ "fixed by a reboot\n", i);
+ }
+ }
+ }
+
+ status = icp_sal_userStop();
+ if (status != CPA_STATUS_SUCCESS) {
+ QLOG("IntelQA: Could not stop sal for user space (status %d)\n",
+ status);
+ }
+
+ if (g_cyInstMap) {
+ XFREE(g_cyInstMap, NULL, DYNAMIC_TYPE_ASYNC);
+ g_cyInstMap = NULL;
+ }
+
+ if (g_cyInstanceInfo) {
+ XFREE(g_cyInstanceInfo, NULL, DYNAMIC_TYPE_ASYNC);
+ g_cyInstanceInfo = NULL;
+ }
+
+#ifdef QAT_USE_POLLING_CHECK
+ if (g_cyPolling) {
+ XFREE(g_cyPolling, NULL, DYNAMIC_TYPE_ASYNC);
+ g_cyPolling = NULL;
+ }
+ if (g_PollLock) {
+ for (i=0; i<g_numInstances; i++) {
+ pthread_mutex_destroy(&g_PollLock[i]);
+ }
+ XFREE(g_PollLock, NULL, DYNAMIC_TYPE_ASYNC);
+ g_PollLock = NULL;
+ }
+#endif
+
+ if (g_cyInstances) {
+ XFREE(g_cyInstances, NULL, DYNAMIC_TYPE_ASYNC);
+ g_cyInstances = NULL;
+ g_numInstances = 0;
+ }
+
+ qaeMemDestroy();
+
+ QLOG("IntelQA: Stop\n");
+}
+
+
+int IntelQaHardwareStart(const char* process_name, int limitDevAccess)
+{
+ int ret = 0, i;
+ CpaStatus status;
+
+ g_initCount++;
+ if (g_initCount > 1) {
+ return 0;
+ }
+
+ status = qaeMemInit();
+ if (status != CPA_STATUS_SUCCESS) {
+ QLOG("IntelQA: Could not start qae mem for user space (status %d)\n"
+ "\tHas the qaeMemDrv.ko module been loaded?\n",
+ status);
+ return ASYNC_INIT_E;
+ }
+
+ status = icp_sal_userStartMultiProcess(process_name,
+ limitDevAccess ? CPA_TRUE : CPA_FALSE);
+ if (status != CPA_STATUS_SUCCESS) {
+ QLOG("IntelQA: Could not start sal for user space! status %d\n",
+ status);
+ ret = ASYNC_INIT_E; goto error;
+ }
+
+#ifdef QAT_DEBUG
+ /* optionally enable debugging */
+ //osalLogLevelSet(8);
+#endif
+
+ status = cpaCyGetNumInstances(&g_numInstances);
+ if (status != CPA_STATUS_SUCCESS || g_numInstances == 0) {
+ QLOG("IntelQA: Failed to get num of instances! status %d\n", status);
+ ret = INVALID_DEVID; goto error;
+ }
+
+ /* Get handles / info */
+ g_cyInstances = (CpaInstanceHandle*)XMALLOC(
+ sizeof(CpaInstanceHandle) * g_numInstances, NULL, DYNAMIC_TYPE_ASYNC);
+ if (g_cyInstances == NULL) {
+ QLOG("IntelQA: Failed to allocate instances\n");
+ ret = INVALID_DEVID; goto error;
+ }
+
+#ifdef QAT_USE_POLLING_CHECK
+ g_cyPolling = (CpaBoolean*)XMALLOC(sizeof(CpaBoolean) * g_numInstances, NULL,
+ DYNAMIC_TYPE_ASYNC);
+ if (g_cyPolling == NULL) {
+ QLOG("IntelQA: Failed to allocate polling status\n");
+ ret = INVALID_DEVID; goto error;
+ }
+ g_PollLock = (pthread_mutex_t*)XMALLOC(sizeof(pthread_mutex_t) *
+ g_numInstances, NULL, DYNAMIC_TYPE_ASYNC);
+ if (g_PollLock == NULL) {
+ QLOG("IntelQA: Failed to allocate polling locks\n");
+ ret = INVALID_DEVID; goto error;
+ }
+ for (i=0; i<g_numInstances; i++) {
+ pthread_mutex_init(&g_PollLock[i], NULL);
+ }
+#endif
+
+ g_cyInstanceInfo = (CpaInstanceInfo2*)XMALLOC(
+ sizeof(CpaInstanceInfo2) * g_numInstances, NULL, DYNAMIC_TYPE_ASYNC);
+ if (g_cyInstanceInfo == NULL) {
+ QLOG("IntelQA: Failed to allocate instance info\n");
+ ret = INVALID_DEVID; goto error;
+ }
+
+ g_cyInstMap = (Cpa32U*)XMALLOC(
+ sizeof(Cpa32U) * g_numInstances, NULL, DYNAMIC_TYPE_ASYNC);
+ if (g_cyInstMap == NULL) {
+ QLOG("IntelQA: Failed to allocate instance map\n");
+ ret = INVALID_DEVID; goto error;
+ }
+
+ status = cpaCyGetInstances(g_numInstances, g_cyInstances);
+ if (status != CPA_STATUS_SUCCESS) {
+ QLOG("IntelQA: Failed to get IntelQA instances\n");
+ ret = INVALID_DEVID; goto error;
+ }
+
+ /* start all instances */
+ g_cyServiceStarted = CPA_TRUE;
+ for (i=0; i<g_numInstances; i++) {
+ Cpa32U coreAffinity = 0;
+ CpaCySymCapabilitiesInfo capabilities;
+ int j;
+ XMEMSET(&capabilities, 0, sizeof(capabilities));
+
+ status = cpaCyInstanceGetInfo2(g_cyInstances[i],
+ &g_cyInstanceInfo[i]);
+ if (status != CPA_STATUS_SUCCESS) {
+ QLOG("IntelQA: Error getting instance info for %d\n", i);
+ ret = INVALID_DEVID; goto error;
+ }
+
+ /* loop of the instanceInfo coreAffinity bitmask to find the core */
+ for (j=0; j<CPA_MAX_CORES; j++) {
+ if (CPA_BITMAP_BIT_TEST(g_cyInstanceInfo[i].coreAffinity, j)) {
+ coreAffinity = i;
+ break;
+ }
+ }
+ g_cyInstMap[i] = coreAffinity;
+
+ /* capabilities */
+ status = cpaCySymQueryCapabilities(g_cyInstances[i], &capabilities);
+ if (status == CPA_STATUS_SUCCESS) {
+ g_qatCapabilities.supPartial = capabilities.partialPacketSupported;
+ if (capabilities.partialPacketSupported != CPA_TRUE) {
+ QLOG("Warning: QAT does not support partial packets!\n");
+ }
+ }
+
+ QLOG("Inst %d, Node: %d, Affin: %u, Dev: %u, Accel %u, "
+ "EE %u, BDF %02X:%02X:%02X, isPolled %d\n",
+ i, g_cyInstanceInfo[i].nodeAffinity, coreAffinity,
+ g_cyInstanceInfo[i].physInstId.packageId,
+ g_cyInstanceInfo[i].physInstId.acceleratorId,
+ g_cyInstanceInfo[i].physInstId.executionEngineId,
+ (Cpa8U)((g_cyInstanceInfo[i].physInstId.busAddress) >> 8),
+ (Cpa8U)((g_cyInstanceInfo[i].physInstId.busAddress)
+ & 0xFF) >> 3,
+ (Cpa8U)((g_cyInstanceInfo[i].physInstId.busAddress) & 3),
+ g_cyInstanceInfo[i].isPolled);
+
+ status = cpaCySetAddressTranslation(g_cyInstances[i],
+ qaeVirtToPhysNUMA);
+ if (status != CPA_STATUS_SUCCESS) {
+ QLOG("IntelQA: Error setting memory config for inst %d\n", i);
+ ret = INVALID_DEVID; goto error;
+ }
+
+ status = cpaCyStartInstance(g_cyInstances[i]);
+ if (status != CPA_STATUS_SUCCESS) {
+ QLOG("IntelQA: Error starting crypto instance %d\n", i);
+ ret = INVALID_DEVID; goto error;
+ }
+ }
+
+ QLOG("IntelQA: Instances %d\n", g_numInstances);
+ return ret;
+
+error:
+ IntelQaHardwareStop();
+ return ret;
+}
+
+
+int IntelQaInit(void* threadId)
+{
+ int ret;
+ int devId;
+ (void)threadId;
+
+ ret = pthread_mutex_lock(&g_Hwlock);
+ if (ret != 0) {
+ QLOG("IntelQaInit: mutex lock failed! %d\n", ret);
+ return BAD_MUTEX_E;
+ }
+
+ ret = IntelQaHardwareStart(QAT_PROCESS_NAME, QAT_LIMIT_DEV_ACCESS);
+ if (ret != 0) {
+ pthread_mutex_unlock(&g_Hwlock);
+ return ret;
+ }
+
+ if (g_numInstances <= 0) {
+ pthread_mutex_unlock(&g_Hwlock);
+ return ASYNC_INIT_E;
+ }
+
+ /* assign device id */
+ devId = (g_instCounter % g_numInstances);
+ g_instCounter++;
+
+ pthread_mutex_unlock(&g_Hwlock);
+
+ return devId;
+}
+
+
+int IntelQaNumInstances(void)
+{
+ return g_numInstances;
+}
+
+
+int IntelQaOpen(IntelQaDev* dev, int devId)
+{
+ if (dev == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* clear device info */
+ XMEMSET(dev, 0, sizeof(IntelQaDev));
+
+ if (g_cyInstances == NULL) {
+ QLOG("IntelQA not initialized\n");
+ return ASYNC_INIT_E;
+ }
+
+ dev->devId = devId;
+ dev->handle = g_cyInstances[devId];
+
+#ifdef QAT_USE_POLLING_THREAD
+ /* start polling thread */
+ IntelQaStartPollingThread(dev);
+#endif
+
+ return 0;
+}
+
+
+#if defined(QAT_ENABLE_CRYPTO)
+
+static IntelQaSymCtx* IntelQaGetSymCtx(IntelQaDev* dev)
+{
+ return &dev->op.cipher.ctx;
+}
+
+#endif
+
+
+void IntelQaClose(IntelQaDev* dev)
+{
+ if (dev) {
+ QLOG("IntelQaClose %p\n", dev);
+ /* close any active session */
+ IntelQaSymClose(dev, 1);
+
+ #ifdef QAT_USE_POLLING_THREAD
+ IntelQaStopPollingThread(dev);
+ #endif
+
+ dev->handle = NULL;
+ }
+}
+
+void IntelQaDeInit(int devId)
+{
+ (void)devId;
+
+ if (pthread_mutex_lock(&g_Hwlock) == 0) {
+ IntelQaHardwareStop();
+ pthread_mutex_unlock(&g_Hwlock);
+ }
+}
+
+int IntelQaPoll(IntelQaDev* dev)
+{
+ int ret = 0;
+ CpaStatus status;
+
+#ifdef QAT_USE_POLLING_CHECK
+ pthread_mutex_t* lock = &g_PollLock[dev->qat.devId];
+ if (pthread_mutex_lock(lock) == 0) {
+ /* test if any other threads are polling */
+ if (g_cyPolling[dev->qat.devId]) {
+ pthread_mutex_unlock(lock);
+
+ /* return success even though its busy, caller will treat as WC_PENDING_E */
+ return 0;
+ }
+
+ g_cyPolling[dev->qat.devId] = 1;
+ pthread_mutex_unlock(lock);
+ }
+#endif
+
+ status = icp_sal_CyPollInstance(dev->handle, QAT_POLL_RESP_QUOTA);
+ if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_RETRY) {
+ QLOG("IntelQa: Poll failure %d\n", status);
+ ret = -1;
+ }
+
+ {
+ if (dev->ret != WC_PENDING_E) {
+ /* perform cleanup */
+ IntelQaFreeFunc freeFunc = dev->freeFunc;
+ QLOG("IntelQaOpFree: Dev %p, FreeFunc %p\n", dev, freeFunc);
+ if (freeFunc) {
+ dev->freeFunc = NULL;
+ freeFunc(dev);
+ }
+ }
+ }
+
+#ifdef QAT_USE_POLLING_CHECK
+ /* indicate we are done polling */
+ if (pthread_mutex_lock(lock) == 0) {
+ g_cyPolling[dev->qat.devId] = 0;
+ pthread_mutex_unlock(lock);
+ }
+#endif
+
+ return ret;
+}
+
+static int IntelQaPollBlockRet(IntelQaDev* dev, int ret_wait)
+{
+ int ret;
+
+ do {
+ ret = IntelQaPoll(dev);
+
+ if (dev->ret != ret_wait) {
+ break;
+ }
+ } while (1);
+ ret = dev->ret;
+
+ return ret;
+}
+
+int IntelQaGetCyInstanceCount(void)
+{
+ return g_numInstances;
+}
+
+static WC_INLINE int IntelQaHandleCpaStatus(IntelQaDev* dev, CpaStatus status,
+ int* ret, byte isAsync, void* callback, int* retryCount)
+{
+ int retry = 0;
+
+ if (status == CPA_STATUS_SUCCESS) {
+ if (isAsync && callback) {
+ *ret = WC_PENDING_E;
+ }
+ else {
+ *ret = IntelQaPollBlockRet(dev, WC_PENDING_E);
+ }
+ }
+ else if (status == CPA_STATUS_RETRY) {
+ (*retryCount)++;
+ if ((*retryCount % (QAT_RETRY_LIMIT + 1)) == QAT_RETRY_LIMIT) {
+ SyncSleep(10);
+ }
+ retry = 1;
+ }
+ else {
+ *ret = ASYNC_OP_E;
+ }
+
+ return retry;
+}
+
+static WC_INLINE void IntelQaOpInit(IntelQaDev* dev, IntelQaFreeFunc freeFunc)
+{
+ dev->ret = WC_PENDING_E;
+ dev->freeFunc = freeFunc;
+}
+
+
+/* -------------------------------------------------------------------------- */
+/* Symmetric Algos */
+/* -------------------------------------------------------------------------- */
+
+#if defined(QAT_ENABLE_CRYPTO)
+
+static int IntelQaSymOpen(IntelQaDev* dev, CpaCySymSessionSetupData* setup,
+ CpaCySymCbFunc callback)
+{
+ int ret = 0;
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U sessionCtxSize = 0;
+ IntelQaSymCtx* ctx;
+
+ /* arg check */
+ if (dev == NULL || setup == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ ctx = IntelQaGetSymCtx(dev);
+
+ /* Determine size of session context to allocate - use max size */
+ status = cpaCySymSessionCtxGetSize(dev->handle, setup, &sessionCtxSize);
+
+ if (ctx->symCtxSize > 0 && ctx->symCtxSize > sessionCtxSize) {
+ QLOG("Symmetric context size error! Buf %d, Exp %d\n",
+ ctx->symCtxSize, sessionCtxSize);
+ return ASYNC_OP_E;
+ }
+
+ /* make sure session context is allocated */
+ if (ctx->symCtx == NULL) {
+ /* Allocate session context */
+ ctx->symCtx = XMALLOC(sessionCtxSize, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA64);
+ if (ctx->symCtx == NULL) {
+ return MEMORY_E;
+ }
+ }
+ ctx->symCtxSize = sessionCtxSize;
+
+ if (!ctx->isOpen) {
+ ctx->isOpen = 1;
+
+ QLOG("IntelQaSymOpen: InitSession dev %p, symCtx %p\n",
+ dev, ctx->symCtx);
+
+ /* open symmetric session */
+ status = cpaCySymInitSession(dev->handle, callback, setup, ctx->symCtx);
+ if (status != CPA_STATUS_SUCCESS) {
+ QLOG("cpaCySymInitSession failed! dev %p, status %d\n",
+ dev, status);
+ XFREE(ctx->symCtx, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA64);
+ ctx->symCtx = NULL;
+ return ASYNC_INIT_E;
+ }
+ }
+
+ if (ctx->symCtxSrc == NULL) {
+ ctx->symCtxSrc = ctx->symCtx;
+ }
+
+ QLOG("IntelQaSymOpen: dev %p, symCtx %p (src %p), "
+ "symCtxSize %d, isCopy %d, isOpen %d\n",
+ dev, ctx->symCtx, ctx->symCtxSrc, ctx->symCtxSize,
+ ctx->isCopy, ctx->isOpen);
+
+ return ret;
+}
+
+static int IntelQaSymClose(IntelQaDev* dev, int doFree)
+{
+ int ret = 0;
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ IntelQaSymCtx* ctx;
+
+ if (dev == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ ctx = IntelQaGetSymCtx(dev);
+
+ QLOG("IntelQaSymClose: dev %p, ctx %p, symCtx %p (src %p), "
+ "symCtxSize %d, isCopy %d, isOpen %d, doFree %d\n",
+ dev, ctx, ctx->symCtx, ctx->symCtxSrc, ctx->symCtxSize,
+ ctx->isCopy, ctx->isOpen, doFree);
+
+ if (ctx->symCtx == ctx->symCtxSrc && ctx->symCtx != NULL) {
+ if (ctx->isOpen) {
+ ctx->isOpen = 0;
+ QLOG("IntelQaSymClose: RemoveSession dev %p, symCtx %p\n",
+ dev, ctx->symCtx);
+ status = cpaCySymRemoveSession(dev->handle, ctx->symCtx);
+ if (status == CPA_STATUS_RETRY) {
+ QLOG("cpaCySymRemoveSession retry!\n");
+ /* treat this as error, since session should not be active */
+ ret = ASYNC_OP_E;
+ }
+ else if (status != CPA_STATUS_SUCCESS) {
+ QLOG("cpaCySymRemoveSession failed! status %d\n", status);
+ ret = ASYNC_OP_E;
+ }
+ }
+ }
+
+ if (doFree) {
+ XFREE(ctx->symCtx, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA64);
+ ctx->symCtx = NULL;
+ ctx->symCtxSrc = NULL;
+ ctx->symCtxSize = 0;
+ }
+
+ return ret;
+}
+
+#endif /* QAT_ENABLE_CRYPTO */
+
+
+/* -------------------------------------------------------------------------- */
+/* AES/DES Algo */
+/* -------------------------------------------------------------------------- */
+
+#ifdef QAT_ENABLE_CRYPTO
+
+static void IntelQaSymCipherFree(IntelQaDev* dev)
+{
+ IntelQaSymCtx* ctx = &dev->op.cipher.ctx;
+ CpaCySymOpData* opData = &ctx->opData;
+ CpaBufferList* pDstBuffer = &dev->op.cipher.bufferList;
+
+ if (opData) {
+ if (opData->pAdditionalAuthData) {
+ XFREE(opData->pAdditionalAuthData, dev->heap,
+ DYNAMIC_TYPE_ASYNC_NUMA);
+ opData->pAdditionalAuthData = NULL;
+ }
+ if (opData->pIv) {
+ XFREE(opData->pIv, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA);
+ opData->pIv = NULL;
+ }
+ XMEMSET(opData, 0, sizeof(CpaCySymOpData));
+ }
+ if (pDstBuffer) {
+ if (pDstBuffer->pBuffers) {
+ if (pDstBuffer->pBuffers->pData) {
+ XFREE(pDstBuffer->pBuffers->pData, dev->heap,
+ DYNAMIC_TYPE_ASYNC_NUMA);
+ pDstBuffer->pBuffers->pData = NULL;
+ }
+ XMEMSET(pDstBuffer->pBuffers, 0, sizeof(CpaFlatBuffer));
+ }
+ if (pDstBuffer->pPrivateMetaData) {
+ XFREE(pDstBuffer->pPrivateMetaData, dev->heap,
+ DYNAMIC_TYPE_ASYNC_NUMA);
+ pDstBuffer->pPrivateMetaData = NULL;
+ }
+ XMEMSET(pDstBuffer, 0, sizeof(CpaBufferList));
+ }
+
+ /* close and free sym context */
+ IntelQaSymClose(dev, 1);
+
+ /* clear temp pointers */
+ dev->out = NULL;
+ dev->outLen = 0;
+#ifndef NO_AES
+ if (dev->op.cipher.authTag != NULL) {
+ XMEMSET(dev->op.cipher.authTag, 0, dev->op.cipher.authTagSz);
+ XFREE(dev->op.cipher.authTag, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA);
+ dev->op.cipher.authTag = NULL;
+ }
+ dev->op.cipher.authTagSz = 0;
+#endif
+}
+
+static int IntelQaSymCipher(IntelQaDev* dev, byte* out, const byte* in,
+ word32 inOutSz, const byte* key, word32 keySz, const byte* iv, word32 ivSz,
+ CpaCySymOp symOperation, CpaCySymCipherAlgorithm cipherAlgorithm,
+ CpaCySymCipherDirection cipherDirection,
+
+ /* for auth ciphers (CCM or GCM) */
+ CpaCySymHashAlgorithm hashAlgorithm,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ int ret;
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ CpaCySymOpData* opData = NULL;
+ CpaCySymSessionSetupData setup;
+ const Cpa32U numBuffers = 1;
+ CpaBufferList* bufferList = NULL;
+ CpaFlatBuffer* flatBuffer = NULL;
+ Cpa8U* ivBuf = NULL;
+ Cpa8U* dataBuf = NULL;
+ Cpa32U dataLen = inOutSz;
+ Cpa8U* metaBuf = NULL;
+ Cpa32U metaSize = 0;
+ Cpa8U* authInBuf = NULL;
+ Cpa32U authInSzAligned = authInSz;
+ Cpa8U* authTagBuf = NULL;
+ IntelQaSymCtx* ctx;
+ CpaBoolean verifyResult = CPA_FALSE;
+
+ QLOG("IntelQaSymCipher: dev %p, out %p, in %p, inOutSz %d, op %d, "
+ "algo %d, dir %d, hash %d\n",
+ dev, out, in, inOutSz, symOperation, cipherAlgorithm,
+ cipherDirection, hashAlgorithm);
+
+ /* check args */
+ if (out == NULL || in == NULL || inOutSz == 0 ||
+ key == NULL || keySz == 0 || iv == NULL || ivSz == 0) {
+ return BAD_FUNC_ARG;
+ }
+ if (hashAlgorithm != CPA_CY_SYM_HASH_NONE &&
+ (authTag == NULL || authTagSz == 0)) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* get meta size */
+ status = cpaCyBufferListGetMetaSize(dev->handle, numBuffers, &metaSize);
+ if (status != CPA_STATUS_SUCCESS && metaSize <= 0) {
+ ret = BUFFER_E; goto exit;
+ }
+
+ /* if authtag provided then it will be appended to end of input */
+ if (authTag && authTagSz > 0) {
+ dataLen += authTagSz;
+ }
+
+ /* allocate buffers */
+ ctx = &dev->op.cipher.ctx;
+ opData = &ctx->opData;
+ bufferList = &dev->op.cipher.bufferList;
+ flatBuffer = &dev->op.cipher.flatBuffer;
+ metaBuf = XMALLOC(metaSize, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA);
+ dataBuf = XMALLOC(dataLen, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA);
+ XMEMCPY(dataBuf, in, inOutSz);
+ ivBuf = XMALLOC(AES_BLOCK_SIZE, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA);
+ XMEMCPY(ivBuf, iv, ivSz);
+ authTagBuf = XMALLOC(authTagSz, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA);
+
+ /* check allocations */
+ if (ivBuf == NULL || metaBuf == NULL || dataBuf == NULL ||
+ authTagBuf == NULL) {
+ ret = MEMORY_E; goto exit;
+ }
+
+ /* AAD */
+ if (authIn && authInSz > 0) {
+ /* make sure AAD is block aligned */
+ if (authInSzAligned % AES_BLOCK_SIZE) {
+ authInSzAligned += AES_BLOCK_SIZE -
+ (authInSzAligned % AES_BLOCK_SIZE);
+ }
+
+ authInBuf = XMALLOC(authInSzAligned, dev->heap,
+ DYNAMIC_TYPE_ASYNC_NUMA);
+ XMEMCPY(authInBuf, authIn, authInSz);
+ if (authInBuf == NULL) {
+ ret = MEMORY_E; goto exit;
+ }
+ /* clear remainder */
+ XMEMSET(authInBuf + authInSz, 0, authInSzAligned - authInSz);
+ }
+
+ /* init buffers */
+ XMEMSET(&setup, 0, sizeof(CpaCySymSessionSetupData));
+ XMEMSET(opData, 0, sizeof(CpaCySymOpData));
+ XMEMSET(bufferList, 0, sizeof(CpaBufferList));
+ XMEMSET(flatBuffer, 0, sizeof(CpaFlatBuffer));
+ XMEMSET(metaBuf, 0, metaSize);
+
+ bufferList->pBuffers = flatBuffer;
+ bufferList->numBuffers = numBuffers;
+ bufferList->pPrivateMetaData = metaBuf;
+ flatBuffer->dataLenInBytes = dataLen;
+ flatBuffer->pData = dataBuf;
+
+ /* setup */
+ setup.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+ setup.symOperation = symOperation;
+ setup.cipherSetupData.cipherAlgorithm = cipherAlgorithm;
+ setup.cipherSetupData.cipherKeyLenInBytes = keySz;
+ setup.cipherSetupData.pCipherKey = (byte*)key;
+ setup.cipherSetupData.cipherDirection = cipherDirection;
+
+ /* setup auth ciphers */
+ if (hashAlgorithm != CPA_CY_SYM_HASH_NONE) {
+ setup.algChainOrder =
+ (cipherDirection == CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT) ?
+ CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH :
+ CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER;
+
+ setup.hashSetupData.hashAlgorithm = hashAlgorithm;
+ setup.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH;
+ setup.hashSetupData.digestResultLenInBytes = authTagSz;
+ setup.hashSetupData.authModeSetupData.aadLenInBytes = authInSz;
+
+ if (cipherDirection == CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT)
+ setup.digestIsAppended = CPA_TRUE;
+ else
+ setup.digestIsAppended = CPA_FALSE;
+ }
+
+ /* open session */
+ ret = IntelQaSymOpen(dev, &setup, NULL);
+ if (ret != 0) {
+ goto exit;
+ }
+
+ /* operation data */
+ opData->sessionCtx = ctx->symCtx;
+ opData->packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+ opData->pIv = ivBuf;
+ opData->ivLenInBytes = ivSz;
+ opData->cryptoStartSrcOffsetInBytes = 0;
+ opData->messageLenToCipherInBytes = inOutSz;
+ if (authIn && authInSz > 0) {
+ opData->pAdditionalAuthData = authInBuf;
+ }
+ if (cipherDirection == CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT) {
+ if (authTag && authTagSz > 0) {
+ /* append digest to end of data buffer */
+ XMEMCPY(flatBuffer->pData + inOutSz, authTag, authTagSz);
+ }
+ }
+ else {
+ if (authTag && authTagSz > 0) {
+ XMEMCPY(authTagBuf, authTag, authTagSz);
+ }
+ }
+
+ /* store info needed for output */
+ dev->out = out;
+ dev->outLen = inOutSz;
+ if (cipherDirection == CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT) {
+ dev->op.cipher.authTag = authTagBuf;
+ dev->op.cipher.authTagSz = authTagSz;
+ opData->pDigestResult = authTagBuf;
+ }
+ else {
+ dev->op.cipher.authTag = NULL;
+ dev->op.cipher.authTagSz = 0;
+ }
+ IntelQaOpInit(dev, IntelQaSymCipherFree);
+
+ /* perform symmetric AES operation async */
+ /* use same buffer list for in-place operation */
+ status = cpaCySymPerformOp(dev->handle, dev, opData,
+ bufferList, bufferList, &verifyResult);
+
+ if (symOperation == CPA_CY_SYM_OP_ALGORITHM_CHAINING &&
+ cipherAlgorithm == CPA_CY_SYM_CIPHER_AES_GCM &&
+ cipherDirection == CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT &&
+ hashAlgorithm == CPA_CY_SYM_HASH_AES_GCM) {
+ if (verifyResult == CPA_FALSE) {
+ ret = AES_GCM_AUTH_E;
+ }
+ }
+exit:
+
+ if (ret != 0) {
+ QLOG("cpaCySymPerformOp Cipher failed! dev %p, status %d, ret %d\n",
+ dev, status, ret);
+ }
+
+ /* Capture the inline decrypt into the output. */
+ XMEMCPY(out, dataBuf, inOutSz);
+ if (cipherDirection == CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT) {
+ if (authTag != NULL && authTagSz > 0) {
+ XMEMCPY(authTag, authTagBuf, authTagSz);
+ }
+ }
+
+ /* handle cleanup */
+ IntelQaSymCipherFree(dev);
+
+ return ret;
+}
+
+#ifdef HAVE_AES_CBC
+int IntelQaSymAesCbcEncrypt(IntelQaDev* dev,
+ byte* out, const byte* in, word32 sz,
+ const byte* key, word32 keySz,
+ const byte* iv, word32 ivSz)
+{
+ int ret = IntelQaSymCipher(dev, out, in, sz,
+ key, keySz, iv, ivSz,
+ CPA_CY_SYM_OP_CIPHER, CPA_CY_SYM_CIPHER_AES_CBC,
+ CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT,
+ CPA_CY_SYM_HASH_NONE, NULL, 0, NULL, 0);
+
+ XMEMCPY((byte*)iv, out + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+ return ret;
+}
+
+#ifdef HAVE_AES_DECRYPT
+int IntelQaSymAesCbcDecrypt(IntelQaDev* dev,
+ byte* out, const byte* in, word32 sz,
+ const byte* key, word32 keySz,
+ const byte* iv, word32 ivSz)
+{
+ byte nextIv[AES_BLOCK_SIZE];
+ int ret;
+
+ XMEMCPY(nextIv, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+ ret = IntelQaSymCipher(dev, out, in, sz,
+ key, keySz, iv, ivSz,
+ CPA_CY_SYM_OP_CIPHER, CPA_CY_SYM_CIPHER_AES_CBC,
+ CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT,
+ CPA_CY_SYM_HASH_NONE, NULL, 0, NULL, 0);
+
+ XMEMCPY((byte*)iv, nextIv, AES_BLOCK_SIZE);
+ return ret;
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AES_CBC */
+
+
+#ifdef HAVE_AESGCM
+int IntelQaSymAesGcmEncrypt(IntelQaDev* dev,
+ byte* out, const byte* in, word32 sz,
+ const byte* key, word32 keySz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ return IntelQaSymCipher(dev, out, in, sz,
+ key, keySz, iv, ivSz,
+ CPA_CY_SYM_OP_ALGORITHM_CHAINING, CPA_CY_SYM_CIPHER_AES_GCM,
+ CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT,
+ CPA_CY_SYM_HASH_AES_GCM, authTag, authTagSz, authIn, authInSz);
+}
+#ifdef HAVE_AES_DECRYPT
+int IntelQaSymAesGcmDecrypt(IntelQaDev* dev,
+ byte* out, const byte* in, word32 sz,
+ const byte* key, word32 keySz,
+ const byte* iv, word32 ivSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ return IntelQaSymCipher(dev, out, in, sz,
+ key, keySz, iv, ivSz,
+ CPA_CY_SYM_OP_ALGORITHM_CHAINING, CPA_CY_SYM_CIPHER_AES_GCM,
+ CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT,
+ CPA_CY_SYM_HASH_AES_GCM, (byte*)authTag, authTagSz, authIn, authInSz);
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AESGCM */
+
+#ifndef NO_DES3
+int IntelQaSymDes3CbcEncrypt(IntelQaDev* dev,
+ byte* out, const byte* in, word32 sz,
+ const byte* key, word32 keySz,
+ const byte* iv, word32 ivSz)
+{
+ return IntelQaSymCipher(dev, out, in, sz,
+ key, keySz, iv, ivSz,
+ CPA_CY_SYM_OP_CIPHER, CPA_CY_SYM_CIPHER_3DES_CBC,
+ CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT,
+ CPA_CY_SYM_HASH_NONE, NULL, 0, NULL, 0);
+}
+
+int IntelQaSymDes3CbcDecrypt(IntelQaDev* dev,
+ byte* out, const byte* in, word32 sz,
+ const byte* key, word32 keySz,
+ const byte* iv, word32 ivSz)
+{
+ return IntelQaSymCipher(dev, out, in, sz,
+ key, keySz, iv, ivSz,
+ CPA_CY_SYM_OP_CIPHER, CPA_CY_SYM_CIPHER_3DES_CBC,
+ CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT,
+ CPA_CY_SYM_HASH_NONE, NULL, 0, NULL, 0);
+}
+#endif /* !NO_DES3 */
+
+#endif /* QAT_ENABLE_CRYPTO */
+
+
+#ifdef WOLF_CRYPTO_CB
+
+int IntelQaSymSync_CryptoDevCb(int devId, struct wc_CryptoInfo* info, void* ctx)
+{
+ int rc = NOT_COMPILED_IN; /* return this to bypass HW and use SW */
+ IntelQaDev* dev;
+
+ if (info == NULL || ctx == NULL)
+ return BAD_FUNC_ARG;
+
+ (void)devId;
+ dev = (IntelQaDev*)ctx;
+
+ #ifdef QAT_ENABLE_CRYPTO
+ if (info->algo_type == WC_ALGO_TYPE_CIPHER) {
+ QLOG("CryptoDevCb Cipher: Type %d\n", info->cipher.type);
+
+ #ifndef NO_AES
+ if (info->cipher.type == WC_CIPHER_AES_CBC) {
+ Aes* aes = info->cipher.aescbc.aes;
+ if (aes == NULL)
+ return BAD_FUNC_ARG;
+
+ if (info->cipher.enc) {
+ rc = IntelQaSymAesCbcEncrypt(dev,
+ info->cipher.aescbc.out,
+ info->cipher.aescbc.in,
+ info->cipher.aescbc.sz,
+ (byte*)aes->devKey, aes->keylen,
+ (byte*)aes->reg, AES_BLOCK_SIZE);
+ }
+ else {
+ rc = IntelQaSymAesCbcDecrypt(dev,
+ info->cipher.aescbc.out,
+ info->cipher.aescbc.in,
+ info->cipher.aescbc.sz,
+ (byte*)aes->devKey, aes->keylen,
+ (byte*)aes->reg, AES_BLOCK_SIZE);
+ }
+ }
+ #endif /* !NO_AES */
+
+ #ifdef HAVE_AESGCM
+ if (info->cipher.type == WC_CIPHER_AES_GCM) {
+ if (info->cipher.enc) {
+ Aes* aes = info->cipher.aesgcm_enc.aes;
+ if (aes == NULL)
+ return BAD_FUNC_ARG;
+
+ rc = IntelQaSymAesGcmEncrypt(dev,
+ info->cipher.aesgcm_enc.out,
+ info->cipher.aesgcm_enc.in,
+ info->cipher.aesgcm_enc.sz,
+ (const byte*)aes->devKey, aes->keylen,
+ info->cipher.aesgcm_enc.iv,
+ info->cipher.aesgcm_enc.ivSz,
+ info->cipher.aesgcm_enc.authTag,
+ info->cipher.aesgcm_enc.authTagSz,
+ info->cipher.aesgcm_enc.authIn,
+ info->cipher.aesgcm_enc.authInSz);
+ }
+ else {
+ Aes* aes = info->cipher.aesgcm_dec.aes;
+ if (aes == NULL)
+ return BAD_FUNC_ARG;
+
+ rc = IntelQaSymAesGcmDecrypt(dev,
+ info->cipher.aesgcm_dec.out,
+ info->cipher.aesgcm_dec.in,
+ info->cipher.aesgcm_dec.sz,
+ (const byte*)aes->devKey, aes->keylen,
+ info->cipher.aesgcm_dec.iv,
+ info->cipher.aesgcm_dec.ivSz,
+ info->cipher.aesgcm_dec.authTag,
+ info->cipher.aesgcm_dec.authTagSz,
+ info->cipher.aesgcm_dec.authIn,
+ info->cipher.aesgcm_dec.authInSz);
+ }
+ }
+ #endif /* HAVE_AESGCM */
+
+ #ifndef NO_DES3
+ if (info->cipher.type == WC_CIPHER_DES3) {
+ Des3* des = info->cipher.des3.des;
+ if (des == NULL)
+ return BAD_FUNC_ARG;
+
+ if (info->cipher.enc) {
+ rc = IntelQaSymDes3CbcEncrypt(dev,
+ info->cipher.des3.out,
+ info->cipher.des3.in,
+ info->cipher.des3.sz,
+ (byte*)des->devKey, DES3_KEYLEN,
+ (byte*)des->reg, DES_BLOCK_SIZE);
+ }
+ else {
+ rc = IntelQaSymDes3CbcDecrypt(dev,
+ info->cipher.des3.out,
+ info->cipher.des3.in,
+ info->cipher.des3.sz,
+ (byte*)des->devKey, DES3_KEYLEN,
+ (byte*)des->reg, DES_BLOCK_SIZE);
+ }
+ }
+ #endif /* !NO_DES3 */
+ }
+ #endif /* QAT_ENABLE_CRYPTO */
+
+ return rc;
+}
+
+/* -------------------------------------------------------------------------- */
+/* Public API */
+/* -------------------------------------------------------------------------- */
+
+int wc_CryptoCb_InitIntelQa(void)
+{
+ int devId, rc;
+
+ devId = IntelQaInit(NULL);
+ if (devId < 0) {
+ QLOG("Couldn't init the Intel QA\n");
+ devId = INVALID_DEVID;
+ }
+ else {
+ rc = IntelQaOpen(&qaDev, devId);
+ if (rc != 0) {
+ QLOG("Couldn't open the device\n");
+ IntelQaDeInit(devId);
+ devId = INVALID_DEVID;
+ }
+ else {
+ rc = wc_CryptoCb_RegisterDevice(devId,
+ IntelQaSymSync_CryptoDevCb, &qaDev);
+ if (rc != 0) {
+ QLOG("Couldn't register the device\n");
+ IntelQaClose(&qaDev);
+ IntelQaDeInit(devId);
+ devId = INVALID_DEVID;
+ }
+ }
+ }
+
+ return devId;
+}
+
+
+void wc_CryptoCb_CleanupIntelQa(int* id)
+{
+ if (INVALID_DEVID != *id) {
+ wc_CryptoCb_UnRegisterDevice(*id);
+ IntelQaClose(&qaDev);
+ IntelQaDeInit(*id);
+ *id = INVALID_DEVID;
+ }
+}
+
+#endif /* WOLF_CRYPTO_CB */
+
+
+/* -------------------------------------------------------------------------- */
+/* Memory allocator and deallocator */
+/* -------------------------------------------------------------------------- */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* use thread local for QAE variables (removing mutex requirement) */
+#ifdef USE_QAE_THREAD_LS
+ #include <pthread.h> /* for threadId tracking */
+ #define QAE_THREAD_LS THREAD_LS_T
+#else
+ #define QAE_THREAD_LS
+#endif
+
+/* these are used to align memory to a byte boundary */
+#define ALIGNMENT_BASE (16ul)
+#define ALIGNMENT_HW (64ul)
+#define WOLF_MAGIC_NUM 0xA576F6C6641736EBUL /* (0xA)WolfAsyn(0xB) */
+#define WOLF_HEADER_ALIGN ALIGNMENT_BASE
+
+#define QAE_NOT_NUMA_PAGE 0xFFFF
+typedef struct qaeMemHeader {
+#ifdef WOLFSSL_TRACK_MEMORY
+ struct qaeMemHeader* next;
+ struct qaeMemHeader* prev;
+ #ifdef WOLFSSL_DEBUG_MEMORY
+ const char* func;
+ unsigned int line;
+ #endif
+#endif
+ uint64_t magic;
+ void* heap;
+#ifdef USE_QAE_THREAD_LS
+ pthread_t threadId;
+#endif
+ size_t size;
+ word16 count;
+ word16 isNuma:1;
+ word16 reservedBits:15; /* use for future bits */
+ word16 type;
+ word16 numa_page_offset; /* use QAE_NOT_NUMA_PAGE if not NUMA */
+} ALIGN16 qaeMemHeader;
+
+#ifdef WOLFSSL_TRACK_MEMORY
+ typedef struct qaeMemStats {
+ long totalAllocs; /* number of allocations */
+ long totalDeallocs; /* number of deallocations */
+ long totalBytes; /* total number of bytes allocated */
+ long peakBytes; /* concurrent max bytes */
+ long currentBytes; /* total current bytes in use */
+ } qaeMemStats;
+
+ /* track allocations and report at end */
+ typedef struct qaeMemList {
+ qaeMemHeader* head;
+ qaeMemHeader* tail;
+ uint32_t count;
+ } qaeMemList;
+#endif /* WOLFSSL_TRACK_MEMORY */
+
+
+/* local variables */
+#ifndef USE_QAE_THREAD_LS
+ static pthread_mutex_t g_memLock = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+
+#ifdef WOLFSSL_TRACK_MEMORY
+ static qaeMemStats g_memStats;
+ static qaeMemList g_memList;
+ static pthread_mutex_t g_memStatLock = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+static WC_INLINE int qaeMemTypeIsNuma(int type)
+{
+ int isNuma = 0;
+
+ switch (type) {
+ case DYNAMIC_TYPE_ASYNC_NUMA:
+ case DYNAMIC_TYPE_ASYNC_NUMA64:
+ case DYNAMIC_TYPE_WOLF_BIGINT:
+ case DYNAMIC_TYPE_PRIVATE_KEY:
+ case DYNAMIC_TYPE_PUBLIC_KEY:
+ case DYNAMIC_TYPE_AES_BUFFER:
+ case DYNAMIC_TYPE_RSA_BUFFER:
+ case DYNAMIC_TYPE_ECC_BUFFER:
+ case DYNAMIC_TYPE_SIGNATURE:
+ case DYNAMIC_TYPE_DIGEST:
+ case DYNAMIC_TYPE_SECRET:
+ case DYNAMIC_TYPE_SEED:
+ case DYNAMIC_TYPE_SALT:
+ {
+ isNuma = 1;
+ break;
+ }
+ case DYNAMIC_TYPE_OUT_BUFFER:
+ case DYNAMIC_TYPE_IN_BUFFER:
+ {
+ #if !defined(WC_ASYNC_NO_CRYPT) && !defined(WC_ASYNC_NO_HASH)
+ isNuma = 1;
+ #else
+ isNuma = 0;
+ #endif
+ break;
+ }
+ default:
+ isNuma = 0;
+ break;
+ }
+ return isNuma;
+}
+
+
+static void _qaeMemFree(void *ptr, void* heap, int type
+#ifdef WOLFSSL_DEBUG_MEMORY
+ , const char* func, unsigned int line
+#endif
+)
+{
+ qaeMemHeader* header = NULL;
+ size_t size;
+ void* origPtr = ptr;
+
+ if (ptr == NULL)
+ return;
+
+ /* adjust for header and align */
+ ptr = (byte*)(((size_t)ptr - ((size_t)ptr % WOLF_HEADER_ALIGN)) -
+ sizeof(qaeMemHeader));
+ header = (qaeMemHeader*)ptr;
+
+ /* check for header magic */
+ if (header->magic != WOLF_MAGIC_NUM) {
+ printf("Free: Header magic not found! %p\n", ptr);
+ return;
+ }
+
+ /* cache values for later */
+ size = header->size;
+
+#ifdef WOLFSSL_DEBUG_MEMORY
+#ifdef WOLFSSL_DEBUG_MEMORY_PRINT
+ printf("Free: %p (%u) at %s:%u, heap %p, type %d, count %d\n",
+ origPtr, (unsigned int)size, func, line, heap, type, header->count);
+#else
+ (void)func;
+ (void)line;
+#endif
+#endif
+ (void)type;
+
+ /* adjust free count */
+ header->count--;
+
+ /* check header count */
+ if (header->count > 0) {
+ /* go ahead and return if still in use */
+ return;
+ }
+
+#ifdef WOLFSSL_TRACK_MEMORY
+ if (pthread_mutex_lock(&g_memStatLock) == 0) {
+ g_memStats.currentBytes -= size;
+ g_memStats.totalDeallocs++;
+
+ if (header == g_memList.head && header == g_memList.tail) {
+ g_memList.head = NULL;
+ g_memList.tail = NULL;
+ }
+ else if (header == g_memList.head) {
+ g_memList.head = header->next;
+ g_memList.head->prev = NULL;
+ }
+ else if (header == g_memList.tail) {
+ g_memList.tail = header->prev;
+ g_memList.tail->next = NULL;
+ }
+ else {
+ qaeMemHeader* next = header->next;
+ qaeMemHeader* prev = header->prev;
+ if (next)
+ next->prev = prev;
+ if (prev)
+ prev->next = next;
+ }
+ g_memList.count--;
+
+ pthread_mutex_unlock(&g_memStatLock);
+ }
+#endif
+
+ (void)heap;
+ (void)size;
+ (void)origPtr;
+
+#ifdef WOLFSSL_DEBUG_MEMORY
+ /* make sure magic is gone */
+ header->magic = 0;
+#endif
+
+ /* free type */
+ if (header->isNuma && header->numa_page_offset != QAE_NOT_NUMA_PAGE) {
+ qaeMemFreeNUMA(&ptr);
+ }
+ else {
+ free(ptr);
+ }
+}
+
+
+static void* _qaeMemAlloc(size_t size, void* heap, int type
+#ifdef WOLFSSL_DEBUG_MEMORY
+ , const char* func, unsigned int line
+#endif
+)
+{
+ void* ptr = NULL;
+ qaeMemHeader* header = NULL;
+ int isNuma;
+ int alignment = ALIGNMENT_BASE;
+ word16 page_offset = QAE_NOT_NUMA_PAGE;
+
+ /* make sure all allocations are aligned */
+ if ((size % WOLF_HEADER_ALIGN) != 0) {
+ size += (WOLF_HEADER_ALIGN - (size % WOLF_HEADER_ALIGN));
+ }
+
+ isNuma = qaeMemTypeIsNuma(type);
+ if (type == DYNAMIC_TYPE_ASYNC_NUMA64)
+ alignment = ALIGNMENT_HW;
+
+ /* allocate type */
+ if (isNuma) {
+ /* Node is typically 0 */
+ page_offset = 0;
+ ptr = qaeMemAllocNUMA((Cpa32U)(size + sizeof(qaeMemHeader)), 0,
+ alignment);
+ }
+ else {
+ isNuma = 0;
+ ptr = malloc(size + sizeof(qaeMemHeader));
+ }
+
+ /* add header */
+ if (ptr) {
+ header = (qaeMemHeader*)ptr;
+ ptr = (byte*)ptr + sizeof(qaeMemHeader);
+ header->magic = WOLF_MAGIC_NUM;
+ header->heap = heap;
+ header->size = size;
+ header->type = type;
+ header->count = 1;
+ header->isNuma = isNuma;
+ header->numa_page_offset = page_offset;
+ #ifdef USE_QAE_THREAD_LS
+ header->threadId = pthread_self();
+ #endif
+
+ #ifdef WOLFSSL_TRACK_MEMORY
+ if (pthread_mutex_lock(&g_memStatLock) == 0) {
+ g_memStats.totalAllocs++;
+ g_memStats.totalBytes += size;
+ g_memStats.currentBytes += size;
+ if (g_memStats.currentBytes > g_memStats.peakBytes)
+ g_memStats.peakBytes = g_memStats.currentBytes;
+
+ #ifdef WOLFSSL_DEBUG_MEMORY
+ header->func = func;
+ header->line = line;
+ #endif
+
+ /* Setup event */
+ header->next = NULL;
+ if (g_memList.tail == NULL) {
+ g_memList.head = header;
+ }
+ else {
+ g_memList.tail->next = header;
+ header->prev = g_memList.tail;
+ }
+ g_memList.tail = header; /* add to the end either way */
+ g_memList.count++;
+
+ pthread_mutex_unlock(&g_memStatLock);
+ }
+ #endif
+ }
+
+#ifdef WOLFSSL_DEBUG_MEMORY
+#ifdef WOLFSSL_DEBUG_MEMORY_PRINT
+ printf("Alloc: %p (%u) at %s:%u, heap %p, type %d\n",
+ ptr, (unsigned int)size, func, line, heap, type);
+#else
+ (void)func;
+ (void)line;
+#endif
+#endif
+
+ (void)heap;
+
+ return ptr;
+}
+
+/* Public Functions */
+void* wc_CryptoCb_IntelQaMalloc(size_t size, void* heap, int type
+#ifdef WOLFSSL_DEBUG_MEMORY
+ , const char* func, unsigned int line
+#endif
+)
+{
+ void* ptr;
+
+#ifndef USE_QAE_THREAD_LS
+ int ret = pthread_mutex_lock(&g_memLock);
+ if (ret != 0) {
+ printf("Alloc: Error(%d) on mutex lock\n", ret);
+ return NULL;
+ }
+#endif
+
+ ptr = _qaeMemAlloc(size, heap, type
+ #ifdef WOLFSSL_DEBUG_MEMORY
+ , func, line
+ #endif
+ );
+
+#ifndef USE_QAE_THREAD_LS
+ pthread_mutex_unlock(&g_memLock);
+#endif
+
+ return ptr;
+}
+
+void wc_CryptoCb_IntelQaFree(void *ptr, void* heap, int type
+#ifdef WOLFSSL_DEBUG_MEMORY
+ , const char* func, unsigned int line
+#endif
+)
+{
+#ifndef USE_QAE_THREAD_LS
+ int ret = pthread_mutex_lock(&g_memLock);
+ if (ret != 0) {
+ printf("Free: Error(%d) on mutex lock\n", ret);
+ return;
+ }
+#endif
+
+ _qaeMemFree(ptr, heap, type
+ #ifdef WOLFSSL_DEBUG_MEMORY
+ , func, line
+ #endif
+ );
+
+#ifndef USE_QAE_THREAD_LS
+ pthread_mutex_unlock(&g_memLock);
+#endif
+}
+
+void* wc_CryptoCb_IntelQaRealloc(void *ptr, size_t size, void* heap, int type
+#ifdef WOLFSSL_DEBUG_MEMORY
+ , const char* func, unsigned int line
+#endif
+)
+{
+ void* newPtr = NULL;
+ void* origPtr = ptr;
+ qaeMemHeader* header = NULL;
+ byte allocNew = 1;
+ int newIsNuma = -1, ptrIsNuma = -1;
+ size_t copySize = 0;
+
+#ifndef USE_QAE_THREAD_LS
+ int ret = pthread_mutex_lock(&g_memLock);
+ if (ret != 0) {
+ printf("Realloc: Error(%d) on mutex lock\n", ret);
+ return NULL;
+ }
+#endif
+
+ (void)heap;
+
+ if (ptr) {
+ /* get header pointer and align */
+ header = (qaeMemHeader*)(((size_t)ptr -
+ ((size_t)ptr % WOLF_HEADER_ALIGN)) - sizeof(qaeMemHeader));
+ if (header->magic == WOLF_MAGIC_NUM) {
+ newIsNuma = qaeMemTypeIsNuma(type);
+ ptrIsNuma = (header->numa_page_offset != QAE_NOT_NUMA_PAGE) ? 1 : 0;
+
+ /* for non-NUMA, treat as normal REALLOC */
+ if (newIsNuma == 0 && ptrIsNuma == 0) {
+ allocNew = 1;
+ }
+ /* if matching NUMA type and size fits, use existing */
+ else if (newIsNuma == ptrIsNuma && header->size >= size) {
+
+ #ifdef USE_QAE_THREAD_LS
+ if (header->threadId != pthread_self()) {
+ allocNew = 1;
+ #if 0
+ printf("Realloc %p from different thread! orig %lx this %lx\n",
+ origPtr, header->threadId, pthread_self());
+ #endif
+ }
+ else
+ #endif
+ {
+ /* use existing pointer and increment counter */
+ header->count++;
+ newPtr = origPtr;
+ allocNew = 0;
+ }
+ }
+
+ copySize = header->size;
+ }
+ else {
+ copySize = size;
+ }
+ }
+
+ if (allocNew) {
+ newPtr = _qaeMemAlloc(size, heap, type
+ #ifdef WOLFSSL_DEBUG_MEMORY
+ , func, line
+ #endif
+ );
+ if (newPtr && ptr) {
+ /* only copy min of new and old size to new pointer */
+ if (copySize > size)
+ copySize = size;
+ XMEMCPY(newPtr, ptr, copySize);
+
+ if (newIsNuma == 0 && ptrIsNuma == 0) {
+ /* for non-NUMA, treat as normal REALLOC and free old pointer */
+ _qaeMemFree(ptr, heap, type
+ #ifdef WOLFSSL_DEBUG_MEMORY
+ , func, line
+ #endif
+ );
+ }
+ }
+ }
+
+#ifndef USE_QAE_THREAD_LS
+ pthread_mutex_unlock(&g_memLock);
+#endif
+
+#ifdef WOLFSSL_DEBUG_MEMORY
+#ifdef WOLFSSL_DEBUG_MEMORY_PRINT
+ if (allocNew) {
+ printf("Realloc: New %p -> %p (%u) at %s:%u, heap %p, type %d\n",
+ origPtr, newPtr, (unsigned int)size, func, line, heap, type);
+ }
+ else {
+ printf("Realloc: Reuse %p (%u) at %s:%u, heap %p, type %d, count %d\n",
+ origPtr, (unsigned int)size, func, line, header->heap, header->type, header->count);
+ }
+#else
+ (void)func;
+ (void)line;
+#endif
+#endif
+
+ return newPtr;
+}
+
+
+#ifdef WOLFSSL_TRACK_MEMORY
+int InitMemoryTracker(void)
+{
+ if (pthread_mutex_lock(&g_memStatLock) == 0) {
+ g_memStats.totalAllocs = 0;
+ g_memStats.totalDeallocs= 0;
+ g_memStats.totalBytes = 0;
+ g_memStats.peakBytes = 0;
+ g_memStats.currentBytes = 0;
+
+ XMEMSET(&g_memList, 0, sizeof(g_memList));
+
+ pthread_mutex_unlock(&g_memStatLock);
+ }
+
+ return 0;
+}
+
+void ShowMemoryTracker(void)
+{
+ if (pthread_mutex_lock(&g_memStatLock) == 0) {
+ printf("total Allocs = %9ld\n", g_memStats.totalAllocs);
+ printf("total Deallocs = %9ld\n", g_memStats.totalDeallocs);
+ printf("total Bytes = %9ld\n", g_memStats.totalBytes);
+ printf("peak Bytes = %9ld\n", g_memStats.peakBytes);
+ printf("current Bytes = %9ld\n", g_memStats.currentBytes);
+
+ if (g_memList.count > 0) {
+
+ /* print list of allocations */
+ qaeMemHeader* header;
+ for (header = g_memList.head; header != NULL; header = header->next) {
+ printf("Leak: Ptr %p, Size %u, Type %d, Heap %p"
+ #ifdef WOLFSSL_DEBUG_MEMORY
+ ", Func %s, Line %d"
+ #endif
+ "\n",
+ (byte*)header + sizeof(qaeMemHeader), (unsigned int)header->size,
+ header->type, header->heap
+ #ifdef WOLFSSL_DEBUG_MEMORY
+ , header->func, header->line
+ #endif
+ );
+ }
+ }
+
+ pthread_mutex_unlock(&g_memStatLock);
+
+ /* cleanup lock */
+ pthread_mutex_destroy(&g_memStatLock);
+ }
+}
+#endif /* WOLFSSL_TRACK_MEMORY */
+
+#ifdef QAT_DEMO_MAIN
+
+/* AES GCM */
+static const byte aesgcm_k[] = {
+ 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
+ 0x99, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66,
+ 0x77, 0x88, 0x99, 0x00, 0x11, 0x22, 0x33, 0x44,
+ 0x55, 0x66, 0x77, 0x88, 0x99, 0x00, 0x11, 0x22
+};
+
+static const byte aesgcm_iv[] = {
+ 0xca, 0xfe, 0xca, 0xfe, 0xca, 0xfe, 0xca, 0xfe,
+ 0xca, 0xfe, 0xca, 0xfe
+};
+
+static const byte aesgcm_a[] = {
+ 0xde, 0xad, 0xde, 0xad, 0xde, 0xad, 0xde, 0xad,
+ 0xde, 0xad, 0xde, 0xad, 0xde, 0xad, 0xde, 0xad,
+ 0xde, 0xad, 0xde, 0xad
+};
+
+static const byte aesgcm_p[] = {
+ 0x79, 0x84, 0x86, 0x44, 0x68, 0x45, 0x15, 0x61,
+ 0x86, 0x54, 0x66, 0x56, 0x54, 0x54, 0x31, 0x54,
+ 0x64, 0x64, 0x68, 0x45, 0x15, 0x15, 0x61, 0x61,
+ 0x51, 0x51, 0x51, 0x51, 0x51, 0x56, 0x14, 0x11,
+ 0x72, 0x13, 0x51, 0x82, 0x84, 0x56, 0x74, 0x53,
+ 0x45, 0x34, 0x65, 0x15, 0x46, 0x14, 0x67, 0x55,
+ 0x16, 0x14, 0x67, 0x54, 0x65, 0x47, 0x14, 0x67,
+ 0x46, 0x74, 0x65, 0x46
+};
+
+static const byte aesgcm_c[] = {
+ 0x59, 0x85, 0x02, 0x97, 0xE0, 0x4D, 0xFC, 0x5C,
+ 0x03, 0xCC, 0x83, 0x64, 0xCE, 0x28, 0x0B, 0x95,
+ 0x78, 0xEC, 0x93, 0x40, 0xA1, 0x8D, 0x21, 0xC5,
+ 0x48, 0x6A, 0x39, 0xBA, 0x4F, 0x4B, 0x8C, 0x95,
+ 0x6F, 0x8C, 0xF6, 0x9C, 0xD0, 0xA5, 0x8D, 0x67,
+ 0xA1, 0x32, 0x11, 0xE7, 0x2E, 0xF6, 0x63, 0xAF,
+ 0xDE, 0xD4, 0x7D, 0xEC, 0x15, 0x01, 0x58, 0xCB,
+ 0xE3, 0x7B, 0xC6, 0x94,
+};
+
+static byte aesgcm_t[] = {
+ 0x5D, 0x10, 0x3F, 0xC7, 0x22, 0xC7, 0x21, 0x29
+};
+
+
+/* simple example of using AES-GCM encrypt with Intel QA */
+int main(int argc, char** argv)
+{
+#if !defined(NO_AES) && defined(HAVE_AESGCM)
+ int ret;
+ IntelQaDev dev;
+ byte out[256];
+ byte tmp[256];
+ word32 tmpLen;
+#endif
+
+#ifdef QAT_DEBUG
+ wolfSSL_Debugging_ON();
+#endif
+
+ IntelQaInit(NULL);
+
+#ifndef NO_AES
+#ifdef HAVE_AESGCM
+ /* AES Test */
+ IntelQaOpen(&dev, 0);
+ dev.event.ret = WC_PENDING_E;
+ tmpLen = sizeof(aesgcm_t);
+ XMEMSET(out, 0, sizeof(out));
+ XMEMSET(tmp, 0, sizeof(tmp));
+
+ ret = IntelQaSymAesGcmEncrypt(&dev, out, aesgcm_p, sizeof(aesgcm_p),
+ aesgcm_k, sizeof(aesgcm_k), aesgcm_iv, sizeof(aesgcm_iv),
+ tmp, tmpLen, aesgcm_a, sizeof(aesgcm_a));
+ printf("AES GCM Encrypt: Ret=%d, Tag Len=%d\n", ret, tmpLen);
+ IntelQaClose(&dev);
+#endif /* HAVE_AESGCM */
+#endif /* NO_AES */
+
+ IntelQaDeInit(0);
+
+ return 0;
+}
+
+#endif
+
+#endif /* HAVE_INTEL_QA_SYNC */
diff --git a/wolfcrypt/src/port/mynewt/mynewt_port.c b/wolfcrypt/src/port/mynewt/mynewt_port.c
new file mode 100644
index 0000000..8a4e903
--- /dev/null
+++ b/wolfcrypt/src/port/mynewt/mynewt_port.c
@@ -0,0 +1,146 @@
+/* mynewt_port.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#if defined(WOLFSSL_APACHE_MYNEWT)
+#ifndef NO_FILESYSTEM
+#include "fs/fs.h"
+#define FILE struct fs_file
+
+FILE* mynewt_fopen(const char * restrict path, const char * restrict mode)
+{
+ FILE *file;
+ uint8_t access_flags = 0;
+ const char *p = mode;
+ while(*p != '\0') {
+ switch(*p) {
+ case 'r':
+ {
+ access_flags |= FS_ACCESS_READ;
+ if(*(p+1) == '+') {
+ access_flags |= FS_ACCESS_WRITE;
+ }
+ }
+ break;
+
+ case 'w':
+ {
+ access_flags |= (FS_ACCESS_WRITE | FS_ACCESS_TRUNCATE);
+ if(*(p+1) == '+') {
+ access_flags |= FS_ACCESS_READ;
+ }
+ }
+ break;
+
+ case 'a':
+ {
+ access_flags |= (FS_ACCESS_WRITE | FS_ACCESS_APPEND);
+ if(*(p+1) == '+') {
+ access_flags |= FS_ACCESS_READ;
+ }
+ }
+ break;
+ }
+ p++;
+ }
+
+ /* Open the file for reading. */
+ int rc = fs_open(path, access_flags, &file);
+ if (rc != 0) {
+ return NULL;
+ }
+ return file;
+}
+
+int mynewt_fseek(FILE *stream, long offset, int whence)
+{
+ uint32_t fs_offset;
+
+ switch(whence) {
+ case 0: /* SEEK_SET */
+ {
+ fs_offset += offset;
+ }
+ break;
+
+ case 1: /* SEEK_CUR */
+ {
+ fs_offset = fs_getpos(stream);
+ fs_offset += offset;
+ }
+ break;
+
+ case 2: /* SEEK_END */
+ {
+ fs_filelen(stream, &fs_offset);
+ fs_offset += offset;
+ }
+ break;
+ }
+
+ fs_seek(stream, fs_offset);
+
+ return 0;
+}
+
+long mynewt_ftell(FILE *stream)
+{
+ uint32_t fs_offset;
+ fs_filelen(stream, &fs_offset);
+ fs_seek(stream, fs_offset);
+ return (long)fs_offset;
+}
+
+void mynewt_rewind(FILE *stream)
+{
+ fs_seek(stream, 0);
+}
+
+size_t mynewt_fread(void *restrict ptr, size_t size, size_t nitems, FILE *restrict stream)
+{
+ size_t to_read = size * nitems;
+ uint32_t read_size;
+ int rc = fs_read(stream, to_read, ptr, &read_size);
+ if(rc != 0) {
+ return 0;
+ }
+
+ return (size_t)read_size;
+}
+
+size_t mynewt_fwrite(const void *restrict ptr, size_t size, size_t nitems, FILE *restrict stream)
+{
+ size_t to_write = size * nitems;
+ int rc = fs_write(stream, ptr, to_write);
+ if(rc != 0) {
+ return 0;
+ }
+
+ return to_write;
+}
+
+int mynewt_fclose(FILE *stream)
+{
+ fs_close(stream);
+ return 0;
+}
+
+#endif /* NO_FILESYSTEM*/
+#endif /* if defined(WOLFSSL_APACHE_MYNEWT) */
diff --git a/wolfcrypt/src/port/nrf51.c b/wolfcrypt/src/port/nrf51.c
new file mode 100644
index 0000000..c7db4b0
--- /dev/null
+++ b/wolfcrypt/src/port/nrf51.c
@@ -0,0 +1,220 @@
+/* nrf51.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_NRF51
+
+#include "bsp.h"
+#include "nrf_delay.h"
+#include "app_uart.h"
+#include "app_error.h"
+#include "nrf_drv_rng.h"
+#include "nrf_drv_rtc.h"
+#include "nrf_drv_clock.h"
+#include "nrf_ecb.h"
+
+#ifdef SOFTDEVICE_PRESENT
+ #include "softdevice_handler.h"
+ #include "nrf_soc.h"
+#endif /* SOFTDEVICE_PRESENT */
+
+/* RTC */
+#ifndef NO_CRYPT_BENCHMARK
+static byte mRtcInitDone = 0;
+static int mRtcSec = 0;
+const nrf_drv_rtc_t rtc = NRF_DRV_RTC_INSTANCE(0); /**< Declaring an instance of nrf_drv_rtc for RTC0. */
+#endif /* !NO_CRYPT_BENCHMARK */
+
+/* AES */
+#if !defined(NO_AES) && !defined(SOFTDEVICE_PRESENT)
+ static byte mAesInitDone = 0;
+#endif
+
+/** @brief Function for getting vector of random numbers.
+ *
+ * @param[out] p_buff Pointer to unit8_t buffer for storing the bytes.
+ * @param[in] length Number of bytes to take from pool and place in p_buff.
+ *
+ * @retval 0 = Success, else error
+ */
+int nrf51_random_generate(byte* output, word32 size)
+{
+ int remaining = size, length, pos = 0;
+ uint8_t available;
+ uint32_t err_code;
+
+ /* Make sure RNG is running */
+ err_code = nrf_drv_rng_init(NULL);
+ if (err_code != NRF_SUCCESS && err_code != NRF_ERROR_INVALID_STATE) {
+ return -1;
+ }
+
+ while (remaining > 0) {
+ err_code = nrf_drv_rng_bytes_available(&available);
+ if (err_code == NRF_SUCCESS) {
+ length = (remaining < available) ? remaining : available;
+ if (length > 0) {
+ err_code = nrf_drv_rng_rand(&output[pos], length);
+ remaining -= length;
+ pos += length;
+ }
+ }
+
+ if (err_code != NRF_SUCCESS) {
+ break;
+ }
+ }
+
+ return (err_code == NRF_SUCCESS) ? 0 : -1;
+}
+
+#if !defined(NO_AES) && defined(WOLFSSL_NRF51_AES)
+
+#ifdef SOFTDEVICE_PRESENT
+static const byte* nRF51AesKey = NULL;
+#endif
+int nrf51_aes_set_key(const byte* key)
+{
+#ifdef SOFTDEVICE_PRESENT
+ nRF51AesKey = key;
+#else
+ if (!mAesInitDone) {
+ nrf_ecb_init();
+ mAesInitDone = 1;
+ }
+ nrf_ecb_set_key(key);
+#endif
+ return 0;
+}
+
+
+int nrf51_aes_encrypt(const byte* in, const byte* key, word32 rounds, byte* out)
+{
+ int ret;
+ uint32_t err_code = 0;
+#ifdef SOFTDEVICE_PRESENT
+ nrf_ecb_hal_data_t ecb_hal_data;
+#endif
+
+ /* Set key */
+ ret = nrf51_aes_set_key(key);
+ if (ret != 0) {
+ return ret;
+ }
+
+#ifdef SOFTDEVICE_PRESENT
+ /* Define ECB record */
+ XMEMCPY(ecb_hal_data.key, nRF51AesKey, SOC_ECB_KEY_LENGTH);
+ XMEMCPY(ecb_hal_data.cleartext, in, SOC_ECB_CLEARTEXT_LENGTH);
+ XMEMSET(ecb_hal_data.ciphertext, 0, SOC_ECB_CIPHERTEXT_LENGTH);
+
+ /* Perform block encrypt */
+ err_code = sd_ecb_block_encrypt(&ecb_hal_data);
+ if (err_code != NRF_SUCCESS) {
+ return -1;
+ }
+
+ /* Grab result */
+ XMEMCPY(out, ecb_hal_data.ciphertext, SOC_ECB_CIPHERTEXT_LENGTH);
+#else
+ err_code = nrf_ecb_crypt(out, in);
+ err_code = err_code ? 0 : -1;
+#endif
+
+ return err_code;
+}
+
+#endif /* !NO_AES && WOLFSSL_NRF51_AES */
+
+
+#ifndef NO_CRYPT_BENCHMARK
+static void rtc_handler(nrf_drv_rtc_int_type_t int_type)
+{
+ if (int_type == NRF_DRV_RTC_INT_COMPARE0)
+ {
+ mRtcSec++;
+ nrf_drv_rtc_counter_clear(&rtc);
+ nrf_drv_rtc_int_enable(&rtc, RTC_CHANNEL_INT_MASK(0));
+
+#ifdef BSP_LED_0
+ nrf_gpio_pin_toggle(BSP_LED_0);
+#endif
+ }
+}
+
+static void rtc_config(void)
+{
+ uint32_t err_code;
+
+ // Start the internal LFCLK XTAL oscillator
+ err_code = nrf_drv_clock_init(NULL);
+ APP_ERROR_CHECK(err_code);
+
+ nrf_drv_clock_lfclk_request();
+
+ // Initialize RTC instance
+ err_code = nrf_drv_rtc_init(&rtc, NULL, rtc_handler);
+ APP_ERROR_CHECK(err_code);
+
+ // Enable tick event
+ nrf_drv_rtc_tick_enable(&rtc, false);
+
+ // Set compare channel to trigger interrupt after 1 seconds
+ err_code = nrf_drv_rtc_cc_set(&rtc, 0, RTC0_CONFIG_FREQUENCY, true);
+ APP_ERROR_CHECK(err_code);
+
+ // Power on RTC instance
+ nrf_drv_rtc_enable(&rtc);
+}
+
+static int rtc_get_ms(void)
+{
+ /* Prescaler is 12-bit for COUNTER: frequency = (32768/(PRESCALER+1)) */
+ int frequency = (32768 / (rtc_prescaler_get(rtc.p_reg) + 1));
+ int counter = nrf_drv_rtc_counter_get(&rtc);
+
+ /* Convert with rounding frequency to milliseconds */
+ return ((counter * 1000) + (frequency / 2) ) / frequency;
+}
+
+double current_time(int reset)
+{
+ double time;
+
+ if (!mRtcInitDone) {
+ rtc_config();
+ mRtcInitDone = 1;
+ }
+
+ time = mRtcSec;
+ time += (double)rtc_get_ms() / 1000;
+
+ return time;
+}
+#endif /* !NO_CRYPT_BENCHMARK */
+
+#endif /* WOLFSSL_NRF51 */
diff --git a/wolfcrypt/src/port/nxp/ksdk_port.c b/wolfcrypt/src/port/nxp/ksdk_port.c
new file mode 100644
index 0000000..a5cc737
--- /dev/null
+++ b/wolfcrypt/src/port/nxp/ksdk_port.c
@@ -0,0 +1,1731 @@
+/* ksdk_port.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+/* If FREESCALE_LTC_TFM or FREESCALE_LTC_ECC */
+#if defined(FREESCALE_LTC_TFM) || defined(FREESCALE_LTC_ECC)
+
+#include <wolfssl/wolfcrypt/port/nxp/ksdk_port.h>
+#include <wolfssl/wolfcrypt/random.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <stdint.h>
+
+#define ERROR_OUT(res) { ret = (res); goto done; }
+
+
+int ksdk_port_init(void)
+{
+#if defined(FREESCALE_LTC_TFM)
+ LTC_Init(LTC0);
+#endif
+
+ return 0;
+}
+
+/* Reverse array in memory (in place) */
+static void ltc_reverse_array(uint8_t *src, size_t src_len)
+{
+ unsigned int i;
+
+ for (i = 0; i < src_len / 2; i++) {
+ uint8_t tmp;
+
+ tmp = src[i];
+ src[i] = src[src_len - 1 - i];
+ src[src_len - 1 - i] = tmp;
+ }
+}
+
+
+#ifndef WOLFSSL_SP_MATH
+/* same as mp_to_unsigned_bin() with mp_reverse() skipped */
+static int mp_to_unsigned_lsb_bin(mp_int *a, unsigned char *b)
+{
+ int res;
+ mp_int t;
+
+ res = mp_init_copy(&t, a);
+ if (res == MP_OKAY) {
+ res = mp_to_unsigned_bin_at_pos(0, &t, b);
+ if (res >= 0)
+ res = 0;
+ #ifndef USE_FAST_MATH
+ mp_clear(&t);
+ #endif
+ }
+
+ return res;
+}
+#endif
+
+static int ltc_get_lsb_bin_from_mp_int(uint8_t *dst, mp_int *A, uint16_t *psz)
+{
+ int res;
+ uint16_t sz;
+
+ sz = mp_unsigned_bin_size(A);
+#ifndef WOLFSSL_SP_MATH
+ res = mp_to_unsigned_lsb_bin(A, dst); /* result is lsbyte at lowest addr as required by LTC */
+#else
+ res = mp_to_unsigned_bin(A, dst);
+ if (res == MP_OKAY) {
+ ltc_reverse_array(dst, sz);
+ }
+#endif
+ *psz = sz;
+ return res;
+}
+
+/* LTC TFM */
+#if defined(FREESCALE_LTC_TFM)
+
+
+/* these function are used by wolfSSL upper layers (like RSA) */
+
+/* c = a * b */
+int mp_mul(mp_int *A, mp_int *B, mp_int *C)
+{
+ int res = MP_OKAY;
+ int szA, szB;
+ szA = mp_unsigned_bin_size(A);
+ szB = mp_unsigned_bin_size(B);
+
+ /* if unsigned mul can fit into LTC PKHA let's use it, otherwise call software mul */
+ if ((szA <= LTC_MAX_INT_BYTES / 2) && (szB <= LTC_MAX_INT_BYTES / 2)) {
+ int neg = 0;
+
+#ifndef WOLFSSL_SP_MATH
+ neg = (A->sign == B->sign) ? MP_ZPOS : MP_NEG;
+#endif
+
+ /* unsigned multiply */
+ uint8_t *ptrA = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrB = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrC = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+
+ if (ptrA && ptrB && ptrC) {
+ uint16_t sizeA, sizeB;
+
+ res = ltc_get_lsb_bin_from_mp_int(ptrA, A, &sizeA);
+ if (res == MP_OKAY)
+ res = ltc_get_lsb_bin_from_mp_int(ptrB, B, &sizeB);
+ if (res == MP_OKAY) {
+ XMEMSET(ptrC, 0xFF, LTC_MAX_INT_BYTES);
+
+ LTC_PKHA_ModMul(LTC_BASE, ptrA, sizeA, ptrB, sizeB, ptrC, LTC_MAX_INT_BYTES, ptrB, &sizeB,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+
+ ltc_reverse_array(ptrB, sizeB);
+ res = mp_read_unsigned_bin(C, ptrB, sizeB);
+ }
+ }
+
+#ifndef WOLFSSL_SP_MATH
+ /* fix sign */
+ C->sign = neg;
+#endif
+ if (ptrA) {
+ XFREE(ptrA, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrB) {
+ XFREE(ptrB, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrC) {
+ XFREE(ptrC, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ }
+ else {
+#ifdef WOLFSSL_SP_MATH
+ res = sp_mul(A, B, C);
+#else
+ res = wolfcrypt_mp_mul(A, B, C);
+#endif
+ }
+ return res;
+}
+
+/* c = a mod b, 0 <= c < b */
+int mp_mod(mp_int *a, mp_int *b, mp_int *c)
+{
+ int res = MP_OKAY;
+#if defined(FREESCALE_LTC_TFM_RSA_4096_ENABLE)
+ int szA, szB;
+ szA = mp_unsigned_bin_size(a);
+ szB = mp_unsigned_bin_size(b);
+ if ((szA <= LTC_MAX_INT_BYTES) && (szB <= LTC_MAX_INT_BYTES))
+ {
+#endif /* FREESCALE_LTC_TFM_RSA_4096_ENABLE */
+ int neg = 0;
+ uint8_t *ptrA = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrB = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrC = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+
+#ifndef WOLFSSL_SP_MATH
+ /* get sign for the result */
+ neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+#endif
+
+ /* get remainder of unsigned a divided by unsigned b */
+ if (ptrA && ptrB && ptrC) {
+ uint16_t sizeA, sizeB, sizeC;
+
+ res = ltc_get_lsb_bin_from_mp_int(ptrA, a, &sizeA);
+ if (res == MP_OKAY)
+ res = ltc_get_lsb_bin_from_mp_int(ptrB, b, &sizeB);
+ if (res == MP_OKAY) {
+ if (kStatus_Success ==
+ LTC_PKHA_ModRed(LTC_BASE, ptrA, sizeA, ptrB, sizeB, ptrC, &sizeC, kLTC_PKHA_IntegerArith))
+ {
+ ltc_reverse_array(ptrC, sizeC);
+ res = mp_read_unsigned_bin(c, ptrC, sizeC);
+ }
+ else {
+ res = MP_VAL;
+ }
+ }
+ }
+ else {
+ res = MP_MEM;
+ }
+
+#ifndef WOLFSSL_SP_MATH
+ /* fix sign */
+ c->sign = neg;
+#endif
+
+ if (ptrA) {
+ XFREE(ptrA, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrB) {
+ XFREE(ptrB, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrC) {
+ XFREE(ptrC, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+#if defined(FREESCALE_LTC_TFM_RSA_4096_ENABLE)
+ }
+ else {
+ res = wolfcrypt_mp_mod(a, b, c);
+ }
+#endif /* FREESCALE_LTC_TFM_RSA_4096_ENABLE */
+ return res;
+}
+
+/* c = 1/a (mod b) for odd b only */
+int mp_invmod(mp_int *a, mp_int *b, mp_int *c)
+{
+ int res = MP_OKAY;
+#if defined(FREESCALE_LTC_TFM_RSA_4096_ENABLE)
+ int szA, szB;
+ szA = mp_unsigned_bin_size(a);
+ szB = mp_unsigned_bin_size(b);
+ if ((szA <= LTC_MAX_INT_BYTES) && (szB <= LTC_MAX_INT_BYTES)) {
+#endif
+ uint8_t *ptrA = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrB = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrC = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+
+ if (ptrA && ptrB && ptrC) {
+ uint16_t sizeA, sizeB, sizeC;
+
+ res = ltc_get_lsb_bin_from_mp_int(ptrA, a, &sizeA);
+ if (res == MP_OKAY)
+ res = ltc_get_lsb_bin_from_mp_int(ptrB, b, &sizeB);
+ if (res == MP_OKAY) {
+ if (kStatus_Success ==
+ LTC_PKHA_ModInv(LTC_BASE, ptrA, sizeA, ptrB, sizeB, ptrC, &sizeC, kLTC_PKHA_IntegerArith))
+ {
+ ltc_reverse_array(ptrC, sizeC);
+ res = mp_read_unsigned_bin(c, ptrC, sizeC);
+ }
+ else {
+ res = MP_VAL;
+ }
+ }
+ }
+ else {
+ res = MP_MEM;
+ }
+
+#ifndef WOLFSSL_SP_MATH
+ c->sign = a->sign;
+#endif
+ if (ptrA) {
+ XFREE(ptrA, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrB) {
+ XFREE(ptrB, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrC) {
+ XFREE(ptrC, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+#if defined(FREESCALE_LTC_TFM_RSA_4096_ENABLE)
+ }
+ else {
+ res = wolfcrypt_mp_invmod(a, b, c);
+ }
+#endif /* FREESCALE_LTC_TFM_RSA_4096_ENABLE */
+ return res;
+}
+
+/* d = a * b (mod c) */
+int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
+{
+ int res = MP_OKAY;
+#if defined(FREESCALE_LTC_TFM_RSA_4096_ENABLE)
+ int szA, szB, szC;
+ szA = mp_unsigned_bin_size(a);
+ szB = mp_unsigned_bin_size(b);
+ szC = mp_unsigned_bin_size(c);
+ if ((szA <= LTC_MAX_INT_BYTES) && (szB <= LTC_MAX_INT_BYTES) && (szC <= LTC_MAX_INT_BYTES)) {
+#endif /* FREESCALE_LTC_TFM_RSA_4096_ENABLE */
+ mp_int t;
+
+ uint8_t *ptrA = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, NULL, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrB = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, NULL, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrC = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, NULL, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrD = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, NULL, DYNAMIC_TYPE_BIGINT);
+
+ /* if A or B is negative, subtract abs(A) or abs(B) from modulus to get positive integer representation of the
+ * same number */
+ res = mp_init(&t);
+#ifndef WOLFSSL_SP_MATH
+ if (a->sign) {
+ if (res == MP_OKAY)
+ res = mp_add(a, c, &t);
+ if (res == MP_OKAY)
+ res = mp_copy(&t, a);
+ }
+ if (b->sign) {
+ if (res == MP_OKAY)
+ res = mp_add(b, c, &t);
+ if (res == MP_OKAY)
+ res = mp_copy(&t, b);
+ }
+#endif
+
+ if (res == MP_OKAY && ptrA && ptrB && ptrC && ptrD) {
+ uint16_t sizeA, sizeB, sizeC, sizeD;
+
+ res = ltc_get_lsb_bin_from_mp_int(ptrA, a, &sizeA);
+ if (res == MP_OKAY)
+ res = ltc_get_lsb_bin_from_mp_int(ptrB, b, &sizeB);
+ if (res == MP_OKAY)
+ res = ltc_get_lsb_bin_from_mp_int(ptrC, c, &sizeC);
+
+ /* (A*B)mod C = ((A mod C) * (B mod C)) mod C */
+ if (res == MP_OKAY && LTC_PKHA_CompareBigNum(ptrA, sizeA, ptrC, sizeC) >= 0) {
+ if (kStatus_Success !=
+ LTC_PKHA_ModRed(LTC_BASE, ptrA, sizeA, ptrC, sizeC, ptrA, &sizeA, kLTC_PKHA_IntegerArith))
+ {
+ res = MP_VAL;
+ }
+ }
+ if (res == MP_OKAY && (LTC_PKHA_CompareBigNum(ptrB, sizeB, ptrC, sizeC) >= 0))
+ {
+ if (kStatus_Success !=
+ LTC_PKHA_ModRed(LTC_BASE, ptrB, sizeB, ptrC, sizeC, ptrB, &sizeB, kLTC_PKHA_IntegerArith))
+ {
+ res = MP_VAL;
+ }
+ }
+
+ if (res == MP_OKAY) {
+ if (kStatus_Success != LTC_PKHA_ModMul(LTC_BASE, ptrA, sizeA, ptrB, sizeB, ptrC, sizeC, ptrD, &sizeD,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized))
+ {
+ res = MP_VAL;
+ }
+ }
+
+ if (res == MP_OKAY) {
+ ltc_reverse_array(ptrD, sizeD);
+ res = mp_read_unsigned_bin(d, ptrD, sizeD);
+ }
+ }
+ else {
+ res = MP_MEM;
+ }
+
+ if (ptrA) {
+ XFREE(ptrA, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrB) {
+ XFREE(ptrB, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrC) {
+ XFREE(ptrC, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrD) {
+ XFREE(ptrD, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ #ifndef USE_FAST_MATH
+ mp_clear(&t);
+ #endif
+#if defined(FREESCALE_LTC_TFM_RSA_4096_ENABLE)
+ }
+ else {
+ res = wolfcrypt_mp_mulmod(a, b, c, d);
+ }
+#endif /* FREESCALE_LTC_TFM_RSA_4096_ENABLE */
+ return res;
+}
+
+/* Y = G^X mod P */
+int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
+{
+ int res = MP_OKAY;
+#if defined(FREESCALE_LTC_TFM_RSA_4096_ENABLE)
+ int szA, szB, szC;
+ mp_int tmp;
+
+ /* if G cannot fit into LTC_PKHA, reduce it */
+ szA = mp_unsigned_bin_size(G);
+ if (szA > LTC_MAX_INT_BYTES) {
+ res = mp_init(&tmp);
+ if (res != MP_OKAY)
+ return res;
+ if ((res = mp_mod(G, P, &tmp)) != MP_OKAY) {
+ return res;
+ }
+ G = &tmp;
+ szA = mp_unsigned_bin_size(G);
+ }
+
+ szB = mp_unsigned_bin_size(X);
+ szC = mp_unsigned_bin_size(P);
+
+ if ((szA <= LTC_MAX_INT_BYTES) && (szB <= LTC_MAX_INT_BYTES) && (szC <= LTC_MAX_INT_BYTES)) {
+#endif /* FREESCALE_LTC_TFM_RSA_4096_ENABLE */
+ mp_int t;
+
+ uint16_t sizeG, sizeX, sizeP;
+ uint8_t *ptrG = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrX = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+ uint8_t *ptrP = (uint8_t *)XMALLOC(LTC_MAX_INT_BYTES, 0, DYNAMIC_TYPE_BIGINT);
+
+ /* if G is negative, add modulus to convert to positive number for LTC */
+ res = mp_init(&t);
+#ifndef WOLFSSL_SP_MATH
+ if (G->sign) {
+ if (res == MP_OKAY)
+ res = mp_add(G, P, &t);
+ if (res == MP_OKAY)
+ res = mp_copy(&t, G);
+ }
+#endif
+
+ if (res == MP_OKAY && ptrG && ptrX && ptrP) {
+ res = ltc_get_lsb_bin_from_mp_int(ptrG, G, &sizeG);
+ if (res == MP_OKAY)
+ res = ltc_get_lsb_bin_from_mp_int(ptrX, X, &sizeX);
+ if (res == MP_OKAY)
+ res = ltc_get_lsb_bin_from_mp_int(ptrP, P, &sizeP);
+
+ /* if number if greater that modulo, we must first reduce due to LTC requirement on modular exponentiaton */
+ /* it needs number less than modulus. */
+ /* we can take advantage of modular arithmetic rule that: A^B mod C = ( (A mod C)^B ) mod C
+ and so we do first (A mod N) : LTC does not give size requirement on A versus N,
+ and then the modular exponentiation.
+ */
+ /* if G >= P then */
+ if (res == MP_OKAY && LTC_PKHA_CompareBigNum(ptrG, sizeG, ptrP, sizeP) >= 0) {
+ res = (int)LTC_PKHA_ModRed(LTC_BASE, ptrG, sizeG, ptrP, sizeP, ptrG, &sizeG, kLTC_PKHA_IntegerArith);
+
+ if (res != kStatus_Success) {
+ res = MP_VAL;
+ }
+ }
+
+ if (res == MP_OKAY) {
+ res = (int)LTC_PKHA_ModExp(LTC_BASE, ptrG, sizeG, ptrP, sizeP, ptrX, sizeX, ptrP, &sizeP,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+
+ if (res != kStatus_Success) {
+ res = MP_VAL;
+ }
+ else {
+ ltc_reverse_array(ptrP, sizeP);
+ res = mp_read_unsigned_bin(Y, ptrP, sizeP);
+ }
+ }
+ }
+ else {
+ res = MP_MEM;
+ }
+
+ if (ptrG) {
+ XFREE(ptrG, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrX) {
+ XFREE(ptrX, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ if (ptrP) {
+ XFREE(ptrP, NULL, DYNAMIC_TYPE_BIGINT);
+ }
+ #ifndef USE_FAST_MATH
+ mp_clear(&t);
+ #endif
+#if defined(FREESCALE_LTC_TFM_RSA_4096_ENABLE)
+ }
+ else {
+ res = wolfcrypt_mp_exptmod(G, X, P, Y);
+ }
+
+#ifndef USE_FAST_MATH
+ if (szA > LTC_MAX_INT_BYTES)
+ mp_clear(&tmp);
+#endif
+#endif /* FREESCALE_LTC_TFM_RSA_4096_ENABLE */
+ return res;
+}
+
+#endif /* FREESCALE_LTC_TFM */
+
+
+/* ECC */
+#if defined(HAVE_ECC) && defined(FREESCALE_LTC_ECC)
+
+/* convert from mp_int to LTC integer, as array of bytes of size sz.
+ * if mp_int has less bytes than sz, add zero bytes at most significant byte positions.
+ * This is when for example modulus is 32 bytes (P-256 curve)
+ * and mp_int has only 31 bytes, we add leading zeros
+ * so that result array has 32 bytes, same as modulus (sz).
+ */
+static int ltc_get_from_mp_int(uint8_t *dst, mp_int *a, int sz)
+{
+ int res;
+ int szbin;
+ int offset;
+
+ /* check how many bytes are in the mp_int */
+ szbin = mp_unsigned_bin_size(a);
+
+ /* compute offset from dst */
+ offset = sz - szbin;
+ if (offset < 0)
+ offset = 0;
+ if (offset > sz)
+ offset = sz;
+
+ /* add leading zeroes */
+ if (offset)
+ XMEMSET(dst, 0, offset);
+
+ /* convert mp_int to array of bytes */
+ res = mp_to_unsigned_bin(a, dst + offset);
+
+ if (res == MP_OKAY) {
+ /* reverse array for LTC direct use */
+ ltc_reverse_array(dst, sz);
+ }
+
+ return res;
+}
+
+/* ECC specs in lsbyte at lowest address format for direct use by LTC PKHA driver functions */
+#if defined(HAVE_ECC192) || defined(HAVE_ALL_CURVES)
+#define ECC192
+#endif
+#if defined(HAVE_ECC224) || defined(HAVE_ALL_CURVES)
+#define ECC224
+#endif
+#if !defined(NO_ECC256) || defined(HAVE_ALL_CURVES)
+#define ECC256
+#endif
+#if defined(HAVE_ECC384) || defined(HAVE_ALL_CURVES)
+#define ECC384
+#endif
+
+/* P-256 */
+#ifdef ECC256
+static const uint8_t ltc_ecc256_modulus[32] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF};
+static const uint8_t ltc_ecc256_r2modn[32] = {
+ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFB, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFD, 0xFF, 0xFF, 0xFF, 0x04, 0x00, 0x00, 0x00};
+static const uint8_t ltc_ecc256_aCurveParam[32] = {
+ 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF};
+static const uint8_t ltc_ecc256_bCurveParam[32] = {
+ 0x4B, 0x60, 0xD2, 0x27, 0x3E, 0x3C, 0xCE, 0x3B, 0xF6, 0xB0, 0x53,
+ 0xCC, 0xB0, 0x06, 0x1D, 0x65, 0xBC, 0x86, 0x98, 0x76, 0x55, 0xBD,
+ 0xEB, 0xB3, 0xE7, 0x93, 0x3A, 0xAA, 0xD8, 0x35, 0xC6, 0x5A};
+#endif
+
+#ifdef ECC192
+static const uint8_t ltc_ecc192_modulus[24] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+static const uint8_t ltc_ecc192_r2modn[24] = {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static const uint8_t ltc_ecc192_aCurveParam[24] = {
+ 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+static const uint8_t ltc_ecc192_bCurveParam[24] = {
+ 0xB1, 0xB9, 0x46, 0xC1, 0xEC, 0xDE, 0xB8, 0xFE, 0x49, 0x30, 0x24, 0x72,
+ 0xAB, 0xE9, 0xA7, 0x0F, 0xE7, 0x80, 0x9C, 0xE5, 0x19, 0x05, 0x21, 0x64};
+#endif
+
+#ifdef ECC224
+static const uint8_t ltc_ecc224_modulus[28] = {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+static const uint8_t ltc_ecc224_r2modn[28] = {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00};
+static const uint8_t ltc_ecc224_aCurveParam[28] = {
+ 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+static const uint8_t ltc_ecc224_bCurveParam[28] = {
+ 0xB4, 0xFF, 0x55, 0x23, 0x43, 0x39, 0x0B, 0x27, 0xBA, 0xD8,
+ 0xBF, 0xD7, 0xB7, 0xB0, 0x44, 0x50, 0x56, 0x32, 0x41, 0xF5,
+ 0xAB, 0xB3, 0x04, 0x0C, 0x85, 0x0A, 0x05, 0xB4};
+#endif
+
+#ifdef ECC384
+static const uint8_t ltc_ecc384_modulus[48] = {
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const uint8_t ltc_ecc384_r2modn[48] = {
+ 0x01, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static const uint8_t ltc_ecc384_aCurveParam[48] = {
+ 0xfc, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const uint8_t ltc_ecc384_bCurveParam[48] = {
+ 0xef, 0x2a, 0xec, 0xd3, 0xed, 0xc8, 0x85, 0x2a, 0x9d, 0xd1, 0x2e, 0x8a,
+ 0x8d, 0x39, 0x56, 0xc6, 0x5a, 0x87, 0x13, 0x50, 0x8f, 0x08, 0x14, 0x03,
+ 0x12, 0x41, 0x81, 0xfe, 0x6e, 0x9c, 0x1d, 0x18, 0x19, 0x2d, 0xf8, 0xe3,
+ 0x6b, 0x05, 0x8e, 0x98, 0xe4, 0xe7, 0x3e, 0xe2, 0xa7, 0x2f, 0x31, 0xb3};
+#endif
+
+static int ltc_get_ecc_specs(const uint8_t **modulus, const uint8_t **r2modn,
+ const uint8_t **aCurveParam, const uint8_t **bCurveParam, int size)
+{
+ switch(size) {
+ case 32:
+ *modulus = ltc_ecc256_modulus;
+ *r2modn = ltc_ecc256_r2modn;
+ *aCurveParam = ltc_ecc256_aCurveParam;
+ *bCurveParam = ltc_ecc256_bCurveParam;
+ break;
+#ifdef ECC224
+ case 28:
+ *modulus = ltc_ecc224_modulus;
+ *r2modn = ltc_ecc224_r2modn;
+ *aCurveParam = ltc_ecc224_aCurveParam;
+ *bCurveParam = ltc_ecc224_bCurveParam;
+ break;
+#endif
+#ifdef ECC192
+ case 24:
+ *modulus = ltc_ecc192_modulus;
+ *r2modn = ltc_ecc192_r2modn;
+ *aCurveParam = ltc_ecc192_aCurveParam;
+ *bCurveParam = ltc_ecc192_bCurveParam;
+ break;
+#endif
+#ifdef HAVE_ECC384
+ case 48:
+ *modulus = ltc_ecc384_modulus;
+ *r2modn = ltc_ecc384_r2modn;
+ *aCurveParam = ltc_ecc384_aCurveParam;
+ *bCurveParam = ltc_ecc384_bCurveParam;
+ break;
+#endif
+ default:
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ Perform a point multiplication (timing resistant)
+ k The scalar to multiply by
+ G The base point
+ R [out] Destination for kG
+ modulus The modulus of the field the ECC curve is in
+ map Boolean whether to map back to affine or not
+ (1==map, 0 == leave in projective)
+ return MP_OKAY on success
+*/
+int wc_ecc_mulmod_ex(mp_int *k, ecc_point *G, ecc_point *R, mp_int* a,
+ mp_int *modulus, int map, void* heap)
+{
+ ltc_pkha_ecc_point_t B;
+ uint8_t size;
+ int szModulus;
+ int szkbin;
+ bool point_of_infinity;
+ status_t status;
+ int res;
+
+ (void)a;
+ (void)heap;
+
+ uint8_t Gxbin[LTC_MAX_ECC_BITS / 8];
+ uint8_t Gybin[LTC_MAX_ECC_BITS / 8];
+ uint8_t kbin[LTC_MAX_INT_BYTES];
+
+ const uint8_t *modbin;
+ const uint8_t *aCurveParam;
+ const uint8_t *bCurveParam;
+ const uint8_t *r2modn;
+
+ if (k == NULL || G == NULL || R == NULL || modulus == NULL) {
+ return ECC_BAD_ARG_E;
+ }
+
+ szModulus = mp_unsigned_bin_size(modulus);
+ szkbin = mp_unsigned_bin_size(k);
+
+ res = ltc_get_from_mp_int(kbin, k, szkbin);
+ if (res == MP_OKAY)
+ res = ltc_get_from_mp_int(Gxbin, G->x, szModulus);
+ if (res == MP_OKAY)
+ res = ltc_get_from_mp_int(Gybin, G->y, szModulus);
+
+ if (res != MP_OKAY)
+ return res;
+
+ size = szModulus;
+ /* find LTC friendly parameters for the selected curve */
+ if (0 != ltc_get_ecc_specs(&modbin, &r2modn, &aCurveParam, &bCurveParam, size)) {
+ return ECC_BAD_ARG_E;
+ }
+
+ B.X = &Gxbin[0];
+ B.Y = &Gybin[0];
+
+ status = LTC_PKHA_ECC_PointMul(LTC_BASE, &B, kbin, szkbin, modbin, r2modn, aCurveParam, bCurveParam, size,
+ kLTC_PKHA_TimingEqualized, kLTC_PKHA_IntegerArith, &B, &point_of_infinity);
+ if (status != kStatus_Success) {
+ return MP_VAL;
+ }
+
+ ltc_reverse_array(Gxbin, size);
+ ltc_reverse_array(Gybin, size);
+ res = mp_read_unsigned_bin(R->x, Gxbin, size);
+ if (res == MP_OKAY) {
+ res = mp_read_unsigned_bin(R->y, Gybin, size);
+ /* if k is negative, we compute the multiplication with abs(-k)
+ * with result (x, y) and modify the result to (x, -y)
+ */
+#ifndef WOLFSSL_SP_MATH
+ R->y->sign = k->sign;
+#endif
+ }
+ if (res == MP_OKAY)
+ res = mp_set(R->z, 1);
+
+ return res;
+}
+
+int wc_ecc_point_add(ecc_point *mG, ecc_point *mQ, ecc_point *mR, mp_int *m)
+{
+ int res;
+ ltc_pkha_ecc_point_t A, B;
+ int size;
+ status_t status;
+
+ uint8_t Gxbin[LTC_MAX_ECC_BITS / 8];
+ uint8_t Gybin[LTC_MAX_ECC_BITS / 8];
+ uint8_t Qxbin[LTC_MAX_ECC_BITS / 8];
+ uint8_t Qybin[LTC_MAX_ECC_BITS / 8];
+ const uint8_t *modbin;
+ const uint8_t *aCurveParam;
+ const uint8_t *bCurveParam;
+ const uint8_t *r2modn;
+
+ size = mp_unsigned_bin_size(m);
+
+ /* find LTC friendly parameters for the selected curve */
+ if (ltc_get_ecc_specs(&modbin, &r2modn, &aCurveParam, &bCurveParam, size) != 0) {
+ res = ECC_BAD_ARG_E;
+ }
+ else {
+ res = ltc_get_from_mp_int(Gxbin, mG->x, size);
+ if (res == MP_OKAY)
+ res = ltc_get_from_mp_int(Gybin, mG->y, size);
+ if (res == MP_OKAY)
+ res = ltc_get_from_mp_int(Qxbin, mQ->x, size);
+ if (res == MP_OKAY)
+ res = ltc_get_from_mp_int(Qybin, mQ->y, size);
+
+ if (res != MP_OKAY)
+ return res;
+
+ A.X = Gxbin;
+ A.Y = Gybin;
+
+ B.X = Qxbin;
+ B.Y = Qybin;
+
+ status = LTC_PKHA_ECC_PointAdd(LTC_BASE, &A, &B, modbin, r2modn, aCurveParam, bCurveParam, size,
+ kLTC_PKHA_IntegerArith, &A);
+ if (status != kStatus_Success) {
+ res = MP_VAL;
+ }
+ else {
+ ltc_reverse_array(Gxbin, size);
+ ltc_reverse_array(Gybin, size);
+ res = mp_read_unsigned_bin(mR->x, Gxbin, size);
+ if (res == MP_OKAY)
+ res = mp_read_unsigned_bin(mR->y, Gybin, size);
+ if (res == MP_OKAY)
+ res = mp_set(mR->z, 1);
+ }
+ }
+ return res;
+}
+
+#if defined(HAVE_ED25519) || defined(HAVE_CURVE25519)
+/* Weierstrass parameters of prime 2^255 - 19 */
+static const uint8_t curve25519_modbin[32] = {
+ 0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f};
+/* precomputed R2modN for the curve25519 */
+static const uint8_t r2mod[32] = {
+ 0xa4, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+/* invThree = ModInv(3,curve25519_modbin) in LSB first */
+static const uint8_t invThree[32] = {
+ 0x49, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+ 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+ 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55};
+
+/*
+ *
+ * finds square root in finite field when modulus congruent to 5 modulo 8
+ * this is fixed to curve25519 modulus 2^255 - 19 which is congruent to 5 modulo 8
+ *
+ * This function solves equation: res^2 = a mod (2^255 - 19)
+ *
+p = prime
+p % 8 must be 5
+
+v = ModularArithmetic.powmod(2*a, (p-5)/8, p)
+i = (2*a*v**2) % p
+r1 = 1*a*v*(i - 1) % p
+r2 = -1*a*v*(i - 1) % p
+puts "Gy=0x#{r2.to_s(16)}"
+ */
+status_t LTC_PKHA_Prime25519SquareRootMod(const uint8_t *A, size_t sizeA,
+ uint8_t *res, size_t *szRes, int sign)
+{
+ status_t status;
+ const uint8_t curve25519_param[] = {
+ 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0f};
+ uint8_t twoA[sizeof(curve25519_modbin)] = {0};
+ uint8_t V[sizeof(curve25519_modbin)] = {0};
+ uint8_t I[sizeof(curve25519_modbin)] = {0};
+ uint8_t VV[sizeof(curve25519_modbin)] = {0};
+ uint16_t szTwoA = 0;
+ uint16_t szV = 0;
+ uint16_t szVV = 0;
+ uint16_t szI = 0;
+ uint16_t szRes16 = 0;
+ uint8_t one = 1;
+
+ /* twoA = 2*A % p */
+ status = LTC_PKHA_ModAdd(LTC_BASE, A, sizeA, A, sizeA, curve25519_modbin,
+ sizeof(curve25519_modbin), twoA, &szTwoA, kLTC_PKHA_IntegerArith);
+
+ /* V = ModularArithmetic.powmod(twoA, (p-5)/8, p) */
+ if (status == kStatus_Success) {
+ status =
+ LTC_PKHA_ModExp(LTC_BASE, twoA, szTwoA, curve25519_modbin,
+ sizeof(curve25519_modbin), curve25519_param,
+ sizeof(curve25519_param), V, &szV, kLTC_PKHA_IntegerArith,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ /* VV = V*V % p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, V, szV, V, szV, curve25519_modbin,
+ sizeof(curve25519_modbin), VV, &szVV, kLTC_PKHA_IntegerArith,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+ }
+
+ /* I = twoA * VV = 2*A*V*V % p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, twoA, szTwoA, VV, szVV,
+ curve25519_modbin, sizeof(curve25519_modbin), I, &szI,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ /* I = I - 1 */
+ XMEMSET(VV, 0xff, sizeof(VV)); /* just temp for maximum integer - for non-modular subtract */
+ if (0 <= LTC_PKHA_CompareBigNum(I, szI, &one, sizeof(one))) {
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModSub1(LTC_BASE, I, szI, &one, sizeof(one),
+ VV, sizeof(VV), I, &szI);
+ }
+ }
+ else {
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModSub1(LTC_BASE, curve25519_modbin,
+ sizeof(curve25519_modbin), &one, sizeof(one), VV, sizeof(VV), I,
+ &szI);
+ }
+ }
+
+ /* res = a*v mod p */
+ status = LTC_PKHA_ModMul(LTC_BASE, A, sizeA, V, szV, curve25519_modbin,
+ sizeof(curve25519_modbin), res, &szRes16, kLTC_PKHA_IntegerArith,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+
+ /* res = res * (i-1) mod p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, res, szRes16, I, szI,
+ curve25519_modbin, sizeof(curve25519_modbin), res, &szRes16,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ /* if X mod 2 != X_0 then we need the -X
+ *
+ * X mod 2 get from LSB bit0
+ */
+ if ((status == kStatus_Success) &&
+ ((bool)sign != (bool)(res[0] & 0x01u)))
+ {
+ status = LTC_PKHA_ModSub1(LTC_BASE, curve25519_modbin,
+ sizeof(curve25519_modbin), res, szRes16, VV, sizeof(VV), res,
+ &szRes16); /* -a = p - a */
+ }
+
+ if (status == kStatus_Success) {
+ *szRes = szRes16;
+ }
+
+ return status;
+}
+#endif /* HAVE_ED25519 || HAVE_CURVE25519 */
+
+
+#ifdef HAVE_CURVE25519
+
+/* for LTC we need Weierstrass format of curve25519 parameters
+ * these two are base point X and Y.
+ * in LSB first format (native for LTC)
+ */
+static const ECPoint ecBasePoint = {
+ {0x5a, 0x24, 0xad, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0x2a},
+ {0xd9, 0xd3, 0xce, 0x7e, 0xa2, 0xc5, 0xe9, 0x29, 0xb2, 0x61, 0x7c,
+ 0x6d, 0x7e, 0x4d, 0x3d, 0x92, 0x4c, 0xd1, 0x48, 0x77, 0x2c, 0xdd,
+ 0x1e, 0xe0, 0xb4, 0x86, 0xa0, 0xb8, 0xa1, 0x19, 0xae, 0x20},
+};
+
+const ECPoint *wc_curve25519_GetBasePoint(void)
+{
+ return &ecBasePoint;
+}
+
+static const uint8_t curve25519_aCurveParam[CURVE25519_KEYSIZE] = {
+ 0x44, 0xa1, 0x14, 0x49, 0x98, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0x2a};
+
+static const uint8_t curve_bCurveParam[CURVE25519_KEYSIZE] = {
+ 0x64, 0xc8, 0x10, 0x77, 0x9c, 0x5e, 0x0b, 0x26, 0xb4, 0x97, 0xd0,
+ 0x5e, 0x42, 0x7b, 0x09, 0xed,
+ 0x25, 0xb4, 0x97, 0xd0, 0x5e, 0x42, 0x7b, 0x09, 0xed, 0x25, 0xb4,
+ 0x97, 0xd0, 0x5e, 0x42, 0x7b};
+
+/* transform a point on Montgomery curve to a point on Weierstrass curve */
+status_t LTC_PKHA_Curve25519ToWeierstrass(
+ const ltc_pkha_ecc_point_t *ltcPointIn,ltc_pkha_ecc_point_t *ltcPointOut)
+{
+ /* offset X point (in Montgomery) so that it becomes Weierstrass */
+ const uint8_t offset[] = {
+ 0x51, 0x24, 0xad, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0x2a};
+ uint16_t sizeRes = 0;
+ status_t status;
+ status = LTC_PKHA_ModAdd(LTC_BASE, ltcPointIn->X, CURVE25519_KEYSIZE,
+ offset, sizeof(offset), curve25519_modbin, CURVE25519_KEYSIZE,
+ ltcPointOut->X, &sizeRes, kLTC_PKHA_IntegerArith);
+
+ if (status == kStatus_Success) {
+ if (ltcPointOut->Y != ltcPointIn->Y) {
+ XMEMCPY(ltcPointOut->Y, ltcPointIn->Y, CURVE25519_KEYSIZE);
+ }
+ }
+
+ return status;
+}
+
+/* transform a point on Weierstrass curve to a point on Montgomery curve */
+status_t LTC_PKHA_WeierstrassToCurve25519(
+ const ltc_pkha_ecc_point_t *ltcPointIn, ltc_pkha_ecc_point_t *ltcPointOut)
+{
+ status_t status;
+ uint16_t resultSize = 0;
+ const uint8_t three = 0x03;
+
+ status = LTC_PKHA_ModMul(LTC_BASE, &three, sizeof(three), ltcPointIn->X,
+ CURVE25519_KEYSIZE, curve25519_modbin, CURVE25519_KEYSIZE,
+ ltcPointOut->X, &resultSize, kLTC_PKHA_IntegerArith,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+
+ if (status == kStatus_Success) {
+ const uint8_t A[] = {0x06, 0x6d, 0x07};
+ if (LTC_PKHA_CompareBigNum(ltcPointOut->X, resultSize, A, sizeof(A))) {
+ status = LTC_PKHA_ModSub1(LTC_BASE, ltcPointOut->X, resultSize, A,
+ sizeof(A), curve25519_modbin, CURVE25519_KEYSIZE,
+ ltcPointOut->X, &resultSize);
+ }
+ else {
+ status = LTC_PKHA_ModSub2(LTC_BASE, ltcPointOut->X, resultSize, A,
+ sizeof(A), curve25519_modbin, CURVE25519_KEYSIZE,
+ ltcPointOut->X, &resultSize);
+ }
+ }
+
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, invThree, CURVE25519_KEYSIZE,
+ ltcPointOut->X, resultSize, curve25519_modbin, CURVE25519_KEYSIZE,
+ ltcPointOut->X, &resultSize, kLTC_PKHA_IntegerArith,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+ }
+
+ if (status == kStatus_Success) {
+ if (ltcPointOut->Y != ltcPointIn->Y) {
+ XMEMCPY(ltcPointOut->Y, ltcPointIn->Y, CURVE25519_KEYSIZE);
+ }
+ }
+
+ return status;
+}
+
+/* Y = square root (X^3 + 486662*X^2 + X) */
+status_t LTC_PKHA_Curve25519ComputeY(ltc_pkha_ecc_point_t *ltcPoint)
+{
+ uint8_t three = 3;
+ const uint8_t A[] = {0x06, 0x6d, 0x07};
+ uint8_t U[CURVE25519_KEYSIZE] = {0};
+ uint8_t X2[CURVE25519_KEYSIZE] = {0};
+ uint16_t sizeU = 0;
+ uint16_t sizeX2 = 0;
+ size_t szRes = 0;
+ status_t status;
+
+ /* X^3 */
+ status = LTC_PKHA_ModExp(LTC_BASE, ltcPoint->X, CURVE25519_KEYSIZE,
+ curve25519_modbin, CURVE25519_KEYSIZE, &three, 1, U, &sizeU,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+
+ /* X^2 */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, ltcPoint->X, CURVE25519_KEYSIZE,
+ ltcPoint->X, CURVE25519_KEYSIZE, curve25519_modbin,
+ CURVE25519_KEYSIZE, X2, &sizeX2, kLTC_PKHA_IntegerArith,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+ }
+
+ /* 486662*X^2 */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, A, sizeof(A), X2, sizeX2,
+ curve25519_modbin, CURVE25519_KEYSIZE, X2, &sizeX2,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ /* X^3 + 486662*X^2 */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModAdd(LTC_BASE, U, sizeU, X2, sizeX2,
+ curve25519_modbin, CURVE25519_KEYSIZE, U, &sizeU,
+ kLTC_PKHA_IntegerArith);
+ }
+
+ /* U = X^3 + 486662*X^2 + X */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModAdd(LTC_BASE, U, sizeU, ltcPoint->X,
+ CURVE25519_KEYSIZE, curve25519_modbin, CURVE25519_KEYSIZE, U,
+ &sizeU, kLTC_PKHA_IntegerArith);
+ }
+
+ /* Y = modular square root of U (U is Y^2) */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_Prime25519SquareRootMod(U, sizeU, ltcPoint->Y,
+ &szRes, 1);
+ }
+
+ return status;
+}
+
+/* Q = n*P */
+/* if type is set, the input point p is in Montgomery curve coordinates,
+ so there is a map to Weierstrass curve */
+/* q output point is always in Montgomery curve coordinates */
+int wc_curve25519(ECPoint *q, byte *n, const ECPoint *p, fsl_ltc_ecc_coordinate_system_t type)
+{
+ status_t status;
+ ltc_pkha_ecc_point_t ltcPoint;
+ ltc_pkha_ecc_point_t ltcPointOut;
+ ECPoint pIn = {{0}};
+
+ XMEMCPY(&pIn, p, sizeof(*p));
+ ltcPoint.X = &pIn.point[0];
+ ltcPoint.Y = &pIn.pointY[0];
+
+ /* if input point P is on Curve25519 Montgomery curve, transform
+ it to Weierstrass equivalent */
+ if (type == kLTC_Curve25519) {
+ LTC_PKHA_Curve25519ToWeierstrass(&ltcPoint, &ltcPoint);
+ }
+
+ ltcPointOut.X = &q->point[0];
+ ltcPointOut.Y = &q->pointY[0];
+ /* curve25519_modbin, r2mod, curve25519_aCurveParam, curve25519_bCurveParam
+ * are Weierstrass equivalent with Curve25519 */
+ status = LTC_PKHA_ECC_PointMul(LTC_BASE, &ltcPoint, n, CURVE25519_KEYSIZE,
+ curve25519_modbin, r2mod, curve25519_aCurveParam,
+ curve25519_bCurveParam, CURVE25519_KEYSIZE, kLTC_PKHA_TimingEqualized,
+ kLTC_PKHA_IntegerArith, &ltcPointOut, NULL);
+
+ /* now need to map from Weierstrass form to Montgomery form */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_WeierstrassToCurve25519(&ltcPointOut, &ltcPointOut);
+ }
+
+ return (status == kStatus_Success) ? 0 : IS_POINT_E;
+}
+
+#endif /* HAVE_CURVE25519 */
+
+
+#ifdef HAVE_ED25519
+/* a and d are Edwards curve parameters -1 and -121665/121666 prime is 2^255 - 19.
+ *
+ * https://en.wikipedia.org/wiki/Montgomery_curve#Equivalence_with_Edward_curves
+ */
+
+/* d parameter of ed25519 */
+static const uint8_t d_coefEd25519[] = {
+ 0xa3, 0x78, 0x59, 0x13, 0xca, 0x4d, 0xeb, 0x75, 0xab, 0xd8, 0x41,
+ 0x41, 0x4d, 0x0a, 0x70, 0x00, 0x98, 0xe8, 0x79, 0x77, 0x79, 0x40,
+ 0xc7, 0x8c, 0x73, 0xfe, 0x6f, 0x2b, 0xee, 0x6c, 0x03, 0x52};
+
+/* Montgomery curve parameter A for a Montgomery curve equivalent with ed25519 */
+static const uint8_t A_coefEd25519[] = {
+ 0x06, 0x6d, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+/* Montgomery curve parameter B for a Montgomery curve equivalent with ed25519 */
+static const uint8_t B_coefEd25519[] = {
+ 0xe5, 0x92, 0xf8, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f};
+
+/* these are pre-computed constants used in computations */
+
+/* = 3*B */
+static const uint8_t threeB_coefEd25519[] = {
+ 0xd5, 0xb8, 0xe9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f};
+
+/* = -A */
+static const uint8_t minus_A_coefEd25519[] = {
+ 0xe7, 0x92, 0xf8, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f};
+
+/* = 1/B */
+static const uint8_t invB_coefEd25519[] = {
+0xc4, 0xa1, 0x29, 0x7b, 0x8d, 0x2c, 0x85, 0x22, 0xd5, 0x89, 0xaf,
+ 0xaf, 0x6c, 0xfd, 0xe3, 0xff, 0xd9, 0x85, 0x21, 0xa2, 0xe1, 0x2f,
+ 0xce, 0x1c, 0x63, 0x00, 0x24, 0x75, 0xc4, 0x24, 0x7f, 0x6b};
+
+/* = 1/(3*B) */
+static const uint8_t A_mul_invThreeB_coefEd25519[] = {
+ 0xb9, 0x3e, 0xe4, 0xad, 0xa1, 0x37, 0xa7, 0x93, 0x1c, 0xa4, 0x35,
+ 0xe0, 0x0c, 0x57, 0xbd, 0xaa, 0x6e, 0x51, 0x94, 0x3e, 0x14, 0xe0,
+ 0xcb, 0xec, 0xbd, 0xff, 0xe7, 0xb1, 0x27, 0x92, 0x00, 0x63};
+
+/* Weierstrass curve parameter a for a Weierstrass curve equivalent with ed25519 */
+static const uint8_t a_coefEd25519[] = {
+ 0x2d, 0x17, 0xbc, 0xf8, 0x8e, 0xe1, 0x71, 0xac, 0xf7, 0x2a, 0xa5,
+ 0x0c, 0x5d, 0xb6, 0xb8, 0x6b, 0xd6, 0x3d, 0x7b, 0x61, 0x0d, 0xe1,
+ 0x97, 0x31, 0xe6, 0xbe, 0xb9, 0xa5, 0xd3, 0xac, 0x4e, 0x5d};
+
+/* Weierstrass curve parameter b for a Weierstrass curve equivalent with ed25519 */
+static const uint8_t b_coefEd25519[] = {
+ 0xa4, 0xb2, 0x64, 0xf3, 0xc1, 0xeb, 0x04, 0x90, 0x32, 0xbc, 0x9f,
+ 0x6b, 0x97, 0x31, 0x48, 0xf5, 0xd5, 0x80, 0x57, 0x10, 0x06, 0xdb,
+ 0x0d, 0x55, 0xe0, 0xb3, 0xd0, 0xcf, 0x9b, 0xb2, 0x11, 0x1d};
+
+/* Ed25519 basepoint B mapped to Weierstrass equivalent */
+static uint8_t Wx_Ed25519[ED25519_KEY_SIZE] = {
+ 0x35, 0xef, 0x5a, 0x02, 0x9b, 0xc8, 0x55, 0xca, 0x9a, 0x7c, 0x61,
+ 0x0d, 0xdf, 0x3f, 0xc1, 0xa9, 0x18, 0x06, 0xc2, 0xf1, 0x02, 0x8f,
+ 0x0b, 0xf0, 0x39, 0x03, 0x2c, 0xd0, 0x0f, 0xdd, 0x78, 0x2a};
+static uint8_t Wy_Ed25519[ED25519_KEY_SIZE] = {
+ 0x14, 0x1d, 0x2c, 0xf6, 0xf3, 0x30, 0x78, 0x9b, 0x65, 0x31, 0x71,
+ 0x80, 0x61, 0xd0, 0x6f, 0xcf, 0x23, 0x83, 0x79, 0x63, 0xa5, 0x3b,
+ 0x48, 0xbe, 0x2e, 0xa2, 0x1d, 0xc7, 0xa5, 0x44, 0xc6, 0x29};
+
+static const ltc_pkha_ecc_point_t basepointEd25519 = {
+ Wx_Ed25519, Wy_Ed25519,
+};
+
+const ltc_pkha_ecc_point_t *LTC_PKHA_Ed25519_BasePoint(void)
+{
+ return &basepointEd25519;
+}
+
+/* input point is on Weierstrass curve, typeOut determines the coordinates
+ system of output point (either Weierstrass or Ed25519) */
+status_t LTC_PKHA_Ed25519_PointMul(const ltc_pkha_ecc_point_t *ltcPointIn,
+ const uint8_t *N,
+ size_t sizeN,
+ ltc_pkha_ecc_point_t *ltcPointOut,
+ fsl_ltc_ecc_coordinate_system_t typeOut)
+{
+ uint16_t szN = (uint16_t)sizeN;
+ status_t status;
+ /* input on W, output in W, W parameters of ECC curve are Ed25519 curve
+ parameters mapped to Weierstrass curve */
+ status = LTC_PKHA_ECC_PointMul(LTC_BASE, ltcPointIn, N, szN,
+ curve25519_modbin, r2mod, a_coefEd25519, b_coefEd25519,
+ ED25519_KEY_SIZE, kLTC_PKHA_TimingEqualized, kLTC_PKHA_IntegerArith,
+ ltcPointOut, NULL);
+
+ /* Weierstrass coordinates to Ed25519 coordinates */
+ if ((status == kStatus_Success) && (typeOut == kLTC_Ed25519)) {
+ status = LTC_PKHA_WeierstrassToEd25519(ltcPointOut, ltcPointOut);
+ }
+ return status;
+}
+
+status_t LTC_PKHA_Ed25519ToWeierstrass(const ltc_pkha_ecc_point_t *ltcPointIn,
+ ltc_pkha_ecc_point_t *ltcPointOut)
+{
+ status_t status;
+ uint8_t Mx[ED25519_KEY_SIZE] = {0};
+ uint8_t My[ED25519_KEY_SIZE] = {0};
+ uint8_t temp[ED25519_KEY_SIZE] = {0};
+ uint8_t temp2[ED25519_KEY_SIZE] = {0};
+ const uint8_t max[32] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ const uint8_t *Ex;
+ const uint8_t *Ey;
+ uint8_t *Gx;
+ uint8_t *Gy;
+ uint16_t szMx = 0;
+ uint16_t szGx = 0;
+ uint16_t szMy = 0;
+ uint16_t szGy = 0;
+ uint16_t szTemp = 0;
+ uint16_t szTemp2 = 0;
+ uint8_t one = 1;
+
+ Ex = ltcPointIn->X;
+ Ey = ltcPointIn->Y;
+ Gx = ltcPointOut->X;
+ Gy = ltcPointOut->Y;
+ /* # (Ex, Ey) on Ed (a_ed, d) to (x, y) on M (A,B)
+ Mx = (1 + Ey) * ModularArithmetic.invert(1 - Ey, prime) % prime
+ My = (1 + Ey) * ModularArithmetic.invert((1 - Ey)*Ex, prime) % prime */
+
+ /* Gx = ((Mx * ModularArithmetic.invert(B, prime)) +
+ (A * ModularArithmetic.invert(3*B, prime))) % prime
+ Gy = (My * ModularArithmetic.invert(B, prime)) % prime */
+
+ /* temp = 1 + Ey */
+ status = LTC_PKHA_ModAdd(LTC_BASE, Ey, ED25519_KEY_SIZE, &one, sizeof(one),
+ curve25519_modbin, sizeof(curve25519_modbin), temp, &szTemp,
+ kLTC_PKHA_IntegerArith);
+
+ /* temp2 = 1 - Ey = 1 + (p - Ey) */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModSub1(LTC_BASE, curve25519_modbin,
+ sizeof(curve25519_modbin), Ey, ED25519_KEY_SIZE, max, sizeof(max),
+ temp2, &szTemp2);
+ }
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModAdd(LTC_BASE, temp2, szTemp2, &one, sizeof(one),
+ curve25519_modbin, sizeof(curve25519_modbin), temp2, &szTemp2,
+ kLTC_PKHA_IntegerArith);
+ }
+
+ /* Mx = ModInv(temp2,prime) */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModInv(LTC_BASE, temp2, szTemp2, curve25519_modbin,
+ sizeof(curve25519_modbin), Mx, &szMx, kLTC_PKHA_IntegerArith);
+ }
+
+ /* Mx = Mx * temp */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, Mx, szMx, temp, szTemp,
+ curve25519_modbin, ED25519_KEY_SIZE, Mx, &szMx,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ /* My = temp2 * Ex */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, Ex, ED25519_KEY_SIZE, temp2,
+ szTemp2, curve25519_modbin, ED25519_KEY_SIZE, My, &szMy,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ /* My = ModInv(My, prime) */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModInv(LTC_BASE, My, szMy, curve25519_modbin,
+ sizeof(curve25519_modbin), My, &szMy, kLTC_PKHA_IntegerArith);
+ }
+ /* My = My * temp */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, My, szMy, temp, szTemp,
+ curve25519_modbin, ED25519_KEY_SIZE, My, &szMy,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ /* Gx = Mx * invB_coefEd25519 + A_mul_invThreeB_coefEd25519 */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, Mx, szMx, invB_coefEd25519,
+ sizeof(invB_coefEd25519), curve25519_modbin, ED25519_KEY_SIZE, Gx,
+ &szGx, kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModAdd(LTC_BASE, Gx, szGx,
+ A_mul_invThreeB_coefEd25519, sizeof(A_mul_invThreeB_coefEd25519),
+ curve25519_modbin, sizeof(curve25519_modbin), Gx, &szGx,
+ kLTC_PKHA_IntegerArith);
+ }
+
+ /* Gy = My * invB_coefEd25519 */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, My, szMy, invB_coefEd25519,
+ sizeof(invB_coefEd25519), curve25519_modbin, ED25519_KEY_SIZE, Gy,
+ &szGy, kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ return status;
+}
+
+/*
+# (Gx, Gy) on W to (Ex, Ey) on E
+My = (B*Gy) % prime
+Mx = ((3*B*Gx-A)*ModularArithmetic.invert(3, prime)) % prime
+Ex = Mx*ModularArithmetic.invert(My, prime) % prime
+Ey = (Mx - 1)*ModularArithmetic.invert(Mx + 1, prime) % prime
+*/
+status_t LTC_PKHA_WeierstrassToEd25519(const ltc_pkha_ecc_point_t *ltcPointIn,
+ ltc_pkha_ecc_point_t *ltcPointOut)
+{
+ status_t status;
+ uint8_t Mx[ED25519_KEY_SIZE] = {0};
+ uint8_t My[ED25519_KEY_SIZE] = {0};
+ uint8_t temp[ED25519_KEY_SIZE] = {0};
+ const uint8_t *Gx;
+ const uint8_t *Gy;
+ uint8_t *Ex;
+ uint8_t *Ey;
+ uint16_t szMx = 0;
+ uint16_t szEx = 0;
+ uint16_t szMy = 0;
+ uint16_t szEy = 0;
+ uint16_t szTemp = 0;
+ uint8_t one = 1;
+
+ Gx = ltcPointIn->X;
+ Gy = ltcPointIn->Y;
+ Ex = ltcPointOut->X;
+ Ey = ltcPointOut->Y;
+
+ /* My = (B*Gy) % prime */
+ status = LTC_PKHA_ModMul(LTC_BASE, B_coefEd25519, sizeof(B_coefEd25519),
+ Gy, ED25519_KEY_SIZE, curve25519_modbin, ED25519_KEY_SIZE, My, &szMy,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+
+ /* temp = 3*B*Gx mod p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, threeB_coefEd25519,
+ sizeof(threeB_coefEd25519), Gx, ED25519_KEY_SIZE, curve25519_modbin,
+ ED25519_KEY_SIZE, temp, &szTemp, kLTC_PKHA_IntegerArith,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+ }
+ /* temp = (temp - A) mod p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModAdd(LTC_BASE, temp, szTemp, minus_A_coefEd25519,
+ sizeof(minus_A_coefEd25519), curve25519_modbin,
+ sizeof(curve25519_modbin), temp, &szTemp, kLTC_PKHA_IntegerArith);
+ }
+ /* Mx = (temp/3) mod p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, temp, szTemp, invThree,
+ sizeof(invThree), curve25519_modbin, sizeof(curve25519_modbin), Mx,
+ &szMx, kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+ /* temp = 1/My mod p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModInv(LTC_BASE, My, szMy, curve25519_modbin,
+ sizeof(curve25519_modbin), temp, &szTemp, kLTC_PKHA_IntegerArith);
+ }
+ /* Ex = Mx * temp mod p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, temp, szTemp, Mx, szMx,
+ curve25519_modbin, sizeof(curve25519_modbin), Ex, &szEx,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ /* temp = Mx + 1 mod p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModAdd(LTC_BASE, Mx, szMx, &one, sizeof(one),
+ curve25519_modbin, sizeof(curve25519_modbin), temp, &szTemp,
+ kLTC_PKHA_IntegerArith);
+ }
+ /* temp = 1/temp mod p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModInv(LTC_BASE, temp, szTemp, curve25519_modbin,
+ sizeof(curve25519_modbin), temp, &szTemp, kLTC_PKHA_IntegerArith);
+ }
+ /* Mx = (Mx - 1) mod p */
+ if (status == kStatus_Success) {
+ if (LTC_PKHA_CompareBigNum(Mx, szMx, &one, sizeof(one)) >= 0) {
+ status = LTC_PKHA_ModSub1(LTC_BASE, Mx, szMx, &one, sizeof(one),
+ curve25519_modbin, sizeof(curve25519_modbin), Mx, &szMx);
+ }
+ else {
+ /* Mx is zero, so it is modulus, thus we do modulus - 1 */
+ XMEMCPY(Mx, curve25519_modbin, sizeof(curve25519_modbin));
+ Mx[0]--;
+ }
+ }
+ /* Ey = Mx * temp mod p */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, temp, szTemp, Mx, szMx,
+ curve25519_modbin, sizeof(curve25519_modbin), Ey, &szEy,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ return status;
+}
+
+status_t LTC_PKHA_Ed25519_PointDecompress(const uint8_t *pubkey,
+ size_t pubKeySize, ltc_pkha_ecc_point_t *ltcPointOut)
+{
+ status_t status;
+ const uint8_t one = 1;
+
+ /* pubkey contains the Y coordinate and a sign of X
+ */
+
+ /* x^2 = ((y^2 - 1) / (d*y^2 +1)) mod p */
+
+ /* decode Y from pubkey */
+ XMEMCPY(ltcPointOut->Y, pubkey, pubKeySize);
+ ltcPointOut->Y[pubKeySize - 1] &= ~0x80u;
+ int sign = (int)(bool)(pubkey[pubKeySize - 1] & 0x80u);
+
+ uint8_t U[ED25519_KEY_SIZE] = {0};
+ uint8_t V[ED25519_KEY_SIZE] = {0};
+ uint8_t *X = ltcPointOut->X;
+ uint8_t *Y = ltcPointOut->Y;
+ uint16_t szU = 0;
+ uint16_t szV = 0;
+ size_t szRes = 0;
+
+ /* decode X from pubkey */
+
+ /* U = y * y mod p */
+ status = LTC_PKHA_ModMul(LTC_BASE, Y, ED25519_KEY_SIZE, Y,
+ ED25519_KEY_SIZE, curve25519_modbin, ED25519_KEY_SIZE, U, &szU,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+ XMEMCPY(V, U, szU);
+ szV = szU;
+
+ /* U = U - 1 = y^2 - 1 */
+ if (status == kStatus_Success) {
+ if (LTC_PKHA_CompareBigNum(U, szU, &one, sizeof(one)) >= 0) {
+ status = LTC_PKHA_ModSub1(LTC_BASE, U, szU, &one, sizeof(one),
+ curve25519_modbin, sizeof(curve25519_modbin), U, &szU);
+ }
+ else {
+ /* U is zero, so it is modulus, thus we do modulus - 1 */
+ XMEMCPY(U, curve25519_modbin, sizeof(curve25519_modbin));
+ U[0]--;
+ }
+ }
+
+ /* V = d*y*y + 1 */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, V, szV, d_coefEd25519,
+ ED25519_KEY_SIZE, curve25519_modbin, ED25519_KEY_SIZE, V, &szV,
+ kLTC_PKHA_IntegerArith, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_TimingEqualized);
+ }
+
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModAdd(LTC_BASE, V, szV, &one, sizeof(one),
+ curve25519_modbin, sizeof(curve25519_modbin), V, &szV,
+ kLTC_PKHA_IntegerArith);
+ }
+
+ /* U = U / V (mod p) */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModInv(LTC_BASE, V, szV, curve25519_modbin,
+ sizeof(curve25519_modbin), V, &szV, kLTC_PKHA_IntegerArith);
+ }
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, V, szV, U, szU, curve25519_modbin,
+ ED25519_KEY_SIZE, U, &szU, kLTC_PKHA_IntegerArith,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+ }
+
+ /* get square root */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_Prime25519SquareRootMod(U, szU, X, &szRes, sign);
+ }
+
+ return status;
+}
+
+/* LSByte first of Ed25519 parameter l = 2^252 + 27742317777372353535851937790883648493 */
+static const uint8_t l_coefEdDSA[] = {
+ 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7,
+ 0xa2, 0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10};
+
+/*
+Input:
+ s[0]+256*s[1]+...+256^63*s[63] = s
+
+Output:
+ s[0]+256*s[1]+...+256^31*s[31] = s mod l
+ where l = 2^252 + 27742317777372353535851937790883648493.
+ Overwrites s in place.
+*/
+status_t LTC_PKHA_sc_reduce(uint8_t *a)
+{
+ uint16_t szA = 0;
+ return LTC_PKHA_ModRed(LTC_BASE, a, 64, l_coefEdDSA, sizeof(l_coefEdDSA),
+ a, &szA, kLTC_PKHA_IntegerArith);
+}
+
+/*
+Input:
+ a[0]+256*a[1]+...+256^31*a[31] = a
+ b[0]+256*b[1]+...+256^31*b[31] = b
+ c[0]+256*c[1]+...+256^31*c[31] = c
+
+Output:
+ s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
+ where l = 2^252 + 27742317777372353535851937790883648493.
+*/
+status_t LTC_PKHA_sc_muladd(uint8_t *s, const uint8_t *a,
+ const uint8_t *b, const uint8_t *c)
+{
+ uint16_t szS = 0;
+ uint16_t szB = 0;
+ uint8_t tempB[32] = {0};
+ status_t status;
+
+ /* Assume only b can be larger than modulus. It is called durind
+ * wc_ed25519_sign_msg() where hram (=a) and nonce(=c)
+ * have been reduced by LTC_PKHA_sc_reduce()
+ * Thus reducing b only.
+ */
+ status = LTC_PKHA_ModRed(LTC_BASE, b, 32, l_coefEdDSA, sizeof(l_coefEdDSA),
+ tempB, &szB, kLTC_PKHA_IntegerArith);
+
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModMul(LTC_BASE, a, 32, tempB, szB, l_coefEdDSA,
+ sizeof(l_coefEdDSA), s, &szS, kLTC_PKHA_IntegerArith,
+ kLTC_PKHA_NormalValue, kLTC_PKHA_NormalValue,
+ kLTC_PKHA_TimingEqualized);
+ }
+
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModAdd(LTC_BASE, s, szS, c, 32, l_coefEdDSA, 32, s,
+ &szS, kLTC_PKHA_IntegerArith);
+ }
+
+ return status;
+}
+
+/*
+r = a * A + b * B
+where A is public key point, B is basepoint
+where a = a[0]+256*a[1]+...+256^31 a[31].
+and b = b[0]+256*b[1]+...+256^31 b[31].
+B is the Ed25519 base point (x,4/5) with x positive.
+*/
+status_t LTC_PKHA_SignatureForVerify(uint8_t *rcheck, const unsigned char *a,
+ const unsigned char *b, ed25519_key *key)
+{
+ /* To verify a signature on a message M, first split the signature
+ into two 32-octet halves. Decode the first half as a point R,
+ and the second half as an integer s, in the range 0 <= s < q. If
+ the decoding fails, the signature is invalid. */
+
+ /* Check the group equation 8s B = 8 R + 8k A. */
+
+ /*
+ Uses a fast single-signature verification SB = R + H(R,A,M)A becomes
+ SB - H(R,A,M)A saving decompression of R
+ */
+ uint8_t X0[ED25519_PUB_KEY_SIZE] = {0};
+ uint8_t X1[ED25519_PUB_KEY_SIZE] = {0};
+ uint8_t Y0[ED25519_PUB_KEY_SIZE] = {0};
+ uint8_t Y1[ED25519_PUB_KEY_SIZE] = {0};
+ const uint8_t max[32] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ ltc_pkha_ecc_point_t ltc0;
+ ltc_pkha_ecc_point_t ltc1;
+ ltc_pkha_ecc_point_t pubKey;
+ status_t status;
+
+ /* The equality for the negative of a point P, in affine coordinates,
+ is -P = -(x,y) = (x, -y) */
+ uint16_t szY = 32;
+
+ ltc0.X = X0;
+ ltc1.X = X1;
+ ltc0.Y = Y0;
+ ltc1.Y = Y1;
+ pubKey.X = key->pointX;
+ pubKey.Y = key->pointY;
+
+ /* ltc0 = b*B */
+ status = LTC_PKHA_Ed25519_PointMul(LTC_PKHA_Ed25519_BasePoint(), b,
+ ED25519_KEY_SIZE, &ltc0, kLTC_Weierstrass /* result in W */);
+
+ /* ltc1 = a*A */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_Ed25519ToWeierstrass(&pubKey, &ltc1);
+ }
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_Ed25519_PointMul(&ltc1, a, ED25519_KEY_SIZE, &ltc1,
+ kLTC_Weierstrass /* result in W */);
+ }
+
+ /* R = b*B - a*A */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ModSub1(LTC_BASE, curve25519_modbin,
+ sizeof(curve25519_modbin), ltc1.Y, szY, max, sizeof(max), ltc1.Y,
+ &szY);
+ }
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_ECC_PointAdd(LTC_BASE, &ltc0, &ltc1,
+ curve25519_modbin, r2mod, a_coefEd25519, b_coefEd25519,
+ ED25519_KEY_SIZE, kLTC_PKHA_IntegerArith, &ltc0);
+ }
+ /* map to Ed25519 */
+ if (status == kStatus_Success) {
+ status = LTC_PKHA_WeierstrassToEd25519(&ltc0, &ltc0);
+ }
+ if (((uint32_t)ltc0.X[0]) & 0x01u) {
+ ltc0.Y[ED25519_KEY_SIZE - 1] |= 0x80u;
+ }
+
+ XMEMCPY(rcheck, ltc0.Y, ED25519_KEY_SIZE);
+ return status;
+}
+
+status_t LTC_PKHA_Ed25519_Compress(const ltc_pkha_ecc_point_t *ltcPointIn,
+ uint8_t *p)
+{
+ /* compress */
+ /* get sign of X per https://tools.ietf.org/html/draft-josefsson-eddsa-ed25519-02
+ * To form the encoding of the point, copy the least
+ significant bit of the x-coordinate to the most significant bit of
+ the final octet
+ */
+ XMEMCPY(p, ltcPointIn->Y, ED25519_KEY_SIZE);
+ if (((uint32_t)ltcPointIn->X[0]) & 0x01u) {
+ p[ED25519_KEY_SIZE - 1] |= 0x80u;
+ }
+ return kStatus_Success;
+}
+
+#endif /* HAVE_ED25519 */
+#endif /* FREESCALE_LTC_ECC */
+
+
+#undef ERROR_OUT
+
+#endif /* FREESCALE_LTC_TFM || FREESCALE_LTC_ECC */
diff --git a/wolfcrypt/src/port/pic32/pic32mz-crypt.c b/wolfcrypt/src/port/pic32/pic32mz-crypt.c
new file mode 100644
index 0000000..1e618c1
--- /dev/null
+++ b/wolfcrypt/src/port/pic32/pic32mz-crypt.c
@@ -0,0 +1,804 @@
+/* pic32mz-crypt.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_MICROCHIP_PIC32MZ
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+
+#include <wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h>
+
+#ifdef WOLFSSL_PIC32MZ_CRYPT
+#include <wolfssl/wolfcrypt/aes.h>
+#include <wolfssl/wolfcrypt/des3.h>
+#endif
+
+#ifdef WOLFSSL_PIC32MZ_HASH
+#include <wolfssl/wolfcrypt/md5.h>
+#include <wolfssl/wolfcrypt/sha.h>
+#include <wolfssl/wolfcrypt/sha256.h>
+#endif
+
+
+#if defined(WOLFSSL_PIC32MZ_CRYPT) || defined(WOLFSSL_PIC32MZ_HASH)
+
+static int Pic32GetBlockSize(int algo)
+{
+ switch (algo) {
+ case PIC32_ALGO_HMAC1:
+ return PIC32_BLOCKSIZE_HMAC;
+ case PIC32_ALGO_SHA256:
+ return PIC32_BLOCKSIZE_SHA256;
+ case PIC32_ALGO_SHA1:
+ return PIC32_BLOCKSIZE_SHA1;
+ case PIC32_ALGO_MD5:
+ return PIC32_BLOCKSIZE_MD5;
+ case PIC32_ALGO_AES:
+ return PIC32_BLOCKSIZE_AES;
+ case PIC32_ALGO_TDES:
+ return PIC32_BLOCKSIZE_TDES;
+ case PIC32_ALGO_DES:
+ return PIC32_BLOCKSIZE_DES;
+ }
+ return 0;
+}
+
+static int Pic32Crypto(const byte* pIn, int inLen, word32* pOut, int outLen,
+ int dir, int algo, int cryptoalgo,
+
+ /* For DES/AES only */
+ word32* key, int keyLen, word32* iv, int ivLen)
+{
+ int ret = 0;
+ int blockSize = Pic32GetBlockSize(algo);
+ volatile bufferDescriptor bd __attribute__((aligned (8)));
+ securityAssociation sa __attribute__((aligned (8)));
+ securityAssociation *sa_p;
+ bufferDescriptor *bd_p;
+ byte *in_p;
+ byte *out_p;
+ word32* dst;
+ word32 padRemain;
+ int timeout = 0xFFFFFF;
+ word32* in = (word32*)pIn;
+ word32* out = pOut;
+ int isDynamic = 0;
+
+ /* check args */
+ if (in == NULL || inLen <= 0 || out == NULL || blockSize == 0) {
+ return BAD_FUNC_ARG;
+ }
+
+ /* check pointer alignment - must be word aligned */
+ if (((size_t)in % sizeof(word32)) || ((size_t)out % sizeof(word32))) {
+ /* dynamically allocate aligned pointers */
+ isDynamic = 1;
+ in = (word32*)XMALLOC(inLen, NULL, DYNAMIC_TYPE_AES_BUFFER);
+ if (in == NULL)
+ return MEMORY_E;
+ if ((word32*)pIn == pOut) /* inline */
+ out = (word32*)in;
+ else {
+ out = (word32*)XMALLOC(outLen, NULL, DYNAMIC_TYPE_AES_BUFFER);
+ if (out == NULL) {
+ XFREE(in, NULL, DYNAMIC_TYPE_AES_BUFFER);
+ return MEMORY_E;
+ }
+ }
+ XMEMCPY(in, pIn, inLen);
+ }
+
+ /* get uncached address */
+ sa_p = KVA0_TO_KVA1(&sa);
+ bd_p = KVA0_TO_KVA1(&bd);
+ out_p= KVA0_TO_KVA1(out);
+ in_p = KVA0_TO_KVA1(in);
+
+ /* Sync cache if in physical memory (not flash) */
+ if (PIC32MZ_IF_RAM(in_p)) {
+ XMEMCPY(in_p, in, inLen);
+ }
+
+ /* Set up the Security Association */
+ XMEMSET(sa_p, 0, sizeof(sa));
+ sa_p->SA_CTRL.ALGO = algo;
+ sa_p->SA_CTRL.ENCTYPE = dir;
+ sa_p->SA_CTRL.FB = 1; /* first block */
+ sa_p->SA_CTRL.LNC = 1; /* Load new set of keys */
+ if (key) {
+ /* cipher */
+ sa_p->SA_CTRL.CRYPTOALGO = cryptoalgo;
+
+ switch (keyLen) {
+ case 32:
+ sa_p->SA_CTRL.KEYSIZE = PIC32_KEYSIZE_256;
+ break;
+ case 24:
+ case 8: /* DES */
+ sa_p->SA_CTRL.KEYSIZE = PIC32_KEYSIZE_192;
+ break;
+ case 16:
+ sa_p->SA_CTRL.KEYSIZE = PIC32_KEYSIZE_128;
+ break;
+ }
+
+ dst = (word32*)KVA0_TO_KVA1(sa.SA_ENCKEY +
+ (sizeof(sa.SA_ENCKEY)/sizeof(word32)) - (keyLen/sizeof(word32)));
+ ByteReverseWords(dst, key, keyLen);
+
+ if (iv && ivLen > 0) {
+ sa_p->SA_CTRL.LOADIV = 1;
+ dst = (word32*)KVA0_TO_KVA1(sa.SA_ENCIV +
+ (sizeof(sa.SA_ENCIV)/sizeof(word32)) - (ivLen/sizeof(word32)));
+ ByteReverseWords(dst, iv, ivLen);
+ }
+ }
+ else {
+ /* hashing */
+ sa_p->SA_CTRL.LOADIV = 1;
+ sa_p->SA_CTRL.IRFLAG = 0; /* immediate result for hashing */
+
+ dst = (word32*)KVA0_TO_KVA1(sa.SA_AUTHIV +
+ (sizeof(sa.SA_AUTHIV)/sizeof(word32)) - (outLen/sizeof(word32)));
+ ByteReverseWords(dst, out, outLen);
+ }
+
+ /* Set up the Buffer Descriptor */
+ XMEMSET(bd_p, 0, sizeof(bd));
+ bd_p->BD_CTRL.BUFLEN = inLen;
+ padRemain = (inLen % 4); /* make sure buffer is 4-byte multiple */
+ if (padRemain != 0) {
+ bd_p->BD_CTRL.BUFLEN += (4 - padRemain);
+ }
+ bd_p->BD_CTRL.SA_FETCH_EN = 1; /* Fetch the security association */
+ bd_p->BD_CTRL.PKT_INT_EN = 1; /* enable interrupt */
+ bd_p->BD_CTRL.LAST_BD = 1; /* last buffer desc in chain */
+ bd_p->BD_CTRL.LIFM = 1; /* last in frame */
+ bd_p->SA_ADDR = (unsigned int)KVA_TO_PA(&sa);
+ bd_p->SRCADDR = (unsigned int)KVA_TO_PA(in);
+ if (key) {
+ /* cipher */
+ if (in != out)
+ XMEMSET(out_p, 0, outLen); /* clear output buffer */
+ bd_p->DSTADDR = (unsigned int)KVA_TO_PA(out);
+ }
+ else {
+ /* hashing */
+ /* digest result returned in UPDPTR */
+ bd_p->UPDPTR = (unsigned int)KVA_TO_PA(out);
+ }
+ bd_p->NXTPTR = (unsigned int)KVA_TO_PA(&bd);
+ bd_p->MSGLEN = inLen; /* actual message size */
+ bd_p->BD_CTRL.DESC_EN = 1; /* enable this descriptor */
+
+ /* begin access to hardware */
+ ret = wolfSSL_CryptHwMutexLock();
+ if (ret == 0) {
+ /* Software Reset the Crypto Engine */
+ CECON = 1 << 6;
+ while (CECON);
+
+ /* Clear the interrupt flags */
+ CEINTSRC = 0xF;
+
+ /* Run the engine */
+ CEBDPADDR = (unsigned int)KVA_TO_PA(&bd);
+ CEINTEN = 0x07; /* enable DMA Packet Completion Interrupt */
+
+ /* input swap, enable BD fetch and start DMA */
+ #if PIC32_NO_OUT_SWAP
+ CECON = 0x25;
+ #else
+ CECON = 0xa5; /* bit 7 = enable out swap */
+ #endif
+
+ /* wait for operation to complete */
+ while (CEINTSRCbits.PKTIF == 0 && --timeout > 0) {};
+
+ /* Clear the interrupt flags */
+ CEINTSRC = 0xF;
+
+ /* check for errors */
+ if (CESTATbits.ERROP || timeout <= 0) {
+ #if 0
+ printf("PIC32 Crypto: ERROP %x, ERRPHASE %x, TIMEOUT %s\n",
+ CESTATbits.ERROP, CESTATbits.ERRPHASE, timeout <= 0 ? "yes" : "no");
+ #endif
+ ret = ASYNC_OP_E;
+ }
+
+ wolfSSL_CryptHwMutexUnLock();
+
+ /* copy result to output */
+ #if PIC32_NO_OUT_SWAP
+ /* swap bytes */
+ ByteReverseWords(out, (word32*)out_p, outLen);
+ #elif defined(_SYS_DEVCON_LOCAL_H)
+ /* sync cache */
+ SYS_DEVCON_DataCacheInvalidate((word32)out, outLen);
+ #else
+ XMEMCPY(out, out_p, outLen);
+ #endif
+ }
+
+ /* handle unaligned */
+ if (isDynamic) {
+ /* return result */
+ XMEMCPY(pOut, out, outLen);
+
+ /* free dynamic buffers */
+ XFREE(in, NULL, DYNAMIC_TYPE_AES_BUFFER);
+ if ((word32*)pIn != pOut)
+ XFREE(out, NULL, DYNAMIC_TYPE_AES_BUFFER);
+ }
+
+ return ret;
+}
+#endif /* WOLFSSL_PIC32MZ_CRYPT || WOLFSSL_PIC32MZ_HASH */
+
+
+#ifdef WOLFSSL_PIC32MZ_HASH
+
+#ifdef WOLFSSL_PIC32MZ_LARGE_HASH
+
+/* tunable large hash block size */
+#ifndef PIC32_BLOCK_SIZE
+ #define PIC32_BLOCK_SIZE 256
+#endif
+
+#define PIC32MZ_MIN_BLOCK 64
+#define PIC32MZ_MAX_BLOCK (32*1024)
+
+#ifndef PIC32MZ_MAX_BD
+ #define PIC32MZ_MAX_BD 2
+#endif
+
+#if PIC32_BLOCK_SIZE < PIC32MZ_MIN_BLOCK
+ #error Encryption block size must be at least 64 bytes.
+#endif
+
+/* Crypt Engine descriptor */
+typedef struct {
+ int currBd;
+ int err;
+ unsigned int msgSize;
+ uint32_t processed;
+ uint32_t dbPtr;
+ int engine_ready;
+ volatile bufferDescriptor bd[PIC32MZ_MAX_BD] __attribute__((aligned (8)));
+ securityAssociation sa __attribute__((aligned (8)));
+} pic32mz_desc;
+
+static pic32mz_desc gLHDesc;
+static uint8_t gLHDataBuf[PIC32MZ_MAX_BD][PIC32_BLOCK_SIZE] __attribute__((aligned (4), coherent));
+
+static void reset_engine(pic32mz_desc *desc, int algo)
+{
+ int i;
+ pic32mz_desc* uc_desc = KVA0_TO_KVA1(desc);
+
+ wolfSSL_CryptHwMutexLock();
+
+ /* Software reset */
+ CECON = 1 << 6;
+ while (CECON);
+
+ /* Clear the interrupt flags */
+ CEINTSRC = 0xF;
+
+ /* Make sure everything is clear first before we setup */
+ XMEMSET(desc, 0, sizeof(pic32mz_desc));
+ XMEMSET((void *)&uc_desc->sa, 0, sizeof(uc_desc->sa));
+
+ /* Set up the Security Association */
+ uc_desc->sa.SA_CTRL.ALGO = algo;
+ uc_desc->sa.SA_CTRL.LNC = 1;
+ uc_desc->sa.SA_CTRL.FB = 1;
+ uc_desc->sa.SA_CTRL.ENCTYPE = 1;
+ uc_desc->sa.SA_CTRL.LOADIV = 1;
+
+ /* Set up the Buffer Descriptor */
+ uc_desc->err = 0;
+ for (i = 0; i < PIC32MZ_MAX_BD; i++) {
+ XMEMSET((void *)&uc_desc->bd[i], 0, sizeof(uc_desc->bd[i]));
+ uc_desc->bd[i].BD_CTRL.LAST_BD = 1;
+ uc_desc->bd[i].BD_CTRL.LIFM = 1;
+ uc_desc->bd[i].BD_CTRL.PKT_INT_EN = 1;
+ uc_desc->bd[i].SA_ADDR = KVA_TO_PA(&uc_desc->sa);
+ uc_desc->bd[i].SRCADDR = KVA_TO_PA(&gLHDataBuf[i]);
+ if (PIC32MZ_MAX_BD > i+1)
+ uc_desc->bd[i].NXTPTR = KVA_TO_PA(&uc_desc->bd[i+1]);
+ else
+ uc_desc->bd[i].NXTPTR = KVA_TO_PA(&uc_desc->bd[0]);
+ XMEMSET((void *)&gLHDataBuf[i], 0, PIC32_BLOCK_SIZE);
+ }
+ uc_desc->bd[0].BD_CTRL.SA_FETCH_EN = 1; /* Fetch the security association on the first BD */
+ desc->dbPtr = 0;
+ desc->currBd = 0;
+ desc->msgSize = 0;
+ desc->processed = 0;
+ CEBDPADDR = KVA_TO_PA(&(desc->bd[0]));
+
+ CEPOLLCON = 10;
+
+#if PIC32_NO_OUT_SWAP
+ CECON = 0x27;
+#else
+ CECON = 0xa7;
+#endif
+}
+
+static void update_engine(pic32mz_desc *desc, const byte *input, word32 len,
+ word32 *hash)
+{
+ int total;
+ pic32mz_desc *uc_desc = KVA0_TO_KVA1(desc);
+
+ uc_desc->bd[desc->currBd].UPDPTR = KVA_TO_PA(hash);
+
+ /* Add the data to the current buffer. If the buffer fills, start processing it
+ and fill the next one. */
+ while (len) {
+ /* If we've been given the message size, we can process along the
+ way.
+ Enable the current buffer descriptor if it is full. */
+ if (desc->dbPtr >= PIC32_BLOCK_SIZE) {
+ /* Wrap up the buffer descriptor and enable it so the engine can process */
+ uc_desc->bd[desc->currBd].MSGLEN = desc->msgSize;
+ uc_desc->bd[desc->currBd].BD_CTRL.BUFLEN = desc->dbPtr;
+ uc_desc->bd[desc->currBd].BD_CTRL.LAST_BD = 0;
+ uc_desc->bd[desc->currBd].BD_CTRL.LIFM = 0;
+ uc_desc->bd[desc->currBd].BD_CTRL.DESC_EN = 1;
+ /* Move to the next buffer descriptor, or wrap around. */
+ desc->currBd++;
+ if (desc->currBd >= PIC32MZ_MAX_BD)
+ desc->currBd = 0;
+ /* Wait until the engine has processed the new BD. */
+ while (uc_desc->bd[desc->currBd].BD_CTRL.DESC_EN);
+ uc_desc->bd[desc->currBd].UPDPTR = KVA_TO_PA(hash);
+ desc->dbPtr = 0;
+ }
+ if (!PIC32MZ_IF_RAM(input)) {
+ /* If we're inputting from flash, let the BD have
+ the address and max the buffer size */
+ uc_desc->bd[desc->currBd].SRCADDR = KVA_TO_PA(input);
+ total = (len > PIC32MZ_MAX_BLOCK ? PIC32MZ_MAX_BLOCK : len);
+ desc->dbPtr = total;
+ len -= total;
+ input += total;
+ }
+ else {
+ if (len > PIC32_BLOCK_SIZE - desc->dbPtr) {
+ /* We have more data than can be put in the buffer. Fill what we can.*/
+ total = PIC32_BLOCK_SIZE - desc->dbPtr;
+ XMEMCPY(&gLHDataBuf[desc->currBd][desc->dbPtr], input, total);
+ len -= total;
+ desc->dbPtr = PIC32_BLOCK_SIZE;
+ input += total;
+ }
+ else {
+ /* Fill up what we have, but don't turn on the engine.*/
+ XMEMCPY(&gLHDataBuf[desc->currBd][desc->dbPtr], input, len);
+ desc->dbPtr += len;
+ len = 0;
+ }
+ }
+ }
+}
+
+static void start_engine(pic32mz_desc *desc)
+{
+ /* Wrap up the last buffer descriptor and enable it */
+ int bufferLen;
+ pic32mz_desc *uc_desc = KVA0_TO_KVA1(desc);
+
+ bufferLen = desc->dbPtr;
+ if (bufferLen % 4)
+ bufferLen = (bufferLen + 4) - (bufferLen % 4);
+ /* initialize the MSGLEN on engine startup to avoid infinite loop when
+ * length is less than 257 (size of PIC32_BLOCK_SIZE) */
+ uc_desc->bd[desc->currBd].MSGLEN = desc->msgSize;
+ uc_desc->bd[desc->currBd].BD_CTRL.BUFLEN = bufferLen;
+ uc_desc->bd[desc->currBd].BD_CTRL.LAST_BD = 1;
+ uc_desc->bd[desc->currBd].BD_CTRL.LIFM = 1;
+ uc_desc->bd[desc->currBd].BD_CTRL.DESC_EN = 1;
+}
+
+void wait_engine(pic32mz_desc *desc, char *hash, int hash_sz)
+{
+ int i;
+ pic32mz_desc *uc_desc = KVA0_TO_KVA1(desc);
+ unsigned int engineRunning;
+
+ do {
+ engineRunning = 0;
+ for (i = 0; i < PIC32MZ_MAX_BD; i++) {
+ engineRunning = engineRunning || uc_desc->bd[i].BD_CTRL.DESC_EN;
+ }
+ } while (engineRunning);
+
+#if PIC32_NO_OUT_SWAP
+ /* swap bytes */
+ ByteReverseWords(hash, KVA0_TO_KVA1(hash), hash_sz);
+#else
+ /* copy output - hardware already swapped */
+ XMEMCPY(hash, KVA0_TO_KVA1(hash), hash_sz);
+#endif
+
+ wolfSSL_CryptHwMutexUnLock();
+}
+
+#endif /* WOLFSSL_PIC32MZ_LARGE_HASH */
+
+int wc_Pic32Hash(const byte* in, int inLen, word32* out, int outLen, int algo)
+{
+ return Pic32Crypto(in, inLen, out, outLen, PIC32_ENCRYPTION, algo, 0,
+ NULL, 0, NULL, 0);
+}
+
+int wc_Pic32HashCopy(hashUpdCache* src, hashUpdCache* dst)
+{
+ /* mark destination as copy, so cache->buf is not free'd */
+ if (dst) {
+ dst->isCopy = 1;
+ }
+ return 0;
+}
+
+static int wc_Pic32HashUpdate(hashUpdCache* cache, byte* stdBuf, int stdBufLen,
+ word32* digest, int digestSz, const byte* data, int len, int algo, void* heap)
+{
+ int ret = 0;
+ word32 newLenUpd, newLenPad, padRemain;
+ byte* newBuf;
+ int isNewBuf = 0;
+
+#ifdef WOLFSSL_PIC32MZ_LARGE_HASH
+ /* if final length is set then pass straight to hardware */
+ if (cache->finalLen) {
+ if (cache->bufLen == 0) {
+ reset_engine(&gLHDesc, algo);
+ gLHDesc.msgSize = cache->finalLen;
+ }
+ update_engine(&gLHDesc, data, len, digest);
+ cache->bufLen += len; /* track progress for blockType */
+ return 0;
+ }
+#endif
+
+ /* cache updates */
+ /* calculate new len */
+ newLenUpd = cache->updLen + len;
+
+ /* calculate padded len - pad buffer at 64-bytes for hardware */
+ newLenPad = newLenUpd;
+ padRemain = (newLenUpd % PIC32_BLOCKSIZE_HASH);
+ if (padRemain != 0) {
+ newLenPad += (PIC32_BLOCKSIZE_HASH - padRemain);
+ }
+
+ /* determine buffer source */
+ if (newLenPad <= stdBufLen) {
+ /* use standard buffer */
+ newBuf = stdBuf;
+ }
+ else if (newLenPad > cache->bufLen) {
+ /* alloc buffer */
+ newBuf = (byte*)XMALLOC(newLenPad, heap, DYNAMIC_TYPE_HASH_TMP);
+ if (newBuf == NULL) {
+ if (cache->buf != stdBuf && !cache->isCopy) {
+ XFREE(cache->buf, heap, DYNAMIC_TYPE_HASH_TMP);
+ cache->buf = NULL;
+ cache->updLen = cache->bufLen = 0;
+ }
+ return MEMORY_E;
+ }
+ isNewBuf = 1;
+ cache->isCopy = 0; /* no longer using copy buffer */
+ }
+ else {
+ /* use existing buffer */
+ newBuf = cache->buf;
+ }
+ if (cache->buf && cache->updLen > 0) {
+ XMEMCPY(newBuf, cache->buf, cache->updLen);
+ if (isNewBuf && cache->buf != stdBuf) {
+ XFREE(cache->buf, heap, DYNAMIC_TYPE_HASH_TMP);
+ cache->buf = NULL;
+ }
+ }
+ XMEMCPY(newBuf + cache->updLen, data, len);
+
+ cache->buf = newBuf;
+ cache->updLen = newLenUpd;
+ cache->bufLen = newLenPad;
+
+ return ret;
+}
+
+static int wc_Pic32HashFinal(hashUpdCache* cache, byte* stdBuf,
+ word32* digest, byte* hash, int digestSz, int algo, void* heap)
+{
+ int ret = 0;
+
+ /* if room add the pad */
+ if (cache->buf && cache->updLen < cache->bufLen) {
+ cache->buf[cache->updLen] = 0x80;
+ }
+
+#ifdef WOLFSSL_PIC32MZ_LARGE_HASH
+ if (cache->finalLen) {
+ start_engine(&gLHDesc);
+ wait_engine(&gLHDesc, (char*)digest, digestSz);
+ XMEMCPY(hash, digest, digestSz);
+ cache->finalLen = 0;
+ }
+ else
+#endif
+ {
+ if (cache->updLen == 0) {
+ /* handle empty input */
+ switch (algo) {
+ case PIC32_ALGO_SHA256: {
+ const char* sha256EmptyHash =
+ "\xe3\xb0\xc4\x42\x98\xfc\x1c\x14\x9a\xfb\xf4\xc8\x99\x6f\xb9"
+ "\x24\x27\xae\x41\xe4\x64\x9b\x93\x4c\xa4\x95\x99\x1b\x78\x52"
+ "\xb8\x55";
+ XMEMCPY(hash, sha256EmptyHash, digestSz);
+ break;
+ }
+ case PIC32_ALGO_SHA1: {
+ const char* shaEmptyHash =
+ "\xda\x39\xa3\xee\x5e\x6b\x4b\x0d\x32\x55\xbf\xef\x95\x60\x18"
+ "\x90\xaf\xd8\x07\x09";
+ XMEMCPY(hash, shaEmptyHash, digestSz);
+ break;
+ }
+ case PIC32_ALGO_MD5: {
+ const char* md5EmptyHash =
+ "\xd4\x1d\x8c\xd9\x8f\x00\xb2\x04\xe9\x80\x09\x98\xec\xf8\x42"
+ "\x7e";
+ XMEMCPY(hash, md5EmptyHash, digestSz);
+ break;
+ }
+ } /* switch */
+ }
+ else {
+ ret = wc_Pic32Hash(cache->buf, cache->updLen, digest, digestSz, algo);
+ if (ret == 0) {
+ XMEMCPY(hash, digest, digestSz);
+ }
+ }
+
+ if (cache->buf && cache->buf != stdBuf && !cache->isCopy) {
+ XFREE(cache->buf, heap, DYNAMIC_TYPE_HASH_TMP);
+ cache->buf = NULL;
+ }
+ }
+
+ cache->buf = NULL;
+ cache->bufLen = cache->updLen = 0;
+
+ return ret;
+}
+
+static void wc_Pic32HashFree(hashUpdCache* cache, void* heap)
+{
+ if (cache && cache->buf && !cache->isCopy) {
+ XFREE(cache->buf, heap, DYNAMIC_TYPE_HASH_TMP);
+ cache->buf = NULL;
+ }
+}
+
+/* API's for compatibility with Harmony wrappers - not used */
+#ifndef NO_MD5
+ int wc_InitMd5_ex(wc_Md5* md5, void* heap, int devId)
+ {
+ if (md5 == NULL)
+ return BAD_FUNC_ARG;
+
+ XMEMSET(md5, 0, sizeof(wc_Md5));
+ md5->heap = heap;
+ (void)devId;
+ return 0;
+ }
+ int wc_Md5Update(wc_Md5* md5, const byte* data, word32 len)
+ {
+ if (md5 == NULL || (data == NULL && len > 0))
+ return BAD_FUNC_ARG;
+ return wc_Pic32HashUpdate(&md5->cache, (byte*)md5->buffer,
+ sizeof(md5->buffer), md5->digest, MD5_DIGEST_SIZE,
+ data, len, PIC32_ALGO_MD5, md5->heap);
+ }
+ int wc_Md5Final(wc_Md5* md5, byte* hash)
+ {
+ int ret;
+
+ if (md5 == NULL || hash == NULL)
+ return BAD_FUNC_ARG;
+
+ ret = wc_Pic32HashFinal(&md5->cache, (byte*)md5->buffer,
+ md5->digest, hash, MD5_DIGEST_SIZE,
+ PIC32_ALGO_MD5, md5->heap);
+
+ wc_InitMd5_ex(md5, md5->heap, INVALID_DEVID); /* reset state */
+
+ return ret;
+ }
+ void wc_Md5SizeSet(wc_Md5* md5, word32 len)
+ {
+ if (md5) {
+ #ifdef WOLFSSL_PIC32MZ_LARGE_HASH
+ md5->cache.finalLen = len;
+ #else
+ (void)len;
+ #endif
+ }
+ }
+ void wc_Md5Pic32Free(wc_Md5* md5)
+ {
+ if (md5) {
+ wc_Pic32HashFree(&md5->cache, md5->heap);
+ }
+ }
+#endif /* !NO_MD5 */
+#ifndef NO_SHA
+ int wc_InitSha_ex(wc_Sha* sha, void* heap, int devId)
+ {
+ if (sha == NULL)
+ return BAD_FUNC_ARG;
+
+ XMEMSET(sha, 0, sizeof(wc_Sha));
+ sha->heap = heap;
+ (void)devId;
+ return 0;
+ }
+ int wc_ShaUpdate(wc_Sha* sha, const byte* data, word32 len)
+ {
+ if (sha == NULL || (data == NULL && len > 0))
+ return BAD_FUNC_ARG;
+ return wc_Pic32HashUpdate(&sha->cache, (byte*)sha->buffer,
+ sizeof(sha->buffer), sha->digest, SHA_DIGEST_SIZE,
+ data, len, PIC32_ALGO_SHA1, sha->heap);
+ }
+ int wc_ShaFinal(wc_Sha* sha, byte* hash)
+ {
+ int ret;
+
+ if (sha == NULL || hash == NULL)
+ return BAD_FUNC_ARG;
+
+ ret = wc_Pic32HashFinal(&sha->cache, (byte*)sha->buffer,
+ sha->digest, hash, SHA_DIGEST_SIZE,
+ PIC32_ALGO_SHA1, sha->heap);
+
+ wc_InitSha_ex(sha, sha->heap, INVALID_DEVID); /* reset state */
+
+ return ret;
+ }
+ void wc_ShaSizeSet(wc_Sha* sha, word32 len)
+ {
+ if (sha) {
+ #ifdef WOLFSSL_PIC32MZ_LARGE_HASH
+ sha->cache.finalLen = len;
+ #else
+ (void)len;
+ #endif
+ }
+ }
+ void wc_ShaPic32Free(wc_Sha* sha)
+ {
+ if (sha) {
+ wc_Pic32HashFree(&sha->cache, sha->heap);
+ }
+ }
+#endif /* !NO_SHA */
+#ifndef NO_SHA256
+ int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
+ {
+ if (sha256 == NULL)
+ return BAD_FUNC_ARG;
+
+ XMEMSET(sha256, 0, sizeof(wc_Sha256));
+ sha256->heap = heap;
+ (void)devId;
+ return 0;
+ }
+ int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
+ {
+ if (sha256 == NULL || (data == NULL && len > 0))
+ return BAD_FUNC_ARG;
+ return wc_Pic32HashUpdate(&sha256->cache, (byte*)sha256->buffer,
+ sizeof(sha256->buffer), sha256->digest, SHA256_DIGEST_SIZE,
+ data, len, PIC32_ALGO_SHA256, sha256->heap);
+ }
+ int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
+ {
+ int ret;
+
+ if (sha256 == NULL || hash == NULL)
+ return BAD_FUNC_ARG;
+
+ ret = wc_Pic32HashFinal(&sha256->cache, (byte*)sha256->buffer,
+ sha256->digest, hash, SHA256_DIGEST_SIZE,
+ PIC32_ALGO_SHA256, sha256->heap);
+
+ wc_InitSha256_ex(sha256, sha256->heap, INVALID_DEVID); /* reset state */
+
+ return ret;
+ }
+ void wc_Sha256SizeSet(wc_Sha256* sha256, word32 len)
+ {
+ if (sha256) {
+ #ifdef WOLFSSL_PIC32MZ_LARGE_HASH
+ sha256->cache.finalLen = len;
+ #else
+ (void)len;
+ #endif
+ }
+ }
+ void wc_Sha256Pic32Free(wc_Sha256* sha256)
+ {
+ if (sha256) {
+ wc_Pic32HashFree(&sha256->cache, sha256->heap);
+ }
+ }
+#endif /* !NO_SHA256 */
+#endif /* WOLFSSL_PIC32MZ_HASH */
+
+
+#ifdef WOLFSSL_PIC32MZ_CRYPT
+#if !defined(NO_AES)
+ int wc_Pic32AesCrypt(word32 *key, int keyLen, word32 *iv, int ivLen,
+ byte* out, const byte* in, word32 sz,
+ int dir, int algo, int cryptoalgo)
+ {
+ return Pic32Crypto(in, sz, (word32*)out, sz, dir, algo, cryptoalgo,
+ key, keyLen, iv, ivLen);
+ }
+#endif /* !NO_AES */
+
+#ifndef NO_DES3
+ int wc_Pic32DesCrypt(word32 *key, int keyLen, word32 *iv, int ivLen,
+ byte* out, const byte* in, word32 sz,
+ int dir, int algo, int cryptoalgo)
+ {
+ return Pic32Crypto(in, sz, (word32*)out, sz, dir, algo, cryptoalgo,
+ key, keyLen, iv, ivLen);
+ }
+#endif /* !NO_DES3 */
+#endif /* WOLFSSL_PIC32MZ_CRYPT */
+
+#endif /* WOLFSSL_MICROCHIP_PIC32MZ */
diff --git a/wolfcrypt/src/port/st/README.md b/wolfcrypt/src/port/st/README.md
new file mode 100644
index 0000000..011dd90
--- /dev/null
+++ b/wolfcrypt/src/port/st/README.md
@@ -0,0 +1,132 @@
+# ST Ports
+
+Support for the STM32 L4, F1, F2, F4 and F7 on-board crypto hardware acceleration for symmetric AES (ECB/CBC/CTR/GCM) and MD5/SHA1/SHA224/SHA256.
+
+Support for the STSAFE-A100 crypto hardware accelerator co-processor via I2C for ECC supporting NIST or Brainpool 256-bit and 384-bit curves. It requires the ST-Safe SDK including wolf stsafe_interface.c/.h files. Please contact ST for these.
+
+
+For details see our [wolfSSL ST](https://www.wolfssl.com/docs/stm32/) page.
+
+
+## STM32 Symmetric Acceleration
+
+We support using the STM32 CubeMX and Standard Peripheral Library.
+
+### Building
+
+To enable support define one of the following:
+
+```
+#define WOLFSSL_STM32L4
+#define WOLFSSL_STM32F1
+#define WOLFSSL_STM32F2
+#define WOLFSSL_STM32F4
+#define WOLFSSL_STM32F7
+```
+
+To use CubeMX define `WOLFSSL_STM32_CUBEMX` otherwise StdPeriLib is used.
+
+To disable portions of the hardware acceleration you can optionally define:
+
+```
+#define NO_STM32_RNG
+#define NO_STM32_CRYPTO
+#define NO_STM32_HASH
+```
+
+### Coding
+
+In your application you must include <wolfssl/wolfcrypt/settings.h> before any other wolfSSL headers. If building the sources directly we recommend defining `WOLFSSL_USER_SETTINGS` and adding your own `user_settings.h` file. You can find a good reference for this in `IDE/GCC-ARM/Header/user_settings.h`.
+
+
+### Benchmarks
+
+See our [benchmarks](https://www.wolfssl.com/docs/benchmarks/) on the wolfSSL website.
+
+
+
+## STSAFE-A100 ECC Acceleration
+
+Using the wolfSSL PK callbacks and the reference ST Safe reference API's we support an ECC only cipher suite such as ECDHE-ECDSA-AES128-SHA256 for TLS client or server.
+
+At the wolfCrypt level we also support ECC native API's for `wc_ecc_*` using the ST-Safe.
+
+### Building
+
+`./configure --enable-pkcallbacks CFLAGS="-DWOLFSSL_STSAFEA100"`
+
+or
+
+`#define HAVE_PK_CALLBACKS`
+`#define WOLFSSL_STSAFEA100`
+
+
+### Coding
+
+Setup the PK callbacks for TLS using:
+
+```
+/* Setup PK Callbacks for STSAFE-A100 */
+WOLFSSL_CTX* ctx;
+wolfSSL_CTX_SetEccKeyGenCb(ctx, SSL_STSAFE_CreateKeyCb);
+wolfSSL_CTX_SetEccSignCb(ctx, SSL_STSAFE_SignCertificateCb);
+wolfSSL_CTX_SetEccVerifyCb(ctx, SSL_STSAFE_VerifyPeerCertCb);
+wolfSSL_CTX_SetEccSharedSecretCb(ctx, SSL_STSAFE_SharedSecretCb);
+wolfSSL_CTX_SetDevId(ctx, 0); /* enables wolfCrypt `wc_ecc_*` ST-Safe use */
+```
+
+The reference STSAFE-A100 PK callback functions are located in the `wolfcrypt/src/port/st/stsafe.c` file.
+
+Adding a custom context to the callbacks:
+
+```
+/* Setup PK Callbacks context */
+WOLFSSL* ssl;
+void* myOwnCtx;
+wolfSSL_SetEccKeyGenCtx(ssl, myOwnCtx);
+wolfSSL_SetEccVerifyCtx(ssl, myOwnCtx);
+wolfSSL_SetEccSignCtx(ssl, myOwnCtx);
+wolfSSL_SetEccSharedSecretCtx(ssl, myOwnCtx);
+```
+
+### Benchmarks and Memory Use
+
+Software only implementation (STM32L4 120Mhz, Cortex-M4, Fast Math):
+
+```
+ECDHE 256 key gen SW 4 ops took 1.278 sec, avg 319.500 ms, 3.130 ops/sec
+ECDHE 256 agree SW 4 ops took 1.306 sec, avg 326.500 ms, 3.063 ops/sec
+ECDSA 256 sign SW 4 ops took 1.298 sec, avg 324.500 ms, 3.082 ops/sec
+ECDSA 256 verify SW 2 ops took 1.283 sec, avg 641.500 ms, 1.559 ops/sec
+```
+
+Memory Use:
+
+```
+Peak Stack: 18456
+Peak Heap: 2640
+Total: 21096
+```
+
+
+STSAFE-A100 acceleration:
+
+```
+ECDHE 256 key gen HW 8 ops took 1.008 sec, avg 126.000 ms, 7.937 ops/sec
+ECDHE 256 agree HW 6 ops took 1.051 sec, avg 175.167 ms, 5.709 ops/sec
+ECDSA 256 sign HW 14 ops took 1.161 sec, avg 82.929 ms, 12.059 ops/sec
+ECDSA 256 verify HW 8 ops took 1.184 sec, avg 148.000 ms, 6.757 ops/sec
+```
+
+Memory Use:
+
+```
+Peak Stack: 9592
+Peak Heap: 170
+Total: 9762
+```
+
+
+## Support
+
+Email us at [[email protected]](mailto:[email protected]).
diff --git a/wolfcrypt/src/port/st/stm32.c b/wolfcrypt/src/port/st/stm32.c
new file mode 100644
index 0000000..b37dbd8
--- /dev/null
+++ b/wolfcrypt/src/port/st/stm32.c
@@ -0,0 +1,879 @@
+/* stm32.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* Generic STM32 Hashing Function */
+/* Supports CubeMX HAL or Standard Peripheral Library */
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#include <wolfssl/wolfcrypt/port/st/stm32.h>
+#include <wolfssl/wolfcrypt/types.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifndef NO_AES
+ #include <wolfssl/wolfcrypt/aes.h>
+#endif
+
+
+#ifdef STM32_HASH
+
+#ifdef WOLFSSL_STM32L4
+ #define HASH_STR_NBW HASH_STR_NBLW
+#endif
+
+/* User can override STM32_HASH_CLOCK_ENABLE and STM32_HASH_CLOCK_DISABLE */
+#ifndef STM32_HASH_CLOCK_ENABLE
+ static WC_INLINE void wc_Stm32_Hash_Clock_Enable(STM32_HASH_Context* stmCtx)
+ {
+ #ifdef WOLFSSL_STM32_CUBEMX
+ __HAL_RCC_HASH_CLK_ENABLE();
+ #else
+ RCC_AHB2PeriphClockCmd(RCC_AHB2Periph_HASH, ENABLE);
+ #endif
+ (void)stmCtx;
+ }
+ #define STM32_HASH_CLOCK_ENABLE(ctx) wc_Stm32_Hash_Clock_Enable(ctx)
+#endif
+
+#ifndef STM32_HASH_CLOCK_DISABLE
+ static WC_INLINE void wc_Stm32_Hash_Clock_Disable(STM32_HASH_Context* stmCtx)
+ {
+ #ifdef WOLFSSL_STM32_CUBEMX
+ __HAL_RCC_HASH_CLK_DISABLE();
+ #else
+ RCC_AHB2PeriphClockCmd(RCC_AHB2Periph_HASH, DISABLE);
+ #endif
+ (void)stmCtx;
+ }
+ #define STM32_HASH_CLOCK_DISABLE(ctx) wc_Stm32_Hash_Clock_Disable(ctx)
+#endif
+
+/* STM32 Port Internal Functions */
+static WC_INLINE void wc_Stm32_Hash_SaveContext(STM32_HASH_Context* ctx)
+{
+ int i;
+
+ /* save context registers */
+ ctx->HASH_IMR = HASH->IMR;
+ ctx->HASH_STR = HASH->STR;
+ ctx->HASH_CR = HASH->CR;
+ for (i=0; i<HASH_CR_SIZE; i++) {
+ ctx->HASH_CSR[i] = HASH->CSR[i];
+ }
+}
+
+static WC_INLINE int wc_Stm32_Hash_RestoreContext(STM32_HASH_Context* ctx)
+{
+ int i;
+
+ if (ctx->HASH_CR != 0) {
+ /* restore context registers */
+ HASH->IMR = ctx->HASH_IMR;
+ HASH->STR = ctx->HASH_STR;
+ HASH->CR = ctx->HASH_CR;
+
+ /* Initialize the hash processor */
+ HASH->CR |= HASH_CR_INIT;
+
+ /* continue restoring context registers */
+ for (i=0; i<HASH_CR_SIZE; i++) {
+ HASH->CSR[i] = ctx->HASH_CSR[i];
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static WC_INLINE void wc_Stm32_Hash_GetDigest(byte* hash, int digestSize)
+{
+ word32 digest[HASH_MAX_DIGEST/sizeof(word32)];
+
+ /* get digest result */
+ digest[0] = HASH->HR[0];
+ digest[1] = HASH->HR[1];
+ digest[2] = HASH->HR[2];
+ digest[3] = HASH->HR[3];
+ if (digestSize >= 20) {
+ digest[4] = HASH->HR[4];
+ #ifdef HASH_DIGEST
+ if (digestSize >= 28) {
+ digest[5] = HASH_DIGEST->HR[5];
+ digest[6] = HASH_DIGEST->HR[6];
+ if (digestSize == 32)
+ digest[7] = HASH_DIGEST->HR[7];
+ }
+ #endif
+ }
+
+ ByteReverseWords(digest, digest, digestSize);
+
+ XMEMCPY(hash, digest, digestSize);
+}
+
+
+/* STM32 Port Exposed Functions */
+static WC_INLINE int wc_Stm32_Hash_WaitDone(void)
+{
+ /* wait until hash hardware is not busy */
+ int timeout = 0;
+ while ((HASH->SR & HASH_SR_BUSY) && ++timeout < STM32_HASH_TIMEOUT) {
+
+ }
+ /* verify timeout did not occur */
+ if (timeout >= STM32_HASH_TIMEOUT) {
+ return WC_TIMEOUT_E;
+ }
+ return 0;
+}
+
+
+void wc_Stm32_Hash_Init(STM32_HASH_Context* stmCtx)
+{
+ /* clear context */
+ XMEMSET(stmCtx, 0, sizeof(STM32_HASH_Context));
+}
+
+int wc_Stm32_Hash_Update(STM32_HASH_Context* stmCtx, word32 algo,
+ const byte* data, int len)
+{
+ int ret = 0;
+ byte* local = (byte*)stmCtx->buffer;
+ int wroteToFifo = 0;
+
+ /* check that internal buffLen is valid */
+ if (stmCtx->buffLen >= STM32_HASH_REG_SIZE) {
+ return BUFFER_E;
+ }
+
+ /* turn on hash clock */
+ STM32_HASH_CLOCK_ENABLE(stmCtx);
+
+ /* restore hash context or init as new hash */
+ if (wc_Stm32_Hash_RestoreContext(stmCtx) == 0) {
+ /* reset the control register */
+ HASH->CR &= ~(HASH_CR_ALGO | HASH_CR_DATATYPE | HASH_CR_MODE);
+
+ /* configure algorithm, mode and data type */
+ HASH->CR |= (algo | HASH_ALGOMODE_HASH | HASH_DATATYPE_8B);
+
+ /* reset HASH processor */
+ HASH->CR |= HASH_CR_INIT;
+ }
+
+ /* write 4-bytes at a time into FIFO */
+ while (len) {
+ word32 add = min(len, STM32_HASH_REG_SIZE - stmCtx->buffLen);
+ XMEMCPY(&local[stmCtx->buffLen], data, add);
+
+ stmCtx->buffLen += add;
+ data += add;
+ len -= add;
+
+ if (stmCtx->buffLen == STM32_HASH_REG_SIZE) {
+ wroteToFifo = 1;
+ HASH->DIN = *(word32*)stmCtx->buffer;
+
+ stmCtx->loLen += STM32_HASH_REG_SIZE;
+ stmCtx->buffLen = 0;
+ }
+ }
+
+ if (wroteToFifo) {
+ /* save hash state for next operation */
+ wc_Stm32_Hash_SaveContext(stmCtx);
+ }
+
+ /* turn off hash clock */
+ STM32_HASH_CLOCK_DISABLE(stmCtx);
+
+ return ret;
+}
+
+int wc_Stm32_Hash_Final(STM32_HASH_Context* stmCtx, word32 algo,
+ byte* hash, int digestSize)
+{
+ int ret = 0;
+ word32 nbvalidbitsdata = 0;
+
+ /* turn on hash clock */
+ STM32_HASH_CLOCK_ENABLE(stmCtx);
+
+ /* restore hash state */
+ wc_Stm32_Hash_RestoreContext(stmCtx);
+
+ /* finish reading any trailing bytes into FIFO */
+ if (stmCtx->buffLen > 0) {
+ HASH->DIN = *(word32*)stmCtx->buffer;
+ stmCtx->loLen += stmCtx->buffLen;
+ }
+
+ /* calculate number of valid bits in last word */
+ nbvalidbitsdata = 8 * (stmCtx->loLen % STM32_HASH_REG_SIZE);
+ HASH->STR &= ~HASH_STR_NBW;
+ HASH->STR |= nbvalidbitsdata;
+
+ /* start hash processor */
+ HASH->STR |= HASH_STR_DCAL;
+
+ /* wait for hash done */
+ ret = wc_Stm32_Hash_WaitDone();
+ if (ret == 0) {
+ /* read message digest */
+ wc_Stm32_Hash_GetDigest(hash, digestSize);
+ }
+
+ /* turn off hash clock */
+ STM32_HASH_CLOCK_DISABLE(stmCtx);
+
+ return ret;
+}
+
+#endif /* STM32_HASH */
+
+
+#ifdef STM32_CRYPTO
+
+#ifndef NO_AES
+#ifdef WOLFSSL_STM32_CUBEMX
+int wc_Stm32_Aes_Init(Aes* aes, CRYP_HandleTypeDef* hcryp)
+{
+ int ret;
+ word32 keySize;
+
+ ret = wc_AesGetKeySize(aes, &keySize);
+ if (ret != 0)
+ return ret;
+
+ XMEMSET(hcryp, 0, sizeof(CRYP_HandleTypeDef));
+ switch (keySize) {
+ case 16: /* 128-bit key */
+ hcryp->Init.KeySize = CRYP_KEYSIZE_128B;
+ break;
+ #ifdef CRYP_KEYSIZE_192B
+ case 24: /* 192-bit key */
+ hcryp->Init.KeySize = CRYP_KEYSIZE_192B;
+ break;
+ #endif
+ case 32: /* 256-bit key */
+ hcryp->Init.KeySize = CRYP_KEYSIZE_256B;
+ break;
+ default:
+ break;
+ }
+ hcryp->Instance = CRYP;
+ hcryp->Init.DataType = CRYP_DATATYPE_8B;
+ hcryp->Init.pKey = (STM_CRYPT_TYPE*)aes->key;
+#ifdef STM32_HAL_V2
+ hcryp->Init.DataWidthUnit = CRYP_DATAWIDTHUNIT_BYTE;
+#endif
+
+ return 0;
+}
+
+#else /* STD_PERI_LIB */
+
+int wc_Stm32_Aes_Init(Aes* aes, CRYP_InitTypeDef* cryptInit,
+ CRYP_KeyInitTypeDef* keyInit)
+{
+ int ret;
+ word32 keySize;
+ word32* aes_key;
+
+ ret = wc_AesGetKeySize(aes, &keySize);
+ if (ret != 0)
+ return ret;
+
+ aes_key = aes->key;
+
+ /* crypto structure initialization */
+ CRYP_KeyStructInit(keyInit);
+ CRYP_StructInit(cryptInit);
+
+ /* load key into correct registers */
+ switch (keySize) {
+ case 16: /* 128-bit key */
+ cryptInit->CRYP_KeySize = CRYP_KeySize_128b;
+ keyInit->CRYP_Key2Left = aes_key[0];
+ keyInit->CRYP_Key2Right = aes_key[1];
+ keyInit->CRYP_Key3Left = aes_key[2];
+ keyInit->CRYP_Key3Right = aes_key[3];
+ break;
+
+ case 24: /* 192-bit key */
+ cryptInit->CRYP_KeySize = CRYP_KeySize_192b;
+ keyInit->CRYP_Key1Left = aes_key[0];
+ keyInit->CRYP_Key1Right = aes_key[1];
+ keyInit->CRYP_Key2Left = aes_key[2];
+ keyInit->CRYP_Key2Right = aes_key[3];
+ keyInit->CRYP_Key3Left = aes_key[4];
+ keyInit->CRYP_Key3Right = aes_key[5];
+ break;
+
+ case 32: /* 256-bit key */
+ cryptInit->CRYP_KeySize = CRYP_KeySize_256b;
+ keyInit->CRYP_Key0Left = aes_key[0];
+ keyInit->CRYP_Key0Right = aes_key[1];
+ keyInit->CRYP_Key1Left = aes_key[2];
+ keyInit->CRYP_Key1Right = aes_key[3];
+ keyInit->CRYP_Key2Left = aes_key[4];
+ keyInit->CRYP_Key2Right = aes_key[5];
+ keyInit->CRYP_Key3Left = aes_key[6];
+ keyInit->CRYP_Key3Right = aes_key[7];
+ break;
+
+ default:
+ break;
+ }
+ cryptInit->CRYP_DataType = CRYP_DataType_8b;
+
+ return 0;
+}
+#endif /* WOLFSSL_STM32_CUBEMX */
+#endif /* !NO_AES */
+#endif /* STM32_CRYPTO */
+
+#ifdef WOLFSSL_STM32_PKA
+#include <stdint.h>
+#include <stm32wbxx_hal_conf.h>
+#include <stm32wbxx_hal_pka.h>
+
+extern PKA_HandleTypeDef hpka;
+
+/* Reverse array in memory (in place) */
+#ifdef HAVE_ECC
+#include <wolfssl/wolfcrypt/ecc.h>
+
+/* convert from mp_int to STM32 PKA HAL integer, as array of bytes of size sz.
+ * if mp_int has less bytes than sz, add zero bytes at most significant byte positions.
+ * This is when for example modulus is 32 bytes (P-256 curve)
+ * and mp_int has only 31 bytes, we add leading zeros
+ * so that result array has 32 bytes, same as modulus (sz).
+ */
+static int stm32_get_from_mp_int(uint8_t *dst, mp_int *a, int sz)
+{
+ int res;
+ int szbin;
+ int offset;
+
+ if (!a || !dst || (sz < 0))
+ return -1;
+
+ /* check how many bytes are in the mp_int */
+ szbin = mp_unsigned_bin_size(a);
+ if ((szbin < 0) || (szbin > sz))
+ return -1;
+
+ /* compute offset from dst */
+ offset = sz - szbin;
+ if (offset < 0)
+ offset = 0;
+ if (offset > sz)
+ offset = sz;
+
+ /* add leading zeroes */
+ if (offset)
+ XMEMSET(dst, 0, offset);
+
+ /* convert mp_int to array of bytes */
+ res = mp_to_unsigned_bin(a, dst + offset);
+ return res;
+}
+
+/* ECC specs in lsbyte at lowest address format for direct use by STM32_PKA PKHA driver functions */
+#if defined(HAVE_ECC192) || defined(HAVE_ALL_CURVES)
+#define ECC192
+#endif
+#if defined(HAVE_ECC224) || defined(HAVE_ALL_CURVES)
+#define ECC224
+#endif
+#if !defined(NO_ECC256) || defined(HAVE_ALL_CURVES)
+#define ECC256
+#endif
+#if defined(HAVE_ECC384) || defined(HAVE_ALL_CURVES)
+#define ECC384
+#endif
+
+/* STM32 PKA supports up to 640bit numbers */
+#define STM32_MAX_ECC_SIZE (80)
+
+
+/* P-192 */
+#ifdef ECC192
+#define ECC192_KEYSIZE (24)
+static const uint8_t stm32_ecc192_prime[ECC192_KEYSIZE] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+static const uint32_t stm32_ecc192_coef_sign = 1U;
+
+static const uint8_t stm32_ecc192_coef[ECC192_KEYSIZE] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
+};
+
+static const uint8_t stm32_ecc192_pointX[ECC192_KEYSIZE] = {
+ 0x18, 0x8D, 0xA8, 0x0E, 0xB0, 0x30, 0x90, 0xF6,
+ 0x7C, 0xBF, 0x20, 0xEB, 0x43, 0xA1, 0x88, 0x00,
+ 0xF4, 0xFF, 0x0A, 0xFD, 0x82, 0xFF, 0x10, 0x12
+};
+
+const uint8_t stm32_ecc192_pointY[ECC192_KEYSIZE] = {
+ 0x07, 0x19, 0x2B, 0x95, 0xFF, 0xC8, 0xDA, 0x78,
+ 0x63, 0x10, 0x11, 0xED, 0x6B, 0x24, 0xCD, 0xD5,
+ 0x73, 0xF9, 0x77, 0xA1, 0x1E, 0x79, 0x48, 0x11
+};
+
+const uint8_t stm32_ecc192_order[ECC192_KEYSIZE] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x99, 0xDE, 0xF8, 0x36,
+ 0x14, 0x6B, 0xC9, 0xB1, 0xB4, 0xD2, 0x28, 0x31
+};
+const uint32_t stm32_ecc192_cofactor = 1U;
+
+#endif /* ECC192 */
+
+/* P-224 */
+#ifdef ECC224
+#define ECC224_KEYSIZE (28)
+static const uint8_t stm32_ecc224_prime[ECC224_KEYSIZE] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01
+};
+static const uint32_t stm32_ecc224_coef_sign = 1U;
+
+static const uint8_t stm32_ecc224_coef[ECC224_KEYSIZE] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x03
+};
+
+static const uint8_t stm32_ecc224_pointX[ECC224_KEYSIZE] = {
+ 0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F,
+ 0x32, 0x13, 0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3,
+ 0x56, 0xC2, 0x11, 0x22, 0x34, 0x32, 0x80, 0xD6,
+ 0x11, 0x5C, 0x1D, 0x21
+};
+
+const uint8_t stm32_ecc224_pointY[ECC224_KEYSIZE] = {
+ 0xBD, 0x37, 0x63, 0x88, 0xB5, 0xF7, 0x23, 0xFB,
+ 0x4C, 0x22, 0xDF, 0xE6, 0xCD, 0x43, 0x75, 0xA0,
+ 0x5A, 0x07, 0x47, 0x64, 0x44, 0xD5, 0x81, 0x99,
+ 0x85, 0x00, 0x7E, 0x34
+};
+
+const uint8_t stm32_ecc224_order[ECC224_KEYSIZE] = {
+};
+const uint32_t stm32_ecc224_cofactor = 1U;
+
+#endif /* ECC224 */
+
+/* P-256 */
+#ifdef ECC256
+#define ECC256_KEYSIZE (32)
+
+static const uint8_t stm32_ecc256_prime[ECC256_KEYSIZE] = {
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+static const uint32_t stm32_ecc256_coef_sign = 1U;
+
+static const uint8_t stm32_ecc256_coef[ECC256_KEYSIZE] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
+};
+
+static const uint8_t stm32_ecc256_pointX[ECC256_KEYSIZE] = {
+ 0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47,
+ 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
+ 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
+ 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96
+};
+
+const uint8_t stm32_ecc256_pointY[ECC256_KEYSIZE] = {
+ 0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b,
+ 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
+ 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
+ 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5
+};
+
+const uint8_t stm32_ecc256_order[ECC256_KEYSIZE] = {
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xbc, 0xe6, 0xfa, 0xad, 0xa7, 0x17, 0x9e, 0x84,
+ 0xf3, 0xb9, 0xca, 0xc2, 0xfc, 0x63, 0x25, 0x51
+};
+const uint32_t stm32_ecc256_cofactor = 1U;
+
+#endif /* ECC256 */
+
+/* P-384 */
+#ifdef ECC384
+#define ECC384_KEYSIZE (48)
+
+static const uint8_t stm32_ecc384_prime[ECC384_KEYSIZE] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF
+};
+static const uint32_t stm32_ecc384_coef_sign = 1U;
+
+static const uint8_t stm32_ecc384_coef[ECC384_KEYSIZE] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
+};
+
+static const uint8_t stm32_ecc384_pointX[ECC384_KEYSIZE] = {
+ 0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37,
+ 0x8E, 0xB1, 0xC7, 0x1E, 0xF3, 0x20, 0xAD, 0x74,
+ 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
+ 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38,
+ 0x55, 0x02, 0xF2, 0x5D, 0xBF, 0x55, 0x29, 0x6C,
+ 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7,
+};
+
+const uint8_t stm32_ecc384_pointY[ECC384_KEYSIZE] = {
+ 0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F,
+ 0x5D, 0x9E, 0x98, 0xBF, 0x92, 0x92, 0xDC, 0x29,
+ 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
+ 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0,
+ 0x0A, 0x60, 0xB1, 0xCE, 0x1D, 0x7E, 0x81, 0x9D,
+ 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F,
+};
+
+const uint8_t stm32_ecc384_order[ECC384_KEYSIZE] = {
+};
+const uint32_t stm32_ecc384_cofactor = 1U;
+
+#endif /* ECC384 */
+static int stm32_get_ecc_specs(const uint8_t **prime, const uint8_t **coef,
+ const uint32_t **coef_sign, const uint8_t **GenPointX, const uint8_t **GenPointY,
+ const uint8_t **order, int size)
+{
+ switch(size) {
+ case 32:
+ *prime = stm32_ecc256_prime;
+ *coef = stm32_ecc256_coef;
+ *GenPointX = stm32_ecc256_pointX;
+ *GenPointY = stm32_ecc256_pointY;
+ *coef_sign = &stm32_ecc256_coef_sign;
+ *order = stm32_ecc256_order;
+ break;
+#ifdef ECC224
+ case 28:
+ *prime = stm32_ecc224_prime;
+ *coef = stm32_ecc224_coef;
+ *GenPointX = stm32_ecc224_pointX;
+ *GenPointY = stm32_ecc224_pointY;
+ *coef_sign = &stm32_ecc224_coef;
+ break;
+#endif
+#ifdef ECC192
+ case 24:
+ *prime = stm32_ecc192_prime;
+ *coef = stm32_ecc192_coef;
+ *GenPointX = stm32_ecc192_pointX;
+ *GenPointY = stm32_ecc192_pointY;
+ *coef_sign = &stm32_ecc192_coef;
+ break;
+#endif
+#ifdef ECC384
+ case 48:
+ *prime = stm32_ecc384_prime;
+ *coef = stm32_ecc384_coef;
+ *GenPointX = stm32_ecc384_pointX;
+ *GenPointY = stm32_ecc384_pointY;
+ *coef_sign = &stm32_ecc384_coef;
+ break;
+#endif
+ default:
+ return -1;
+ }
+ return 0;
+}
+
+
+/**
+ Perform a point multiplication (timing resistant)
+ k The scalar to multiply by
+ G The base point
+ R [out] Destination for kG
+ modulus The modulus of the field the ECC curve is in
+ map Boolean whether to map back to affine or not
+ (1==map, 0 == leave in projective)
+ return MP_OKAY on success
+*/
+int wc_ecc_mulmod_ex(mp_int *k, ecc_point *G, ecc_point *R, mp_int* a,
+ mp_int *modulus, int map, void* heap)
+{
+ PKA_ECCMulInTypeDef pka_mul;
+ PKA_ECCMulOutTypeDef pka_mul_res;
+ uint8_t size;
+ int szModulus;
+ int szkbin;
+ int status;
+ int res;
+ uint8_t Gxbin[STM32_MAX_ECC_SIZE];
+ uint8_t Gybin[STM32_MAX_ECC_SIZE];
+ uint8_t kbin[STM32_MAX_ECC_SIZE];
+ uint8_t PtXbin[STM32_MAX_ECC_SIZE];
+ uint8_t PtYbin[STM32_MAX_ECC_SIZE];
+ const uint8_t *prime, *coef, *gen_x, *gen_y, *order;
+ const uint32_t *coef_sign;
+ (void)a;
+ (void)heap;
+ XMEMSET(&pka_mul, 0x00, sizeof(PKA_ECCMulInTypeDef));
+ XMEMSET(&pka_mul_res, 0x00, sizeof(PKA_ECCMulOutTypeDef));
+ pka_mul_res.ptX = PtXbin;
+ pka_mul_res.ptY = PtYbin;
+
+ if (k == NULL || G == NULL || R == NULL || modulus == NULL) {
+ return ECC_BAD_ARG_E;
+ }
+
+ szModulus = mp_unsigned_bin_size(modulus);
+ szkbin = mp_unsigned_bin_size(k);
+
+ res = stm32_get_from_mp_int(kbin, k, szkbin);
+ if (res == MP_OKAY)
+ res = stm32_get_from_mp_int(Gxbin, G->x, szModulus);
+ if (res == MP_OKAY)
+ res = stm32_get_from_mp_int(Gybin, G->y, szModulus);
+
+ if (res != MP_OKAY)
+ return res;
+
+ size = (uint8_t)szModulus;
+ /* find STM32_PKA friendly parameters for the selected curve */
+ if (0 != stm32_get_ecc_specs(&prime, &coef, &coef_sign, &gen_x, &gen_y, &order, size)) {
+ return ECC_BAD_ARG_E;
+ }
+ (void)order;
+
+ pka_mul.modulusSize = szModulus;
+ pka_mul.coefSign = *coef_sign;
+ pka_mul.coefA = coef;
+ pka_mul.modulus = prime;
+ pka_mul.pointX = Gxbin;
+ pka_mul.pointY = Gybin;
+ pka_mul.scalarMulSize = size;
+ pka_mul.scalarMul = kbin;
+
+ status = HAL_PKA_ECCMul(&hpka, &pka_mul, HAL_MAX_DELAY);
+ if (status != HAL_OK) {
+ return WC_HW_E;
+ }
+ pka_mul_res.ptX = Gxbin;
+ pka_mul_res.ptY = Gybin;
+ HAL_PKA_ECCMul_GetResult(&hpka, &pka_mul_res);
+ res = mp_read_unsigned_bin(R->x, Gxbin, size);
+ if (res == MP_OKAY) {
+ res = mp_read_unsigned_bin(R->y, Gybin, size);
+#ifndef WOLFSSL_SP_MATH
+ /* if k is negative, we compute the multiplication with abs(-k)
+ * with result (x, y) and modify the result to (x, -y)
+ */
+ R->y->sign = k->sign;
+#endif
+ }
+ if (res == MP_OKAY)
+ res = mp_set(R->z, 1);
+ HAL_PKA_RAMReset(&hpka);
+ return res;
+}
+
+int stm32_ecc_verify_hash_ex(mp_int *r, mp_int *s, const byte* hash,
+ word32 hashlen, int* res, ecc_key* key)
+{
+ PKA_ECDSAVerifInTypeDef pka_ecc;
+ uint8_t size;
+ int szModulus;
+ int szrbin;
+ int status;
+ uint8_t Rbin[STM32_MAX_ECC_SIZE];
+ uint8_t Sbin[STM32_MAX_ECC_SIZE];
+ uint8_t Qxbin[STM32_MAX_ECC_SIZE];
+ uint8_t Qybin[STM32_MAX_ECC_SIZE];
+ uint8_t Hashbin[STM32_MAX_ECC_SIZE];
+ uint8_t privKeybin[STM32_MAX_ECC_SIZE];
+ const uint8_t *prime, *coef, *gen_x, *gen_y, *order;
+ const uint32_t *coef_sign;
+ XMEMSET(&pka_ecc, 0x00, sizeof(PKA_ECDSAVerifInTypeDef));
+
+ if (r == NULL || s == NULL || hash == NULL || res == NULL || key == NULL) {
+ return ECC_BAD_ARG_E;
+ }
+ *res = 0;
+
+ szModulus = mp_unsigned_bin_size(key->pubkey.x);
+ szrbin = mp_unsigned_bin_size(r);
+
+ status = stm32_get_from_mp_int(Rbin, r, szrbin);
+ if (status == MP_OKAY)
+ status = stm32_get_from_mp_int(Sbin, s, szrbin);
+ if (status == MP_OKAY)
+ status = stm32_get_from_mp_int(Qxbin, key->pubkey.x, szModulus);
+ if (status == MP_OKAY)
+ status = stm32_get_from_mp_int(Qybin, key->pubkey.y, szModulus);
+ if (status == MP_OKAY)
+ status = stm32_get_from_mp_int(privKeybin, &key->k, szModulus);
+ if (status != MP_OKAY)
+ return status;
+
+ size = (uint8_t)szModulus;
+ /* find parameters for the selected curve */
+ if (0 != stm32_get_ecc_specs(&prime, &coef, &coef_sign, &gen_x, &gen_y, &order, size)) {
+ return ECC_BAD_ARG_E;
+ }
+
+
+ pka_ecc.primeOrderSize = size;
+ pka_ecc.modulusSize = size;
+ pka_ecc.coefSign = *coef_sign;
+ pka_ecc.coef = coef;
+ pka_ecc.modulus = prime;
+ pka_ecc.basePointX = gen_x;
+ pka_ecc.basePointY = gen_y;
+ pka_ecc.primeOrder = order;
+
+ pka_ecc.pPubKeyCurvePtX = Qxbin;
+ pka_ecc.pPubKeyCurvePtY = Qybin;
+ pka_ecc.RSign = Rbin;
+ pka_ecc.SSign = Sbin;
+ XMEMSET(Hashbin, 0, STM32_MAX_ECC_SIZE);
+ XMEMCPY(Hashbin + (size - hashlen), hash, hashlen);
+ pka_ecc.hash = Hashbin;
+
+ status = HAL_PKA_ECDSAVerif(&hpka, &pka_ecc, HAL_MAX_DELAY);
+ if (status != HAL_OK) {
+ HAL_PKA_RAMReset(&hpka);
+ return WC_HW_E;
+ }
+ *res = HAL_PKA_ECDSAVerif_IsValidSignature(&hpka);
+ HAL_PKA_RAMReset(&hpka);
+ return status;
+}
+
+int stm32_ecc_sign_hash_ex(const byte* hash, word32 hashlen, WC_RNG* rng,
+ ecc_key* key, mp_int *r, mp_int *s)
+{
+ PKA_ECDSASignInTypeDef pka_ecc;
+ PKA_ECDSASignOutTypeDef pka_ecc_out;
+ int size;
+ int status;
+ mp_int gen_k;
+ mp_int order_mp;
+ uint8_t Keybin[STM32_MAX_ECC_SIZE];
+ uint8_t Intbin[STM32_MAX_ECC_SIZE];
+ uint8_t Rbin[STM32_MAX_ECC_SIZE];
+ uint8_t Sbin[STM32_MAX_ECC_SIZE];
+ uint8_t Hashbin[STM32_MAX_ECC_SIZE];
+ const uint8_t *prime, *coef, *gen_x, *gen_y, *order;
+ const uint32_t *coef_sign;
+ XMEMSET(&pka_ecc, 0x00, sizeof(PKA_ECDSASignInTypeDef));
+ XMEMSET(&pka_ecc, 0x00, sizeof(PKA_ECDSASignOutTypeDef));
+
+ if (r == NULL || s == NULL || hash == NULL || key == NULL) {
+ return ECC_BAD_ARG_E;
+ }
+
+ mp_init(&gen_k);
+ mp_init(&order_mp);
+
+ size = mp_unsigned_bin_size(key->pubkey.x);
+
+ status = stm32_get_from_mp_int(Keybin, &key->k, size);
+ if (status != MP_OKAY)
+ return status;
+
+ /* find parameters for the selected curve */
+ if (0 != stm32_get_ecc_specs(&prime, &coef, &coef_sign, &gen_x, &gen_y, &order, size)) {
+ return ECC_BAD_ARG_E;
+ }
+
+ status = mp_read_unsigned_bin(&order_mp, order, size);
+ if (status == MP_OKAY)
+ status = wc_ecc_gen_k(rng, size, &gen_k, &order_mp);
+ if (status == MP_OKAY)
+ status = stm32_get_from_mp_int(Intbin, &gen_k, size);
+ if (status != MP_OKAY)
+ return status;
+
+ pka_ecc.primeOrderSize = size;
+ pka_ecc.modulusSize = size;
+ pka_ecc.coefSign = *coef_sign;
+ pka_ecc.coef = coef;
+ pka_ecc.modulus = prime;
+ pka_ecc.basePointX = gen_x;
+ pka_ecc.basePointY = gen_y;
+ pka_ecc.primeOrder = order;
+
+ XMEMSET(Hashbin, 0, STM32_MAX_ECC_SIZE);
+ XMEMCPY(Hashbin + (size - hashlen), hash, hashlen);
+ pka_ecc.hash = Hashbin;
+ pka_ecc.integer = Intbin;
+ pka_ecc.privateKey = Keybin;
+
+ /* Assign R, S static buffers */
+ pka_ecc_out.RSign = Rbin;
+ pka_ecc_out.SSign = Sbin;
+
+ status = HAL_PKA_ECDSASign(&hpka, &pka_ecc, HAL_MAX_DELAY);
+ if (status != HAL_OK) {
+ HAL_PKA_RAMReset(&hpka);
+ return WC_HW_E;
+ }
+ HAL_PKA_ECDSASign_GetResult(&hpka, &pka_ecc_out, NULL);
+ status = mp_read_unsigned_bin(r, pka_ecc_out.RSign, size);
+ if (status == MP_OKAY)
+ status = mp_read_unsigned_bin(s, pka_ecc_out.SSign, size);
+ HAL_PKA_RAMReset(&hpka);
+ return status;
+}
+
+#endif /* HAVE_ECC */
+#endif /* WOLFSSL_STM32_PKA */
diff --git a/wolfcrypt/src/port/st/stsafe.c b/wolfcrypt/src/port/st/stsafe.c
new file mode 100644
index 0000000..239b159
--- /dev/null
+++ b/wolfcrypt/src/port/st/stsafe.c
@@ -0,0 +1,566 @@
+/* stsafe.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#include <wolfssl/wolfcrypt/port/st/stsafe.h>
+#include <wolfssl/wolfcrypt/logging.h>
+
+#ifndef STSAFE_INTERFACE_PRINTF
+#define STSAFE_INTERFACE_PRINTF(...)
+#endif
+
+#ifdef WOLFSSL_STSAFEA100
+
+int SSL_STSAFE_LoadDeviceCertificate(byte** pRawCertificate,
+ word32* pRawCertificateLen)
+{
+ int err;
+
+ if (pRawCertificate == NULL || pRawCertificateLen == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+#ifdef USE_STSAFE_VERBOSE
+ WOLFSSL_MSG("SSL_STSAFE_LoadDeviceCertificate");
+#endif
+
+ /* Try reading device certificate from ST-SAFE Zone 0 */
+ err = stsafe_interface_read_device_certificate_raw(
+ pRawCertificate, (uint32_t*)pRawCertificateLen);
+ if (err == STSAFE_A_OK) {
+ #if 0
+ /* example for loading into WOLFSSL_CTX */
+ err = wolfSSL_CTX_use_certificate_buffer(ctx,
+ *pRawCertificate, *pRawCertificateLen, SSL_FILETYPE_ASN1);
+ if (err != WOLFSSL_SUCCESS) {
+ /* failed */
+ }
+ /* can free now */
+ XFREE(*pRawCertificate, NULL, DYNAMIC_TEMP_BUFFER);
+ *pRawCertificate = NULL;
+ #endif
+ }
+ else {
+ err = WC_HW_E;
+ }
+
+ return err;
+}
+
+#ifdef HAVE_PK_CALLBACKS
+
+/**
+ * \brief Key Gen Callback (used by TLS server)
+ */
+int SSL_STSAFE_CreateKeyCb(WOLFSSL* ssl, ecc_key* key, word32 keySz,
+ int ecc_curve, void* ctx)
+{
+ int err;
+ byte pubKeyRaw[STSAFE_MAX_PUBKEY_RAW_LEN];
+ StSafeA_KeySlotNumber slot;
+ StSafeA_CurveId curve_id;
+
+ (void)ssl;
+ (void)ctx;
+
+#ifdef USE_STSAFE_VERBOSE
+ WOLFSSL_MSG("CreateKeyCb: STSAFE");
+#endif
+
+ /* get curve */
+ curve_id = stsafe_get_ecc_curve_id(ecc_curve);
+
+ /* generate new ephemeral key on device */
+ err = stsafe_interface_create_key(&slot, curve_id, (uint8_t*)&pubKeyRaw[0]);
+ if (err != STSAFE_A_OK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("stsafe_interface_create_key error: %d\n", err);
+ #endif
+ err = WC_HW_E;
+ return err;
+ }
+
+ /* load generated public key into key, used by wolfSSL */
+ err = wc_ecc_import_unsigned(key, &pubKeyRaw[0], &pubKeyRaw[keySz],
+ NULL, ecc_curve);
+
+ return err;
+}
+
+/**
+ * \brief Verify Peer Cert Callback.
+ */
+int SSL_STSAFE_VerifyPeerCertCb(WOLFSSL* ssl,
+ const unsigned char* sig, unsigned int sigSz,
+ const unsigned char* hash, unsigned int hashSz,
+ const unsigned char* keyDer, unsigned int keySz,
+ int* result, void* ctx)
+{
+ int err;
+ byte sigRS[STSAFE_MAX_SIG_LEN];
+ byte *r, *s;
+ word32 r_len = STSAFE_MAX_SIG_LEN/2, s_len = STSAFE_MAX_SIG_LEN/2;
+ byte pubKeyX[STSAFE_MAX_PUBKEY_RAW_LEN/2];
+ byte pubKeyY[STSAFE_MAX_PUBKEY_RAW_LEN/2];
+ word32 pubKeyX_len = sizeof(pubKeyX);
+ word32 pubKeyY_len = sizeof(pubKeyY);
+ ecc_key key;
+ word32 inOutIdx = 0;
+ StSafeA_CurveId curve_id;
+ int ecc_curve;
+
+ (void)ssl;
+ (void)ctx;
+
+#ifdef USE_STSAFE_VERBOSE
+ WOLFSSL_MSG("VerifyPeerCertCB: STSAFE");
+#endif
+
+ err = wc_ecc_init(&key);
+ if (err != 0) {
+ return err;
+ }
+
+ /* Decode the public key */
+ err = wc_EccPublicKeyDecode(keyDer, &inOutIdx, &key, keySz);
+ if (err == 0) {
+ /* Extract Raw X and Y coordinates of the public key */
+ err = wc_ecc_export_public_raw(&key, pubKeyX, &pubKeyX_len,
+ pubKeyY, &pubKeyY_len);
+ }
+ if (err == 0) {
+ int key_sz;
+
+ /* determine curve */
+ ecc_curve = key.dp->id;
+ curve_id = stsafe_get_ecc_curve_id(ecc_curve);
+ key_sz = stsafe_get_key_size(curve_id);
+
+ /* Extract R and S from signature */
+ XMEMSET(sigRS, 0, sizeof(sigRS));
+ r = &sigRS[0];
+ s = &sigRS[key_sz];
+ err = wc_ecc_sig_to_rs(sig, sigSz, r, &r_len, s, &s_len);
+ (void)r_len;
+ (void)s_len;
+ }
+
+ if (err == 0) {
+ /* Verify signature */
+ err = stsafe_interface_verify(curve_id, (uint8_t*)hash, sigRS,
+ pubKeyX, pubKeyY, (int32_t*)result);
+ if (err != STSAFE_A_OK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("stsafe_interface_verify error: %d\n", err);
+ #endif
+ err = WC_HW_E;
+ }
+ }
+
+ wc_ecc_free(&key);
+ return err;
+}
+
+/**
+ * \brief Sign Certificate Callback.
+ */
+int SSL_STSAFE_SignCertificateCb(WOLFSSL* ssl, const byte* in,
+ word32 inSz, byte* out, word32* outSz,
+ const byte* key, word32 keySz, void* ctx)
+{
+ int err;
+ byte digest[STSAFE_MAX_KEY_LEN];
+ byte sigRS[STSAFE_MAX_SIG_LEN];
+ byte *r, *s;
+ StSafeA_CurveId curve_id;
+ int key_sz;
+
+ (void)ssl;
+ (void)ctx;
+
+#ifdef USE_STSAFE_VERBOSE
+ WOLFSSL_MSG("SignCertificateCb: STSAFE");
+#endif
+
+ curve_id = stsafe_get_curve_mode();
+ key_sz = stsafe_get_key_size(curve_id);
+
+ /* Build input digest */
+ if (inSz > key_sz)
+ inSz = key_sz;
+ XMEMSET(&digest[0], 0, sizeof(digest));
+ XMEMCPY(&digest[key_sz - inSz], in, inSz);
+
+ /* Sign using slot 0: Result is R then S */
+ /* Sign will always use the curve type in slot 0 (the TLS curve needs to match) */
+ XMEMSET(sigRS, 0, sizeof(sigRS));
+ err = stsafe_interface_sign(STSAFE_A_SLOT_0, curve_id, digest, sigRS);
+ if (err != STSAFE_A_OK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("stsafe_interface_sign error: %d\n", err);
+ #endif
+ err = WC_HW_E;
+ return err;
+ }
+
+ /* Convert R and S to signature */
+ r = &sigRS[0];
+ s = &sigRS[key_sz];
+ err = wc_ecc_rs_raw_to_sig((const byte*)r, key_sz, (const byte*)s, key_sz,
+ out, outSz);
+ if (err != 0) {
+ #ifdef USE_STSAFE_VERBOSE
+ WOLFSSL_MSG("Error converting RS to Signature");
+ #endif
+ }
+
+ return err;
+}
+
+
+/**
+ * \brief Create pre master secret using peer's public key and self private key.
+ */
+int SSL_STSAFE_SharedSecretCb(WOLFSSL* ssl, ecc_key* otherKey,
+ unsigned char* pubKeyDer, unsigned int* pubKeySz,
+ unsigned char* out, unsigned int* outlen,
+ int side, void* ctx)
+{
+ int err;
+ byte otherKeyX[STSAFE_MAX_KEY_LEN];
+ byte otherKeyY[STSAFE_MAX_KEY_LEN];
+ word32 otherKeyX_len = sizeof(otherKeyX);
+ word32 otherKeyY_len = sizeof(otherKeyY);
+ byte pubKeyRaw[STSAFE_MAX_PUBKEY_RAW_LEN];
+ StSafeA_KeySlotNumber slot;
+ StSafeA_CurveId curve_id;
+ ecc_key tmpKey;
+ int ecc_curve;
+ int key_sz;
+
+ (void)ssl;
+ (void)ctx;
+
+#ifdef USE_STSAFE_VERBOSE
+ WOLFSSL_MSG("SharedSecretCb: STSAFE");
+#endif
+
+ err = wc_ecc_init(&tmpKey);
+ if (err != 0) {
+ return err;
+ }
+
+ /* set curve */
+ ecc_curve = otherKey->dp->id;
+ curve_id = stsafe_get_ecc_curve_id(ecc_curve);
+ key_sz = stsafe_get_key_size(curve_id);
+
+ /* for client: create and export public key */
+ if (side == WOLFSSL_CLIENT_END) {
+ /* Export otherKey raw X and Y */
+ err = wc_ecc_export_public_raw(otherKey,
+ &otherKeyX[0], (word32*)&otherKeyX_len,
+ &otherKeyY[0], (word32*)&otherKeyY_len);
+ if (err != 0) {
+ return err;
+ }
+
+ err = stsafe_interface_create_key(&slot, curve_id, (uint8_t*)&pubKeyRaw[0]);
+ if (err != STSAFE_A_OK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("stsafe_interface_create_key error: %d\n", err);
+ #endif
+ err = WC_HW_E;
+ return err;
+ }
+
+ /* convert raw unsigned public key to X.963 format for TLS */
+ err = wc_ecc_init(&tmpKey);
+ if (err == 0) {
+ err = wc_ecc_import_unsigned(&tmpKey, &pubKeyRaw[0], &pubKeyRaw[key_sz],
+ NULL, ecc_curve);
+ if (err == 0) {
+ err = wc_ecc_export_x963(&tmpKey, pubKeyDer, pubKeySz);
+ }
+ wc_ecc_free(&tmpKey);
+ }
+ }
+ /* for server: import public key */
+ else if (side == WOLFSSL_SERVER_END) {
+ /* import peer's key and export as raw unsigned for hardware */
+ err = wc_ecc_import_x963_ex(pubKeyDer, *pubKeySz, &tmpKey, ecc_curve);
+ if (err == 0) {
+ err = wc_ecc_export_public_raw(&tmpKey, otherKeyX, &otherKeyX_len,
+ otherKeyY, &otherKeyY_len);
+ }
+ }
+ else {
+ err = BAD_FUNC_ARG;
+ }
+
+ wc_ecc_free(&tmpKey);
+
+ if (err != 0) {
+ return err;
+ }
+
+ /* Compute shared secret */
+ err = stsafe_interface_shared_secret(curve_id, &otherKeyX[0], &otherKeyY[0],
+ out, (int32_t*)outlen);
+ if (err != STSAFE_A_OK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("stsafe_interface_shared_secret error: %d\n", err);
+ #endif
+ err = WC_HW_E;
+ }
+
+ return err;
+}
+
+int SSL_STSAFE_SetupPkCallbacks(WOLFSSL_CTX* ctx)
+{
+ wolfSSL_CTX_SetEccKeyGenCb(ctx, SSL_STSAFE_CreateKeyCb);
+ wolfSSL_CTX_SetEccSignCb(ctx, SSL_STSAFE_SignCertificateCb);
+ wolfSSL_CTX_SetEccVerifyCb(ctx, SSL_STSAFE_VerifyPeerCertCb);
+ wolfSSL_CTX_SetEccSharedSecretCb(ctx, SSL_STSAFE_SharedSecretCb);
+ wolfSSL_CTX_SetDevId(ctx, 0); /* enables wolfCrypt `wc_ecc_*` ST-Safe use */
+ return 0;
+}
+
+int SSL_STSAFE_SetupPkCallbackCtx(WOLFSSL* ssl, void* user_ctx)
+{
+ wolfSSL_SetEccKeyGenCtx(ssl, user_ctx);
+ wolfSSL_SetEccSharedSecretCtx(ssl, user_ctx);
+ wolfSSL_SetEccSignCtx(ssl, user_ctx);
+ wolfSSL_SetEccVerifyCtx(ssl, user_ctx);
+ return 0;
+}
+
+
+#endif /* HAVE_PK_CALLBACKS */
+
+#ifdef WOLF_CRYPTO_CB
+
+int wolfSSL_STSAFE_CryptoDevCb(int devId, wc_CryptoInfo* info, void* ctx)
+{
+ int rc = CRYPTOCB_UNAVAILABLE;
+ wolfSTSAFE_CryptoCb_Ctx* stsCtx = (wolfSTSAFE_CryptoCb_Ctx*)ctx;
+
+ if (info == NULL || ctx == NULL)
+ return BAD_FUNC_ARG;
+
+ (void)devId;
+ (void)stsCtx;
+
+ if (info->algo_type == WC_ALGO_TYPE_SEED) {
+ /* use the STSAFE hardware for RNG seed */
+ #if !defined(WC_NO_RNG) && defined(USE_STSAFE_RNG_SEED)
+ while (info->seed.sz > 0) {
+ rc = stsafe_interface_getrandom(info->seed.seed, info->seed.sz);
+ if (rc < 0) {
+ return rc;
+ }
+ info->seed.seed += rc;
+ info->seed.sz -= rc;
+ }
+ rc = 0;
+ #else
+ rc = CRYPTOCB_UNAVAILABLE;
+ #endif
+ }
+#ifdef HAVE_ECC
+ else if (info->algo_type == WC_ALGO_TYPE_PK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("STSAFE Pk: Type %d\n", info->pk.type);
+ #endif
+
+ if (info->pk.type == WC_PK_TYPE_EC_KEYGEN) {
+ byte pubKeyRaw[STSAFE_MAX_PUBKEY_RAW_LEN];
+ StSafeA_KeySlotNumber slot;
+ StSafeA_CurveId curve_id;
+ int ecc_curve, key_sz;
+
+ WOLFSSL_MSG("STSAFE: ECC KeyGen");
+
+ /* get curve */
+ ecc_curve = info->pk.eckg.curveId;
+ curve_id = stsafe_get_ecc_curve_id(ecc_curve);
+ key_sz = stsafe_get_key_size(curve_id);
+
+ /* generate new ephemeral key on device */
+ rc = stsafe_interface_create_key(&slot, curve_id,
+ (uint8_t*)pubKeyRaw);
+ if (rc != STSAFE_A_OK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("stsafe_interface_create_key error: %d\n", rc);
+ #endif
+ rc = WC_HW_E;
+ return rc;
+ }
+
+ /* load generated public key into key, used by wolfSSL */
+ rc = wc_ecc_import_unsigned(info->pk.eckg.key, pubKeyRaw,
+ &pubKeyRaw[key_sz], NULL, ecc_curve);
+ }
+ else if (info->pk.type == WC_PK_TYPE_ECDSA_SIGN) {
+ byte digest[STSAFE_MAX_KEY_LEN];
+ byte sigRS[STSAFE_MAX_SIG_LEN];
+ byte *r, *s;
+ StSafeA_CurveId curve_id;
+ word32 inSz = info->pk.eccsign.inlen;
+ int key_sz;
+
+ WOLFSSL_MSG("STSAFE: ECC Sign");
+
+ curve_id = stsafe_get_curve_mode();
+ key_sz = stsafe_get_key_size(curve_id);
+
+ /* truncate input to match key size */
+ if (inSz > key_sz)
+ inSz = key_sz;
+
+ /* Build input digest */
+ XMEMSET(&digest[0], 0, sizeof(digest));
+ XMEMCPY(&digest[key_sz - inSz], info->pk.eccsign.in, inSz);
+
+ /* Sign using slot 0: Result is R then S */
+ /* Sign will always use the curve type in slot 0
+ (the TLS curve needs to match) */
+ XMEMSET(sigRS, 0, sizeof(sigRS));
+ rc = stsafe_interface_sign(STSAFE_A_SLOT_0, curve_id,
+ (uint8_t*)info->pk.eccsign.in, sigRS);
+ if (rc != STSAFE_A_OK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("stsafe_interface_sign error: %d\n", rc);
+ #endif
+ rc = WC_HW_E;
+ return rc;
+ }
+
+ /* Convert R and S to signature */
+ r = &sigRS[0];
+ s = &sigRS[key_sz];
+ rc = wc_ecc_rs_raw_to_sig((const byte*)r, key_sz, (const byte*)s,
+ key_sz, info->pk.eccsign.out, info->pk.eccsign.outlen);
+ if (rc != 0) {
+ WOLFSSL_MSG("Error converting RS to Signature");
+ }
+ }
+ else if (info->pk.type == WC_PK_TYPE_ECDSA_VERIFY) {
+ byte sigRS[STSAFE_MAX_SIG_LEN];
+ byte *r, *s;
+ word32 r_len = STSAFE_MAX_SIG_LEN/2, s_len = STSAFE_MAX_SIG_LEN/2;
+ byte pubKeyX[STSAFE_MAX_PUBKEY_RAW_LEN/2];
+ byte pubKeyY[STSAFE_MAX_PUBKEY_RAW_LEN/2];
+ word32 pubKeyX_len = sizeof(pubKeyX);
+ word32 pubKeyY_len = sizeof(pubKeyY);
+ StSafeA_CurveId curve_id;
+ int ecc_curve, key_sz;
+
+ WOLFSSL_MSG("STSAFE: ECC Verify");
+
+ if (info->pk.eccverify.key == NULL)
+ return BAD_FUNC_ARG;
+
+ /* determine curve */
+ ecc_curve = info->pk.eccverify.key->dp->id;
+ curve_id = stsafe_get_ecc_curve_id(ecc_curve);
+ key_sz = stsafe_get_key_size(curve_id);
+
+ /* Extract Raw X and Y coordinates of the public key */
+ rc = wc_ecc_export_public_raw(info->pk.eccverify.key,
+ pubKeyX, &pubKeyX_len,
+ pubKeyY, &pubKeyY_len);
+ if (rc == 0) {
+ /* Extract R and S from signature */
+ XMEMSET(sigRS, 0, sizeof(sigRS));
+ r = &sigRS[0];
+ s = &sigRS[key_sz];
+ rc = wc_ecc_sig_to_rs(info->pk.eccverify.sig,
+ info->pk.eccverify.siglen, r, &r_len, s, &s_len);
+ (void)r_len;
+ (void)s_len;
+ }
+ if (rc == 0) {
+ /* Verify signature */
+ rc = stsafe_interface_verify(curve_id,
+ (uint8_t*)info->pk.eccverify.hash, sigRS, pubKeyX, pubKeyY,
+ (int32_t*)info->pk.eccverify.res);
+ if (rc != STSAFE_A_OK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("stsafe_interface_verify error: %d\n", rc);
+ #endif
+ rc = WC_HW_E;
+ }
+ }
+ }
+ else if (info->pk.type == WC_PK_TYPE_ECDH) {
+ byte otherKeyX[STSAFE_MAX_KEY_LEN];
+ byte otherKeyY[STSAFE_MAX_KEY_LEN];
+ word32 otherKeyX_len = sizeof(otherKeyX);
+ word32 otherKeyY_len = sizeof(otherKeyY);
+ StSafeA_CurveId curve_id;
+ int ecc_curve;
+
+ WOLFSSL_MSG("STSAFE: PMS");
+
+ if (info->pk.ecdh.public_key == NULL)
+ return BAD_FUNC_ARG;
+
+ /* get curve */
+ ecc_curve = info->pk.ecdh.public_key->dp->id;
+ curve_id = stsafe_get_ecc_curve_id(ecc_curve);
+
+ /* Export otherKey raw X and Y */
+ rc = wc_ecc_export_public_raw(info->pk.ecdh.public_key,
+ &otherKeyX[0], (word32*)&otherKeyX_len,
+ &otherKeyY[0], (word32*)&otherKeyY_len);
+ if (rc == 0) {
+ /* Compute shared secret */
+ *info->pk.ecdh.outlen = 0;
+ rc = stsafe_interface_shared_secret(curve_id,
+ otherKeyX, otherKeyY,
+ info->pk.ecdh.out, (int32_t*)info->pk.ecdh.outlen);
+ if (rc != STSAFE_A_OK) {
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("stsafe_interface_shared_secret error: %d\n", rc);
+ #endif
+ rc = WC_HW_E;
+ }
+ }
+ }
+ }
+#endif /* HAVE_ECC */
+
+ /* need to return negative here for error */
+ if (rc != 0 && rc != CRYPTOCB_UNAVAILABLE) {
+ WOLFSSL_MSG("STSAFE: CryptoCb failed");
+ #ifdef USE_STSAFE_VERBOSE
+ STSAFE_INTERFACE_PRINTF("STSAFE: CryptoCb failed %d\n", rc);
+ #endif
+ rc = WC_HW_E;
+ }
+
+ return rc;
+}
+
+#endif /* WOLF_CRYPTO_CB */
+
+#endif /* WOLFSSL_STSAFEA100 */
diff --git a/wolfcrypt/src/port/ti/ti-aes.c b/wolfcrypt/src/port/ti/ti-aes.c
new file mode 100644
index 0000000..52f2ceb
--- /dev/null
+++ b/wolfcrypt/src/port/ti/ti-aes.c
@@ -0,0 +1,569 @@
+/* port/ti/ti-aes.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifndef NO_AES
+
+
+#if defined(WOLFSSL_TI_CRYPT)
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <wolfssl/wolfcrypt/aes.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/port/ti/ti-ccm.h>
+
+#include "inc/hw_aes.h"
+#include "inc/hw_memmap.h"
+#include "inc/hw_ints.h"
+#include "driverlib/aes.h"
+#include "driverlib/sysctl.h"
+#include "driverlib/rom_map.h"
+#include "driverlib/rom.h"
+
+static int AesSetIV(Aes* aes, const byte* iv)
+{
+ if (aes == NULL)
+ return BAD_FUNC_ARG;
+
+ if (iv)
+ XMEMCPY(aes->reg, iv, AES_BLOCK_SIZE);
+ else
+ XMEMSET(aes->reg, 0, AES_BLOCK_SIZE);
+
+ return 0;
+}
+
+WOLFSSL_API int wc_AesSetKey(Aes* aes, const byte* key, word32 len, const byte* iv,
+ int dir)
+{
+ if(!wolfSSL_TI_CCMInit())return 1 ;
+ if ((aes == NULL) || (key == NULL) || (iv == NULL))
+ return BAD_FUNC_ARG;
+ if(!((dir == AES_ENCRYPTION) || (dir == AES_DECRYPTION)))
+ return BAD_FUNC_ARG;
+
+ switch(len) {
+ case 16: aes->keylen = AES_CFG_KEY_SIZE_128BIT ; break ;
+ case 24: aes->keylen = AES_CFG_KEY_SIZE_192BIT ; break ;
+ case 32: aes->keylen = AES_CFG_KEY_SIZE_256BIT ; break ;
+ default: return BAD_FUNC_ARG;
+ }
+
+ XMEMCPY(aes->key, key, len) ;
+ #ifdef WOLFSSL_AES_COUNTER
+ aes->left = 0;
+ #endif /* WOLFSSL_AES_COUNTER */
+ return AesSetIV(aes, iv);
+}
+
+#define AES_CFG_MODE_CTR_NOCTR AES_CFG_MODE_CTR+100
+#define IS_ALIGN16(p) (((unsigned int)(p)&0xf) == 0)
+
+static int AesAlign16(Aes* aes, byte* out, const byte* in, word32 sz, word32 dir, word32 mode)
+{
+ wolfSSL_TI_lockCCM() ;
+ ROM_AESReset(AES_BASE);
+ ROM_AESConfigSet(AES_BASE, (aes->keylen | dir |
+ (mode==AES_CFG_MODE_CTR_NOCTR ? AES_CFG_MODE_CTR : mode)));
+ ROM_AESIVSet(AES_BASE, (uint32_t *)aes->reg);
+ ROM_AESKey1Set(AES_BASE, (uint32_t *)aes->key, aes->keylen);
+ if((dir == AES_CFG_DIR_DECRYPT)&& (mode == AES_CFG_MODE_CBC))
+ /* if input and output same will overwrite input iv */
+ XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+ ROM_AESDataProcess(AES_BASE, (uint32_t *)in, (uint32_t *)out, sz);
+ wolfSSL_TI_unlockCCM() ;
+
+ /* store iv for next call */
+ if(mode == AES_CFG_MODE_CBC){
+ if(dir == AES_CFG_DIR_ENCRYPT)
+ XMEMCPY(aes->reg, out + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+ else
+ XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE);
+ }
+
+ if(mode == AES_CFG_MODE_CTR) {
+ do {
+ int i ;
+ for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) {
+ if (++((byte *)aes->reg)[i])
+ break ;
+ }
+ sz -= AES_BLOCK_SIZE ;
+ } while((int)sz > 0) ;
+ }
+
+ return 0 ;
+}
+
+static int AesProcess(Aes* aes, byte* out, const byte* in, word32 sz, word32 dir, word32 mode)
+{
+ const byte * in_p ; byte * out_p ;
+ word32 size ;
+ #define TI_BUFFSIZE 1024
+ byte buff[TI_BUFFSIZE] ;
+
+ if ((aes == NULL) || (in == NULL) || (out == NULL))
+ return BAD_FUNC_ARG;
+ if(sz % AES_BLOCK_SIZE)
+ return BAD_FUNC_ARG;
+
+ while(sz > 0) {
+ size = sz ; in_p = in ; out_p = out ;
+ if(!IS_ALIGN16(in)){
+ size = sz>TI_BUFFSIZE ? TI_BUFFSIZE : sz ;
+ XMEMCPY(buff, in, size) ;
+ in_p = (const byte *)buff ;
+ }
+ if(!IS_ALIGN16(out)){
+ size = sz>TI_BUFFSIZE ? TI_BUFFSIZE : sz ;
+ out_p = buff ;
+ }
+
+ AesAlign16(aes, out_p, in_p, size, dir, mode) ;
+
+ if(!IS_ALIGN16(out)){
+ XMEMCPY(out, buff, size) ;
+ }
+ sz -= size ; in += size ; out += size ;
+ }
+
+ return 0 ;
+}
+
+WOLFSSL_API int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ return AesProcess(aes, out, in, sz, AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CBC) ;
+}
+
+WOLFSSL_API int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ return AesProcess(aes, out, in, sz, AES_CFG_DIR_DECRYPT, AES_CFG_MODE_CBC) ;
+}
+
+#ifdef WOLFSSL_AES_COUNTER
+WOLFSSL_API void wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+ char out_block[AES_BLOCK_SIZE] ;
+ int odd ;
+ int even ;
+ char *tmp ; /* (char *)aes->tmp, for short */
+
+ tmp = (char *)aes->tmp ;
+ if(aes->left) {
+ if((aes->left + sz) >= AES_BLOCK_SIZE){
+ odd = AES_BLOCK_SIZE - aes->left ;
+ } else {
+ odd = sz ;
+ }
+ XMEMCPY(tmp+aes->left, in, odd) ;
+ if((odd+aes->left) == AES_BLOCK_SIZE){
+ AesProcess(aes, (byte *)out_block, (byte const *)tmp, AES_BLOCK_SIZE,
+ AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CTR) ;
+ XMEMCPY(out, out_block+aes->left, odd) ;
+ aes->left = 0 ;
+ XMEMSET(tmp, 0x0, AES_BLOCK_SIZE) ;
+ }
+ in += odd ;
+ out+= odd ;
+ sz -= odd ;
+ }
+ odd = sz % AES_BLOCK_SIZE ; /* if there is tail flagment */
+ if(sz / AES_BLOCK_SIZE) {
+ even = (sz/AES_BLOCK_SIZE)*AES_BLOCK_SIZE ;
+ AesProcess(aes, out, in, even, AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CTR);
+ out += even ;
+ in += even ;
+ }
+ if(odd) {
+ XMEMSET(tmp+aes->left, 0x0, AES_BLOCK_SIZE - aes->left) ;
+ XMEMCPY(tmp+aes->left, in, odd) ;
+ AesProcess(aes, (byte *)out_block, (byte const *)tmp, AES_BLOCK_SIZE,
+ AES_CFG_DIR_ENCRYPT,
+ AES_CFG_MODE_CTR_NOCTR /* Counter mode without counting IV */
+ );
+ XMEMCPY(out, out_block+aes->left,odd) ;
+ aes->left += odd ;
+ }
+}
+#endif
+
+/* AES-DIRECT */
+#if defined(WOLFSSL_AES_DIRECT)
+WOLFSSL_API void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in)
+{
+ AesProcess(aes, out, in, AES_BLOCK_SIZE, AES_CFG_DIR_ENCRYPT, AES_CFG_MODE_CBC) ;
+}
+WOLFSSL_API void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in)
+{
+ AesProcess(aes, out, in, AES_BLOCK_SIZE, AES_CFG_DIR_DECRYPT, AES_CFG_MODE_CBC) ;
+}
+WOLFSSL_API int wc_AesSetKeyDirect(Aes* aes, const byte* key, word32 len,
+ const byte* iv, int dir)
+{
+ return(wc_AesSetKey(aes, key, len, iv, dir)) ;
+}
+#endif
+
+
+#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
+
+static int AesAuthSetKey(Aes* aes, const byte* key, word32 keySz)
+{
+ byte nonce[AES_BLOCK_SIZE];
+
+ if ((aes == NULL) || (key == NULL))
+ return BAD_FUNC_ARG ;
+ if (!((keySz == 16) || (keySz == 24) || (keySz == 32)))
+ return BAD_FUNC_ARG ;
+
+ XMEMSET(nonce, 0, sizeof(nonce));
+ return wc_AesSetKey(aes, key, keySz, nonce, AES_ENCRYPTION);
+}
+
+
+static int AesAuthArgCheck(Aes* aes, byte* out, const byte* in, word32 inSz,
+ const byte* nonce, word32 nonceSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz, word32 *M, word32 *L)
+{
+ (void) authInSz ;
+ if((aes == NULL)||(nonce == NULL)||(authTag== NULL)||(authIn == NULL))
+ return BAD_FUNC_ARG;
+ if((inSz != 0) && ((out == NULL)||(in == NULL)))
+ return BAD_FUNC_ARG;
+
+ switch(authTagSz){
+ case 4:
+ *M = AES_CFG_CCM_M_4; break ;
+ case 6:
+ *M = AES_CFG_CCM_M_6; break ;
+ case 8:
+ *M = AES_CFG_CCM_M_8; break ;
+ case 10:
+ *M = AES_CFG_CCM_M_10; break ;
+ case 12:
+ *M = AES_CFG_CCM_M_12; break ;
+ case 14:
+ *M = AES_CFG_CCM_M_14; break ;
+ case 16:
+ *M = AES_CFG_CCM_M_16; break ;
+ default:
+ return 1 ;
+ }
+
+ switch(nonceSz){
+ case 7:
+ *L = AES_CFG_CCM_L_8; break ;
+ case 8:
+ *L = AES_CFG_CCM_L_7; break ;
+ case 9:
+ *L = AES_CFG_CCM_L_6; break ;
+ case 10:
+ *L = AES_CFG_CCM_L_5; break ;
+ case 11:
+ *L = AES_CFG_CCM_L_4; break ;
+ case 12:
+ *L = AES_CFG_CCM_L_3; break ;
+ case 13:
+ *L = AES_CFG_CCM_L_2; break ;
+ case 14:
+ *L = AES_CFG_CCM_L_1; break ;
+ default:
+ return 1;
+ }
+ return 0 ;
+}
+
+static void AesAuthSetIv(Aes *aes, const byte *nonce, word32 len, word32 L, int mode) {
+
+ if(mode == AES_CFG_MODE_CCM){
+ XMEMSET(aes->reg, 0, 16) ;
+ switch(L){
+ case AES_CFG_CCM_L_8:
+ aes->reg[0] = 0x7; break ;
+ case AES_CFG_CCM_L_7:
+ aes->reg[0] = 0x6; break ;
+ case AES_CFG_CCM_L_6:
+ aes->reg[0] = 0x5; break ;
+ case AES_CFG_CCM_L_5:
+ aes->reg[0] = 0x4; break ;
+ case AES_CFG_CCM_L_4:
+ aes->reg[0] = 0x3; break ;
+ case AES_CFG_CCM_L_3:
+ aes->reg[0] = 0x2; break ;
+ case AES_CFG_CCM_L_2:
+ aes->reg[0] = 0x1; break ;
+ case AES_CFG_CCM_L_1:
+ aes->reg[0] = 0x0; break ;
+ }
+ XMEMCPY(((byte *)aes->reg)+1, nonce, len) ;
+ } else {
+ byte *b = (byte *)aes->reg ;
+ XMEMSET(aes->reg, 0, AES_BLOCK_SIZE);
+ XMEMCPY(aes->reg, nonce, len);
+ b[AES_BLOCK_SIZE-4] = 0 ;
+ b[AES_BLOCK_SIZE-3] = 0 ;
+ b[AES_BLOCK_SIZE-2] = 0 ;
+ b[AES_BLOCK_SIZE-1] = 1 ;
+ }
+}
+
+#define RoundUp16(n) ((n+15)&0xfffffff0)
+#define FREE_ALL \
+ if(in_save) XFREE(in_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);\
+ if(out_save) XFREE(out_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);\
+ if(authIn_save)XFREE(authIn_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);\
+ if(nonce_save) XFREE(nonce_save, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+
+static int AesAuthEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
+ const byte* nonce, word32 nonceSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz, int mode)
+{
+ word32 M, L ;
+ byte *in_a, *in_save ;
+ byte *out_a, *out_save ;
+ byte *authIn_a, *authIn_save ;
+ byte *nonce_a, *nonce_save ;
+ word32 tmpTag[4] ;
+ int ret ;
+
+ if(AesAuthArgCheck(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, authIn, authInSz, &M, &L)
+ == BAD_FUNC_ARG)return BAD_FUNC_ARG ;
+
+ /* 16 byte padding */
+ in_save = NULL ; out_save = NULL ; authIn_save = NULL ; nonce_save = NULL ;
+ if((inSz%16)==0){
+ in_save = NULL ; in_a = (byte *)in ;
+ out_save = NULL ; out_a = out ;
+ } else {
+ if((in_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){
+ FREE_ALL; return MEMORY_E ; }
+ in_a = in_save ; XMEMSET(in_a, 0, RoundUp16(inSz)) ; XMEMCPY(in_a, in, inSz) ;
+
+ if((out_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){
+ FREE_ALL; return MEMORY_E ; }
+ out_a = out_save ;
+ }
+
+ if((authInSz%16)==0){
+ authIn_save = NULL ; authIn_a = (byte *)authIn ;
+ } else {
+ if((authIn_save = XMALLOC(RoundUp16(authInSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){
+ FREE_ALL; return MEMORY_E ; }
+ authIn_a = authIn_save ; XMEMSET(authIn_a, 0, RoundUp16(authInSz)) ; XMEMCPY(authIn_a, authIn, authInSz) ;
+ }
+
+ if((nonceSz%16)==0){
+ nonce_save = NULL ; nonce_a = (byte *)nonce ;
+ } else {
+ if((nonce_save = XMALLOC(RoundUp16(nonceSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){
+ FREE_ALL; return MEMORY_E; }
+ nonce_a = nonce_save ; XMEMSET(nonce_a, 0, RoundUp16(nonceSz)) ; XMEMCPY(nonce_a, nonce, nonceSz) ;
+ }
+
+ /* do aes-ccm */
+ AesAuthSetIv(aes, nonce, nonceSz, L, mode) ;
+ ROM_AESReset(AES_BASE);
+ ROM_AESConfigSet(AES_BASE, (aes->keylen | AES_CFG_DIR_ENCRYPT |
+ AES_CFG_CTR_WIDTH_128 |
+ mode | ((mode== AES_CFG_MODE_CCM) ? (L | M) : 0 ))) ;
+ ROM_AESIVSet(AES_BASE, aes->reg);
+ ROM_AESKey1Set(AES_BASE, aes->key, aes->keylen);
+ ret = ROM_AESDataProcessAuth(AES_BASE, (unsigned int*)in_a, (unsigned int *)out_a, inSz,
+ (unsigned int*)authIn_a, authInSz, (unsigned int *)tmpTag);
+ if(ret == false){
+ XMEMSET(out, 0, inSz) ;
+ XMEMSET(authTag, 0, authTagSz) ;
+ } else {
+ XMEMCPY(out, out_a, inSz) ;
+ XMEMCPY(authTag, tmpTag, authTagSz) ;
+ }
+
+ FREE_ALL;
+ return 0 ;
+}
+
+static int AesAuthDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
+ const byte* nonce, word32 nonceSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz, int mode)
+{
+ word32 M, L ;
+ byte *in_a, *in_save ;
+ byte *out_a, *out_save ;
+ byte *authIn_a, *authIn_save ;
+ byte *nonce_a, *nonce_save ;
+ word32 tmpTag[4] ;
+ bool ret ;
+
+ if(AesAuthArgCheck(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz, authIn, authInSz, &M, &L)
+ == BAD_FUNC_ARG)return BAD_FUNC_ARG ;
+
+ /* 16 byte padding */
+ in_save = NULL ; out_save = NULL ; authIn_save = NULL ; nonce_save = NULL ;
+ if((inSz%16)==0){
+ in_save = NULL ; in_a = (byte *)in ;
+ out_save = NULL ; out_a = out ;
+ } else {
+ if((in_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){
+ FREE_ALL; return MEMORY_E;}
+ in_a = in_save ; XMEMSET(in_a, 0, RoundUp16(inSz)) ; XMEMCPY(in_a, in, inSz) ;
+
+ if((out_save = XMALLOC(RoundUp16(inSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){
+ FREE_ALL; return MEMORY_E;}
+ out_a = out_save ;
+ }
+
+ if((authInSz%16)==0){
+ authIn_save = NULL ; authIn_a = (byte *)authIn ;
+ } else {
+ if((authIn_save = XMALLOC(RoundUp16(authInSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){
+ FREE_ALL; return MEMORY_E; }
+ authIn_a = authIn_save ; XMEMSET(authIn_a, 0, RoundUp16(authInSz)) ; XMEMCPY(authIn_a, authIn, authInSz) ;
+ }
+
+ if((nonceSz%16)==0){
+ nonce_save = NULL ; nonce_a = (byte *)nonce ;
+ } else {
+ if((nonce_save = XMALLOC(RoundUp16(nonceSz), NULL, DYNAMIC_TYPE_TMP_BUFFER)) == NULL){
+ FREE_ALL; return MEMORY_E; }
+ nonce_a = nonce_save ; XMEMSET(nonce_a, 0, RoundUp16(nonceSz)) ; XMEMCPY(nonce_a, nonce, nonceSz) ;
+ }
+
+ /* do aes-ccm */
+ AesAuthSetIv(aes, nonce, nonceSz, L, mode) ;
+ ROM_AESReset(AES_BASE);
+ ROM_AESConfigSet(AES_BASE, (aes->keylen | AES_CFG_DIR_DECRYPT |
+ AES_CFG_CTR_WIDTH_128 |
+ mode | ((mode== AES_CFG_MODE_CCM) ? (L | M) : 0 ))) ;
+ ROM_AESIVSet(AES_BASE, aes->reg);
+ ROM_AESKey1Set(AES_BASE, aes->key, aes->keylen);
+ ret = ROM_AESDataProcessAuth(AES_BASE, (unsigned int*)in_a, (unsigned int *)out_a, inSz,
+ (unsigned int*)authIn_a, authInSz, (unsigned int *)tmpTag);
+ if((ret == false) || (XMEMCMP(authTag, tmpTag, authTagSz) != 0)){
+ XMEMSET(out, 0, inSz) ;
+ ret = false ;
+ } else {
+ XMEMCPY(out, out_a, inSz) ;
+ }
+
+ FREE_ALL ;
+ return ret==true ? 0 : 1 ;
+}
+#endif
+
+
+#ifdef HAVE_AESGCM
+WOLFSSL_API int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
+{
+ return AesAuthSetKey(aes, key, len) ;
+}
+
+WOLFSSL_API int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) {
+ return BAD_FUNC_ARG;
+ }
+ return AesAuthEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz,
+ authIn, authInSz, AES_CFG_MODE_GCM_HY0CALC) ;
+}
+WOLFSSL_API int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ return AesAuthDecrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz,
+ authIn, authInSz, AES_CFG_MODE_GCM_HY0CALC) ;
+}
+
+WOLFSSL_API int wc_GmacSetKey(Gmac* gmac, const byte* key, word32 len)
+{
+ return AesAuthSetKey(&gmac->aes, key, len) ;
+}
+
+WOLFSSL_API int wc_GmacUpdate(Gmac* gmac, const byte* iv, word32 ivSz,
+ const byte* authIn, word32 authInSz,
+ byte* authTag, word32 authTagSz)
+{
+ return AesAuthEncrypt(&gmac->aes, NULL, NULL, 0, iv, ivSz, authTag, authTagSz,
+ authIn, authInSz, AES_CFG_MODE_GCM_HY0CALC) ;
+}
+
+#endif /* HAVE_AESGCM */
+
+#ifdef HAVE_AESCCM
+WOLFSSL_API int wc_AesCcmSetKey(Aes* aes, const byte* key, word32 keySz)
+{
+ return AesAuthSetKey(aes, key, keySz) ;
+}
+
+WOLFSSL_API int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
+ const byte* nonce, word32 nonceSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ return AesAuthEncrypt(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz,
+ authIn, authInSz, AES_CFG_MODE_CCM) ;
+}
+
+WOLFSSL_API int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
+ const byte* nonce, word32 nonceSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ return AesAuthDecrypt(aes, out, in, inSz, nonce, nonceSz, authTag, authTagSz,
+ authIn, authInSz, AES_CFG_MODE_CCM) ;
+}
+#endif /* HAVE_AESCCM */
+
+WOLFSSL_API int wc_AesInit(Aes* aes, void* heap, int devId)
+{
+ if (aes == NULL)
+ return BAD_FUNC_ARG;
+
+ aes->heap = heap;
+ (void)devId;
+
+ return 0;
+}
+
+WOLFSSL_API void wc_AesFree(Aes* aes)
+{
+ (void)aes;
+}
+
+#endif /* WOLFSSL_TI_CRYPT */
+
+#endif /* NO_AES */
+
+
+
diff --git a/wolfcrypt/src/port/ti/ti-ccm.c b/wolfcrypt/src/port/ti/ti-ccm.c
new file mode 100644
index 0000000..5c0051e
--- /dev/null
+++ b/wolfcrypt/src/port/ti/ti-ccm.c
@@ -0,0 +1,94 @@
+/* port/ti/ti_ccm.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(WOLFSSL_TI_CRYPT) || defined(WOLFSSL_TI_HASH)
+
+#include "wolfssl/wolfcrypt/port/ti/ti-ccm.h"
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifndef TI_DUMMY_BUILD
+#include "driverlib/sysctl.h"
+#include "driverlib/rom_map.h"
+#include "driverlib/rom.h"
+
+#ifndef SINGLE_THREADED
+#include <wolfssl/wolfcrypt/wc_port.h>
+ static wolfSSL_Mutex TI_CCM_Mutex;
+#endif
+#endif /* TI_DUMMY_BUILD */
+
+#define TIMEOUT 500000
+#define WAIT(stat) { volatile int i; for(i=0; i<TIMEOUT; i++)if(stat)break; if(i==TIMEOUT)return(false); }
+
+static bool ccm_init = false;
+int wolfSSL_TI_CCMInit(void)
+{
+ if (ccm_init)
+ return true;
+ ccm_init = true;
+
+#ifndef TI_DUMMY_BUILD
+ SysCtlClockFreqSet((SYSCTL_XTAL_25MHZ |
+ SYSCTL_OSC_MAIN |
+ SYSCTL_USE_PLL |
+ SYSCTL_CFG_VCO_480), 120000000);
+
+ if (!ROM_SysCtlPeripheralPresent(SYSCTL_PERIPH_CCM0))
+ return false;
+
+ ROM_SysCtlPeripheralEnable(SYSCTL_PERIPH_CCM0);
+ WAIT(ROM_SysCtlPeripheralReady(SYSCTL_PERIPH_CCM0));
+ ROM_SysCtlPeripheralReset(SYSCTL_PERIPH_CCM0);
+ WAIT(ROM_SysCtlPeripheralReady(SYSCTL_PERIPH_CCM0));
+
+#ifndef SINGLE_THREADED
+ if (wc_InitMutex(&TI_CCM_Mutex))
+ return false;
+#endif
+#endif /* !TI_DUMMY_BUILD */
+
+ return true;
+}
+
+#ifndef SINGLE_THREADED
+void wolfSSL_TI_lockCCM(void)
+{
+#ifndef TI_DUMMY_BUILD
+ wc_LockMutex(&TI_CCM_Mutex);
+#endif
+}
+
+void wolfSSL_TI_unlockCCM(void){
+#ifndef TI_DUMMY_BUILD
+ wc_UnLockMutex(&TI_CCM_Mutex);
+#endif
+}
+#endif /* !SINGLE_THREADED */
+
+#endif /* WOLFSSL_TI_CRYPT || WOLFSSL_TI_HASH */
diff --git a/wolfcrypt/src/port/ti/ti-des3.c b/wolfcrypt/src/port/ti/ti-des3.c
new file mode 100644
index 0000000..0e3c81d
--- /dev/null
+++ b/wolfcrypt/src/port/ti/ti-des3.c
@@ -0,0 +1,204 @@
+/* port/ti/ti-des.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifndef NO_DES
+
+#if defined(WOLFSSL_TI_CRYPT)
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <wolfssl/wolfcrypt/des3.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/port/ti/ti-ccm.h>
+
+#include "inc/hw_des.h"
+#include "inc/hw_memmap.h"
+#include "inc/hw_ints.h"
+#include "driverlib/des.h"
+#include "driverlib/sysctl.h"
+#include "driverlib/rom_map.h"
+#include "driverlib/rom.h"
+
+static int DesSetIV(Des* des, const byte* iv, int tri)
+{
+ if (des == NULL)
+ return BAD_FUNC_ARG;
+
+ if (iv)
+ XMEMCPY(des->reg, iv, tri == DES_CFG_TRIPLE ? DES3_IVLEN : DES_IVLEN);
+ else
+ XMEMSET(des->reg, 0, tri == DES_CFG_TRIPLE ? DES3_IVLEN : DES_IVLEN);
+
+ return 0;
+}
+
+static int DesSetKey(Des* des, const byte* key, const byte* iv,int dir, int tri)
+{
+ if(!wolfSSL_TI_CCMInit())return 1 ;
+ if ((des == NULL) || (key == NULL) || (iv == NULL))
+ return BAD_FUNC_ARG;
+ if(!((dir == DES_ENCRYPTION) || (dir == DES_DECRYPTION)))
+ return BAD_FUNC_ARG;
+
+ XMEMCPY(des->key, key, tri == DES_CFG_SINGLE ? DES_KEYLEN : DES3_KEYLEN) ;
+ return DesSetIV(des, iv, tri);
+}
+
+static int DesCbcAlign16(Des* des, byte* out, const byte* in, word32 sz, word32 dir, word32 tri)
+{
+
+ wolfSSL_TI_lockCCM() ;
+ ROM_DESReset(DES_BASE);
+ ROM_DESConfigSet(DES_BASE, (dir | DES_CFG_MODE_CBC | tri));
+ ROM_DESIVSet(DES_BASE, (uint32_t*)des->reg);
+ ROM_DESKeySet(DES_BASE,(uint32_t*)des->key);
+ if(dir == DES_CFG_DIR_DECRYPT)
+ /* if input and output same will overwrite input iv */
+ XMEMCPY(des->tmp, in + sz - DES_BLOCK_SIZE, DES_BLOCK_SIZE);
+ ROM_DESDataProcess(DES_BASE, (uint32_t *)in, (uint32_t *)out, sz);
+ wolfSSL_TI_unlockCCM() ;
+
+ /* store iv for next call */
+ if(dir == DES_CFG_DIR_ENCRYPT)
+ XMEMCPY(des->reg, out + sz - DES_BLOCK_SIZE, DES_BLOCK_SIZE);
+ else
+ XMEMCPY(des->reg, des->tmp, DES_BLOCK_SIZE);
+
+ return 0 ;
+}
+
+#define IS_ALIGN16(p) (((unsigned int)(p)&0xf) == 0)
+
+static int DesCbc(Des* des, byte* out, const byte* in, word32 sz, word32 dir, word32 tri)
+{
+ const byte * in_p ; byte * out_p ;
+ word32 size ;
+ #define TI_BUFFSIZE 1024
+ byte buff[TI_BUFFSIZE] ;
+ if ((des == NULL) || (in == NULL) || (out == NULL))
+ return BAD_FUNC_ARG;
+ if(sz % DES_BLOCK_SIZE)
+ return BAD_FUNC_ARG;
+
+ while(sz > 0) {
+ size = sz ; in_p = in ; out_p = out ;
+ if(!IS_ALIGN16(in)){
+ size = sz>TI_BUFFSIZE ? TI_BUFFSIZE : sz ;
+ XMEMCPY(buff, in, size) ;
+ in_p = (const byte *)buff ;
+ }
+ if(!IS_ALIGN16(out)){
+ size = sz>TI_BUFFSIZE ? TI_BUFFSIZE : sz ;
+ out_p = (byte *)buff ;
+ }
+
+ DesCbcAlign16(des, out_p, in_p, size, dir, tri) ;
+
+ if(!IS_ALIGN16(out)){
+ XMEMCPY(out, buff, size) ;
+ }
+ sz -= size ; in += size ; out += size ;
+ }
+ return 0 ;
+}
+
+WOLFSSL_API int wc_Des_SetKey(Des* des, const byte* key, const byte* iv,int dir)
+{
+ return DesSetKey(des, key, iv, dir, DES_CFG_SINGLE) ;
+}
+
+WOLFSSL_API void wc_Des_SetIV(Des* des, const byte* iv)
+{
+ DesSetIV(des, iv, DES_CFG_SINGLE) ;
+}
+
+WOLFSSL_API int wc_Des3_SetKey(Des3* des, const byte* key, const byte* iv,int dir)
+{
+ return DesSetKey((Des *)des, key, iv, dir, DES_CFG_TRIPLE) ;
+}
+
+WOLFSSL_API int wc_Des3_SetIV(Des3* des, const byte* iv)
+{
+ return DesSetIV((Des *)des, iv, DES_CFG_TRIPLE) ;
+}
+
+
+WOLFSSL_API int wc_Des_CbcEncrypt(Des* des, byte* out, const byte* in, word32 sz)
+{
+ return DesCbc(des, out, in, sz, DES_CFG_DIR_ENCRYPT, DES_CFG_SINGLE) ;
+}
+
+WOLFSSL_API int wc_Des_CbcDecrypt(Des* des, byte* out, const byte* in, word32 sz)
+{
+ return DesCbc(des, out, in, sz, DES_CFG_DIR_DECRYPT, DES_CFG_SINGLE) ;
+}
+
+WOLFSSL_API int wc_Des_CbcDecryptWithKey(byte* out, const byte* in, word32 sz,
+ const byte* key, const byte* iv)
+{
+ (void)out; (void)in; (void)sz; (void)key; (void)iv ;
+ return -1 ;
+}
+
+WOLFSSL_API int wc_Des3_CbcEncrypt(Des3* des, byte* out, const byte* in, word32 sz)
+{
+ return DesCbc((Des *)des, out, in, sz, DES_CFG_DIR_ENCRYPT, DES_CFG_TRIPLE) ;
+}
+
+WOLFSSL_API int wc_Des3_CbcDecrypt(Des3* des, byte* out, const byte* in, word32 sz)
+{
+ return DesCbc((Des *)des, out, in, sz, DES_CFG_DIR_DECRYPT, DES_CFG_TRIPLE) ;
+}
+
+WOLFSSL_API int wc_Des3_CbcDecryptWithKey(byte* out, const byte* in, word32 sz,
+ const byte* key, const byte* iv)
+{
+ (void)out; (void)in; (void)sz; (void)key; (void)iv ;
+ return -1 ;
+ }
+
+WOLFSSL_API int wc_Des3Init(Des3* des, void* heap, int devId)
+{
+ if (des == NULL)
+ return BAD_FUNC_ARG;
+
+ des->heap = heap;
+ (void)devId;
+
+ return 0;
+}
+
+WOLFSSL_API void wc_Des3Free(Des3* des)
+{
+ (void)des;
+}
+
+
+#endif /* WOLFSSL_TI_CRYPT */
+
+#endif /* NO_DES */
diff --git a/wolfcrypt/src/port/ti/ti-hash.c b/wolfcrypt/src/port/ti/ti-hash.c
new file mode 100644
index 0000000..ab8f2cc
--- /dev/null
+++ b/wolfcrypt/src/port/ti/ti-hash.c
@@ -0,0 +1,338 @@
+/* port/ti/ti-hash.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#include <wolfssl/wolfcrypt/types.h>
+
+#if defined(WOLFSSL_TI_HASH)
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/md5.h>
+#include <wolfssl/wolfcrypt/sha.h>
+#include <wolfssl/wolfcrypt/sha256.h>
+#include <wolfssl/wolfcrypt/port/ti/ti-hash.h>
+#include <wolfssl/wolfcrypt/port/ti/ti-ccm.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/hash.h>
+
+#ifndef TI_DUMMY_BUILD
+#include "inc/hw_memmap.h"
+#include "inc/hw_shamd5.h"
+#include "inc/hw_ints.h"
+#include "driverlib/shamd5.h"
+#include "driverlib/sysctl.h"
+#include "driverlib/rom_map.h"
+#include "driverlib/rom.h"
+#else
+#define SHAMD5_ALGO_MD5 1
+#define SHAMD5_ALGO_SHA1 2
+#define SHAMD5_ALGO_SHA256 3
+#define SHAMD5_ALGO_SHA224 4
+#endif
+
+static int hashInit(wolfssl_TI_Hash *hash) {
+ if (!wolfSSL_TI_CCMInit())return 1;
+ hash->used = 0;
+ hash->msg = 0;
+ hash->len = 0;
+ return 0;
+}
+
+static int hashUpdate(wolfssl_TI_Hash *hash, const byte* data, word32 len)
+{
+ void *p;
+
+ if ((hash== NULL) || (data == NULL))return BAD_FUNC_ARG;
+
+ if (hash->len < hash->used+len) {
+ if (hash->msg == NULL) {
+ p = XMALLOC(hash->used+len, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+ } else {
+ p = XREALLOC(hash->msg, hash->used+len, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+ }
+ if (p == 0)return 1;
+ hash->msg = p;
+ hash->len = hash->used+len;
+ }
+ XMEMCPY(hash->msg+hash->used, data, len);
+ hash->used += len;
+ return 0;
+}
+
+static int hashGetHash(wolfssl_TI_Hash *hash, byte* result, word32 algo, word32 hsize)
+{
+ uint32_t h[16];
+#ifndef TI_DUMMY_BUILD
+ wolfSSL_TI_lockCCM();
+ ROM_SHAMD5Reset(SHAMD5_BASE);
+ ROM_SHAMD5ConfigSet(SHAMD5_BASE, algo);
+ ROM_SHAMD5DataProcess(SHAMD5_BASE,
+ (uint32_t *)hash->msg, hash->used, h);
+ wolfSSL_TI_unlockCCM();
+#else
+ (void) hash;
+ (void) algo;
+
+ XMEMSET(h, 0, sizeof(h));
+#endif
+ XMEMCPY(result, h, hsize);
+
+ return 0;
+}
+
+static int hashCopy(wolfssl_TI_Hash *src, wolfssl_TI_Hash *dst) {
+ XMEMCPY(dst, src, sizeof(wolfssl_TI_Hash));
+ return 0;
+}
+
+static int hashFinal(wolfssl_TI_Hash *hash, byte* result, word32 algo, word32 hsize)
+{
+ hashGetHash(hash, result, algo, hsize);
+ XFREE(hash->msg, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+ hashInit(hash);
+ return 0;
+}
+
+static int hashHash(const byte* data, word32 len, byte* hash, word32 algo, word32 hsize)
+{
+ int ret = 0;
+#ifdef WOLFSSL_SMALL_STACK
+ wolfssl_TI_Hash* hash_desc;
+#else
+ wolfssl_TI_Hash hash_desc[1];
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+ hash_desc = (wolfssl_TI_Hash*)XMALLOC(sizeof(wolfssl_TI_Hash), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+ if (hash_desc == NULL)
+ return MEMORY_E;
+#endif
+
+ if ((ret = hashInit(hash_desc)) != 0) {
+ WOLFSSL_MSG("Hash Init failed");
+ }
+ else {
+ hashUpdate(hash_desc, data, len);
+ hashFinal(hash_desc, hash, algo, hsize);
+ }
+
+#ifdef WOLFSSL_SMALL_STACK
+ XFREE(hash_desc, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+ return ret;
+}
+
+static int hashFree(wolfssl_TI_Hash *hash)
+{
+ XFREE(hash->msg, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+ hashInit(hash);
+ return 0;
+}
+
+#if !defined(NO_MD5)
+WOLFSSL_API int wc_InitMd5_ex(Md5* md5, void* heap, int devId)
+{
+ if (md5 == NULL)
+ return 1;
+ (void)heap;
+ (void)devId;
+ return hashInit((wolfssl_TI_Hash *)md5);
+}
+WOLFSSL_API int wc_InitMd5(Md5* md5)
+{
+ return wc_InitMd5_ex(md5, NULL, INVALID_DEVID);
+}
+
+WOLFSSL_API int wc_Md5Update(Md5* md5, const byte* data, word32 len)
+{
+ return hashUpdate((wolfssl_TI_Hash *)md5, data, len);
+}
+
+WOLFSSL_API int wc_Md5Final(Md5* md5, byte* hash)
+{
+ return hashFinal((wolfssl_TI_Hash *)md5, hash, SHAMD5_ALGO_MD5, MD5_DIGEST_SIZE);
+}
+
+WOLFSSL_API int wc_Md5GetHash(Md5* md5, byte* hash)
+{
+ return hashGetHash((wolfssl_TI_Hash *)md5, hash, SHAMD5_ALGO_MD5, MD5_DIGEST_SIZE);
+}
+
+WOLFSSL_API int wc_Md5Copy(Md5* src, Md5* dst) {
+ return hashCopy((wolfssl_TI_Hash *)src, (wolfssl_TI_Hash *)dst);
+}
+
+WOLFSSL_API int wc_Md5Hash(const byte*data, word32 len, byte*hash)
+{
+ return hashHash(data, len, hash, SHAMD5_ALGO_MD5, MD5_DIGEST_SIZE);
+}
+
+WOLFSSL_API void wc_Md5Free(Md5* md5)
+{
+ hashFree((wolfssl_TI_Hash *)md5);
+}
+
+#endif /* !NO_MD5 */
+
+#if !defined(NO_SHA)
+WOLFSSL_API int wc_InitSha_ex(Md5* sha, void* heap, int devId)
+{
+ if (sha == NULL)
+ return 1;
+ (void)heap;
+ (void)devId;
+ return hashInit((wolfssl_TI_Hash *)sha);
+}
+WOLFSSL_API int wc_InitSha(Sha* sha)
+{
+ return wc_InitSha_ex(sha, NULL, INVALID_DEVID);
+}
+
+WOLFSSL_API int wc_ShaUpdate(Sha* sha, const byte* data, word32 len)
+{
+ return hashUpdate((wolfssl_TI_Hash *)sha, data, len);
+}
+
+WOLFSSL_API int wc_ShaFinal(Sha* sha, byte* hash)
+{
+ return hashFinal((wolfssl_TI_Hash *)sha, hash, SHAMD5_ALGO_SHA1, SHA_DIGEST_SIZE);
+}
+
+WOLFSSL_API int wc_ShaGetHash(Sha* sha, byte* hash)
+{
+ return hashGetHash(sha, hash, SHAMD5_ALGO_SHA1, SHA_DIGEST_SIZE);
+}
+
+WOLFSSL_API int wc_ShaCopy(Sha* src, Sha* dst) {
+ return hashCopy((wolfssl_TI_Hash *)src, (wolfssl_TI_Hash *)dst);
+}
+
+WOLFSSL_API int wc_ShaHash(const byte*data, word32 len, byte*hash)
+{
+ return hashHash(data, len, hash, SHAMD5_ALGO_SHA1, SHA_DIGEST_SIZE);
+}
+
+WOLFSSL_API void wc_ShaFree(Sha* sha)
+{
+ hashFree((wolfssl_TI_Hash *)sha);
+}
+
+#endif /* !NO_SHA */
+
+#if defined(WOLFSSL_SHA224)
+WOLFSSL_API int wc_InitSha224_ex(Sha224* sha224, void* heap, int devId)
+{
+ if (sha224 == NULL)
+ return 1;
+ (void)heap;
+ (void)devId;
+ return hashInit((wolfssl_TI_Hash *)sha224);
+}
+WOLFSSL_API int wc_InitSha224(Sha224* sha224)
+{
+ return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID);
+}
+
+WOLFSSL_API int wc_Sha224Update(Sha224* sha224, const byte* data, word32 len)
+{
+ return hashUpdate((wolfssl_TI_Hash *)sha224, data, len);
+}
+
+WOLFSSL_API int wc_Sha224Final(Sha224* sha224, byte* hash)
+{
+ return hashFinal((wolfssl_TI_Hash *)sha224, hash, SHAMD5_ALGO_SHA224, SHA224_DIGEST_SIZE);
+}
+
+WOLFSSL_API int wc_Sha224GetHash(Sha224* sha224, byte* hash)
+{
+ return hashGetHash(sha224, hash, SHAMD5_ALGO_SHA224, SHA224_DIGEST_SIZE);
+}
+
+WOLFSSL_API int wc_Sha224Hash(const byte* data, word32 len, byte*hash)
+{
+ return hashHash(data, len, hash, SHAMD5_ALGO_SHA224, SHA224_DIGEST_SIZE);
+}
+
+WOLFSSL_API void wc_Sha224Free(Sha224* sha224)
+{
+ hashFree((wolfssl_TI_Hash *)sha224);
+}
+
+#endif /* WOLFSSL_SHA224 */
+
+#if !defined(NO_SHA256)
+WOLFSSL_API int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
+{
+ if (sha256 == NULL)
+ return 1;
+ (void)heap;
+ (void)devId;
+ return hashInit((wolfssl_TI_Hash *)sha256);
+}
+
+WOLFSSL_API int wc_InitSha256(Sha256* sha256)
+{
+ return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
+}
+
+WOLFSSL_API int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
+{
+ return hashUpdate((wolfssl_TI_Hash *)sha256, data, len);
+}
+
+WOLFSSL_API int wc_Sha256Final(Sha256* sha256, byte* hash)
+{
+ return hashFinal((wolfssl_TI_Hash *)sha256, hash, SHAMD5_ALGO_SHA256, SHA256_DIGEST_SIZE);
+}
+
+WOLFSSL_API int wc_Sha256GetHash(Sha256* sha256, byte* hash)
+{
+ return hashGetHash(sha256, hash, SHAMD5_ALGO_SHA256, SHA256_DIGEST_SIZE);
+}
+
+WOLFSSL_API int wc_Sha256Hash(const byte* data, word32 len, byte*hash)
+{
+ return hashHash(data, len, hash, SHAMD5_ALGO_SHA256, SHA256_DIGEST_SIZE);
+}
+
+WOLFSSL_API void wc_Sha256Free(Sha256* sha256)
+{
+ hashFree((wolfssl_TI_Hash *)sha256);
+}
+
+#endif /* !NO_SHA256 */
+
+#endif
diff --git a/wolfcrypt/src/port/xilinx/xil-aesgcm.c b/wolfcrypt/src/port/xilinx/xil-aesgcm.c
new file mode 100644
index 0000000..6af4b31
--- /dev/null
+++ b/wolfcrypt/src/port/xilinx/xil-aesgcm.c
@@ -0,0 +1,202 @@
+/* xil-aesgcm.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if !defined(NO_AES) && defined(WOLFSSL_XILINX_CRYPT)
+
+#include <wolfssl/wolfcrypt/aes.h>
+
+
+#ifdef HAVE_AESGCM
+/* Make calls to Xilinx hardened AES-GCM crypto */
+
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+
+#ifdef NO_INLINE
+ #include <wolfssl/wolfcrypt/misc.h>
+#else
+ #define WOLFSSL_MISC_INCLUDED
+ #include <wolfcrypt/src/misc.c>
+#endif
+
+#include "xparameters.h"
+
+enum {
+ AEAD_NONCE_SZ = 12,
+ AES_GCM_AUTH_SZ = 16, /* AES-GCM Auth Tag length */
+};
+
+
+int wc_AesGcmSetKey_ex(Aes* aes, const byte* key, word32 len, word32 kup)
+{
+ XCsuDma_Config* con;
+
+ if (aes == NULL || key == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (len != 32) {
+ WOLFSSL_MSG("Expecting a 256 bit key");
+ return BAD_FUNC_ARG;
+ }
+
+ if ((con = XCsuDma_LookupConfig(0)) == NULL) {
+ WOLFSSL_MSG("Failed to look up config");
+ return MEMORY_E;
+ }
+
+ /* XST_SUCCESS comes from Xilinx header file */
+ if (XCsuDma_CfgInitialize(&(aes->dma), con, con->BaseAddress) !=
+ XST_SUCCESS) {
+ WOLFSSL_MSG("Failed to initialize hardware");
+ return MEMORY_E;
+ }
+
+ aes->keylen = len;
+ aes->kup = kup;
+ XMEMCPY((byte*)(aes->key_init), key, len);
+
+ return 0;
+}
+
+
+
+int wc_AesGcmEncrypt(Aes* aes, byte* out,
+ const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ byte* tmp;
+ byte scratch[AES_BLOCK_SIZE];
+ byte initalCounter[AES_BLOCK_SIZE];
+
+ if ((in == NULL && sz > 0) || iv == NULL || authTag == NULL ||
+ authTagSz > AES_GCM_AUTH_SZ) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (ivSz != AEAD_NONCE_SZ) {
+ WOLFSSL_MSG("Expecting an IV size of 12");
+ return BAD_FUNC_ARG;
+ }
+
+ /* API expects that output is size of input + 16 byte tag. A temporary
+ * buffer is created to keep AES encrypt from writing over the end of
+ * out buffer. */
+ if (in != NULL) {
+ if (aes->keylen != 32) {
+ WOLFSSL_MSG("Expecting 256 bit AES key");
+ return BAD_FUNC_ARG;
+ }
+
+ tmp = (byte*)XMALLOC(sz + AES_GCM_AUTH_SZ, aes->heap,
+ DYNAMIC_TYPE_TMP_BUFFER);
+ if (tmp == NULL) {
+ return MEMORY_E;
+ }
+
+ XSecure_AesInitialize(&(aes->xilAes), &(aes->dma), aes->kup, (word32*)iv,
+ aes->key_init);
+ XSecure_AesEncryptData(&(aes->xilAes), tmp, in, sz);
+ XMEMCPY(out, tmp, sz);
+ XMEMCPY(authTag, tmp + sz, authTagSz);
+ XFREE(tmp, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+ }
+
+ /* handle completing tag with any additional data */
+ if (authIn != NULL) {
+ /* @TODO avoid hashing out again since Xilinx call already does */
+ XMEMSET(initalCounter, 0, AES_BLOCK_SIZE);
+ XMEMCPY(initalCounter, iv, ivSz);
+ initalCounter[AES_BLOCK_SIZE - 1] = 1;
+ GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz);
+ wc_AesEncryptDirect(aes, scratch, initalCounter);
+ xorbuf(authTag, scratch, authTagSz);
+ }
+
+ return 0;
+}
+
+
+int wc_AesGcmDecrypt(Aes* aes, byte* out,
+ const byte* in, word32 sz,
+ const byte* iv, word32 ivSz,
+ const byte* authTag, word32 authTagSz,
+ const byte* authIn, word32 authInSz)
+{
+ byte* tag;
+ byte buf[AES_GCM_AUTH_SZ];
+ byte scratch[AES_BLOCK_SIZE];
+ byte initalCounter[AES_BLOCK_SIZE];
+
+ if (in == NULL || iv == NULL || authTag == NULL ||
+ authTagSz < AES_GCM_AUTH_SZ) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (ivSz != AEAD_NONCE_SZ) {
+ WOLFSSL_MSG("Expecting an IV size of 12");
+ return BAD_FUNC_ARG;
+ }
+
+ /* account for additional data */
+ if (authIn != NULL && authInSz > 0) {
+ XMEMSET(initalCounter, 0, AES_BLOCK_SIZE);
+ XMEMCPY(initalCounter, iv, ivSz);
+ initalCounter[AES_BLOCK_SIZE - 1] = 1;
+ tag = buf;
+ GHASH(aes, NULL, 0, in, sz, tag, AES_GCM_AUTH_SZ);
+ wc_AesEncryptDirect(aes, scratch, initalCounter);
+ xorbuf(tag, scratch, AES_GCM_AUTH_SZ);
+ }
+ else {
+ tag = authTag;
+ }
+
+ /* calls to hardened crypto */
+ XSecure_AesInitialize(&(aes->xilAes), &(aes->dma), aes->kup,
+ (word32*)iv, aes->key_init);
+ XSecure_AesDecryptData(&(aes->xilAes), out, in, sz, tag);
+
+ /* account for additional data */
+ if (authIn != NULL && authInSz > 0) {
+ GHASH(aes, authIn, authInSz, in, sz, tag, AES_GCM_AUTH_SZ);
+ wc_AesEncryptDirect(aes, scratch, initalCounter);
+ xorbuf(tag, scratch, AES_GCM_AUTH_SZ);
+ if (ConstantCompare(authTag, tag, authTagSz) != 0) {
+ return AES_GCM_AUTH_E;
+ }
+ }
+
+ return 0;
+
+}
+#endif /* HAVE_AESGCM */
+
+#endif
diff --git a/wolfcrypt/src/port/xilinx/xil-sha3.c b/wolfcrypt/src/port/xilinx/xil-sha3.c
new file mode 100644
index 0000000..a9db6b9
--- /dev/null
+++ b/wolfcrypt/src/port/xilinx/xil-sha3.c
@@ -0,0 +1,158 @@
+/* xil-sha3.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+ #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+
+#if defined(WOLFSSL_SHA3) && defined(WOLFSSL_XILINX_CRYPT)
+
+#include <wolfssl/wolfcrypt/sha3.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+
+#if !defined(WOLFSSL_NOSHA3_224) || !defined(WOLFSSL_NOSHA3_256) \
+ || !defined(WOLFSSL_NOSHA3_512)
+ #error sizes of SHA3 other than 384 are not supported
+#endif
+
+/* Initialize hardware for SHA3 operations
+ *
+ * sha SHA3 structure to initialize
+ * heap memory heap hint to use
+ * devId used for async operations (currently not supported here)
+ */
+int wc_InitSha3_384(wc_Sha3* sha, void* heap, int devId)
+{
+ XCsuDma_Config* con;
+
+ (void)heap;
+ (void)devId;
+
+ if (sha == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if ((con = XCsuDma_LookupConfig(0)) == NULL) {
+ WOLFSSL_MSG("Unable to look up configure for SHA3");
+ return BAD_STATE_E;
+ }
+
+ /* XST_SUCCESS is success macro from Xilinx header */
+ if (XCsuDma_CfgInitialize(&(sha->dma), con, con->BaseAddress) !=
+ XST_SUCCESS) {
+ WOLFSSL_MSG("Unable to initialize CsuDma");
+ return BAD_STATE_E;
+ }
+
+ XSecure_Sha3Initialize(&(sha->hw), &(sha->dma));
+ XSecure_Sha3Start(&(sha->hw));
+
+ return 0;
+}
+
+
+/* Update SHA3 state
+ *
+ * sha SHA3 structure to update
+ * data message to update SHA3 state with
+ * len length of data buffer
+ */
+int wc_Sha3_384_Update(wc_Sha3* sha, const byte* data, word32 len)
+{
+ if (sha == NULL || (data == NULL && len > 0)) {
+ return BAD_FUNC_ARG;
+ }
+ XSecure_Sha3Update(&(sha->hw), (byte*)data, len);
+
+ return 0;
+}
+
+
+/* Finalize SHA3 state and get digest
+ *
+ * sha SHA3 structure to get hash
+ * out digest out, expected to be large enough to hold SHA3 digest
+ */
+int wc_Sha3_384_Final(wc_Sha3* sha, byte* out)
+{
+ if (sha == NULL || out == NULL) {
+ return BAD_FUNC_ARG;
+ }
+ XSecure_Sha3Finish(&(sha->hw), out);
+
+ return wc_InitSha3_384(sha, NULL, INVALID_DEVID);
+}
+
+
+/* Free SHA3 structure
+ *
+ * sha SHA3 structure to free
+ */
+void wc_Sha3_384_Free(wc_Sha3* sha)
+{
+ (void)sha;
+ /* nothing to free yet */
+}
+
+
+/* Get SHA3 digest without finalize SHA3 state
+ *
+ * sha SHA3 structure to get hash
+ * out digest out, expected to be large enough to hold SHA3 digest
+ */
+int wc_Sha3_384_GetHash(wc_Sha3* sha, byte* out)
+{
+ wc_Sha3 s;
+
+ if (sha == NULL || out == NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ if (wc_Sha3_384_Copy(sha, &s) != 0) {
+ WOLFSSL_MSG("Unable to copy SHA3 structure");
+ return MEMORY_E;
+ }
+
+ return wc_Sha3_384_Final(&s, out);
+}
+
+
+/* Get copy of SHA3 structure
+ *
+ * src SHA3 structure to make copy of
+ * dst [out]structure to hold copy
+ */
+int wc_Sha3_384_Copy(wc_Sha3* src, wc_Sha3* dst)
+{
+ if (src == NULL || dst== NULL) {
+ return BAD_FUNC_ARG;
+ }
+
+ XMEMCPY((byte*)dst, (byte*)src, sizeof(wc_Sha3));
+ return 0;
+}
+
+#endif