1 files changed, 4653 insertions, 0 deletions
diff --git a/client/wolfssl/wolfcrypt/src/port/arm/armv8-aes.c b/client/wolfssl/wolfcrypt/src/port/arm/armv8-aes.c
new file mode 100644
index 0000000..d0f8a9c
--- /dev/null
+++ b/client/wolfssl/wolfcrypt/src/port/arm/armv8-aes.c
@@ -0,0 +1,4653 @@
+/* armv8-aes.c
+ *
+ * Copyright (C) 2006-2020 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+
+/*
+ * There are two versions one for 64 (Aarch64)  and one for 32 bit (Aarch32).
+ * If changing one check the other.
+ */
+
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if !defined(NO_AES) && defined(WOLFSSL_ARMASM)
+
+#include <wolfssl/wolfcrypt/aes.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#ifdef NO_INLINE
+    #include <wolfssl/wolfcrypt/misc.h>
+#else
+    #define WOLFSSL_MISC_INCLUDED
+    #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifdef _MSC_VER
+    /* 4127 warning constant while(1)  */
+    #pragma warning(disable: 4127)
+#endif
+
+
+static const byte rcon[] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,0x1B, 0x36
+    /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/* get table value from hardware */
+#ifdef __aarch64__
+    #define SBOX(x)                      \
+        do {                             \
+            __asm__ volatile (           \
+                "DUP v1.4s, %w[in]  \n"  \
+                "MOVI v0.16b, #0     \n" \
+                "AESE v0.16b, v1.16b \n" \
+                "UMOV %w[out], v0.s[0] \n" \
+                : [out] "=r"((x))        \
+                : [in] "r" ((x))         \
+                : "cc", "memory", "v0", "v1"\
+            ); \
+        } while(0)
+
+    #define IMIX(x) \
+        do {        \
+            __asm__ volatile (             \
+                "LD1 {v0.16b}, [%[in]] \n" \
+                "AESIMC v0.16b, v0.16b \n" \
+                "ST1 {v0.16b}, [%[out]]\n" \
+                : [out] "=r" ((x))         \
+                : [in] "0" ((x))           \
+                : "cc", "memory", "v0"     \
+            );                             \
+        } while(0)
+#else /* if not defined __aarch64__ then use 32 bit version */
+    #define SBOX(x)                      \
+        do {                             \
+            __asm__ volatile (           \
+                "VDUP.32 q1, %[in]   \n" \
+                "VMOV.i32 q0, #0     \n" \
+                "AESE.8 q0, q1      \n" \
+                "VMOV.32 %[out], d0[0] \n" \
+                : [out] "=r"((x))        \
+                : [in] "r" ((x))         \
+                : "cc", "memory", "q0", "q1"\
+            ); \
+        } while(0)
+
+    #define IMIX(x) \
+        do {        \
+            __asm__ volatile (           \
+                "VLD1.32 {q0}, [%[in]] \n" \
+                "AESIMC.8 q0, q0    \n" \
+                "VST1.32 {q0}, [%[out]] \n" \
+                : [out] "=r" ((x))       \
+                : [in] "0" ((x))         \
+                : "cc", "memory", "q0"   \
+            );                           \
+        } while(0)
+#endif /* aarch64 */
+
+
+#ifdef HAVE_AESGCM
+
+static WC_INLINE void IncrementGcmCounter(byte* inOutCtr)
+{
+    int i;
+
+    /* in network byte order so start at end and work back */
+    for (i = AES_BLOCK_SIZE - 1; i >= AES_BLOCK_SIZE - CTR_SZ; i--) {
+        if (++inOutCtr[i])  /* we're done unless we overflow */
+            return;
+    }
+}
+
+
+static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz)
+{
+    /* Multiply the sz by 8 */
+    word32 szHi = (sz >> (8*sizeof(sz) - 3));
+    sz <<= 3;
+
+    /* copy over the words of the sz into the destination buffer */
+    buf[0] = (szHi >> 24) & 0xff;
+    buf[1] = (szHi >> 16) & 0xff;
+    buf[2] = (szHi >>  8) & 0xff;
+    buf[3] = szHi & 0xff;
+    buf[4] = (sz >> 24) & 0xff;
+    buf[5] = (sz >> 16) & 0xff;
+    buf[6] = (sz >>  8) & 0xff;
+    buf[7] = sz & 0xff;
+}
+
+#endif /* HAVE_AESGCM */
+
+/* Similar to wolfSSL software implementation of expanding the AES key.
+ * Changed out the locations of where table look ups where made to
+ * use hardware instruction. Also altered decryption key to match. */
+int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
+            const byte* iv, int dir)
+{
+    word32 temp;
+    word32 *rk;
+    unsigned int i = 0;
+
+#if defined(AES_MAX_KEY_SIZE)
+    const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
+#endif
+
+    if (!((keylen == 16) || (keylen == 24) || (keylen == 32)) ||
+           aes == NULL || userKey == NULL)
+        return BAD_FUNC_ARG;
+
+    rk = aes->key;
+#if defined(AES_MAX_KEY_SIZE)
+    /* Check key length */
+    if (keylen > max_key_len) {
+        return BAD_FUNC_ARG;
+    }
+#endif
+
+    #ifdef WOLFSSL_AES_COUNTER
+        aes->left = 0;
+    #endif /* WOLFSSL_AES_COUNTER */
+
+    aes->rounds = keylen/4 + 6;
+    XMEMCPY(rk, userKey, keylen);
+
+    switch(keylen)
+    {
+#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128 && \
+        defined(WOLFSSL_AES_128)
+    case 16:
+        while (1)
+        {
+            temp  = rk[3];
+            SBOX(temp);
+            temp = rotrFixed(temp, 8);
+            rk[4] = rk[0] ^ temp ^ rcon[i];
+            rk[5] = rk[4] ^ rk[1];
+            rk[6] = rk[5] ^ rk[2];
+            rk[7] = rk[6] ^ rk[3];
+            if (++i == 10)
+                break;
+            rk += 4;
+        }
+        break;
+#endif /* 128 */
+
+#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 192 && \
+        defined(WOLFSSL_AES_192)
+    case 24:
+        /* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */
+        while (1)
+        {
+            temp  = rk[5];
+            SBOX(temp);
+            temp = rotrFixed(temp, 8);
+            rk[ 6] = rk[ 0] ^ temp ^ rcon[i];
+            rk[ 7] = rk[ 1] ^ rk[ 6];
+            rk[ 8] = rk[ 2] ^ rk[ 7];
+            rk[ 9] = rk[ 3] ^ rk[ 8];
+            if (++i == 8)
+                break;
+            rk[10] = rk[ 4] ^ rk[ 9];
+            rk[11] = rk[ 5] ^ rk[10];
+            rk += 6;
+        }
+        break;
+#endif /* 192 */
+
+#if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256 && \
+        defined(WOLFSSL_AES_256)
+    case 32:
+        while (1)
+        {
+            temp  = rk[7];
+            SBOX(temp);
+            temp = rotrFixed(temp, 8);
+            rk[8] = rk[0] ^ temp ^ rcon[i];
+            rk[ 9] = rk[ 1] ^ rk[ 8];
+            rk[10] = rk[ 2] ^ rk[ 9];
+            rk[11] = rk[ 3] ^ rk[10];
+            if (++i == 7)
+                break;
+            temp  = rk[11];
+            SBOX(temp);
+            rk[12] = rk[ 4] ^ temp;
+            rk[13] = rk[ 5] ^ rk[12];
+            rk[14] = rk[ 6] ^ rk[13];
+            rk[15] = rk[ 7] ^ rk[14];
+
+            rk += 8;
+        }
+        break;
+#endif /* 256 */
+
+    default:
+        return BAD_FUNC_ARG;
+    }
+
+    if (dir == AES_DECRYPTION)
+    {
+#ifdef HAVE_AES_DECRYPT
+        unsigned int j;
+        rk = aes->key;
+
+        /* invert the order of the round keys: */
+        for (i = 0, j = 4* aes->rounds; i < j; i += 4, j -= 4) {
+            temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+            temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+            temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+            temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+        }
+        /* apply the inverse MixColumn transform to all round keys but the
+           first and the last: */
+        for (i = 1; i < aes->rounds; i++) {
+            rk += 4;
+            IMIX(rk);
+        }
+#else
+    WOLFSSL_MSG("AES Decryption not compiled in");
+    return BAD_FUNC_ARG;
+#endif /* HAVE_AES_DECRYPT */
+    }
+
+    return wc_AesSetIV(aes, iv);
+}
+
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+    int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
+                        const byte* iv, int dir)
+    {
+        return wc_AesSetKey(aes, userKey, keylen, iv, dir);
+    }
+#endif
+
+/* wc_AesSetIV is shared between software and hardware */
+int wc_AesSetIV(Aes* aes, const byte* iv)
+{
+    if (aes == NULL)
+        return BAD_FUNC_ARG;
+
+    if (iv)
+        XMEMCPY(aes->reg, iv, AES_BLOCK_SIZE);
+    else
+        XMEMSET(aes->reg,  0, AES_BLOCK_SIZE);
+
+    return 0;
+}
+
+
+#ifdef __aarch64__
+/* AES CCM/GCM use encrypt direct but not decrypt */
+#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
+    defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+    static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+            word32* keyPt = aes->key;
+
+            /*
+              AESE exor's input with round key
+                   shift rows of exor'ed result
+                   sub bytes for shifted rows
+             */
+
+            __asm__ __volatile__ (
+                "LD1 {v0.16b}, [%[CtrIn]] \n"
+                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v3.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v4.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+
+                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v3.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v4.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+
+                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+
+                "#subtract rounds done so far and see if should continue\n"
+                "MOV w12, %w[R]    \n"
+                "SUB w12, w12, #10 \n"
+                "CBZ w12, 1f       \n"
+                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+
+                "SUB w12, w12, #2 \n"
+                "CBZ w12, 1f      \n"
+                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+
+                "#Final AddRoundKey then store result \n"
+                "1: \n"
+                "LD1 {v1.2d}, [%[Key]], #16 \n"
+                "EOR v0.16b, v0.16b, v1.16b  \n"
+                "ST1 {v0.16b}, [%[CtrOut]]   \n"
+
+                :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
+                 "=r" (inBlock)
+                :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds),
+                 [CtrIn] "3" (inBlock)
+                : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4"
+            );
+
+        return 0;
+    }
+#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+    #ifdef HAVE_AES_DECRYPT
+    static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+            word32* keyPt = aes->key;
+
+            /*
+              AESE exor's input with round key
+                   shift rows of exor'ed result
+                   sub bytes for shifted rows
+             */
+
+            __asm__ __volatile__ (
+                "LD1 {v0.16b}, [%[CtrIn]] \n"
+                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+
+                "AESD v0.16b, v1.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v3.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v4.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+
+                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+                "AESD v0.16b, v1.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v3.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v4.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+
+                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
+                "AESD v0.16b, v1.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b   \n"
+
+                "#subtract rounds done so far and see if should continue\n"
+                "MOV w12, %w[R]    \n"
+                "SUB w12, w12, #10 \n"
+                "CBZ w12, 1f       \n"
+                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v1.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b   \n"
+
+                "SUB w12, w12, #2  \n"
+                "CBZ w12, 1f       \n"
+                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v1.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b   \n"
+
+                "#Final AddRoundKey then store result \n"
+                "1: \n"
+                "LD1 {v1.2d}, [%[Key]], #16 \n"
+                "EOR v0.16b, v0.16b, v1.16b  \n"
+                "ST1 {v0.4s}, [%[CtrOut]]    \n"
+
+                :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
+                 "=r" (inBlock)
+                :[Key] "1" (aes->key), "0" (outBlock), [R] "2" (aes->rounds),
+                 [CtrIn] "3" (inBlock)
+                : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4"
+            );
+
+        return 0;
+}
+    #endif /* HAVE_AES_DECRYPT */
+#endif /* DIRECT or COUNTER */
+
+/* AES-CBC */
+#ifdef HAVE_AES_CBC
+    int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+    {
+        word32 numBlocks = sz / AES_BLOCK_SIZE;
+
+        if (aes == NULL || out == NULL || (in == NULL && sz > 0)) {
+            return BAD_FUNC_ARG;
+        }
+
+        /* do as many block size ops as possible */
+        if (numBlocks > 0) {
+            word32* key = aes->key;
+            word32* reg = aes->reg;
+            /*
+            AESE exor's input with round key
+            shift rows of exor'ed result
+            sub bytes for shifted rows
+
+            note: grouping AESE & AESMC together as pairs reduces latency
+            */
+            switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+            case 10: /* AES 128 BLOCK */
+                __asm__ __volatile__ (
+                "MOV w11, %w[blocks] \n"
+                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+                "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
+                "LD1 {v9.2d-v11.2d},[%[Key]], #48  \n"
+                "LD1 {v0.2d}, [%[reg]] \n"
+
+                "LD1 {v12.2d}, [%[input]], #16 \n"
+                "1:\n"
+                "#CBC operations, xorbuf in with current aes->reg \n"
+                "EOR v0.16b, v0.16b, v12.16b \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v3.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v4.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v5.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v6.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v7.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v8.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v9.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v10.16b  \n"
+                "SUB w11, w11, #1 \n"
+                "EOR v0.16b, v0.16b, v11.16b  \n"
+                "ST1 {v0.2d}, [%[out]], #16   \n"
+
+                "CBZ w11, 2f \n"
+                "LD1 {v12.2d}, [%[input]], #16 \n"
+                "B 1b \n"
+
+                "2:\n"
+                "#store current counter value at the end \n"
+                "ST1 {v0.2d}, [%[regOut]] \n"
+
+                :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
+                :"0" (out), [Key] "r" (key), [input] "2" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (reg)
+                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
+                );
+                break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+            case 12: /* AES 192 BLOCK */
+                __asm__ __volatile__ (
+                "MOV w11, %w[blocks] \n"
+                "LD1 {v1.2d-v4.2d}, %[Key], #64  \n"
+                "LD1 {v5.2d-v8.2d}, %[Key], #64  \n"
+                "LD1 {v9.2d-v12.2d},%[Key], #64  \n"
+                "LD1 {v13.2d}, %[Key], #16 \n"
+                "LD1 {v0.2d}, %[reg] \n"
+
+                "LD1 {v14.2d}, [%[input]], #16  \n"
+                "1:\n"
+                "#CBC operations, xorbuf in with current aes->reg \n"
+                "EOR v0.16b, v0.16b, v14.16b \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v3.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v4.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v5.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v6.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v7.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v8.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v9.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v10.16b \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v11.16b \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v12.16b \n"
+                "EOR v0.16b, v0.16b, v13.16b  \n"
+                "SUB w11, w11, #1 \n"
+                "ST1 {v0.2d}, [%[out]], #16  \n"
+
+                "CBZ w11, 2f \n"
+                "LD1 {v14.2d}, [%[input]], #16\n"
+                "B 1b \n"
+
+                "2:\n"
+                "#store current counter value at the end \n"
+                "ST1 {v0.2d}, %[regOut]   \n"
+
+
+                :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in)
+                :"0" (out), [Key] "m" (aes->key), [input] "2" (in),
+                 [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+                );
+                break;
+#endif /* WOLFSSL_AES_192*/
+#ifdef WOLFSSL_AES_256
+            case 14: /* AES 256 BLOCK */
+                __asm__ __volatile__ (
+                "MOV w11, %w[blocks] \n"
+                "LD1 {v1.2d-v4.2d},   %[Key], #64 \n"
+
+                "LD1 {v5.2d-v8.2d},   %[Key], #64 \n"
+                "LD1 {v9.2d-v12.2d},  %[Key], #64 \n"
+                "LD1 {v13.2d-v15.2d}, %[Key], #48 \n"
+                "LD1 {v0.2d}, %[reg] \n"
+
+                "LD1 {v16.2d}, [%[input]], #16  \n"
+                "1: \n"
+                "#CBC operations, xorbuf in with current aes->reg \n"
+                "EOR v0.16b, v0.16b, v16.16b \n"
+                "AESE v0.16b, v1.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v2.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v3.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v4.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v5.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v6.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v7.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v8.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v9.16b  \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v10.16b \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v11.16b \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v12.16b \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v13.16b \n"
+                "AESMC v0.16b, v0.16b \n"
+                "AESE v0.16b, v14.16b \n"
+                "EOR v0.16b, v0.16b, v15.16b \n"
+                "SUB w11, w11, #1     \n"
+                "ST1 {v0.2d}, [%[out]], #16  \n"
+
+                "CBZ w11, 2f \n"
+                "LD1 {v16.2d}, [%[input]], #16 \n"
+                "B 1b \n"
+
+                "2: \n"
+                "#store current counter value at the end \n"
+                "ST1 {v0.2d}, %[regOut]   \n"
+
+
+                :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in)
+                :"0" (out), [Key] "m" (aes->key), [input] "2" (in),
+                 [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15",
+                "v16"
+                );
+                break;
+#endif /* WOLFSSL_AES_256 */
+            default:
+                WOLFSSL_MSG("Bad AES-CBC round value");
+                return BAD_FUNC_ARG;
+            }
+        }
+
+        return 0;
+    }
+
+    #ifdef HAVE_AES_DECRYPT
+    int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+    {
+        word32 numBlocks = sz / AES_BLOCK_SIZE;
+
+        if (aes == NULL || out == NULL || (in == NULL && sz > 0)
+                || sz % AES_BLOCK_SIZE != 0) {
+            return BAD_FUNC_ARG;
+        }
+
+        /* do as many block size ops as possible */
+        if (numBlocks > 0) {
+            word32* key = aes->key;
+            word32* reg = aes->reg;
+
+            switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+            case 10: /* AES 128 BLOCK */
+                __asm__ __volatile__ (
+                "MOV w11, %w[blocks] \n"
+                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+                "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
+                "LD1 {v9.2d-v11.2d},[%[Key]], #48  \n"
+                "LD1 {v13.2d}, [%[reg]] \n"
+
+                "1:\n"
+                "LD1 {v0.2d}, [%[input]], #16  \n"
+                "MOV v12.16b, v0.16b \n"
+                "AESD v0.16b, v1.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v3.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v4.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v5.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v6.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v7.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v8.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v9.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v10.16b  \n"
+                "EOR v0.16b, v0.16b, v11.16b \n"
+
+                "EOR v0.16b, v0.16b, v13.16b \n"
+                "SUB w11, w11, #1            \n"
+                "ST1 {v0.2d}, [%[out]], #16  \n"
+                "MOV v13.16b, v12.16b        \n"
+
+                "CBZ w11, 2f \n"
+                "B 1b      \n"
+
+                "2: \n"
+                "#store current counter value at the end \n"
+                "ST1 {v13.2d}, [%[regOut]] \n"
+
+                :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
+                :"0" (out), [Key] "r" (key), [input] "2" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (reg)
+                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
+                );
+                break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+            case 12: /* AES 192 BLOCK */
+                __asm__ __volatile__ (
+                "MOV w11, %w[blocks] \n"
+                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+                "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
+                "LD1 {v9.2d-v12.2d},[%[Key]], #64  \n"
+                "LD1 {v13.16b}, [%[Key]], #16 \n"
+                "LD1 {v15.2d}, [%[reg]]       \n"
+
+                "LD1 {v0.2d}, [%[input]], #16  \n"
+                "1:    \n"
+                "MOV v14.16b, v0.16b   \n"
+                "AESD v0.16b, v1.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v3.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v4.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v5.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v6.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v7.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v8.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v9.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v10.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v11.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v12.16b  \n"
+                "EOR v0.16b, v0.16b, v13.16b \n"
+
+                "EOR v0.16b, v0.16b, v15.16b \n"
+                "SUB w11, w11, #1            \n"
+                "ST1 {v0.2d}, [%[out]], #16  \n"
+                "MOV v15.16b, v14.16b        \n"
+
+                "CBZ w11, 2f \n"
+                "LD1 {v0.2d}, [%[input]], #16 \n"
+                "B 1b \n"
+
+                "2:\n"
+                "#store current counter value at the end \n"
+                "ST1 {v15.2d}, [%[regOut]] \n"
+
+                :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
+                :"0" (out), [Key] "r" (key), [input] "2" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (reg)
+                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                );
+                break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+            case 14: /* AES 256 BLOCK */
+                __asm__ __volatile__ (
+                "MOV w11, %w[blocks] \n"
+                "LD1 {v1.2d-v4.2d},   [%[Key]], #64  \n"
+                "LD1 {v5.2d-v8.2d},   [%[Key]], #64  \n"
+                "LD1 {v9.2d-v12.2d},  [%[Key]], #64  \n"
+                "LD1 {v13.2d-v15.2d}, [%[Key]], #48  \n"
+                "LD1 {v17.2d}, [%[reg]] \n"
+
+                "LD1 {v0.2d}, [%[input]], #16  \n"
+                "1:    \n"
+                "MOV v16.16b, v0.16b   \n"
+                "AESD v0.16b, v1.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v2.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v3.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v4.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v5.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v6.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v7.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v8.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v9.16b   \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v10.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v11.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v12.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v13.16b  \n"
+                "AESIMC v0.16b, v0.16b \n"
+                "AESD v0.16b, v14.16b  \n"
+                "EOR v0.16b, v0.16b, v15.16b \n"
+
+                "EOR v0.16b, v0.16b, v17.16b \n"
+                "SUB w11, w11, #1            \n"
+                "ST1 {v0.2d}, [%[out]], #16  \n"
+                "MOV v17.16b, v16.16b        \n"
+
+                "CBZ w11, 2f \n"
+                "LD1 {v0.2d}, [%[input]], #16  \n"
+                "B 1b \n"
+
+                "2:\n"
+                "#store current counter value at the end \n"
+                "ST1 {v17.2d}, [%[regOut]]   \n"
+
+                :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
+                :"0" (out), [Key] "r" (key), [input] "2" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (reg)
+                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15",
+                "v16", "v17"
+                );
+                break;
+#endif /* WOLFSSL_AES_256 */
+            default:
+                WOLFSSL_MSG("Bad AES-CBC round value");
+                return BAD_FUNC_ARG;
+            }
+        }
+
+        return 0;
+    }
+    #endif
+
+#endif /* HAVE_AES_CBC */
+
+/* AES-CTR */
+#ifdef WOLFSSL_AES_COUNTER
+
+        /* Increment AES counter */
+        static WC_INLINE void IncrementAesCounter(byte* inOutCtr)
+        {
+            int i;
+
+            /* in network byte order so start at end and work back */
+            for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) {
+                if (++inOutCtr[i])  /* we're done unless we overflow */
+                    return;
+            }
+        }
+
+        int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+        {
+            byte* tmp;
+            word32 numBlocks;
+
+            if (aes == NULL || out == NULL || in == NULL) {
+                return BAD_FUNC_ARG;
+            }
+
+            tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
+
+            /* consume any unused bytes left in aes->tmp */
+            while (aes->left && sz) {
+               *(out++) = *(in++) ^ *(tmp++);
+               aes->left--;
+               sz--;
+            }
+
+            /* do as many block size ops as possible */
+            numBlocks = sz/AES_BLOCK_SIZE;
+            if (numBlocks > 0) {
+                /* pointer needed because it is incremented when read, causing
+                 * an issue with call to encrypt/decrypt leftovers */
+                byte*  keyPt  = (byte*)aes->key;
+                sz           -= numBlocks * AES_BLOCK_SIZE;
+                switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+                case 10: /* AES 128 BLOCK */
+                    __asm__ __volatile__ (
+                    "MOV w11, %w[blocks] \n"
+                    "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+                    "#Create vector with the value 1  \n"
+                    "MOVI v15.16b, #1                 \n"
+                    "USHR v15.2d, v15.2d, #56         \n"
+                    "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+                    "EOR v14.16b, v14.16b, v14.16b    \n"
+                    "EXT v14.16b, v15.16b, v14.16b, #8\n"
+
+                    "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+                    "LD1 {v13.2d}, %[reg]             \n"
+
+                    /* double block */
+                    "1:      \n"
+                    "CMP w11, #1 \n"
+                    "BEQ 2f    \n"
+                    "CMP w11, #0 \n"
+                    "BEQ 3f    \n"
+
+                    "MOV v0.16b, v13.16b  \n"
+                    "AESE v0.16b, v1.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v13.16b, v13.16b \n" /* network order */
+                    "AESE v0.16b, v2.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+                    "SUB w11, w11, #2     \n"
+                    "ADD v15.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+                    "ADD v13.2d, v15.2d, v14.2d \n" /* add 1 to counter */
+
+                    "AESE v0.16b, v3.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+                    "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+
+                    "AESE v0.16b, v4.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v15.16b, v15.16b \n" /* revert from network order */
+                    "REV64 v13.16b, v13.16b \n" /* revert from network order */
+
+                    "AESE v0.16b, v5.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v15.16b, v1.16b  \n"
+                    "AESMC v15.16b, v15.16b \n"
+
+                    "AESE v0.16b, v6.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v15.16b, v2.16b  \n"
+                    "AESMC v15.16b, v15.16b \n"
+
+                    "AESE v0.16b, v7.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v15.16b, v3.16b  \n"
+                    "AESMC v15.16b, v15.16b \n"
+
+                    "AESE v0.16b, v8.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v15.16b, v4.16b  \n"
+                    "AESMC v15.16b, v15.16b \n"
+
+                    "AESE v0.16b, v9.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v15.16b, v5.16b  \n"
+                    "AESMC v15.16b, v15.16b \n"
+
+                    "AESE v0.16b, v10.16b  \n"
+                    "AESE v15.16b, v6.16b  \n"
+                    "AESMC v15.16b, v15.16b \n"
+
+                    "EOR v0.16b, v0.16b, v11.16b \n"
+                    "AESE v15.16b, v7.16b  \n"
+                    "AESMC v15.16b, v15.16b \n"
+
+                    "LD1 {v12.2d}, [%[input]], #16  \n"
+                    "AESE v15.16b, v8.16b  \n"
+                    "AESMC v15.16b, v15.16b \n"
+
+                    "EOR v0.16b, v0.16b, v12.16b \n"
+                    "AESE v15.16b, v9.16b  \n"
+                    "AESMC v15.16b, v15.16b \n"
+
+                    "LD1 {v12.2d}, [%[input]], #16  \n"
+                    "AESE v15.16b, v10.16b  \n"
+                    "ST1 {v0.2d}, [%[out]], #16  \n"
+                    "EOR v15.16b, v15.16b, v11.16b \n"
+                    "EOR v15.16b, v15.16b, v12.16b \n"
+                    "ST1 {v15.2d}, [%[out]], #16  \n"
+
+                    "B 1b \n"
+
+                    /* single block */
+                    "2: \n"
+                    "MOV v0.16b, v13.16b  \n"
+                    "AESE v0.16b, v1.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v13.16b, v13.16b \n" /* network order */
+                    "AESE v0.16b, v2.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+                    "AESE v0.16b, v3.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+                    "AESE v0.16b, v4.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "SUB w11, w11, #1     \n"
+                    "AESE v0.16b, v5.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+                    "AESE v0.16b, v6.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v13.16b, v13.16b \n" /* revert from network order */
+                    "AESE v0.16b, v7.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v8.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v9.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v10.16b \n"
+                    "EOR v0.16b, v0.16b, v11.16b \n"
+                    "#CTR operations, increment counter and xorbuf \n"
+                    "LD1 {v12.2d}, [%[input]], #16  \n"
+                    "EOR v0.16b, v0.16b, v12.16b \n"
+                    "ST1 {v0.2d}, [%[out]], #16  \n"
+
+                    "3: \n"
+                    "#store current counter value at the end \n"
+                    "ST1 {v13.2d}, %[regOut]   \n"
+
+                    :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg),
+                     "=r" (in)
+                    :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+                     [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+                    : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+                    "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15"
+                    );
+                    break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+                case 12: /* AES 192 BLOCK */
+                    __asm__ __volatile__ (
+                    "MOV w11, %w[blocks]              \n"
+                    "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+                    "#Create vector with the value 1  \n"
+                    "MOVI v16.16b, #1                 \n"
+                    "USHR v16.2d, v16.2d, #56         \n"
+                    "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+                    "EOR v14.16b, v14.16b, v14.16b    \n"
+                    "EXT v16.16b, v16.16b, v14.16b, #8\n"
+
+                    "LD1 {v9.2d-v12.2d}, [%[Key]], #64\n"
+                    "LD1 {v15.2d}, %[reg]             \n"
+                    "LD1 {v13.16b}, [%[Key]], #16     \n"
+
+                    /* double block */
+                    "1:      \n"
+                    "CMP w11, #1 \n"
+                    "BEQ 2f    \n"
+                    "CMP w11, #0 \n"
+                    "BEQ 3f    \n"
+
+                    "MOV v0.16b, v15.16b  \n"
+                    "AESE v0.16b, v1.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v15.16b, v15.16b \n" /* network order */
+                    "AESE v0.16b, v2.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+                    "SUB w11, w11, #2     \n"
+                    "ADD v17.2d, v15.2d, v16.2d \n" /* add 1 to counter */
+                    "ADD v15.2d, v17.2d, v16.2d \n" /* add 1 to counter */
+
+                    "AESE v0.16b, v3.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+                    "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+
+                    "AESE v0.16b, v4.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v17.16b, v17.16b \n" /* revert from network order */
+                    "REV64 v15.16b, v15.16b \n" /* revert from network order */
+
+                    "AESE v0.16b, v5.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v17.16b, v1.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "AESE v0.16b, v6.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v17.16b, v2.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "AESE v0.16b, v7.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v17.16b, v3.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "AESE v0.16b, v8.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v17.16b, v4.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "AESE v0.16b, v9.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v17.16b, v5.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "AESE v0.16b, v10.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v17.16b, v6.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "AESE v0.16b, v11.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v17.16b, v7.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "AESE v0.16b, v12.16b  \n"
+                    "AESE v17.16b, v8.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "EOR v0.16b, v0.16b, v13.16b \n"
+                    "AESE v17.16b, v9.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "LD1 {v14.2d}, [%[input]], #16  \n"
+                    "AESE v17.16b, v10.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "EOR v0.16b, v0.16b, v14.16b \n"
+                    "AESE v17.16b, v11.16b  \n"
+                    "AESMC v17.16b, v17.16b \n"
+
+                    "LD1 {v14.2d}, [%[input]], #16  \n"
+                    "AESE v17.16b, v12.16b  \n"
+                    "ST1 {v0.2d}, [%[out]], #16  \n"
+                    "EOR v17.16b, v17.16b, v13.16b \n"
+                    "EOR v17.16b, v17.16b, v14.16b \n"
+                    "ST1 {v17.2d}, [%[out]], #16  \n"
+
+                    "B 1b \n"
+
+                    "2:      \n"
+                    "LD1 {v14.2d}, [%[input]], #16    \n"
+                    "MOV v0.16b, v15.16b  \n"
+
+                    "AESE v0.16b, v1.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v15.16b, v15.16b \n" /* network order */
+                    "AESE v0.16b, v2.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+                    "AESE v0.16b, v3.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "ADD v15.2d, v15.2d, v16.2d \n" /* add 1 to counter */
+                    "AESE v0.16b, v4.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "SUB w11, w11, #1     \n"
+                    "AESE v0.16b, v5.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v15.16b, v15.16b, v15.16b, #8 \n"
+                    "AESE v0.16b, v6.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v15.16b, v15.16b \n" /* revert from network order */
+                    "AESE v0.16b, v7.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v8.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v9.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v10.16b \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v11.16b \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v12.16b \n"
+                    "EOR v0.16b, v0.16b, v13.16b \n"
+                    "#CTR operations, increment counter and xorbuf \n"
+                    "EOR v0.16b, v0.16b, v14.16b \n"
+                    "ST1 {v0.2d}, [%[out]], #16  \n"
+
+                    "3: \n"
+                    "#store current counter value at the end \n"
+                    "ST1 {v15.2d}, %[regOut] \n"
+
+                    :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg),
+                     "=r" (in)
+                    :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+                     [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+                    : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+                    "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15",
+                    "v16", "v17"
+                    );
+                    break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+                case 14: /* AES 256 BLOCK */
+                    __asm__ __volatile__ (
+                    "MOV w11, %w[blocks] \n"
+                    "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+
+                    "#Create vector with the value 1  \n"
+                    "MOVI v18.16b, #1                 \n"
+                    "USHR v18.2d, v18.2d, #56         \n"
+                    "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+                    "EOR v19.16b, v19.16b, v19.16b    \n"
+                    "EXT v18.16b, v18.16b, v19.16b, #8\n"
+
+                    "LD1 {v9.2d-v12.2d}, [%[Key]], #64  \n"
+                    "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n"
+                    "LD1 {v17.2d}, %[reg]               \n"
+
+                    /* double block */
+                    "1:      \n"
+                    "CMP w11, #1 \n"
+                    "BEQ 2f    \n"
+                    "CMP w11, #0 \n"
+                    "BEQ 3f    \n"
+
+                    "MOV v0.16b, v17.16b  \n"
+                    "AESE v0.16b, v1.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v17.16b, v17.16b \n" /* network order */
+                    "AESE v0.16b, v2.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+                    "SUB w11, w11, #2     \n"
+                    "ADD v19.2d, v17.2d, v18.2d \n" /* add 1 to counter */
+                    "ADD v17.2d, v19.2d, v18.2d \n" /* add 1 to counter */
+
+                    "AESE v0.16b, v3.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v19.16b, v19.16b, v19.16b, #8 \n"
+                    "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+
+                    "AESE v0.16b, v4.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v19.16b, v19.16b \n" /* revert from network order */
+                    "REV64 v17.16b, v17.16b \n" /* revert from network order */
+
+                    "AESE v0.16b, v5.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v19.16b, v1.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "AESE v0.16b, v6.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v19.16b, v2.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "AESE v0.16b, v7.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v19.16b, v3.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "AESE v0.16b, v8.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v19.16b, v4.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "AESE v0.16b, v9.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v19.16b, v5.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "AESE v0.16b, v10.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v19.16b, v6.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "AESE v0.16b, v11.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v19.16b, v7.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "AESE v0.16b, v12.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v19.16b, v8.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "AESE v0.16b, v13.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v19.16b, v9.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "AESE v0.16b, v14.16b  \n"
+                    "AESE v19.16b, v10.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "EOR v0.16b, v0.16b, v15.16b \n"
+                    "AESE v19.16b, v11.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "LD1 {v16.2d}, [%[input]], #16 \n"
+                    "AESE v19.16b, v12.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "EOR v0.16b, v0.16b, v16.16b \n"
+                    "AESE v19.16b, v13.16b  \n"
+                    "AESMC v19.16b, v19.16b \n"
+
+                    "LD1 {v16.2d}, [%[input]], #16 \n"
+                    "AESE v19.16b, v14.16b  \n"
+                    "ST1 {v0.2d}, [%[out]], #16  \n"
+                    "EOR v19.16b, v19.16b, v15.16b \n"
+                    "EOR v19.16b, v19.16b, v16.16b \n"
+                    "ST1 {v19.2d}, [%[out]], #16  \n"
+
+                    "B 1b \n"
+
+                    "2:      \n"
+                    "LD1 {v16.2d}, [%[input]], #16 \n"
+                    "MOV v0.16b, v17.16b  \n"
+                    "AESE v0.16b, v1.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v17.16b, v17.16b \n" /* network order */
+                    "AESE v0.16b, v2.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+                    "AESE v0.16b, v3.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "ADD v17.2d, v17.2d, v18.2d \n" /* add 1 to counter */
+                    "AESE v0.16b, v4.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v5.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+                    "AESE v0.16b, v6.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "REV64 v17.16b, v17.16b \n" /* revert from network order */
+                    "AESE v0.16b, v7.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v8.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v9.16b  \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v10.16b \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v11.16b \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v12.16b \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v13.16b \n"
+                    "AESMC v0.16b, v0.16b \n"
+                    "AESE v0.16b, v14.16b \n"
+                    "EOR v0.16b, v0.16b, v15.16b \n"
+                    "#CTR operations, increment counter and xorbuf \n"
+                    "EOR v0.16b, v0.16b, v16.16b \n"
+                    "ST1 {v0.2d}, [%[out]], #16 \n"
+
+                    "3: \n"
+                    "#store current counter value at the end \n"
+                    "ST1 {v17.2d}, %[regOut] \n"
+
+
+                    :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg),
+                     "=r" (in)
+                    :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+                     [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
+                    : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+                    "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15",
+                    "v16", "v17", "v18", "v19"
+                    );
+                    break;
+#endif /* WOLFSSL_AES_256 */
+                default:
+                    WOLFSSL_MSG("Bad AES-CTR round value");
+                    return BAD_FUNC_ARG;
+                }
+
+                aes->left = 0;
+            }
+
+            /* handle non block size remaining */
+            if (sz) {
+                wc_AesEncrypt(aes, (byte*)aes->reg, (byte*)aes->tmp);
+                IncrementAesCounter((byte*)aes->reg);
+
+                aes->left = AES_BLOCK_SIZE;
+                tmp = (byte*)aes->tmp;
+
+                while (sz--) {
+                    *(out++) = *(in++) ^ *(tmp++);
+                    aes->left--;
+                }
+            }
+            return 0;
+        }
+
+#endif /* WOLFSSL_AES_COUNTER */
+
+#ifdef HAVE_AESGCM
+
+/*
+ * Based from GCM implementation in wolfcrypt/src/aes.c
+ */
+
+/* PMULL and RBIT only with AArch64 */
+/* Use ARM hardware for polynomial multiply */
+static void GMULT(byte* X, byte* Y)
+{
+    __asm__ volatile (
+        "LD1 {v0.16b}, [%[inX]] \n"
+        "LD1 {v1.16b}, [%[inY]] \n" /* v1 already reflected from set key */
+        "RBIT v0.16b, v0.16b \n"
+
+
+        /* Algorithm 1 from Intel GCM white paper.
+           "Carry-Less Multiplication and Its Usage for Computing the GCM Mode"
+         */
+        "PMULL  v3.1q, v0.1d, v1.1d \n"     /* a0 * b0 = C */
+        "PMULL2 v4.1q, v0.2d, v1.2d \n"     /* a1 * b1 = D */
+        "EXT v5.16b, v1.16b, v1.16b, #8 \n" /* b0b1 -> b1b0 */
+        "PMULL  v6.1q, v0.1d, v5.1d \n"     /* a0 * b1 = E */
+        "PMULL2 v5.1q, v0.2d, v5.2d \n"     /* a1 * b0 = F */
+
+        "#Set a register to all 0s using EOR \n"
+        "EOR v7.16b, v7.16b, v7.16b \n"
+        "EOR v5.16b, v5.16b, v6.16b \n"     /* F ^ E */
+        "EXT v6.16b, v7.16b, v5.16b, #8 \n" /* get (F^E)[0] */
+        "EOR v3.16b, v3.16b, v6.16b \n"     /* low 128 bits in v3 */
+        "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* get (F^E)[1] */
+        "EOR v4.16b, v4.16b, v6.16b \n"     /* high 128 bits in v4 */
+
+
+        /* Based from White Paper "Implementing GCM on ARMv8"
+           by Conrado P.L. Gouvea and Julio Lopez
+           reduction on 256bit value using Algorithm 5 */
+        "MOVI v8.16b, #0x87 \n"
+        "USHR v8.2d, v8.2d, #56 \n"
+        /* v8 is now 0x00000000000000870000000000000087 reflected 0xe1....*/
+        "PMULL2 v5.1q, v4.2d, v8.2d \n"
+        "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* v7 is all 0's */
+        "EOR v4.16b, v4.16b, v6.16b \n"
+        "EXT v6.16b, v7.16b, v5.16b, #8 \n"
+        "EOR v3.16b, v3.16b, v6.16b \n"
+        "PMULL v5.1q, v4.1d, v8.1d  \n"
+        "EOR v4.16b, v3.16b, v5.16b \n"
+
+        "RBIT v4.16b, v4.16b \n"
+        "STR q4, [%[out]] \n"
+        : [out] "=r" (X), "=r" (Y)
+        : [inX] "0" (X), [inY] "1" (Y)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"
+    );
+}
+
+
+void GHASH(Aes* aes, const byte* a, word32 aSz,
+                                const byte* c, word32 cSz, byte* s, word32 sSz)
+{
+    byte x[AES_BLOCK_SIZE];
+    byte scratch[AES_BLOCK_SIZE];
+    word32 blocks, partial;
+    byte* h = aes->H;
+
+    XMEMSET(x, 0, AES_BLOCK_SIZE);
+
+    /* Hash in A, the Additional Authentication Data */
+    if (aSz != 0 && a != NULL) {
+        blocks = aSz / AES_BLOCK_SIZE;
+        partial = aSz % AES_BLOCK_SIZE;
+        /* do as many blocks as possible */
+        while (blocks--) {
+            xorbuf(x, a, AES_BLOCK_SIZE);
+            GMULT(x, h);
+            a += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, a, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, h);
+        }
+    }
+
+    /* Hash in C, the Ciphertext */
+    if (cSz != 0 && c != NULL) {
+        blocks = cSz / AES_BLOCK_SIZE;
+        partial = cSz % AES_BLOCK_SIZE;
+        while (blocks--) {
+            xorbuf(x, c, AES_BLOCK_SIZE);
+            GMULT(x, h);
+            c += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, c, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, h);
+        }
+    }
+
+    /* Hash in the lengths of A and C in bits */
+    FlattenSzInBits(&scratch[0], aSz);
+    FlattenSzInBits(&scratch[8], cSz);
+    xorbuf(x, scratch, AES_BLOCK_SIZE);
+
+    /* Copy the result (minus last GMULT) into s. */
+    XMEMCPY(s, x, sSz);
+}
+
+
+#ifdef WOLFSSL_AES_128
+/* internal function : see wc_AesGcmEncrypt */
+static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    word32 blocks;
+    word32 partial;
+    byte counter[AES_BLOCK_SIZE];
+    byte initialCounter[AES_BLOCK_SIZE];
+    byte x[AES_BLOCK_SIZE];
+    byte scratch[AES_BLOCK_SIZE];
+
+    /* Noticed different optimization levels treated head of array different.
+       Some cases was stack pointer plus offset others was a regester containing
+       address. To make uniform for passing in to inline assembly code am using
+       pointers to the head of each local array.
+     */
+    byte* ctr  = counter;
+    byte* iCtr = initialCounter;
+    byte* xPt  = x;
+    byte* sPt  = scratch;
+    byte* keyPt; /* pointer to handle pointer advencment */
+
+    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+    if (ivSz == GCM_NONCE_MID_SZ) {
+        XMEMCPY(initialCounter, iv, ivSz);
+        initialCounter[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+        GMULT(initialCounter, aes->H);
+    }
+    XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE);
+
+
+    /* Hash in the Additional Authentication Data */
+    XMEMSET(x, 0, AES_BLOCK_SIZE);
+    if (authInSz != 0 && authIn != NULL) {
+        blocks = authInSz / AES_BLOCK_SIZE;
+        partial = authInSz % AES_BLOCK_SIZE;
+        /* do as many blocks as possible */
+        while (blocks--) {
+            xorbuf(x, authIn, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+            authIn += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, authIn, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+        }
+    }
+
+    /* do as many blocks as possible */
+    blocks = sz / AES_BLOCK_SIZE;
+    partial = sz % AES_BLOCK_SIZE;
+    if (blocks > 0) {
+        keyPt  = (byte*)aes->key;
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v13.2d}, [%[ctr]] \n"
+
+            "#Create vector with the value 1  \n"
+            "MOVI v14.16b, #1                 \n"
+            "USHR v14.2d, v14.2d, #56         \n"
+            "EOR v22.16b, v22.16b, v22.16b    \n"
+            "EXT v14.16b, v14.16b, v22.16b, #8\n"
+
+
+            /***************************************************
+               Get first out block for GHASH using AES encrypt
+             ***************************************************/
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+            "MOV v0.16b, v13.16b  \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v16.2d}, %[inY] \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "SUB w11, w11, #1     \n"
+            "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "MOVI v23.16b, #0x87 \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "USHR v23.2d, v23.2d, #56 \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b \n"
+            "EOR v0.16b, v0.16b, v11.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "MOV v15.16b, v0.16b \n"
+
+            "CBZ w11, 1f \n" /* only one block jump to final GHASH */
+
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+
+            /***************************************************
+               Interweave GHASH and encrypt if more then 1 block
+             ***************************************************/
+            "2: \n"
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "MOV v0.16b, v13.16b  \n"
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "SUB w11, w11, #1     \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "AESE v0.16b, v10.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v0.16b, v0.16b, v11.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "MOV v15.16b, v0.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+
+            "CBZ w11, 1f \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "B 2b \n"
+
+            /***************************************************
+               GHASH on last block
+             ***************************************************/
+            "1: \n"
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */
+
+            "#store current AES counter value \n"
+            "ST1 {v13.2d}, [%[ctrOut]] \n"
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+
+            "#Reduce product from multiplication \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+            "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */
+
+            :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in)
+            ,[xOut] "=r" (xPt),"=m" (aes->H)
+            :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+             [input] "3" (in)
+            ,[inX] "4" (xPt), [inY] "m" (aes->H)
+            : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+            ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"
+        );
+    }
+
+    /* take care of partial block sizes leftover */
+    if (partial != 0) {
+        IncrementGcmCounter(counter);
+        wc_AesEncrypt(aes, counter, scratch);
+        xorbuf(scratch, in, partial);
+        XMEMCPY(out, scratch, partial);
+
+        XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+        XMEMCPY(scratch, out, partial);
+        xorbuf(x, scratch, AES_BLOCK_SIZE);
+        GMULT(x, aes->H);
+    }
+
+    /* Hash in the lengths of A and C in bits */
+    XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+    FlattenSzInBits(&scratch[0], authInSz);
+    FlattenSzInBits(&scratch[8], sz);
+    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    XMEMCPY(scratch, x, AES_BLOCK_SIZE);
+
+    keyPt  = (byte*)aes->key;
+    __asm__ __volatile__ (
+
+        "LD1 {v16.16b}, [%[tag]] \n"
+        "LD1 {v17.16b}, %[h] \n"
+        "RBIT v16.16b, v16.16b \n"
+
+        "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+        "PMULL  v18.1q, v16.1d, v17.1d \n"     /* a0 * b0 = C */
+        "PMULL2 v19.1q, v16.2d, v17.2d \n"     /* a1 * b1 = D */
+        "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+        "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */
+        "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+        "PMULL  v21.1q, v16.1d, v20.1d \n"     /* a0 * b1 = E */
+        "PMULL2 v20.1q, v16.2d, v20.2d \n"     /* a1 * b0 = F */
+        "LD1 {v0.2d}, [%[ctr]]             \n"
+
+        "#Set a register to all 0s using EOR \n"
+        "EOR v22.16b, v22.16b, v22.16b \n"
+        "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+        "AESE v0.16b, v2.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+        "AESE v0.16b, v3.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+        "MOVI v23.16b, #0x87 \n"
+        "AESE v0.16b, v4.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "USHR v23.2d, v23.2d, #56 \n"
+        "PMULL2 v20.1q, v19.2d, v23.2d \n"
+        "AESE v0.16b, v5.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n"
+        "AESE v0.16b, v6.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"
+        "AESE v0.16b, v7.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+        "AESE v0.16b, v8.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"
+        "AESE v0.16b, v9.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "PMULL v20.1q, v19.1d, v23.1d  \n"
+        "EOR v19.16b, v18.16b, v20.16b \n"
+        "AESE v0.16b, v10.16b \n"
+        "RBIT v19.16b, v19.16b \n"
+        "EOR v0.16b, v0.16b, v11.16b \n"
+        "EOR v19.16b, v19.16b, v0.16b \n"
+        "STR q19, [%[out]] \n"
+
+        :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr)
+        :[tag] "0" (sPt), [Key] "1" (keyPt),
+        [ctr] "2" (iCtr) , [h] "m" (aes->H)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5",
+        "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14",
+        "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24"
+    );
+
+
+    if (authTagSz > AES_BLOCK_SIZE) {
+        XMEMCPY(authTag, scratch, AES_BLOCK_SIZE);
+    }
+    else {
+        /* authTagSz can be smaller than AES_BLOCK_SIZE */
+        XMEMCPY(authTag, scratch, authTagSz);
+    }
+    return 0;
+}
+#endif /* WOLFSSL_AES_128 */
+
+#ifdef WOLFSSL_AES_192
+/* internal function : see wc_AesGcmEncrypt */
+static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    word32 blocks;
+    word32 partial;
+    byte counter[AES_BLOCK_SIZE];
+    byte initialCounter[AES_BLOCK_SIZE];
+    byte x[AES_BLOCK_SIZE];
+    byte scratch[AES_BLOCK_SIZE];
+
+    /* Noticed different optimization levels treated head of array different.
+       Some cases was stack pointer plus offset others was a regester containing
+       address. To make uniform for passing in to inline assembly code am using
+       pointers to the head of each local array.
+     */
+    byte* ctr  = counter;
+    byte* iCtr = initialCounter;
+    byte* xPt  = x;
+    byte* sPt  = scratch;
+    byte* keyPt; /* pointer to handle pointer advencment */
+
+    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+    if (ivSz == GCM_NONCE_MID_SZ) {
+        XMEMCPY(initialCounter, iv, ivSz);
+        initialCounter[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+        GMULT(initialCounter, aes->H);
+    }
+    XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE);
+
+
+    /* Hash in the Additional Authentication Data */
+    XMEMSET(x, 0, AES_BLOCK_SIZE);
+    if (authInSz != 0 && authIn != NULL) {
+        blocks = authInSz / AES_BLOCK_SIZE;
+        partial = authInSz % AES_BLOCK_SIZE;
+        /* do as many blocks as possible */
+        while (blocks--) {
+            xorbuf(x, authIn, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+            authIn += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, authIn, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+        }
+    }
+
+    /* do as many blocks as possible */
+    blocks = sz / AES_BLOCK_SIZE;
+    partial = sz % AES_BLOCK_SIZE;
+    if (blocks > 0) {
+        keyPt  = (byte*)aes->key;
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v13.2d}, [%[ctr]] \n"
+
+            "#Create vector with the value 1  \n"
+            "MOVI v14.16b, #1                 \n"
+            "USHR v14.2d, v14.2d, #56         \n"
+            "EOR v22.16b, v22.16b, v22.16b    \n"
+            "EXT v14.16b, v14.16b, v22.16b, #8\n"
+
+
+            /***************************************************
+               Get first out block for GHASH using AES encrypt
+             ***************************************************/
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+            "MOV v0.16b, v13.16b  \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v16.2d}, %[inY] \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "SUB w11, w11, #1     \n"
+            "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+            "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "MOVI v23.16b, #0x87 \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "USHR v23.2d, v23.2d, #56 \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v30.16b \n"
+            "EOR v0.16b, v0.16b, v31.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "MOV v15.16b, v0.16b \n"
+
+            "CBZ w11, 1f \n" /* only one block jump to final GHASH */
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+
+            /***************************************************
+               Interweave GHASH and encrypt if more then 1 block
+             ***************************************************/
+            "2: \n"
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "MOV v0.16b, v13.16b  \n"
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "SUB w11, w11, #1     \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "AESE v0.16b, v30.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v0.16b, v0.16b, v31.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "MOV v15.16b, v0.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+
+            "CBZ w11, 1f \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "B 2b \n"
+
+            /***************************************************
+               GHASH on last block
+             ***************************************************/
+            "1: \n"
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */
+
+            "#store current AES counter value \n"
+            "ST1 {v13.2d}, [%[ctrOut]] \n"
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+
+            "#Reduce product from multiplication \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+            "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */
+
+            :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in)
+            ,[xOut] "=r" (xPt),"=m" (aes->H)
+            :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+             [input] "3" (in)
+            ,[inX] "4" (xPt), [inY] "m" (aes->H)
+            : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+            ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+            "v24","v25","v26","v27","v28","v29","v30","v31"
+        );
+    }
+
+    /* take care of partial block sizes leftover */
+    if (partial != 0) {
+        IncrementGcmCounter(counter);
+        wc_AesEncrypt(aes, counter, scratch);
+        xorbuf(scratch, in, partial);
+        XMEMCPY(out, scratch, partial);
+
+        XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+        XMEMCPY(scratch, out, partial);
+        xorbuf(x, scratch, AES_BLOCK_SIZE);
+        GMULT(x, aes->H);
+    }
+
+    /* Hash in the lengths of A and C in bits */
+    XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+    FlattenSzInBits(&scratch[0], authInSz);
+    FlattenSzInBits(&scratch[8], sz);
+    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    XMEMCPY(scratch, x, AES_BLOCK_SIZE);
+
+    keyPt  = (byte*)aes->key;
+    __asm__ __volatile__ (
+
+        "LD1 {v16.16b}, [%[tag]] \n"
+        "LD1 {v17.16b}, %[h] \n"
+        "RBIT v16.16b, v16.16b \n"
+
+        "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+        "PMULL  v18.1q, v16.1d, v17.1d \n"     /* a0 * b0 = C */
+        "PMULL2 v19.1q, v16.2d, v17.2d \n"     /* a1 * b1 = D */
+        "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+        "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */
+        "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+        "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n"
+        "PMULL  v21.1q, v16.1d, v20.1d \n"     /* a0 * b1 = E */
+        "PMULL2 v20.1q, v16.2d, v20.2d \n"     /* a1 * b0 = F */
+        "LD1 {v0.2d}, [%[ctr]]             \n"
+
+        "#Set a register to all 0s using EOR \n"
+        "EOR v22.16b, v22.16b, v22.16b \n"
+        "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+        "AESE v0.16b, v2.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+        "AESE v0.16b, v3.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+        "MOVI v23.16b, #0x87 \n"
+        "AESE v0.16b, v4.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "USHR v23.2d, v23.2d, #56 \n"
+        "PMULL2 v20.1q, v19.2d, v23.2d \n"
+        "AESE v0.16b, v5.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n"
+        "AESE v0.16b, v6.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"
+        "AESE v0.16b, v7.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+        "AESE v0.16b, v8.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"
+        "AESE v0.16b, v9.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v10.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v11.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "PMULL v20.1q, v19.1d, v23.1d  \n"
+        "EOR v19.16b, v18.16b, v20.16b \n"
+        "AESE v0.16b, v30.16b \n"
+        "RBIT v19.16b, v19.16b \n"
+        "EOR v0.16b, v0.16b, v31.16b \n"
+        "EOR v19.16b, v19.16b, v0.16b \n"
+        "STR q19, [%[out]] \n"
+
+        :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr)
+        :[tag] "0" (sPt), [Key] "1" (keyPt),
+        [ctr] "2" (iCtr) , [h] "m" (aes->H)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5",
+        "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14",
+        "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24"
+    );
+
+
+    if (authTagSz > AES_BLOCK_SIZE) {
+        XMEMCPY(authTag, scratch, AES_BLOCK_SIZE);
+    }
+    else {
+        /* authTagSz can be smaller than AES_BLOCK_SIZE */
+        XMEMCPY(authTag, scratch, authTagSz);
+    }
+
+    return 0;
+}
+#endif /* WOLFSSL_AES_192 */
+
+#ifdef WOLFSSL_AES_256
+/* internal function : see wc_AesGcmEncrypt */
+static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    word32 blocks;
+    word32 partial;
+    byte counter[AES_BLOCK_SIZE];
+    byte initialCounter[AES_BLOCK_SIZE];
+    byte x[AES_BLOCK_SIZE];
+    byte scratch[AES_BLOCK_SIZE];
+
+    /* Noticed different optimization levels treated head of array different.
+       Some cases was stack pointer plus offset others was a regester containing
+       address. To make uniform for passing in to inline assembly code am using
+       pointers to the head of each local array.
+     */
+    byte* ctr  = counter;
+    byte* iCtr = initialCounter;
+    byte* xPt  = x;
+    byte* sPt  = scratch;
+    byte* keyPt; /* pointer to handle pointer advencment */
+
+    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+    if (ivSz == GCM_NONCE_MID_SZ) {
+        XMEMCPY(initialCounter, iv, ivSz);
+        initialCounter[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+        GMULT(initialCounter, aes->H);
+    }
+    XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE);
+
+
+    /* Hash in the Additional Authentication Data */
+    XMEMSET(x, 0, AES_BLOCK_SIZE);
+    if (authInSz != 0 && authIn != NULL) {
+        blocks = authInSz / AES_BLOCK_SIZE;
+        partial = authInSz % AES_BLOCK_SIZE;
+        /* do as many blocks as possible */
+        while (blocks--) {
+            xorbuf(x, authIn, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+            authIn += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, authIn, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, aes->H);
+        }
+    }
+
+    /* do as many blocks as possible */
+    blocks = sz / AES_BLOCK_SIZE;
+    partial = sz % AES_BLOCK_SIZE;
+    if (blocks > 0) {
+        keyPt  = (byte*)aes->key;
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v13.2d}, [%[ctr]] \n"
+
+            "#Create vector with the value 1  \n"
+            "MOVI v14.16b, #1                 \n"
+            "USHR v14.2d, v14.2d, #56         \n"
+            "EOR v22.16b, v22.16b, v22.16b    \n"
+            "EXT v14.16b, v14.16b, v22.16b, #8\n"
+
+
+            /***************************************************
+               Get first out block for GHASH using AES encrypt
+             ***************************************************/
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+            "MOV v0.16b, v13.16b  \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v16.2d}, %[inY] \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "SUB w11, w11, #1     \n"
+            "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+            "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "MOVI v23.16b, #0x87 \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "USHR v23.2d, v23.2d, #56 \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v28.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v29.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v30.16b \n"
+            "EOR v0.16b, v0.16b, v31.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "MOV v15.16b, v0.16b \n"
+
+            "CBZ w11, 1f \n" /* only one block jump to final GHASH */
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+
+            /***************************************************
+               Interweave GHASH and encrypt if more then 1 block
+             ***************************************************/
+            "2: \n"
+            "REV64 v13.16b, v13.16b \n" /* network order */
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "ADD v13.2d, v13.2d, v14.2d \n" /* add 1 to counter */
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */
+            "EXT v13.16b, v13.16b, v13.16b, #8 \n"
+            "REV64 v13.16b, v13.16b \n" /* revert from network order */
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "MOV v0.16b, v13.16b  \n"
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "SUB w11, w11, #1     \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v28.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v29.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "AESE v0.16b, v30.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v0.16b, v0.16b, v31.16b \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "MOV v15.16b, v0.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+
+            "CBZ w11, 1f \n"
+            "LD1 {v12.2d}, [%[input]], #16 \n"
+            "B 2b \n"
+
+            /***************************************************
+               GHASH on last block
+             ***************************************************/
+            "1: \n"
+            "EOR v15.16b, v17.16b, v15.16b \n"
+            "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */
+
+            "#store current AES counter value \n"
+            "ST1 {v13.2d}, [%[ctrOut]] \n"
+            "PMULL  v18.1q, v15.1d, v16.1d \n"     /* a0 * b0 = C */
+            "PMULL2 v19.1q, v15.2d, v16.2d \n"     /* a1 * b1 = D */
+            "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */
+            "PMULL  v21.1q, v15.1d, v20.1d \n"     /* a0 * b1 = E */
+            "PMULL2 v20.1q, v15.2d, v20.2d \n"     /* a1 * b0 = F */
+            "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+            "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+            "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+
+            "#Reduce product from multiplication \n"
+            "PMULL2 v20.1q, v19.2d, v23.2d \n"
+            "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */
+            "EOR v19.16b, v19.16b, v21.16b \n"
+            "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+            "EOR v18.16b, v18.16b, v21.16b \n"
+            "PMULL v20.1q, v19.1d, v23.1d  \n"
+            "EOR v19.16b, v18.16b, v20.16b \n"
+            "RBIT v17.16b, v19.16b \n"
+            "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */
+
+            :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in)
+            ,[xOut] "=r" (xPt),"=m" (aes->H)
+            :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+             [input] "3" (in)
+            ,[inX] "4" (xPt), [inY] "m" (aes->H)
+            : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+            ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"
+        );
+    }
+
+    /* take care of partial block sizes leftover */
+    if (partial != 0) {
+        IncrementGcmCounter(counter);
+        wc_AesEncrypt(aes, counter, scratch);
+        xorbuf(scratch, in, partial);
+        XMEMCPY(out, scratch, partial);
+
+        XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+        XMEMCPY(scratch, out, partial);
+        xorbuf(x, scratch, AES_BLOCK_SIZE);
+        GMULT(x, aes->H);
+    }
+
+    /* Hash in the lengths of A and C in bits */
+    XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+    FlattenSzInBits(&scratch[0], authInSz);
+    FlattenSzInBits(&scratch[8], sz);
+    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    XMEMCPY(scratch, x, AES_BLOCK_SIZE);
+
+    keyPt  = (byte*)aes->key;
+    __asm__ __volatile__ (
+
+        "LD1 {v16.16b}, [%[tag]] \n"
+        "LD1 {v17.16b}, %[h] \n"
+        "RBIT v16.16b, v16.16b \n"
+
+        "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n"
+        "PMULL  v18.1q, v16.1d, v17.1d \n"     /* a0 * b0 = C */
+        "PMULL2 v19.1q, v16.2d, v17.2d \n"     /* a1 * b1 = D */
+        "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n"
+        "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */
+        "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
+        "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n"
+        "PMULL  v21.1q, v16.1d, v20.1d \n"     /* a0 * b1 = E */
+        "PMULL2 v20.1q, v16.2d, v20.2d \n"     /* a1 * b0 = F */
+        "LD1 {v0.2d}, [%[ctr]]             \n"
+
+        "#Set a register to all 0s using EOR \n"
+        "EOR v22.16b, v22.16b, v22.16b \n"
+        "EOR v20.16b, v20.16b, v21.16b \n"     /* F ^ E */
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */
+        "AESE v0.16b, v2.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"     /* low 128 bits in v3 */
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */
+        "AESE v0.16b, v3.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"     /* high 128 bits in v4 */
+        "MOVI v23.16b, #0x87 \n"
+        "AESE v0.16b, v4.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "USHR v23.2d, v23.2d, #56 \n"
+        "PMULL2 v20.1q, v19.2d, v23.2d \n"
+        "AESE v0.16b, v5.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v20.16b, v22.16b, #8 \n"
+        "AESE v0.16b, v6.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v19.16b, v19.16b, v21.16b \n"
+        "AESE v0.16b, v7.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EXT v21.16b, v22.16b, v20.16b, #8 \n"
+        "AESE v0.16b, v8.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "EOR v18.16b, v18.16b, v21.16b \n"
+        "AESE v0.16b, v9.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v10.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v11.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v28.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v29.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "PMULL v20.1q, v19.1d, v23.1d  \n"
+        "EOR v19.16b, v18.16b, v20.16b \n"
+        "AESE v0.16b, v30.16b \n"
+        "RBIT v19.16b, v19.16b \n"
+        "EOR v0.16b, v0.16b, v31.16b \n"
+        "EOR v19.16b, v19.16b, v0.16b \n"
+        "STR q19, [%[out]] \n"
+
+        :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr)
+        :[tag] "0" (sPt), [Key] "1" (keyPt),
+        [ctr] "2" (iCtr) , [h] "m" (aes->H)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5",
+        "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14",
+        "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23",
+        "v24","v25","v26","v27","v28","v29","v30","v31"
+    );
+
+
+    if (authTagSz > AES_BLOCK_SIZE) {
+        XMEMCPY(authTag, scratch, AES_BLOCK_SIZE);
+    }
+    else {
+        /* authTagSz can be smaller than AES_BLOCK_SIZE */
+        XMEMCPY(authTag, scratch, authTagSz);
+    }
+
+    return 0;
+}
+#endif /* WOLFSSL_AES_256 */
+
+
+/* aarch64 with PMULL and PMULL2
+ * Encrypt and tag data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: encrypted data output buffer
+ * in:  plain text input buffer
+ * sz:  size of plain text and out buffer
+ * iv:  initialization vector
+ * ivSz:      size of iv buffer
+ * authTag:   buffer to hold tag
+ * authTagSz: size of tag buffer
+ * authIn:    additional data buffer
+ * authInSz:  size of additional data buffer
+ *
+ * Notes:
+ * GHASH multiplication based from Algorithm 1 from Intel GCM white paper.
+ * "Carry-Less Multiplication and Its Usage for Computing the GCM Mode"
+ *
+ * GHASH reduction Based from White Paper "Implementing GCM on ARMv8"
+ * by Conrado P.L. Gouvea and Julio Lopez reduction on 256bit value using
+ * Algorithm 5
+ */
+int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    /* sanity checks */
+    if (aes == NULL || (iv == NULL && ivSz > 0) ||
+                       (authTag == NULL) ||
+                       (authIn == NULL && authInSz > 0) ||
+                       (in == NULL && sz > 0) ||
+                       (out == NULL && sz > 0)) {
+        WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+        return BAD_FUNC_ARG;
+    }
+
+    if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ || authTagSz > AES_BLOCK_SIZE) {
+        WOLFSSL_MSG("GcmEncrypt authTagSz error");
+        return BAD_FUNC_ARG;
+    }
+
+    switch (aes->rounds) {
+#ifdef WOLFSSL_AES_128
+        case 10:
+            return Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz,
+                                    authTag, authTagSz, authIn, authInSz);
+#endif
+#ifdef WOLFSSL_AES_192
+        case 12:
+            return Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz,
+                                    authTag, authTagSz, authIn, authInSz);
+#endif
+#ifdef WOLFSSL_AES_256
+        case 14:
+            return Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz,
+                                    authTag, authTagSz, authIn, authInSz);
+#endif
+        default:
+            WOLFSSL_MSG("AES-GCM invalid round number");
+            return BAD_FUNC_ARG;
+    }
+}
+
+
+#ifdef HAVE_AES_DECRYPT
+/*
+ * Check tag and decrypt data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: decrypted data output buffer
+ * in:  cipher text buffer
+ * sz:  size of plain text and out buffer
+ * iv:  initialization vector
+ * ivSz:      size of iv buffer
+ * authTag:   buffer holding tag
+ * authTagSz: size of tag buffer
+ * authIn:    additional data buffer
+ * authInSz:  size of additional data buffer
+ */
+int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   const byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    word32 blocks = sz / AES_BLOCK_SIZE;
+    word32 partial = sz % AES_BLOCK_SIZE;
+    const byte* c = in;
+    byte* p = out;
+    byte counter[AES_BLOCK_SIZE];
+    byte initialCounter[AES_BLOCK_SIZE];
+    byte *ctr ;
+    byte scratch[AES_BLOCK_SIZE];
+
+    ctr = counter ;
+
+    /* sanity checks */
+    if (aes == NULL || (iv == NULL && ivSz > 0) ||
+                       (authTag == NULL) ||
+                       (authIn == NULL && authInSz > 0) ||
+                       (in  == NULL && sz > 0) ||
+                       (out == NULL && sz > 0)) {
+        WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+        return BAD_FUNC_ARG;
+    }
+
+    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+    if (ivSz == GCM_NONCE_MID_SZ) {
+        XMEMCPY(initialCounter, iv, ivSz);
+        initialCounter[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+        GMULT(initialCounter, aes->H);
+    }
+    XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
+
+    /* Calculate the authTag again using the received auth data and the
+     * cipher text. */
+    {
+        byte Tprime[AES_BLOCK_SIZE];
+        byte EKY0[AES_BLOCK_SIZE];
+
+        GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime));
+        GMULT(Tprime, aes->H);
+        wc_AesEncrypt(aes, ctr, EKY0);
+        xorbuf(Tprime, EKY0, sizeof(Tprime));
+
+        if (ConstantCompare(authTag, Tprime, authTagSz) != 0) {
+            return AES_GCM_AUTH_E;
+        }
+    }
+
+    /* do as many blocks as possible */
+    if (blocks > 0) {
+        /* pointer needed because it is incremented when read, causing
+         * an issue with call to encrypt/decrypt leftovers */
+        byte*  keyPt  = (byte*)aes->key;
+        switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+        case 10: /* AES 128 BLOCK */
+            __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+
+            "#Create vector with the value 1   \n"
+            "MOVI v14.16b, #1                  \n"
+            "USHR v14.2d, v14.2d, #56          \n"
+            "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
+            "EOR v13.16b, v13.16b, v13.16b     \n"
+            "EXT v14.16b, v14.16b, v13.16b, #8 \n"
+
+            "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n"
+            "LD1 {v12.2d}, [%[ctr]]            \n"
+            "LD1 {v13.2d}, [%[input]], #16     \n"
+
+            "1: \n"
+            "REV64 v12.16b, v12.16b \n" /* network order */
+            "EXT v12.16b, v12.16b, v12.16b, #8 \n"
+            "ADD v12.2d, v12.2d, v14.2d \n" /* add 1 to counter */
+            "EXT v12.16b, v12.16b, v12.16b, #8 \n"
+            "REV64 v12.16b, v12.16b \n" /* revert from network order */
+            "MOV v0.16b, v12.16b  \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "SUB w11, w11, #1     \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b \n"
+            "EOR v0.16b, v0.16b, v11.16b \n"
+
+            "EOR v0.16b, v0.16b, v13.16b \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+
+            "CBZ w11, 2f \n"
+            "LD1 {v13.2d}, [%[input]], #16 \n"
+            "B 1b \n"
+
+            "2: \n"
+            "#store current counter value at the end \n"
+            "ST1 {v12.16b}, [%[ctrOut]] \n"
+
+            :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c)
+            :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+             [input] "3" (c)
+            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+            );
+            break;
+#endif
+#ifdef WOLFSSL_AES_192
+        case 12: /* AES 192 BLOCK */
+            __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+
+            "#Create vector with the value 1   \n"
+            "MOVI v16.16b, #1                  \n"
+            "USHR v16.2d, v16.2d, #56          \n"
+            "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
+            "EOR v14.16b, v14.16b, v14.16b     \n"
+            "EXT v16.16b, v16.16b, v14.16b, #8 \n"
+
+            "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n"
+            "LD1 {v13.2d}, [%[Key]], #16       \n"
+            "LD1 {v14.2d}, [%[ctr]]            \n"
+            "LD1 {v15.2d}, [%[input]], #16     \n"
+
+            "1: \n"
+            "REV64 v14.16b, v14.16b \n" /* network order */
+            "EXT v14.16b, v14.16b, v14.16b, #8 \n"
+            "ADD v14.2d, v14.2d, v16.2d \n" /* add 1 to counter */
+            "EXT v14.16b, v14.16b, v14.16b, #8 \n"
+            "REV64 v14.16b, v14.16b \n" /* revert from network order */
+            "MOV v0.16b, v14.16b  \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "SUB w11, w11, #1     \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v12.16b \n"
+            "EOR v0.16b, v0.16b, v13.16b \n"
+
+            "EOR v0.16b, v0.16b, v15.16b \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+
+            "CBZ w11, 2f \n"
+            "LD1 {v15.2d}, [%[input]], #16 \n"
+            "B 1b \n"
+
+            "2: \n"
+            "#store current counter value at the end \n"
+            "ST1 {v14.2d}, [%[ctrOut]]   \n"
+
+            :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c)
+            :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+             [input] "3" (c)
+            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+            "v16"
+            );
+            break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+        case 14: /* AES 256 BLOCK */
+            __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
+
+            "#Create vector with the value 1   \n"
+            "MOVI v18.16b, #1                  \n"
+            "USHR v18.2d, v18.2d, #56          \n"
+            "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
+            "EOR v19.16b, v19.16b, v19.16b     \n"
+            "EXT v18.16b, v18.16b, v19.16b, #8 \n"
+
+            "LD1 {v9.2d-v12.2d},  [%[Key]], #64 \n"
+            "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n"
+            "LD1 {v17.2d}, [%[ctr]]             \n"
+            "LD1 {v16.2d}, [%[input]], #16      \n"
+
+            "1: \n"
+            "REV64 v17.16b, v17.16b \n" /* network order */
+            "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+            "ADD v17.2d, v17.2d, v18.2d \n" /* add 1 to counter */
+            "EXT v17.16b, v17.16b, v17.16b, #8 \n"
+            "REV64 v17.16b, v17.16b \n" /* revert from network order */
+            "MOV v0.16b, v17.16b  \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "SUB w11, w11, #1     \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v12.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v13.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v14.16b \n"
+            "EOR v0.16b, v0.16b, v15.16b \n"
+
+            "EOR v0.16b, v0.16b, v16.16b \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+
+            "CBZ w11, 2f \n"
+            "LD1 {v16.2d}, [%[input]], #16 \n"
+            "B 1b \n"
+
+            "2: \n"
+            "#store current counter value at the end \n"
+            "ST1 {v17.2d}, [%[ctrOut]] \n"
+
+            :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c)
+            :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks),
+             [input] "3" (c)
+            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+            "v16", "v17", "v18", "v19"
+            );
+            break;
+#endif /* WOLFSSL_AES_256 */
+        default:
+            WOLFSSL_MSG("Bad AES-GCM round value");
+            return BAD_FUNC_ARG;
+        }
+    }
+    if (partial != 0) {
+        IncrementGcmCounter(ctr);
+        wc_AesEncrypt(aes, ctr, scratch);
+
+        /* check if pointer is null after main AES-GCM blocks
+         * helps static analysis */
+        if (p == NULL || c == NULL) {
+            return BAD_STATE_E;
+        }
+        xorbuf(scratch, c, partial);
+        XMEMCPY(p, scratch, partial);
+    }
+    return 0;
+}
+
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AESGCM */
+
+
+/***************************************
+ * not 64 bit so use 32 bit mode
+****************************************/
+#else
+
+/* AES CCM/GCM use encrypt direct but not decrypt */
+#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
+    defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+    static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+            /*
+              AESE exor's input with round key
+                   shift rows of exor'ed result
+                   sub bytes for shifted rows
+             */
+
+            word32* keyPt = aes->key;
+            __asm__ __volatile__ (
+                "VLD1.32 {q0}, [%[CtrIn]] \n"
+                "VLDM %[Key]!, {q1-q4}    \n"
+
+                "AESE.8 q0, q1\n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q2\n"
+                "AESMC.8 q0, q0\n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "AESE.8 q0, q3\n"
+                "AESMC.8 q0, q0\n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "AESE.8 q0, q4\n"
+                "AESMC.8 q0, q0\n"
+                "VLD1.32 {q3}, [%[Key]]!  \n"
+                "AESE.8 q0, q1\n"
+                "AESMC.8 q0, q0\n"
+                "VLD1.32 {q4}, [%[Key]]!  \n"
+                "AESE.8 q0, q2\n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q3\n"
+                "AESMC.8 q0, q0\n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "AESE.8 q0, q4\n"
+                "AESMC.8 q0, q0\n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "AESE.8 q0, q1\n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q2\n"
+
+                "MOV r12, %[R]    \n"
+                "CMP r12, #10 \n"
+                "BEQ 1f    \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "AESMC.8 q0, q0\n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "AESE.8 q0, q1\n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q2\n"
+
+                "CMP r12, #12 \n"
+                "BEQ 1f    \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "AESMC.8 q0, q0\n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "AESE.8 q0, q1\n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q2\n"
+
+                "#Final AddRoundKey then store result \n"
+                "1: \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "VEOR.32 q0, q0, q1\n"
+                "VST1.32 {q0}, [%[CtrOut]]   \n"
+
+                :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
+                 "=r" (inBlock)
+                :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds),
+                 [CtrIn] "3" (inBlock)
+                : "cc", "memory", "r12", "q0", "q1", "q2", "q3", "q4"
+            );
+
+        return 0;
+    }
+#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */
+#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
+    #ifdef HAVE_AES_DECRYPT
+    static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+            /*
+              AESE exor's input with round key
+                   shift rows of exor'ed result
+                   sub bytes for shifted rows
+             */
+
+            word32* keyPt = aes->key;
+            __asm__ __volatile__ (
+                "VLD1.32 {q0}, [%[CtrIn]] \n"
+                "VLDM %[Key]!, {q1-q4}    \n"
+
+                "AESD.8 q0, q1\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q2\n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "AESD.8 q0, q3\n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "AESD.8 q0, q4\n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q3}, [%[Key]]!  \n"
+                "AESD.8 q0, q1\n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q4}, [%[Key]]!  \n"
+                "AESD.8 q0, q2\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q3\n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "AESD.8 q0, q4\n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "AESD.8 q0, q1\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q2\n"
+
+                "MOV r12, %[R] \n"
+                "CMP r12, #10  \n"
+                "BEQ 1f \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "AESD.8 q0, q1\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q2\n"
+
+                "CMP r12, #12  \n"
+                "BEQ 1f \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "AESD.8 q0, q1\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q2\n"
+
+                "#Final AddRoundKey then store result \n"
+                "1: \n"
+                "VLD1.32 {q1}, [%[Key]]! \n"
+                "VEOR.32 q0, q0, q1\n"
+                "VST1.32 {q0}, [%[CtrOut]]    \n"
+
+                :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
+                 "=r" (inBlock)
+                :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds),
+                 [CtrIn] "3" (inBlock)
+                : "cc", "memory", "r12", "q0", "q1", "q2", "q3", "q4"
+            );
+
+        return 0;
+}
+    #endif /* HAVE_AES_DECRYPT */
+#endif /* DIRECT or COUNTER */
+
+/* AES-CBC */
+#ifdef HAVE_AES_CBC
+    int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+    {
+        word32 numBlocks = sz / AES_BLOCK_SIZE;
+
+        if (aes == NULL || out == NULL || (in == NULL && sz > 0)) {
+            return BAD_FUNC_ARG;
+        }
+
+        /* do as many block size ops as possible */
+        if (numBlocks > 0) {
+            word32* keyPt = aes->key;
+            word32* regPt = aes->reg;
+            /*
+            AESE exor's input with round key
+            shift rows of exor'ed result
+            sub bytes for shifted rows
+
+            note: grouping AESE & AESMC together as pairs reduces latency
+            */
+            switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+            case 10: /* AES 128 BLOCK */
+                __asm__ __volatile__ (
+                "MOV r11, %[blocks] \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "VLD1.32 {q3}, [%[Key]]!  \n"
+                "VLD1.32 {q4}, [%[Key]]!  \n"
+                "VLD1.32 {q5}, [%[Key]]!  \n"
+                "VLD1.32 {q6}, [%[Key]]!  \n"
+                "VLD1.32 {q7}, [%[Key]]!  \n"
+                "VLD1.32 {q8}, [%[Key]]!  \n"
+                "VLD1.32 {q9}, [%[Key]]!  \n"
+                "VLD1.32 {q10}, [%[Key]]! \n"
+                "VLD1.32 {q11}, [%[Key]]! \n"
+                "VLD1.32 {q0}, [%[reg]]   \n"
+                "VLD1.32 {q12}, [%[input]]!\n"
+
+                "1:\n"
+                "#CBC operations, xorbuf in with current aes->reg \n"
+                "VEOR.32 q0, q0, q12 \n"
+                "AESE.8 q0, q1 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q2 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q3 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q4 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q5 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q6 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q7 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q8 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q9 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q10\n"
+                "VEOR.32 q0, q0, q11 \n"
+                "SUB r11, r11, #1    \n"
+                "VST1.32 {q0}, [%[out]]!   \n"
+
+                "CMP r11, #0   \n"
+                "BEQ 2f \n"
+                "VLD1.32 {q12}, [%[input]]! \n"
+                "B 1b \n"
+
+                "2:\n"
+                "#store current counter value at the end \n"
+                "VST1.32 {q0}, [%[regOut]] \n"
+
+                :[out] "=r" (out), [regOut] "=r" (regPt)
+                :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (regPt)
+                : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+                "q6", "q7", "q8", "q9", "q10", "q11", "q12"
+                );
+                break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+            case 12: /* AES 192 BLOCK */
+                __asm__ __volatile__ (
+                "MOV r11, %[blocks] \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "VLD1.32 {q3}, [%[Key]]!  \n"
+                "VLD1.32 {q4}, [%[Key]]!  \n"
+                "VLD1.32 {q5}, [%[Key]]!  \n"
+                "VLD1.32 {q6}, [%[Key]]!  \n"
+                "VLD1.32 {q7}, [%[Key]]!  \n"
+                "VLD1.32 {q8}, [%[Key]]!  \n"
+                "VLD1.32 {q9}, [%[Key]]!  \n"
+                "VLD1.32 {q10}, [%[Key]]! \n"
+                "VLD1.32 {q11}, [%[Key]]! \n"
+                "VLD1.32 {q0}, [%[reg]]   \n"
+                "VLD1.32 {q12}, [%[input]]!\n"
+                "VLD1.32 {q13}, [%[Key]]!  \n"
+                "VLD1.32 {q14}, [%[Key]]!  \n"
+
+                "1:\n"
+                "#CBC operations, xorbuf in with current aes->reg \n"
+                "VEOR.32 q0, q0, q12 \n"
+                "AESE.8 q0, q1 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q2 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q3 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q4 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q5 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q6 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q7 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q8 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q9 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q10 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q11 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q13\n"
+                "VEOR.32 q0, q0, q14 \n"
+                "SUB r11, r11, #1    \n"
+                "VST1.32 {q0}, [%[out]]!   \n"
+
+                "CMP r11, #0   \n"
+                "BEQ 2f \n"
+                "VLD1.32 {q12}, [%[input]]! \n"
+                "B 1b \n"
+
+                "2:\n"
+                "#store current counter qalue at the end \n"
+                "VST1.32 {q0}, [%[regOut]] \n"
+
+                :[out] "=r" (out), [regOut] "=r" (regPt)
+                :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (regPt)
+                : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+                "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14"
+                );
+                break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+            case 14: /* AES 256 BLOCK */
+                __asm__ __volatile__ (
+                "MOV r11, %[blocks] \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "VLD1.32 {q3}, [%[Key]]!  \n"
+                "VLD1.32 {q4}, [%[Key]]!  \n"
+                "VLD1.32 {q5}, [%[Key]]!  \n"
+                "VLD1.32 {q6}, [%[Key]]!  \n"
+                "VLD1.32 {q7}, [%[Key]]!  \n"
+                "VLD1.32 {q8}, [%[Key]]!  \n"
+                "VLD1.32 {q9}, [%[Key]]!  \n"
+                "VLD1.32 {q10}, [%[Key]]! \n"
+                "VLD1.32 {q11}, [%[Key]]! \n"
+                "VLD1.32 {q0}, [%[reg]]   \n"
+                "VLD1.32 {q12}, [%[input]]!\n"
+                "VLD1.32 {q13}, [%[Key]]!  \n"
+                "VLD1.32 {q14}, [%[Key]]!  \n"
+
+                "1:\n"
+                "#CBC operations, xorbuf in with current aes->reg \n"
+                "VEOR.32 q0, q0, q12 \n"
+                "AESE.8 q0, q1 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q2 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q3 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q4 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q5 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q6 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q7 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q8 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q9 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q10 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q11 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q13 \n"
+                "AESMC.8 q0, q0\n"
+                "VLD1.32 {q15}, [%[Key]]!  \n"
+                "AESE.8 q0, q14 \n"
+                "AESMC.8 q0, q0\n"
+                "AESE.8 q0, q15\n"
+                "VLD1.32 {q15}, [%[Key]]   \n"
+                "VEOR.32 q0, q0, q15 \n"
+                "SUB r11, r11, #1    \n"
+                "VST1.32 {q0}, [%[out]]!   \n"
+                "SUB %[Key], %[Key], #16   \n"
+
+                "CMP r11, #0   \n"
+                "BEQ 2f \n"
+                "VLD1.32 {q12}, [%[input]]! \n"
+                "B 1b \n"
+
+                "2:\n"
+                "#store current counter qalue at the end \n"
+                "VST1.32 {q0}, [%[regOut]] \n"
+
+                :[out] "=r" (out), [regOut] "=r" (regPt), "=r" (keyPt)
+                :"0" (out), [Key] "2" (keyPt), [input] "r" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (regPt)
+                : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+                "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                break;
+#endif /* WOLFSSL_AES_256 */
+            default:
+                WOLFSSL_MSG("Bad AES-CBC round value");
+                return BAD_FUNC_ARG;
+            }
+        }
+
+        return 0;
+    }
+
+    #ifdef HAVE_AES_DECRYPT
+    int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+    {
+        word32 numBlocks = sz / AES_BLOCK_SIZE;
+
+        if (aes == NULL || out == NULL || (in == NULL && sz > 0)
+                || sz % AES_BLOCK_SIZE != 0) {
+            return BAD_FUNC_ARG;
+        }
+
+        /* do as many block size ops as possible */
+        if (numBlocks > 0) {
+            word32* keyPt = aes->key;
+            word32* regPt = aes->reg;
+            switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+            case 10: /* AES 128 BLOCK */
+                __asm__ __volatile__ (
+                "MOV r11, %[blocks] \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "VLD1.32 {q3}, [%[Key]]!  \n"
+                "VLD1.32 {q4}, [%[Key]]!  \n"
+                "VLD1.32 {q5}, [%[Key]]!  \n"
+                "VLD1.32 {q6}, [%[Key]]!  \n"
+                "VLD1.32 {q7}, [%[Key]]!  \n"
+                "VLD1.32 {q8}, [%[Key]]!  \n"
+                "VLD1.32 {q9}, [%[Key]]!  \n"
+                "VLD1.32 {q10}, [%[Key]]! \n"
+                "VLD1.32 {q11}, [%[Key]]! \n"
+                "VLD1.32 {q13}, [%[reg]]  \n"
+                "VLD1.32 {q0}, [%[input]]!\n"
+
+                "1:\n"
+                "VMOV.32 q12, q0 \n"
+                "AESD.8 q0, q1\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q2\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q3\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q4\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q5\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q6\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q7\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q8\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q9\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q10\n"
+                "VEOR.32 q0, q0, q11\n"
+
+                "VEOR.32 q0, q0, q13\n"
+                "SUB r11, r11, #1            \n"
+                "VST1.32 {q0}, [%[out]]!  \n"
+                "VMOV.32 q13, q12        \n"
+
+                "CMP r11, #0 \n"
+                "BEQ 2f \n"
+                "VLD1.32 {q0}, [%[input]]!  \n"
+                "B 1b      \n"
+
+                "2: \n"
+                "#store current counter qalue at the end \n"
+                "VST1.32 {q13}, [%[regOut]] \n"
+
+                :[out] "=r" (out), [regOut] "=r" (regPt)
+                :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (regPt)
+                : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+                "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13"
+                );
+                break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+            case 12: /* AES 192 BLOCK */
+                __asm__ __volatile__ (
+                "MOV r11, %[blocks] \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "VLD1.32 {q3}, [%[Key]]!  \n"
+                "VLD1.32 {q4}, [%[Key]]!  \n"
+                "VLD1.32 {q5}, [%[Key]]!  \n"
+                "VLD1.32 {q6}, [%[Key]]!  \n"
+                "VLD1.32 {q7}, [%[Key]]!  \n"
+                "VLD1.32 {q8}, [%[Key]]!  \n"
+                "VLD1.32 {q9}, [%[Key]]!  \n"
+                "VLD1.32 {q10}, [%[Key]]! \n"
+                "VLD1.32 {q11}, [%[Key]]! \n"
+                "VLD1.32 {q12}, [%[Key]]! \n"
+                "VLD1.32 {q13}, [%[Key]]! \n"
+                "VLD1.32 {q14}, [%[reg]]  \n"
+                "VLD1.32 {q0}, [%[input]]!\n"
+
+                "1:    \n"
+                "VMOV.32 q15, q0 \n"
+                "AESD.8 q0, q1\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q2\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q3\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q4\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q5\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q6\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q7\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q8\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q9\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q10\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q11\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q12\n"
+                "VEOR.32 q0, q0, q13\n"
+
+                "VEOR.32 q0, q0, q14\n"
+                "SUB r11, r11, #1        \n"
+                "VST1.32 {q0}, [%[out]]! \n"
+                "VMOV.32 q14, q15        \n"
+
+                "CMP r11, #0 \n"
+                "BEQ 2f \n"
+                "VLD1.32 {q0}, [%[input]]!  \n"
+                "B 1b \n"
+
+                "2:\n"
+                "#store current counter value at the end \n"
+                "VST1.32 {q15}, [%[regOut]] \n"
+
+                :[out] "=r" (out), [regOut] "=r" (regPt)
+                :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (regPt)
+                : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+                "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+            case 14: /* AES 256 BLOCK */
+                __asm__ __volatile__ (
+                "MOV r11, %[blocks] \n"
+                "VLD1.32 {q1}, [%[Key]]!  \n"
+                "VLD1.32 {q2}, [%[Key]]!  \n"
+                "VLD1.32 {q3}, [%[Key]]!  \n"
+                "VLD1.32 {q4}, [%[Key]]!  \n"
+                "VLD1.32 {q5}, [%[Key]]!  \n"
+                "VLD1.32 {q6}, [%[Key]]!  \n"
+                "VLD1.32 {q7}, [%[Key]]!  \n"
+                "VLD1.32 {q8}, [%[Key]]!  \n"
+                "VLD1.32 {q9}, [%[Key]]!  \n"
+                "VLD1.32 {q10}, [%[Key]]! \n"
+                "VLD1.32 {q11}, [%[Key]]! \n"
+                "VLD1.32 {q12}, [%[Key]]! \n"
+                "VLD1.32 {q14}, [%[reg]]  \n"
+                "VLD1.32 {q0}, [%[input]]!\n"
+
+                "1:\n"
+                "VMOV.32 q15, q0 \n"
+                "AESD.8 q0, q1\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q2\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q3\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q4\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q5\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q6\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q7\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q8\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q9\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q10\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q11\n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q13}, [%[Key]]!  \n"
+                "AESD.8 q0, q12\n"
+                "AESIMC.8 q0, q0\n"
+                "AESD.8 q0, q13\n"
+                "AESIMC.8 q0, q0\n"
+                "VLD1.32 {q13}, [%[Key]]!  \n"
+                "AESD.8 q0, q13\n"
+                "VLD1.32 {q13}, [%[Key]]  \n"
+                "VEOR.32 q0, q0, q13\n"
+                "SUB %[Key], %[Key], #32 \n"
+
+                "VEOR.32 q0, q0, q14\n"
+                "SUB r11, r11, #1            \n"
+                "VST1.32 {q0}, [%[out]]!  \n"
+                "VMOV.32 q14, q15        \n"
+
+                "CMP r11, #0 \n"
+                "BEQ 2f \n"
+                "VLD1.32 {q0}, [%[input]]!  \n"
+                "B 1b \n"
+
+                "2:\n"
+                "#store current counter value at the end \n"
+                "VST1.32 {q15}, [%[regOut]] \n"
+
+                :[out] "=r" (out), [regOut] "=r" (regPt)
+                :"0" (out), [Key] "r" (keyPt), [input] "r" (in),
+                 [blocks] "r" (numBlocks), [reg] "1" (regPt)
+                : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+                "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+                );
+                break;
+#endif /* WOLFSSL_AES_256 */
+            default:
+                WOLFSSL_MSG("Bad AES-CBC round value");
+                return BAD_FUNC_ARG;
+            }
+        }
+
+        return 0;
+    }
+    #endif
+
+#endif /* HAVE_AES_CBC */
+
+/* AES-CTR */
+#ifdef WOLFSSL_AES_COUNTER
+
+        /* Increment AES counter */
+        static WC_INLINE void IncrementAesCounter(byte* inOutCtr)
+        {
+            int i;
+
+            /* in network byte order so start at end and work back */
+            for (i = AES_BLOCK_SIZE - 1; i >= 0; i--) {
+                if (++inOutCtr[i])  /* we're done unless we overflow */
+                    return;
+            }
+        }
+
+        int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+        {
+            byte* tmp;
+            word32 numBlocks;
+
+            if (aes == NULL || out == NULL || in == NULL) {
+                return BAD_FUNC_ARG;
+            }
+
+            tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
+
+            /* consume any unused bytes left in aes->tmp */
+            while (aes->left && sz) {
+               *(out++) = *(in++) ^ *(tmp++);
+               aes->left--;
+               sz--;
+            }
+
+            /* do as many block size ops as possible */
+            numBlocks = sz/AES_BLOCK_SIZE;
+            if (numBlocks > 0) {
+                /* pointer needed because it is incremented when read, causing
+                 * an issue with call to encrypt/decrypt leftovers */
+                word32*  keyPt  = aes->key;
+                word32*  regPt  = aes->reg;
+                sz           -= numBlocks * AES_BLOCK_SIZE;
+                switch(aes->rounds) {
+#ifdef WOLFSSL_AES_128
+                case 10: /* AES 128 BLOCK */
+                    __asm__ __volatile__ (
+                    "MOV r11, %[blocks] \n"
+                    "VLDM %[Key]!, {q1-q4} \n"
+
+                    "#Create vector with the value 1  \n"
+                    "VMOV.u32 q15, #1                 \n"
+                    "VSHR.u64 q15, q15, #32  \n"
+                    "VLDM %[Key]!, {q5-q8} \n"
+                    "VEOR.32 q14, q14, q14    \n"
+                    "VLDM %[Key]!, {q9-q11} \n"
+                    "VEXT.8 q14, q15, q14, #8\n"
+
+                    "VLD1.32 {q13}, [%[reg]]\n"
+
+                    /* double block */
+                    "1:      \n"
+                    "CMP r11, #1 \n"
+                    "BEQ 2f    \n"
+                    "CMP r11, #0 \n"
+                    "BEQ 3f    \n"
+
+                    "VMOV.32 q0, q13  \n"
+                    "AESE.8 q0, q1\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13 \n" /* network order */
+                    "AESE.8 q0, q2\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "SUB r11, r11, #2     \n"
+                    "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */
+                    "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */
+                    "AESE.8 q0, q3\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q15, q15, q15, #8 \n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "AESE.8 q0, q4\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q15, q15\n" /* revert from network order */
+                    "VREV64.8 q13, q13\n" /* revert from network order */
+                    "AESE.8 q0, q5\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q1\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q6\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q2\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q7\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q3\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q8\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q4\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q9\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q5\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q10\n"
+                    "AESE.8 q15, q6\n"
+                    "AESMC.8 q15, q15\n"
+                    "VEOR.32 q0, q0, q11\n"
+
+                    "AESE.8 q15, q7\n"
+                    "AESMC.8 q15, q15\n"
+                    "VLD1.32 {q12}, [%[input]]!  \n"
+                    "AESE.8 q15, q8\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "VEOR.32 q0, q0, q12\n"
+                    "AESE.8 q15, q9\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "VLD1.32 {q12}, [%[input]]!  \n"
+                    "AESE.8 q15, q10\n"
+                    "VST1.32 {q0}, [%[out]]!  \n"
+                    "VEOR.32 q15, q15, q11\n"
+                    "VEOR.32 q15, q15, q12\n"
+                    "VST1.32 {q15}, [%[out]]!  \n"
+
+                    "B 1b \n"
+
+                    /* single block */
+                    "2:      \n"
+                    "VMOV.32 q0, q13  \n"
+                    "AESE.8 q0, q1\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13 \n" /* network order */
+                    "AESE.8 q0, q2\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "AESE.8 q0, q3\n"
+                    "AESMC.8 q0, q0\n"
+                    "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */
+                    "AESE.8 q0, q4\n"
+                    "AESMC.8 q0, q0\n"
+                    "SUB r11, r11, #1     \n"
+                    "AESE.8 q0, q5\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "AESE.8 q0, q6\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13\n" /* revert from network order */
+                    "AESE.8 q0, q7\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q8\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q9\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q10\n"
+                    "VLD1.32 {q12}, [%[input]]!  \n"
+                    "VEOR.32 q0, q0, q11\n"
+                    "#CTR operations, increment counter and xorbuf \n"
+                    "VEOR.32 q0, q0, q12\n"
+                    "VST1.32 {q0}, [%[out]]!  \n"
+
+                    "3: \n"
+                    "#store current counter qalue at the end \n"
+                    "VST1.32 {q13}, [%[regOut]]   \n"
+
+                    :[out] "=r" (out), "=r" (keyPt), [regOut] "=r" (regPt),
+                     "=r" (in)
+                    :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+                     [blocks] "r" (numBlocks), [reg] "2" (regPt)
+                    : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+                    "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14", "q15"
+                    );
+                    break;
+#endif /* WOLFSSL_AES_128 */
+#ifdef WOLFSSL_AES_192
+                case 12: /* AES 192 BLOCK */
+                    __asm__ __volatile__ (
+                    "MOV r11, %[blocks] \n"
+                    "VLDM %[Key]!, {q1-q4} \n"
+
+                    "#Create vector with the value 1  \n"
+                    "VMOV.u32 q15, #1                 \n"
+                    "VSHR.u64 q15, q15, #32  \n"
+                    "VLDM %[Key]!, {q5-q8} \n"
+                    "VEOR.32 q14, q14, q14    \n"
+                    "VEXT.8 q14, q15, q14, #8\n"
+
+                    "VLDM %[Key]!, {q9-q10} \n"
+                    "VLD1.32 {q13}, [%[reg]]\n"
+
+                    /* double block */
+                    "1:   \n"
+                    "CMP r11, #1 \n"
+                    "BEQ 2f \n"
+                    "CMP r11, #0 \n"
+                    "BEQ 3f   \n"
+
+                    "VMOV.32 q0, q13\n"
+                    "AESE.8 q0, q1\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13 \n" /* network order */
+                    "AESE.8 q0, q2\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "SUB r11, r11, #2     \n"
+                    "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */
+                    "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */
+                    "AESE.8 q0, q3\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q15, q15, q15, #8 \n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "AESE.8 q0, q4\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q15, q15\n" /* revert from network order */
+                    "VREV64.8 q13, q13\n" /* revert from network order */
+                    "AESE.8 q0, q5\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q1\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q6\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q2\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q7\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q3\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q8\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q4\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q9\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q5\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q10\n"
+                    "AESMC.8 q0, q0\n"
+                    "VLD1.32 {q11}, [%[Key]]! \n"
+                    "AESE.8 q15, q6\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q11\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q7\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q15, q8\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "VLD1.32 {q12}, [%[Key]]! \n"
+                    "AESE.8 q15, q9\n"
+                    "AESMC.8 q15, q15\n"
+                    "AESE.8 q15, q10\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q15, q11\n"
+                    "AESMC.8 q15, q15\n"
+                    "VLD1.32 {q11}, [%[Key]] \n"
+                    "AESE.8 q0, q12\n"
+                    "AESE.8 q15, q12\n"
+
+                    "VLD1.32 {q12}, [%[input]]!  \n"
+                    "VEOR.32 q0, q0, q11\n"
+                    "VEOR.32 q15, q15, q11\n"
+                    "VEOR.32 q0, q0, q12\n"
+
+                    "VLD1.32 {q12}, [%[input]]!  \n"
+                    "VST1.32 {q0}, [%[out]]!  \n"
+                    "VEOR.32 q15, q15, q12\n"
+                    "VST1.32 {q15}, [%[out]]!  \n"
+                    "SUB %[Key], %[Key], #32 \n"
+
+                    "B 1b \n"
+
+
+                    /* single block */
+                    "2:      \n"
+                    "VLD1.32 {q11}, [%[Key]]! \n"
+                    "VMOV.32 q0, q13  \n"
+                    "AESE.8 q0, q1\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13 \n" /* network order */
+                    "AESE.8 q0, q2\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "AESE.8 q0, q3\n"
+                    "AESMC.8 q0, q0\n"
+                    "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */
+                    "AESE.8 q0, q4\n"
+                    "AESMC.8 q0, q0\n"
+                    "SUB r11, r11, #1     \n"
+                    "AESE.8 q0, q5\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "AESE.8 q0, q6\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13\n" /* revert from network order */
+                    "AESE.8 q0, q7\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q8\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q9\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q10\n"
+                    "AESMC.8 q0, q0\n"
+                    "VLD1.32 {q12}, [%[Key]]! \n"
+                    "AESE.8 q0, q11\n"
+                    "AESMC.8 q0, q0\n"
+                    "VLD1.32 {q11}, [%[Key]] \n"
+                    "AESE.8 q0, q12\n"
+                    "VLD1.32 {q12}, [%[input]]! \n"
+                    "VEOR.32 q0, q0, q11\n"
+                    "#CTR operations, increment counter and xorbuf \n"
+                    "VEOR.32 q0, q0, q12\n"
+                    "VST1.32 {q0}, [%[out]]!  \n"
+
+                    "3: \n"
+                    "#store current counter qalue at the end \n"
+                    "VST1.32 {q13}, [%[regOut]]   \n"
+
+                    :[out] "=r" (out), "=r" (keyPt), [regOut] "=r" (regPt),
+                     "=r" (in)
+                    :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+                     [blocks] "r" (numBlocks), [reg] "2" (regPt)
+                    : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+                    "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14"
+                    );
+                    break;
+#endif /* WOLFSSL_AES_192 */
+#ifdef WOLFSSL_AES_256
+                case 14: /* AES 256 BLOCK */
+                    __asm__ __volatile__ (
+                    "MOV r11, %[blocks] \n"
+                    "VLDM %[Key]!, {q1-q4} \n"
+
+                    "#Create vector with the value 1  \n"
+                    "VMOV.u32 q15, #1                 \n"
+                    "VSHR.u64 q15, q15, #32  \n"
+                    "VLDM %[Key]!, {q5-q8} \n"
+                    "VEOR.32 q14, q14, q14    \n"
+                    "VEXT.8 q14, q15, q14, #8\n"
+
+                    "VLDM %[Key]!, {q9-q10} \n"
+                    "VLD1.32 {q13}, [%[reg]]\n"
+
+                    /* double block */
+                    "1:      \n"
+                    "CMP r11, #1 \n"
+                    "BEQ 2f    \n"
+                    "CMP r11, #0 \n"
+                    "BEQ 3f    \n"
+
+                    "VMOV.32 q0, q13  \n"
+                    "AESE.8 q0, q1\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13 \n" /* network order */
+                    "AESE.8 q0, q2\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "SUB r11, r11, #2     \n"
+                    "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */
+                    "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */
+                    "AESE.8 q0, q3\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q15, q15, q15, #8 \n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "AESE.8 q0, q4\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q15, q15\n" /* revert from network order */
+                    "AESE.8 q0, q5\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13\n" /* revert from network order */
+                    "AESE.8 q15, q1\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q6\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q2\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q7\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q3\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q8\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q4\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q9\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q5\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q10\n"
+                    "AESMC.8 q0, q0\n"
+                    "VLD1.32 {q11}, [%[Key]]! \n"
+                    "AESE.8 q15, q6\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q0, q11\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q7\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q15, q8\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q15, q9\n"
+                    "AESMC.8 q15, q15\n"
+                    "VLD1.32 {q12}, [%[Key]]!  \n"
+                    "AESE.8 q15, q10\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "AESE.8 q15, q11\n"
+                    "AESMC.8 q15, q15\n"
+
+                    "VLD1.32 {q11}, [%[Key]]! \n"
+                    "AESE.8 q0, q12\n" /* rnd 12*/
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q12\n" /* rnd 12 */
+                    "AESMC.8 q15, q15\n"
+
+                    "VLD1.32 {q12}, [%[Key]]!  \n"
+                    "AESE.8 q0, q11\n" /* rnd 13 */
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q15, q11\n" /* rnd 13 */
+                    "AESMC.8 q15, q15\n"
+
+                    "VLD1.32 {q11}, [%[Key]] \n"
+                    "AESE.8 q0, q12\n" /* rnd 14 */
+                    "AESE.8 q15, q12\n" /* rnd 14 */
+
+                    "VLD1.32 {q12}, [%[input]]!  \n"
+                    "VEOR.32 q0, q0, q11\n" /* rnd 15 */
+                    "VEOR.32 q15, q15, q11\n" /* rnd 15 */
+                    "VEOR.32 q0, q0, q12\n"
+
+                    "VLD1.32 {q12}, [%[input]]!  \n"
+                    "VST1.32 {q0}, [%[out]]!  \n"
+                    "VEOR.32 q15, q15, q12\n"
+                    "VST1.32 {q15}, [%[out]]!  \n"
+                    "SUB %[Key], %[Key], #64 \n"
+
+                    /* single block */
+                    "B 1b \n"
+
+                    "2:      \n"
+                    "VLD1.32 {q11}, [%[Key]]! \n"
+                    "VMOV.32 q0, q13  \n"
+                    "AESE.8 q0, q1\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13 \n" /* network order */
+                    "AESE.8 q0, q2\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "AESE.8 q0, q3\n"
+                    "AESMC.8 q0, q0\n"
+                    "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */
+                    "AESE.8 q0, q4\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q5\n"
+                    "AESMC.8 q0, q0\n"
+                    "VEXT.8 q13, q13, q13, #8 \n"
+                    "AESE.8 q0, q6\n"
+                    "AESMC.8 q0, q0\n"
+                    "VREV64.8 q13, q13\n" /* revert from network order */
+                    "AESE.8 q0, q7\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q8\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q9\n"
+                    "AESMC.8 q0, q0\n"
+                    "AESE.8 q0, q10\n"
+                    "AESMC.8 q0, q0\n"
+                    "VLD1.32 {q12}, [%[Key]]! \n"
+                    "AESE.8 q0, q11\n"
+                    "AESMC.8 q0, q0\n"
+                    "VLD1.32 {q11}, [%[Key]]! \n"
+                    "AESE.8 q0, q12\n" /* rnd 12 */
+                    "AESMC.8 q0, q0\n"
+                    "VLD1.32 {q12}, [%[Key]]! \n"
+                    "AESE.8 q0, q11\n" /* rnd 13 */
+                    "AESMC.8 q0, q0\n"
+                    "VLD1.32 {q11}, [%[Key]] \n"
+                    "AESE.8 q0, q12\n" /* rnd 14 */
+                    "VLD1.32 {q12}, [%[input]]! \n"
+                    "VEOR.32 q0, q0, q11\n" /* rnd 15 */
+                    "#CTR operations, increment counter and xorbuf \n"
+                    "VEOR.32 q0, q0, q12\n"
+                    "VST1.32 {q0}, [%[out]]!  \n"
+
+                    "3: \n"
+                    "#store current counter qalue at the end \n"
+                    "VST1.32 {q13}, [%[regOut]]   \n"
+
+                    :[out] "=r" (out), "=r" (keyPt), [regOut] "=r" (regPt),
+                     "=r" (in)
+                    :"0" (out), [Key] "1" (keyPt), [input] "3" (in),
+                     [blocks] "r" (numBlocks), [reg] "2" (regPt)
+                    : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
+                    "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14"
+                    );
+                    break;
+#endif /* WOLFSSL_AES_256 */
+                default:
+                    WOLFSSL_MSG("Bad AES-CTR round qalue");
+                    return BAD_FUNC_ARG;
+                }
+
+                aes->left = 0;
+            }
+
+            /* handle non block size remaining */
+            if (sz) {
+                wc_AesEncrypt(aes, (byte*)aes->reg, (byte*)aes->tmp);
+                IncrementAesCounter((byte*)aes->reg);
+
+                aes->left = AES_BLOCK_SIZE;
+                tmp = (byte*)aes->tmp;
+
+                while (sz--) {
+                    *(out++) = *(in++) ^ *(tmp++);
+                    aes->left--;
+                }
+            }
+
+            return 0;
+        }
+
+#endif /* WOLFSSL_AES_COUNTER */
+
+#ifdef HAVE_AESGCM
+/*
+ * Uses Karatsuba algorithm. Reduction algorithm is based on "Implementing GCM
+ * on ARMv8". Shifting left to account for bit reflection is based on
+ * "Carry-Less Multiplication and Its Usage for Computing the GCM mode"
+ */
+static void GMULT(byte* X, byte* Y)
+{
+    __asm__ __volatile__ (
+        "VLD1.32 {q0}, [%[x]] \n"
+
+        /* In GCM format bits are big endian, switch location of bytes to
+         * allow for logical shifts and carries.
+         */
+        "VREV64.8 q0, q0 \n"
+        "VLD1.32 {q1}, [%[y]] \n" /* converted on set key */
+        "VSWP.8 d0, d1 \n"
+
+        "VMULL.p64  q5, d0, d2 \n"
+        "VMULL.p64  q6, d1, d3 \n"
+        "VEOR d15, d2, d3 \n"
+        "VEOR d14, d0, d1 \n"
+        "VMULL.p64  q7, d15, d14 \n"
+        "VEOR q7, q5 \n"
+        "VEOR q7, q6 \n"
+        "VEOR d11, d14 \n"
+        "VEOR d12, d15\n"
+
+        /* shift to left by 1 to account for reflection */
+        "VMOV q7, q6 \n"
+        "VSHL.u64 q6, q6, #1 \n"
+        "VSHR.u64 q7, q7, #63 \n"
+        "VEOR d13, d14 \n"
+        "VMOV q8, q5 \n"
+        "VSHL.u64 q5, q5, #1 \n"
+        "VSHR.u64 q8, q8, #63 \n"
+        "VEOR d12, d17 \n"
+        "VEOR d11, d16 \n"
+
+        /* create constant 0xc200000000000000 */
+        "VMOV.i32 d16, 0xc2000000 \n"
+        "VSHL.u64 d16, d16, #32 \n"
+
+        /* reduce product of multiplication */
+        "VMULL.p64 q9, d10, d16 \n"
+        "VEOR d11, d18 \n"
+        "VEOR d12, d19 \n"
+        "VMULL.p64 q9, d11, d16 \n"
+        "VEOR q6, q9 \n"
+        "VEOR q10, q5, q6 \n"
+
+        /* convert to GCM format */
+        "VREV64.8 q10, q10 \n"
+        "VSWP.8 d20, d21 \n"
+
+        "VST1.32 {q10}, [%[xOut]] \n"
+
+        : [xOut] "=r" (X), [yOut] "=r" (Y)
+        : [x] "0" (X), [y] "1" (Y)
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6" ,"q7", "q8",
+        "q9", "q10", "q11" ,"q12", "q13", "q14", "q15"
+    );
+}
+
+
+void GHASH(Aes* aes, const byte* a, word32 aSz,
+                                const byte* c, word32 cSz, byte* s, word32 sSz)
+{
+    byte x[AES_BLOCK_SIZE];
+    byte scratch[AES_BLOCK_SIZE];
+    word32 blocks, partial;
+    byte* h = aes->H;
+
+    XMEMSET(x, 0, AES_BLOCK_SIZE);
+
+    /* Hash in A, the Additional Authentication Data */
+    if (aSz != 0 && a != NULL) {
+        blocks = aSz / AES_BLOCK_SIZE;
+        partial = aSz % AES_BLOCK_SIZE;
+        while (blocks--) {
+            xorbuf(x, a, AES_BLOCK_SIZE);
+            GMULT(x, h);
+            a += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, a, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, h);
+        }
+    }
+
+    /* Hash in C, the Ciphertext */
+    if (cSz != 0 && c != NULL) {
+        blocks = cSz / AES_BLOCK_SIZE;
+        partial = cSz % AES_BLOCK_SIZE;
+        while (blocks--) {
+            xorbuf(x, c, AES_BLOCK_SIZE);
+            GMULT(x, h);
+            c += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            XMEMSET(scratch, 0, AES_BLOCK_SIZE);
+            XMEMCPY(scratch, c, partial);
+            xorbuf(x, scratch, AES_BLOCK_SIZE);
+            GMULT(x, h);
+        }
+    }
+
+    /* Hash in the lengths of A and C in bits */
+    FlattenSzInBits(&scratch[0], aSz);
+    FlattenSzInBits(&scratch[8], cSz);
+    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    GMULT(x, h);
+
+    /* Copy the result into s. */
+    XMEMCPY(s, x, sSz);
+}
+
+
+/* Aarch32
+ * Encrypt and tag data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: encrypted data output buffer
+ * in:  plain text input buffer
+ * sz:  size of plain text and out buffer
+ * iv:  initialization vector
+ * ivSz:      size of iv buffer
+ * authTag:   buffer to hold tag
+ * authTagSz: size of tag buffer
+ * authIn:    additional data buffer
+ * authInSz:  size of additional data buffer
+ */
+int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    word32 blocks = sz / AES_BLOCK_SIZE;
+    word32 partial = sz % AES_BLOCK_SIZE;
+    const byte* p = in;
+    byte* c = out;
+    byte counter[AES_BLOCK_SIZE];
+    byte initialCounter[AES_BLOCK_SIZE];
+    byte *ctr ;
+    byte scratch[AES_BLOCK_SIZE];
+    ctr = counter ;
+
+    /* sanity checks */
+    if (aes == NULL || (iv == NULL && ivSz > 0) ||
+                       (authTag == NULL) ||
+                       (authIn == NULL && authInSz > 0) ||
+                       (in == NULL && sz > 0) ||
+                       (out == NULL && sz > 0)) {
+        WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+        return BAD_FUNC_ARG;
+    }
+
+    if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ || authTagSz > AES_BLOCK_SIZE) {
+        WOLFSSL_MSG("GcmEncrypt authTagSz error");
+        return BAD_FUNC_ARG;
+    }
+
+    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+    if (ivSz == GCM_NONCE_MID_SZ) {
+        XMEMCPY(initialCounter, iv, ivSz);
+        initialCounter[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+    }
+    XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
+
+    while (blocks--) {
+        IncrementGcmCounter(ctr);
+        wc_AesEncrypt(aes, ctr, scratch);
+        xorbuf(scratch, p, AES_BLOCK_SIZE);
+        XMEMCPY(c, scratch, AES_BLOCK_SIZE);
+        p += AES_BLOCK_SIZE;
+        c += AES_BLOCK_SIZE;
+    }
+
+    if (partial != 0) {
+        IncrementGcmCounter(ctr);
+        wc_AesEncrypt(aes, ctr, scratch);
+        xorbuf(scratch, p, partial);
+        XMEMCPY(c, scratch, partial);
+
+    }
+
+    GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz);
+    wc_AesEncrypt(aes, initialCounter, scratch);
+    if (authTagSz > AES_BLOCK_SIZE) {
+        xorbuf(authTag, scratch, AES_BLOCK_SIZE);
+    }
+    else {
+        xorbuf(authTag, scratch, authTagSz);
+    }
+
+    return 0;
+}
+
+
+#ifdef HAVE_AES_DECRYPT
+/*
+ * Check tag and decrypt data using AES with GCM mode.
+ * aes: Aes structure having already been set with set key function
+ * out: decrypted data output buffer
+ * in:  cipher text buffer
+ * sz:  size of plain text and out buffer
+ * iv:  initialization vector
+ * ivSz:      size of iv buffer
+ * authTag:   buffer holding tag
+ * authTagSz: size of tag buffer
+ * authIn:    additional data buffer
+ * authInSz:  size of additional data buffer
+ */
+int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+                   const byte* iv, word32 ivSz,
+                   const byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    word32 blocks = sz / AES_BLOCK_SIZE;
+    word32 partial = sz % AES_BLOCK_SIZE;
+    const byte* c = in;
+    byte* p = out;
+    byte counter[AES_BLOCK_SIZE];
+    byte initialCounter[AES_BLOCK_SIZE];
+    byte *ctr ;
+    byte scratch[AES_BLOCK_SIZE];
+    ctr = counter ;
+
+    /* sanity checks */
+    if (aes == NULL || (iv == NULL && ivSz > 0) ||
+                       (authTag == NULL) ||
+                       (authIn == NULL && authInSz > 0) ||
+                       (in  == NULL && sz > 0) ||
+                       (out == NULL && sz > 0)) {
+        WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
+        return BAD_FUNC_ARG;
+    }
+
+    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+    if (ivSz == GCM_NONCE_MID_SZ) {
+        XMEMCPY(initialCounter, iv, ivSz);
+        initialCounter[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+    }
+    XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
+
+    /* Calculate the authTag again using the received auth data and the
+     * cipher text. */
+    {
+        byte Tprime[AES_BLOCK_SIZE];
+        byte EKY0[AES_BLOCK_SIZE];
+
+        GHASH(aes, authIn, authInSz, in, sz, Tprime, sizeof(Tprime));
+        wc_AesEncrypt(aes, ctr, EKY0);
+        xorbuf(Tprime, EKY0, sizeof(Tprime));
+
+        if (ConstantCompare(authTag, Tprime, authTagSz) != 0) {
+            return AES_GCM_AUTH_E;
+        }
+    }
+
+    while (blocks--) {
+        IncrementGcmCounter(ctr);
+        wc_AesEncrypt(aes, ctr, scratch);
+        xorbuf(scratch, c, AES_BLOCK_SIZE);
+        XMEMCPY(p, scratch, AES_BLOCK_SIZE);
+        p += AES_BLOCK_SIZE;
+        c += AES_BLOCK_SIZE;
+    }
+    if (partial != 0) {
+        IncrementGcmCounter(ctr);
+        wc_AesEncrypt(aes, ctr, scratch);
+
+        /* check if pointer is null after main AES-GCM blocks
+         * helps static analysis */
+        if (p == NULL || c == NULL) {
+            return BAD_STATE_E;
+        }
+        xorbuf(scratch, c, partial);
+        XMEMCPY(p, scratch, partial);
+    }
+    return 0;
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AESGCM */
+
+#endif /* aarch64 */
+
+
+#ifdef HAVE_AESCCM
+/* Software version of AES-CCM from wolfcrypt/src/aes.c
+ * Gets some speed up from hardware acceleration of wc_AesEncrypt */
+
+static void roll_x(Aes* aes, const byte* in, word32 inSz, byte* out)
+{
+    /* process the bulk of the data */
+    while (inSz >= AES_BLOCK_SIZE) {
+        xorbuf(out, in, AES_BLOCK_SIZE);
+        in += AES_BLOCK_SIZE;
+        inSz -= AES_BLOCK_SIZE;
+
+        wc_AesEncrypt(aes, out, out);
+    }
+
+    /* process remainder of the data */
+    if (inSz > 0) {
+        xorbuf(out, in, inSz);
+        wc_AesEncrypt(aes, out, out);
+    }
+}
+
+
+static void roll_auth(Aes* aes, const byte* in, word32 inSz, byte* out)
+{
+    word32 authLenSz;
+    word32 remainder;
+
+    /* encode the length in */
+    if (inSz <= 0xFEFF) {
+        authLenSz = 2;
+        out[0] ^= ((inSz & 0xFF00) >> 8);
+        out[1] ^=  (inSz & 0x00FF);
+    }
+    else if (inSz <= 0xFFFFFFFF) {
+        authLenSz = 6;
+        out[0] ^= 0xFF; out[1] ^= 0xFE;
+        out[2] ^= ((inSz & 0xFF000000) >> 24);
+        out[3] ^= ((inSz & 0x00FF0000) >> 16);
+        out[4] ^= ((inSz & 0x0000FF00) >>  8);
+        out[5] ^=  (inSz & 0x000000FF);
+    }
+    /* Note, the protocol handles auth data up to 2^64, but we are
+     * using 32-bit sizes right now, so the bigger data isn't handled
+     * else if (inSz <= 0xFFFFFFFFFFFFFFFF) {} */
+    else
+        return;
+
+    /* start fill out the rest of the first block */
+    remainder = AES_BLOCK_SIZE - authLenSz;
+    if (inSz >= remainder) {
+        /* plenty of bulk data to fill the remainder of this block */
+        xorbuf(out + authLenSz, in, remainder);
+        inSz -= remainder;
+        in += remainder;
+    }
+    else {
+        /* not enough bulk data, copy what is available, and pad zero */
+        xorbuf(out + authLenSz, in, inSz);
+        inSz = 0;
+    }
+    wc_AesEncrypt(aes, out, out);
+
+    if (inSz > 0)
+        roll_x(aes, in, inSz, out);
+}
+
+
+static WC_INLINE void AesCcmCtrInc(byte* B, word32 lenSz)
+{
+    word32 i;
+
+    for (i = 0; i < lenSz; i++) {
+        if (++B[AES_BLOCK_SIZE - 1 - i] != 0) return;
+    }
+}
+
+
+/* return 0 on success */
+int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
+                   const byte* nonce, word32 nonceSz,
+                   byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    byte A[AES_BLOCK_SIZE];
+    byte B[AES_BLOCK_SIZE];
+    byte lenSz;
+    word32 i;
+    byte mask     = 0xFF;
+    word32 wordSz = (word32)sizeof(word32);
+
+    /* sanity check on arguments */
+    if (aes == NULL || out == NULL || in == NULL || nonce == NULL
+            || authTag == NULL || nonceSz < 7 || nonceSz > 13)
+        return BAD_FUNC_ARG;
+
+    XMEMCPY(B+1, nonce, nonceSz);
+    lenSz = AES_BLOCK_SIZE - 1 - (byte)nonceSz;
+    B[0] = (authInSz > 0 ? 64 : 0)
+         + (8 * (((byte)authTagSz - 2) / 2))
+         + (lenSz - 1);
+    for (i = 0; i < lenSz; i++) {
+        if (mask && i >= wordSz)
+            mask = 0x00;
+        B[AES_BLOCK_SIZE - 1 - i] = (inSz >> ((8 * i) & mask)) & mask;
+    }
+
+    wc_AesEncrypt(aes, B, A);
+
+    if (authInSz > 0)
+        roll_auth(aes, authIn, authInSz, A);
+    if (inSz > 0)
+        roll_x(aes, in, inSz, A);
+    XMEMCPY(authTag, A, authTagSz);
+
+    B[0] = lenSz - 1;
+    for (i = 0; i < lenSz; i++)
+        B[AES_BLOCK_SIZE - 1 - i] = 0;
+    wc_AesEncrypt(aes, B, A);
+    xorbuf(authTag, A, authTagSz);
+
+    B[15] = 1;
+    while (inSz >= AES_BLOCK_SIZE) {
+        wc_AesEncrypt(aes, B, A);
+        xorbuf(A, in, AES_BLOCK_SIZE);
+        XMEMCPY(out, A, AES_BLOCK_SIZE);
+
+        AesCcmCtrInc(B, lenSz);
+        inSz -= AES_BLOCK_SIZE;
+        in += AES_BLOCK_SIZE;
+        out += AES_BLOCK_SIZE;
+    }
+    if (inSz > 0) {
+        wc_AesEncrypt(aes, B, A);
+        xorbuf(A, in, inSz);
+        XMEMCPY(out, A, inSz);
+    }
+
+    ForceZero(A, AES_BLOCK_SIZE);
+    ForceZero(B, AES_BLOCK_SIZE);
+
+    return 0;
+}
+
+#ifdef HAVE_AES_DECRYPT
+int  wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
+                   const byte* nonce, word32 nonceSz,
+                   const byte* authTag, word32 authTagSz,
+                   const byte* authIn, word32 authInSz)
+{
+    byte A[AES_BLOCK_SIZE];
+    byte B[AES_BLOCK_SIZE];
+    byte* o;
+    byte lenSz;
+    word32 i, oSz;
+    int result = 0;
+    byte mask     = 0xFF;
+    word32 wordSz = (word32)sizeof(word32);
+
+    /* sanity check on arguments */
+    if (aes == NULL || out == NULL || in == NULL || nonce == NULL
+            || authTag == NULL || nonceSz < 7 || nonceSz > 13)
+        return BAD_FUNC_ARG;
+
+    o = out;
+    oSz = inSz;
+    XMEMCPY(B+1, nonce, nonceSz);
+    lenSz = AES_BLOCK_SIZE - 1 - (byte)nonceSz;
+
+    B[0] = lenSz - 1;
+    for (i = 0; i < lenSz; i++)
+        B[AES_BLOCK_SIZE - 1 - i] = 0;
+    B[15] = 1;
+
+    while (oSz >= AES_BLOCK_SIZE) {
+        wc_AesEncrypt(aes, B, A);
+        xorbuf(A, in, AES_BLOCK_SIZE);
+        XMEMCPY(o, A, AES_BLOCK_SIZE);
+
+        AesCcmCtrInc(B, lenSz);
+        oSz -= AES_BLOCK_SIZE;
+        in += AES_BLOCK_SIZE;
+        o += AES_BLOCK_SIZE;
+    }
+    if (inSz > 0) {
+        wc_AesEncrypt(aes, B, A);
+        xorbuf(A, in, oSz);
+        XMEMCPY(o, A, oSz);
+    }
+
+    for (i = 0; i < lenSz; i++)
+        B[AES_BLOCK_SIZE - 1 - i] = 0;
+    wc_AesEncrypt(aes, B, A);
+
+    o = out;
+    oSz = inSz;
+
+    B[0] = (authInSz > 0 ? 64 : 0)
+         + (8 * (((byte)authTagSz - 2) / 2))
+         + (lenSz - 1);
+    for (i = 0; i < lenSz; i++) {
+        if (mask && i >= wordSz)
+            mask = 0x00;
+        B[AES_BLOCK_SIZE - 1 - i] = (inSz >> ((8 * i) & mask)) & mask;
+    }
+
+    wc_AesEncrypt(aes, B, A);
+
+    if (authInSz > 0)
+        roll_auth(aes, authIn, authInSz, A);
+    if (inSz > 0)
+        roll_x(aes, o, oSz, A);
+
+    B[0] = lenSz - 1;
+    for (i = 0; i < lenSz; i++)
+        B[AES_BLOCK_SIZE - 1 - i] = 0;
+    wc_AesEncrypt(aes, B, B);
+    xorbuf(A, B, authTagSz);
+
+    if (ConstantCompare(A, authTag, authTagSz) != 0) {
+        /* If the authTag check fails, don't keep the decrypted data.
+         * Unfortunately, you need the decrypted data to calculate the
+         * check value. */
+        XMEMSET(out, 0, inSz);
+        result = AES_CCM_AUTH_E;
+    }
+
+    ForceZero(A, AES_BLOCK_SIZE);
+    ForceZero(B, AES_BLOCK_SIZE);
+    o = NULL;
+
+    return result;
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AESCCM */
+
+
+
+#ifdef HAVE_AESGCM /* common GCM functions 32 and 64 bit */
+int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
+{
+    int  ret;
+    byte iv[AES_BLOCK_SIZE];
+
+    if (!((len == 16) || (len == 24) || (len == 32)))
+        return BAD_FUNC_ARG;
+
+    XMEMSET(iv, 0, AES_BLOCK_SIZE);
+    ret = wc_AesSetKey(aes, key, len, iv, AES_ENCRYPTION);
+
+    if (ret == 0) {
+        wc_AesEncrypt(aes, iv, aes->H);
+    #if defined(__aarch64__)
+        {
+            word32* pt = (word32*)aes->H;
+            __asm__ volatile (
+                "LD1 {v0.16b}, [%[h]] \n"
+                "RBIT v0.16b, v0.16b \n"
+                "ST1 {v0.16b}, [%[out]] \n"
+                : [out] "=r" (pt)
+                : [h] "0" (pt)
+                : "cc", "memory", "v0"
+            );
+        }
+    #else
+        {
+            word32* pt = (word32*)aes->H;
+            __asm__ volatile (
+                "VLD1.32 {q0}, [%[h]] \n"
+                "VREV64.8 q0, q0 \n"
+                "VSWP.8 d0, d1 \n"
+                "VST1.32 {q0}, [%[out]] \n"
+                : [out] "=r" (pt)
+                : [h] "0" (pt)
+                : "cc", "memory", "q0"
+            );
+        }
+    #endif
+    }
+
+    return ret;
+}
+
+#endif /* HAVE_AESGCM */
+
+/* AES-DIRECT */
+#if defined(WOLFSSL_AES_DIRECT)
+        /* Allow direct access to one block encrypt */
+        void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in)
+        {
+            if (aes == NULL || out == NULL || in == NULL) {
+                WOLFSSL_MSG("Invalid input to wc_AesEncryptDirect");
+                return;
+            }
+            wc_AesEncrypt(aes, in, out);
+        }
+    #ifdef HAVE_AES_DECRYPT
+        /* Allow direct access to one block decrypt */
+        void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in)
+        {
+            if (aes == NULL || out == NULL || in == NULL) {
+                WOLFSSL_MSG("Invalid input to wc_AesDecryptDirect");
+                return;
+            }
+            wc_AesDecrypt(aes, in, out);
+        }
+    #endif /* HAVE_AES_DECRYPT */
+#endif /* WOLFSSL_AES_DIRECT */
+#endif /* !NO_AES && WOLFSSL_ARMASM */