diff options
| author | auth12 <[email protected]> | 2020-07-22 08:40:38 -0700 |
|---|---|---|
| committer | auth12 <[email protected]> | 2020-07-22 08:40:38 -0700 |
| commit | 4ff89e85e74884e8f04edb5c31a94b4323e895e9 (patch) | |
| tree | 65f98ebf9af0d0947e44bf397b1fac0f107d7a2f /client/wolfssl/wolfcrypt/src/poly1305_asm.S | |
| parent | Client injection. (diff) | |
| download | loader-4ff89e85e74884e8f04edb5c31a94b4323e895e9.tar.xz loader-4ff89e85e74884e8f04edb5c31a94b4323e895e9.zip | |
Removed wolfssl
Diffstat (limited to 'client/wolfssl/wolfcrypt/src/poly1305_asm.S')
| -rw-r--r-- | client/wolfssl/wolfcrypt/src/poly1305_asm.S | 1105 |
1 files changed, 0 insertions, 1105 deletions
diff --git a/client/wolfssl/wolfcrypt/src/poly1305_asm.S b/client/wolfssl/wolfcrypt/src/poly1305_asm.S deleted file mode 100644 index 9571107..0000000 --- a/client/wolfssl/wolfcrypt/src/poly1305_asm.S +++ /dev/null @@ -1,1105 +0,0 @@ -/* poly1305_asm - * - * Copyright (C) 2006-2020 wolfSSL Inc. - * - * This file is part of wolfSSL. - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA - */ - -#ifndef HAVE_INTEL_AVX1 -#define HAVE_INTEL_AVX1 -#endif /* HAVE_INTEL_AVX1 */ -#ifndef NO_AVX2_SUPPORT -#define HAVE_INTEL_AVX2 -#endif /* NO_AVX2_SUPPORT */ - -#ifdef HAVE_INTEL_AVX1 -#ifndef __APPLE__ -.text -.globl poly1305_setkey_avx -.type poly1305_setkey_avx,@function -.align 4 -poly1305_setkey_avx: -#else -.section __TEXT,__text -.globl _poly1305_setkey_avx -.p2align 2 -_poly1305_setkey_avx: -#endif /* __APPLE__ */ - movabsq $0xffffffc0fffffff, %r10 - movabsq $0xffffffc0ffffffc, %r11 - movq (%rsi), %rdx - movq 8(%rsi), %rax - movq 16(%rsi), %rcx - movq 24(%rsi), %r8 - andq %r10, %rdx - andq %r11, %rax - movq %rdx, %r10 - movq %rax, %r11 - xorq %r9, %r9 - movq %rdx, (%rdi) - movq %rax, 8(%rdi) - movq %r9, 24(%rdi) - movq %r9, 32(%rdi) - movq %r9, 40(%rdi) - movq %rcx, 48(%rdi) - movq %r8, 56(%rdi) - movq %r9, 352(%rdi) - movq %r9, 408(%rdi) - movq %rdx, 360(%rdi) - movq %rax, 416(%rdi) - addq %rdx, %r10 - addq %rax, %r11 - movq %r10, 368(%rdi) - movq %r11, 424(%rdi) - addq %rdx, %r10 - addq %rax, %r11 - movq %r10, 376(%rdi) - movq %r11, 432(%rdi) - addq %rdx, %r10 - addq %rax, %r11 - movq %r10, 384(%rdi) - movq %r11, 440(%rdi) - addq %rdx, %r10 - addq %rax, %r11 - movq %r10, 392(%rdi) - movq %r11, 448(%rdi) - addq %rdx, %r10 - addq %rax, %r11 - movq %r10, 400(%rdi) - movq %r11, 456(%rdi) - movq %r9, 608(%rdi) - movb $0x01, 616(%rdi) - repz retq -#ifndef __APPLE__ -.size poly1305_setkey_avx,.-poly1305_setkey_avx -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl poly1305_block_avx -.type poly1305_block_avx,@function -.align 4 -poly1305_block_avx: -#else -.section __TEXT,__text -.globl _poly1305_block_avx -.p2align 2 -_poly1305_block_avx: -#endif /* __APPLE__ */ - pushq %r15 - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - movq (%rdi), %r15 - movq 8(%rdi), %rbx - movq 24(%rdi), %r8 - movq 32(%rdi), %r9 - movq 40(%rdi), %r10 - xorq %r14, %r14 - movb 616(%rdi), %r14b - # h += m - movq (%rsi), %r11 - movq 8(%rsi), %r12 - addq %r11, %r8 - adcq %r12, %r9 - movq %rbx, %rax - adcq %r14, %r10 - # r[1] * h[0] => rdx, rax ==> t2, t1 - mulq %r8 - movq %rax, %r12 - movq %rdx, %r13 - # r[0] * h[1] => rdx, rax ++> t2, t1 - movq %r15, %rax - mulq %r9 - addq %rax, %r12 - movq %r15, %rax - adcq %rdx, %r13 - # r[0] * h[0] => rdx, rax ==> t4, t0 - mulq %r8 - movq %rax, %r11 - movq %rdx, %r8 - # r[1] * h[1] => rdx, rax =+> t3, t2 - movq %rbx, %rax - mulq %r9 - # r[0] * h[2] +> t2 - addq 352(%rdi,%r10,8), %r13 - movq %rdx, %r14 - addq %r8, %r12 - adcq %rax, %r13 - # r[1] * h[2] +> t3 - adcq 408(%rdi,%r10,8), %r14 - # r * h in r14, r13, r12, r11 - # h = (r * h) mod 2^130 - 5 - movq %r13, %r10 - andq $-4, %r13 - andq $3, %r10 - addq %r13, %r11 - movq %r13, %r8 - adcq %r14, %r12 - adcq $0x00, %r10 - shrdq $2, %r14, %r8 - shrq $2, %r14 - addq %r11, %r8 - adcq %r14, %r12 - movq %r12, %r9 - adcq $0x00, %r10 - # h in r10, r9, r8 - # Store h to ctx - movq %r8, 24(%rdi) - movq %r9, 32(%rdi) - movq %r10, 40(%rdi) - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %r15 - repz retq -#ifndef __APPLE__ -.size poly1305_block_avx,.-poly1305_block_avx -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl poly1305_blocks_avx -.type poly1305_blocks_avx,@function -.align 4 -poly1305_blocks_avx: -#else -.section __TEXT,__text -.globl _poly1305_blocks_avx -.p2align 2 -_poly1305_blocks_avx: -#endif /* __APPLE__ */ - pushq %r15 - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - movq %rdx, %rcx - movq (%rdi), %r15 - movq 8(%rdi), %rbx - movq 24(%rdi), %r8 - movq 32(%rdi), %r9 - movq 40(%rdi), %r10 -L_poly1305_avx_blocks_start: - # h += m - movq (%rsi), %r11 - movq 8(%rsi), %r12 - addq %r11, %r8 - adcq %r12, %r9 - movq %rbx, %rax - adcq $0x00, %r10 - # r[1] * h[0] => rdx, rax ==> t2, t1 - mulq %r8 - movq %rax, %r12 - movq %rdx, %r13 - # r[0] * h[1] => rdx, rax ++> t2, t1 - movq %r15, %rax - mulq %r9 - addq %rax, %r12 - movq %r15, %rax - adcq %rdx, %r13 - # r[0] * h[0] => rdx, rax ==> t4, t0 - mulq %r8 - movq %rax, %r11 - movq %rdx, %r8 - # r[1] * h[1] => rdx, rax =+> t3, t2 - movq %rbx, %rax - mulq %r9 - # r[0] * h[2] +> t2 - addq 360(%rdi,%r10,8), %r13 - movq %rdx, %r14 - addq %r8, %r12 - adcq %rax, %r13 - # r[1] * h[2] +> t3 - adcq 416(%rdi,%r10,8), %r14 - # r * h in r14, r13, r12, r11 - # h = (r * h) mod 2^130 - 5 - movq %r13, %r10 - andq $-4, %r13 - andq $3, %r10 - addq %r13, %r11 - movq %r13, %r8 - adcq %r14, %r12 - adcq $0x00, %r10 - shrdq $2, %r14, %r8 - shrq $2, %r14 - addq %r11, %r8 - adcq %r14, %r12 - movq %r12, %r9 - adcq $0x00, %r10 - # h in r10, r9, r8 - # Next block from message - addq $16, %rsi - subq $16, %rcx - jg L_poly1305_avx_blocks_start - # Store h to ctx - movq %r8, 24(%rdi) - movq %r9, 32(%rdi) - movq %r10, 40(%rdi) - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %r15 - repz retq -#ifndef __APPLE__ -.size poly1305_blocks_avx,.-poly1305_blocks_avx -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl poly1305_final_avx -.type poly1305_final_avx,@function -.align 4 -poly1305_final_avx: -#else -.section __TEXT,__text -.globl _poly1305_final_avx -.p2align 2 -_poly1305_final_avx: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - movq %rsi, %rbx - movq 608(%rdi), %rax - testq %rax, %rax - je L_poly1305_avx_final_no_more - movb $0x01, 480(%rdi,%rax,1) - jmp L_poly1305_avx_final_cmp_rem -L_poly1305_avx_final_zero_rem: - movb $0x00, 480(%rdi,%rax,1) -L_poly1305_avx_final_cmp_rem: - incb %al - cmpq $16, %rax - jl L_poly1305_avx_final_zero_rem - movb $0x00, 616(%rdi) - leaq 480(%rdi), %rsi -#ifndef __APPLE__ - callq poly1305_block_avx@plt -#else - callq _poly1305_block_avx -#endif /* __APPLE__ */ -L_poly1305_avx_final_no_more: - movq 24(%rdi), %rax - movq 32(%rdi), %rdx - movq 40(%rdi), %rcx - movq 48(%rdi), %r11 - movq 56(%rdi), %r12 - # h %= p - # h = (h + pad) - # mod 2^130 - 5 - movq %rcx, %r8 - andq $3, %rcx - shrq $2, %r8 - # Multily by 5 - leaq 0(%r8,%r8,4), %r8 - addq %r8, %rax - adcq $0x00, %rdx - adcq $0x00, %rcx - # Fixup when between (1 << 130) - 1 and (1 << 130) - 5 - movq %rax, %r8 - movq %rdx, %r9 - movq %rcx, %r10 - addq $5, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - cmpq $4, %r10 - cmoveq %r8, %rax - cmoveq %r9, %rdx - # h += pad - addq %r11, %rax - adcq %r12, %rdx - movq %rax, (%rbx) - movq %rdx, 8(%rbx) - # Zero out r - movq $0x00, (%rdi) - movq $0x00, 8(%rdi) - # Zero out h - movq $0x00, 24(%rdi) - movq $0x00, 32(%rdi) - movq $0x00, 40(%rdi) - # Zero out pad - movq $0x00, 48(%rdi) - movq $0x00, 56(%rdi) - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size poly1305_final_avx,.-poly1305_final_avx -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX1 */ -#ifdef HAVE_INTEL_AVX2 -#ifndef __APPLE__ -.text -.globl poly1305_calc_powers_avx2 -.type poly1305_calc_powers_avx2,@function -.align 4 -poly1305_calc_powers_avx2: -#else -.section __TEXT,__text -.globl _poly1305_calc_powers_avx2 -.p2align 2 -_poly1305_calc_powers_avx2: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - pushq %rbp - movq (%rdi), %rcx - movq 8(%rdi), %r8 - xorq %r9, %r9 - # Convert to 26 bits in 32 - movq %rcx, %rax - movq %rcx, %rdx - movq %rcx, %rsi - movq %r8, %rbx - movq %r8, %rbp - shrq $26, %rdx - shrdq $52, %r8, %rsi - shrq $14, %rbx - shrdq $40, %r9, %rbp - andq $0x3ffffff, %rax - andq $0x3ffffff, %rdx - andq $0x3ffffff, %rsi - andq $0x3ffffff, %rbx - andq $0x3ffffff, %rbp - movl %eax, 224(%rdi) - movl %edx, 228(%rdi) - movl %esi, 232(%rdi) - movl %ebx, 236(%rdi) - movl %ebp, 240(%rdi) - movl $0x00, 244(%rdi) - # Square 128-bit - movq %r8, %rax - mulq %rcx - xorq %r13, %r13 - movq %rax, %r11 - movq %rdx, %r12 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - movq %rcx, %rax - mulq %rax - movq %rax, %r10 - movq %rdx, %r15 - movq %r8, %rax - mulq %rax - addq %r15, %r11 - adcq %rax, %r12 - adcq %rdx, %r13 - # Reduce 256-bit to 130-bit - movq %r12, %rax - movq %r13, %rdx - andq $-4, %rax - andq $3, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - shrdq $2, %rdx, %rax - shrq $2, %rdx - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - movq %r12, %rax - shrq $2, %rax - leaq 0(%rax,%rax,4), %rax - andq $3, %r12 - addq %rax, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Convert to 26 bits in 32 - movq %r10, %rax - movq %r10, %rdx - movq %r10, %rsi - movq %r11, %rbx - movq %r11, %rbp - shrq $26, %rdx - shrdq $52, %r11, %rsi - shrq $14, %rbx - shrdq $40, %r12, %rbp - andq $0x3ffffff, %rax - andq $0x3ffffff, %rdx - andq $0x3ffffff, %rsi - andq $0x3ffffff, %rbx - andq $0x3ffffff, %rbp - movl %eax, 256(%rdi) - movl %edx, 260(%rdi) - movl %esi, 264(%rdi) - movl %ebx, 268(%rdi) - movl %ebp, 272(%rdi) - movl $0x00, 276(%rdi) - # Multiply 128-bit by 130-bit - # r1[0] * r2[0] - movq %rcx, %rax - mulq %r10 - movq %rax, %r13 - movq %rdx, %r14 - # r1[0] * r2[1] - movq %rcx, %rax - mulq %r11 - movq $0x00, %r15 - addq %rax, %r14 - adcq %rdx, %r15 - # r1[1] * r2[0] - movq %r8, %rax - mulq %r10 - movq $0x00, %rsi - addq %rax, %r14 - adcq %rdx, %r15 - adcq $0x00, %rsi - # r1[0] * r2[2] - movq %rcx, %rax - mulq %r12 - addq %rax, %r15 - adcq %rdx, %rsi - # r1[1] * r2[1] - movq %r8, %rax - mulq %r11 - movq $0x00, %rbx - addq %rax, %r15 - adcq %rdx, %rsi - adcq $0x00, %rbx - # r1[1] * r2[2] - movq %r8, %rax - mulq %r12 - addq %rax, %rsi - adcq %rdx, %rbx - # Reduce 260-bit to 130-bit - movq %r15, %rax - movq %rsi, %rdx - movq %rbx, %rbx - andq $-4, %rax - andq $3, %r15 - addq %rax, %r13 - adcq %rdx, %r14 - adcq %rbx, %r15 - shrdq $2, %rdx, %rax - shrdq $2, %rbx, %rdx - shrq $2, %rbx - addq %rax, %r13 - adcq %rdx, %r14 - adcq %rbx, %r15 - movq %r15, %rax - andq $3, %r15 - shrq $2, %rax - leaq 0(%rax,%rax,4), %rax - addq %rax, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - # Convert to 26 bits in 32 - movq %r13, %rax - movq %r13, %rdx - movq %r13, %rsi - movq %r14, %rbx - movq %r14, %rbp - shrq $26, %rdx - shrdq $52, %r14, %rsi - shrq $14, %rbx - shrdq $40, %r15, %rbp - andq $0x3ffffff, %rax - andq $0x3ffffff, %rdx - andq $0x3ffffff, %rsi - andq $0x3ffffff, %rbx - andq $0x3ffffff, %rbp - movl %eax, 288(%rdi) - movl %edx, 292(%rdi) - movl %esi, 296(%rdi) - movl %ebx, 300(%rdi) - movl %ebp, 304(%rdi) - movl $0x00, 308(%rdi) - # Square 130-bit - movq %r11, %rax - mulq %r10 - xorq %r13, %r13 - movq %rax, %r8 - movq %rdx, %r9 - addq %rax, %r8 - adcq %rdx, %r9 - adcq $0x00, %r13 - movq %r10, %rax - mulq %rax - movq %rax, %rcx - movq %rdx, %r15 - movq %r11, %rax - mulq %rax - addq %r15, %r8 - adcq %rax, %r9 - adcq %rdx, %r13 - movq %r12, %rax - mulq %rax - movq %rax, %r14 - movq %r12, %rax - mulq %r10 - addq %rax, %r9 - adcq %rdx, %r13 - adcq $0x00, %r14 - addq %rax, %r9 - adcq %rdx, %r13 - adcq $0x00, %r14 - movq %r12, %rax - mulq %r11 - addq %rax, %r13 - adcq %rdx, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Reduce 260-bit to 130-bit - movq %r9, %rax - movq %r13, %rdx - movq %r14, %r15 - andq $-4, %rax - andq $3, %r9 - addq %rax, %rcx - adcq %rdx, %r8 - adcq %r15, %r9 - shrdq $2, %rdx, %rax - shrdq $2, %r15, %rdx - shrq $2, %r15 - addq %rax, %rcx - adcq %rdx, %r8 - adcq %r15, %r9 - movq %r9, %rax - andq $3, %r9 - shrq $2, %rax - leaq 0(%rax,%rax,4), %rax - addq %rax, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - # Convert to 26 bits in 32 - movq %rcx, %rax - movq %rcx, %rdx - movq %rcx, %rsi - movq %r8, %rbx - movq %r8, %rbp - shrq $26, %rdx - shrdq $52, %r8, %rsi - shrq $14, %rbx - shrdq $40, %r9, %rbp - andq $0x3ffffff, %rax - andq $0x3ffffff, %rdx - andq $0x3ffffff, %rsi - andq $0x3ffffff, %rbx - andq $0x3ffffff, %rbp - movl %eax, 320(%rdi) - movl %edx, 324(%rdi) - movl %esi, 328(%rdi) - movl %ebx, 332(%rdi) - movl %ebp, 336(%rdi) - movl $0x00, 340(%rdi) - popq %rbp - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl poly1305_setkey_avx2 -.type poly1305_setkey_avx2,@function -.align 4 -poly1305_setkey_avx2: -#else -.section __TEXT,__text -.globl _poly1305_setkey_avx2 -.p2align 2 -_poly1305_setkey_avx2: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - callq poly1305_setkey_avx@plt -#else - callq _poly1305_setkey_avx -#endif /* __APPLE__ */ - vpxor %ymm0, %ymm0, %ymm0 - vmovdqu %ymm0, 64(%rdi) - vmovdqu %ymm0, 96(%rdi) - vmovdqu %ymm0, 128(%rdi) - vmovdqu %ymm0, 160(%rdi) - vmovdqu %ymm0, 192(%rdi) - movq $0x00, 608(%rdi) - movw $0x00, 616(%rdi) - repz retq -#ifndef __APPLE__ -.size poly1305_setkey_avx2,.-poly1305_setkey_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data -#else -.section __DATA,__data -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.align 32 -#else -.p2align 5 -#endif /* __APPLE__ */ -L_poly1305_avx2_blocks_mask: -.quad 0x3ffffff, 0x3ffffff -.quad 0x3ffffff, 0x3ffffff -#ifndef __APPLE__ -.data -#else -.section __DATA,__data -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.align 32 -#else -.p2align 5 -#endif /* __APPLE__ */ -L_poly1305_avx2_blocks_hibit: -.quad 0x1000000, 0x1000000 -.quad 0x1000000, 0x1000000 -#ifndef __APPLE__ -.text -.globl poly1305_blocks_avx2 -.type poly1305_blocks_avx2,@function -.align 4 -poly1305_blocks_avx2: -#else -.section __TEXT,__text -.globl _poly1305_blocks_avx2 -.p2align 2 -_poly1305_blocks_avx2: -#endif /* __APPLE__ */ - pushq %r12 - pushq %rbx - subq $0x140, %rsp - movq %rsp, %rcx - andq $-32, %rcx - addq $32, %rcx - vpxor %ymm15, %ymm15, %ymm15 - movq %rcx, %rbx - leaq 64(%rdi), %rax - addq $0xa0, %rbx - cmpw $0x00, 616(%rdi) - jne L_poly1305_avx2_blocks_begin_h - # Load the message data - vmovdqu (%rsi), %ymm0 - vmovdqu 32(%rsi), %ymm1 - vperm2i128 $32, %ymm1, %ymm0, %ymm2 - vperm2i128 $49, %ymm1, %ymm0, %ymm0 - vpunpckldq %ymm0, %ymm2, %ymm1 - vpunpckhdq %ymm0, %ymm2, %ymm3 - vpunpckldq %ymm15, %ymm1, %ymm0 - vpunpckhdq %ymm15, %ymm1, %ymm1 - vpunpckldq %ymm15, %ymm3, %ymm2 - vpunpckhdq %ymm15, %ymm3, %ymm3 - vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4 - vpsllq $6, %ymm1, %ymm1 - vpsllq $12, %ymm2, %ymm2 - vpsllq $18, %ymm3, %ymm3 - vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14 - # Reduce, in place, the message data - vpsrlq $26, %ymm0, %ymm10 - vpsrlq $26, %ymm3, %ymm11 - vpand %ymm14, %ymm0, %ymm0 - vpand %ymm14, %ymm3, %ymm3 - vpaddq %ymm1, %ymm10, %ymm1 - vpaddq %ymm4, %ymm11, %ymm4 - vpsrlq $26, %ymm1, %ymm10 - vpsrlq $26, %ymm4, %ymm11 - vpand %ymm14, %ymm1, %ymm1 - vpand %ymm14, %ymm4, %ymm4 - vpaddq %ymm2, %ymm10, %ymm2 - vpslld $2, %ymm11, %ymm12 - vpaddd %ymm12, %ymm11, %ymm12 - vpsrlq $26, %ymm2, %ymm10 - vpaddq %ymm0, %ymm12, %ymm0 - vpsrlq $26, %ymm0, %ymm11 - vpand %ymm14, %ymm2, %ymm2 - vpand %ymm14, %ymm0, %ymm0 - vpaddq %ymm3, %ymm10, %ymm3 - vpaddq %ymm1, %ymm11, %ymm1 - vpsrlq $26, %ymm3, %ymm10 - vpand %ymm14, %ymm3, %ymm3 - vpaddq %ymm4, %ymm10, %ymm4 - addq $0x40, %rsi - subq $0x40, %rdx - jz L_poly1305_avx2_blocks_store - jmp L_poly1305_avx2_blocks_load_r4 -L_poly1305_avx2_blocks_begin_h: - # Load the H values. - vmovdqu (%rax), %ymm0 - vmovdqu 32(%rax), %ymm1 - vmovdqu 64(%rax), %ymm2 - vmovdqu 96(%rax), %ymm3 - vmovdqu 128(%rax), %ymm4 - # Check if there is a power of r to load - otherwise use r^4. - cmpb $0x00, 616(%rdi) - je L_poly1305_avx2_blocks_load_r4 - # Load the 4 powers of r - r^4, r^3, r^2, r^1. - vmovdqu 224(%rdi), %ymm8 - vmovdqu 256(%rdi), %ymm7 - vmovdqu 288(%rdi), %ymm6 - vmovdqu 320(%rdi), %ymm5 - vpermq $0xd8, %ymm5, %ymm5 - vpermq $0xd8, %ymm6, %ymm6 - vpermq $0xd8, %ymm7, %ymm7 - vpermq $0xd8, %ymm8, %ymm8 - vpunpcklqdq %ymm6, %ymm5, %ymm10 - vpunpckhqdq %ymm6, %ymm5, %ymm11 - vpunpcklqdq %ymm8, %ymm7, %ymm12 - vpunpckhqdq %ymm8, %ymm7, %ymm13 - vperm2i128 $32, %ymm12, %ymm10, %ymm5 - vperm2i128 $49, %ymm12, %ymm10, %ymm7 - vperm2i128 $32, %ymm13, %ymm11, %ymm9 - vpsrlq $32, %ymm5, %ymm6 - vpsrlq $32, %ymm7, %ymm8 - jmp L_poly1305_avx2_blocks_mul_5 -L_poly1305_avx2_blocks_load_r4: - # Load r^4 into all four positions. - vmovdqu 320(%rdi), %ymm13 - vpermq $0x00, %ymm13, %ymm5 - vpsrlq $32, %ymm13, %ymm14 - vpermq $0x55, %ymm13, %ymm7 - vpermq $0xaa, %ymm13, %ymm9 - vpermq $0x00, %ymm14, %ymm6 - vpermq $0x55, %ymm14, %ymm8 -L_poly1305_avx2_blocks_mul_5: - # Multiply top 4 26-bit values of all four H by 5 - vpslld $2, %ymm6, %ymm10 - vpslld $2, %ymm7, %ymm11 - vpslld $2, %ymm8, %ymm12 - vpslld $2, %ymm9, %ymm13 - vpaddq %ymm10, %ymm6, %ymm10 - vpaddq %ymm11, %ymm7, %ymm11 - vpaddq %ymm12, %ymm8, %ymm12 - vpaddq %ymm13, %ymm9, %ymm13 - # Store powers of r and multiple of 5 for use in multiply. - vmovdqa %ymm10, (%rbx) - vmovdqa %ymm11, 32(%rbx) - vmovdqa %ymm12, 64(%rbx) - vmovdqa %ymm13, 96(%rbx) - vmovdqa %ymm5, (%rcx) - vmovdqa %ymm6, 32(%rcx) - vmovdqa %ymm7, 64(%rcx) - vmovdqa %ymm8, 96(%rcx) - vmovdqa %ymm9, 128(%rcx) - vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14 - # If not finished then loop over data - cmpb $0x01, 616(%rdi) - jne L_poly1305_avx2_blocks_start - # Do last multiply, reduce, add the four H together and move to - # 32-bit registers - vpmuludq (%rbx), %ymm4, %ymm5 - vpmuludq 32(%rbx), %ymm3, %ymm10 - vpmuludq 32(%rbx), %ymm4, %ymm6 - vpmuludq 64(%rbx), %ymm3, %ymm11 - vpmuludq 64(%rbx), %ymm4, %ymm7 - vpaddq %ymm5, %ymm10, %ymm5 - vpmuludq 64(%rbx), %ymm2, %ymm12 - vpmuludq 96(%rbx), %ymm4, %ymm8 - vpaddq %ymm6, %ymm11, %ymm6 - vpmuludq 96(%rbx), %ymm1, %ymm13 - vpmuludq 96(%rbx), %ymm2, %ymm10 - vpaddq %ymm5, %ymm12, %ymm5 - vpmuludq 96(%rbx), %ymm3, %ymm11 - vpmuludq (%rcx), %ymm3, %ymm12 - vpaddq %ymm5, %ymm13, %ymm5 - vpmuludq (%rcx), %ymm4, %ymm9 - vpaddq %ymm6, %ymm10, %ymm6 - vpmuludq (%rcx), %ymm0, %ymm13 - vpaddq %ymm7, %ymm11, %ymm7 - vpmuludq (%rcx), %ymm1, %ymm10 - vpaddq %ymm8, %ymm12, %ymm8 - vpmuludq (%rcx), %ymm2, %ymm11 - vpmuludq 32(%rcx), %ymm2, %ymm12 - vpaddq %ymm5, %ymm13, %ymm5 - vpmuludq 32(%rcx), %ymm3, %ymm13 - vpaddq %ymm6, %ymm10, %ymm6 - vpmuludq 32(%rcx), %ymm0, %ymm10 - vpaddq %ymm7, %ymm11, %ymm7 - vpmuludq 32(%rcx), %ymm1, %ymm11 - vpaddq %ymm8, %ymm12, %ymm8 - vpmuludq 64(%rcx), %ymm1, %ymm12 - vpaddq %ymm9, %ymm13, %ymm9 - vpmuludq 64(%rcx), %ymm2, %ymm13 - vpaddq %ymm6, %ymm10, %ymm6 - vpmuludq 64(%rcx), %ymm0, %ymm10 - vpaddq %ymm7, %ymm11, %ymm7 - vpmuludq 96(%rcx), %ymm0, %ymm11 - vpaddq %ymm8, %ymm12, %ymm8 - vpmuludq 96(%rcx), %ymm1, %ymm12 - vpaddq %ymm9, %ymm13, %ymm9 - vpaddq %ymm7, %ymm10, %ymm7 - vpmuludq 128(%rcx), %ymm0, %ymm13 - vpaddq %ymm8, %ymm11, %ymm8 - vpaddq %ymm9, %ymm12, %ymm9 - vpaddq %ymm9, %ymm13, %ymm9 - vpsrlq $26, %ymm5, %ymm10 - vpsrlq $26, %ymm8, %ymm11 - vpand %ymm14, %ymm5, %ymm5 - vpand %ymm14, %ymm8, %ymm8 - vpaddq %ymm6, %ymm10, %ymm6 - vpaddq %ymm9, %ymm11, %ymm9 - vpsrlq $26, %ymm6, %ymm10 - vpsrlq $26, %ymm9, %ymm11 - vpand %ymm14, %ymm6, %ymm1 - vpand %ymm14, %ymm9, %ymm4 - vpaddq %ymm7, %ymm10, %ymm7 - vpslld $2, %ymm11, %ymm12 - vpaddd %ymm12, %ymm11, %ymm12 - vpsrlq $26, %ymm7, %ymm10 - vpaddq %ymm5, %ymm12, %ymm5 - vpsrlq $26, %ymm5, %ymm11 - vpand %ymm14, %ymm7, %ymm2 - vpand %ymm14, %ymm5, %ymm0 - vpaddq %ymm8, %ymm10, %ymm8 - vpaddq %ymm1, %ymm11, %ymm1 - vpsrlq $26, %ymm8, %ymm10 - vpand %ymm14, %ymm8, %ymm3 - vpaddq %ymm4, %ymm10, %ymm4 - vpsrldq $8, %ymm0, %ymm5 - vpsrldq $8, %ymm1, %ymm6 - vpsrldq $8, %ymm2, %ymm7 - vpsrldq $8, %ymm3, %ymm8 - vpsrldq $8, %ymm4, %ymm9 - vpaddq %ymm0, %ymm5, %ymm0 - vpaddq %ymm1, %ymm6, %ymm1 - vpaddq %ymm2, %ymm7, %ymm2 - vpaddq %ymm3, %ymm8, %ymm3 - vpaddq %ymm4, %ymm9, %ymm4 - vpermq $2, %ymm0, %ymm5 - vpermq $2, %ymm1, %ymm6 - vpermq $2, %ymm2, %ymm7 - vpermq $2, %ymm3, %ymm8 - vpermq $2, %ymm4, %ymm9 - vpaddq %ymm0, %ymm5, %ymm0 - vpaddq %ymm1, %ymm6, %ymm1 - vpaddq %ymm2, %ymm7, %ymm2 - vpaddq %ymm3, %ymm8, %ymm3 - vpaddq %ymm4, %ymm9, %ymm4 - vmovd %xmm0, %r8d - vmovd %xmm1, %r9d - vmovd %xmm2, %r10d - vmovd %xmm3, %r11d - vmovd %xmm4, %r12d - jmp L_poly1305_avx2_blocks_end_calc -L_poly1305_avx2_blocks_start: - vmovdqu (%rsi), %ymm5 - vmovdqu 32(%rsi), %ymm6 - vperm2i128 $32, %ymm6, %ymm5, %ymm7 - vperm2i128 $49, %ymm6, %ymm5, %ymm5 - vpunpckldq %ymm5, %ymm7, %ymm6 - vpunpckhdq %ymm5, %ymm7, %ymm8 - vpunpckldq %ymm15, %ymm6, %ymm5 - vpunpckhdq %ymm15, %ymm6, %ymm6 - vpunpckldq %ymm15, %ymm8, %ymm7 - vpunpckhdq %ymm15, %ymm8, %ymm8 - vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9 - vpsllq $6, %ymm6, %ymm6 - vpsllq $12, %ymm7, %ymm7 - vpsllq $18, %ymm8, %ymm8 - vpmuludq (%rbx), %ymm4, %ymm10 - vpaddq %ymm5, %ymm10, %ymm5 - vpmuludq 32(%rbx), %ymm3, %ymm10 - vpmuludq 32(%rbx), %ymm4, %ymm11 - vpaddq %ymm6, %ymm11, %ymm6 - vpmuludq 64(%rbx), %ymm3, %ymm11 - vpmuludq 64(%rbx), %ymm4, %ymm12 - vpaddq %ymm7, %ymm12, %ymm7 - vpaddq %ymm5, %ymm10, %ymm5 - vpmuludq 64(%rbx), %ymm2, %ymm12 - vpmuludq 96(%rbx), %ymm4, %ymm13 - vpaddq %ymm8, %ymm13, %ymm8 - vpaddq %ymm6, %ymm11, %ymm6 - vpmuludq 96(%rbx), %ymm1, %ymm13 - vpmuludq 96(%rbx), %ymm2, %ymm10 - vpaddq %ymm5, %ymm12, %ymm5 - vpmuludq 96(%rbx), %ymm3, %ymm11 - vpmuludq (%rcx), %ymm3, %ymm12 - vpaddq %ymm5, %ymm13, %ymm5 - vpmuludq (%rcx), %ymm4, %ymm13 - vpaddq %ymm9, %ymm13, %ymm9 - vpaddq %ymm6, %ymm10, %ymm6 - vpmuludq (%rcx), %ymm0, %ymm13 - vpaddq %ymm7, %ymm11, %ymm7 - vpmuludq (%rcx), %ymm1, %ymm10 - vpaddq %ymm8, %ymm12, %ymm8 - vpmuludq (%rcx), %ymm2, %ymm11 - vpmuludq 32(%rcx), %ymm2, %ymm12 - vpaddq %ymm5, %ymm13, %ymm5 - vpmuludq 32(%rcx), %ymm3, %ymm13 - vpaddq %ymm6, %ymm10, %ymm6 - vpmuludq 32(%rcx), %ymm0, %ymm10 - vpaddq %ymm7, %ymm11, %ymm7 - vpmuludq 32(%rcx), %ymm1, %ymm11 - vpaddq %ymm8, %ymm12, %ymm8 - vpmuludq 64(%rcx), %ymm1, %ymm12 - vpaddq %ymm9, %ymm13, %ymm9 - vpmuludq 64(%rcx), %ymm2, %ymm13 - vpaddq %ymm6, %ymm10, %ymm6 - vpmuludq 64(%rcx), %ymm0, %ymm10 - vpaddq %ymm7, %ymm11, %ymm7 - vpmuludq 96(%rcx), %ymm0, %ymm11 - vpaddq %ymm8, %ymm12, %ymm8 - vpmuludq 96(%rcx), %ymm1, %ymm12 - vpaddq %ymm9, %ymm13, %ymm9 - vpaddq %ymm7, %ymm10, %ymm7 - vpmuludq 128(%rcx), %ymm0, %ymm13 - vpaddq %ymm8, %ymm11, %ymm8 - vpaddq %ymm9, %ymm12, %ymm9 - vpaddq %ymm9, %ymm13, %ymm9 - vpsrlq $26, %ymm5, %ymm10 - vpsrlq $26, %ymm8, %ymm11 - vpand %ymm14, %ymm5, %ymm5 - vpand %ymm14, %ymm8, %ymm8 - vpaddq %ymm6, %ymm10, %ymm6 - vpaddq %ymm9, %ymm11, %ymm9 - vpsrlq $26, %ymm6, %ymm10 - vpsrlq $26, %ymm9, %ymm11 - vpand %ymm14, %ymm6, %ymm1 - vpand %ymm14, %ymm9, %ymm4 - vpaddq %ymm7, %ymm10, %ymm7 - vpslld $2, %ymm11, %ymm12 - vpaddd %ymm12, %ymm11, %ymm12 - vpsrlq $26, %ymm7, %ymm10 - vpaddq %ymm5, %ymm12, %ymm5 - vpsrlq $26, %ymm5, %ymm11 - vpand %ymm14, %ymm7, %ymm2 - vpand %ymm14, %ymm5, %ymm0 - vpaddq %ymm8, %ymm10, %ymm8 - vpaddq %ymm1, %ymm11, %ymm1 - vpsrlq $26, %ymm8, %ymm10 - vpand %ymm14, %ymm8, %ymm3 - vpaddq %ymm4, %ymm10, %ymm4 - addq $0x40, %rsi - subq $0x40, %rdx - jnz L_poly1305_avx2_blocks_start -L_poly1305_avx2_blocks_store: - # Store four H values - state - vmovdqu %ymm0, (%rax) - vmovdqu %ymm1, 32(%rax) - vmovdqu %ymm2, 64(%rax) - vmovdqu %ymm3, 96(%rax) - vmovdqu %ymm4, 128(%rax) -L_poly1305_avx2_blocks_end_calc: - cmpb $0x00, 616(%rdi) - je L_poly1305_avx2_blocks_complete - movq %r8, %rax - movq %r10, %rdx - movq %r12, %rcx - shrq $12, %rdx - shrq $24, %rcx - shlq $26, %r9 - shlq $52, %r10 - shlq $14, %r11 - shlq $40, %r12 - addq %r9, %rax - adcq %r10, %rax - adcq %r11, %rdx - adcq %r12, %rdx - adcq $0x00, %rcx - movq %rcx, %r8 - andq $3, %rcx - shrq $2, %r8 - leaq 0(%r8,%r8,4), %r8 - addq %r8, %rax - adcq $0x00, %rdx - adcq $0x00, %rcx - movq %rax, 24(%rdi) - movq %rdx, 32(%rdi) - movq %rcx, 40(%rdi) -L_poly1305_avx2_blocks_complete: - movb $0x01, 617(%rdi) - addq $0x140, %rsp - popq %rbx - popq %r12 - repz retq -#ifndef __APPLE__ -.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl poly1305_final_avx2 -.type poly1305_final_avx2,@function -.align 4 -poly1305_final_avx2: -#else -.section __TEXT,__text -.globl _poly1305_final_avx2 -.p2align 2 -_poly1305_final_avx2: -#endif /* __APPLE__ */ - movb $0x01, 616(%rdi) - movb 617(%rdi), %cl - cmpb $0x00, %cl - je L_poly1305_avx2_final_done_blocks_X4 - pushq %rsi - movq $0x40, %rdx - xorq %rsi, %rsi -#ifndef __APPLE__ - callq poly1305_blocks_avx2@plt -#else - callq _poly1305_blocks_avx2 -#endif /* __APPLE__ */ - popq %rsi -L_poly1305_avx2_final_done_blocks_X4: - movq 608(%rdi), %rax - movq %rax, %rcx - andq $-16, %rcx - cmpb $0x00, %cl - je L_poly1305_avx2_final_done_blocks - pushq %rcx - pushq %rax - pushq %rsi - movq %rcx, %rdx - leaq 480(%rdi), %rsi -#ifndef __APPLE__ - callq poly1305_blocks_avx@plt -#else - callq _poly1305_blocks_avx -#endif /* __APPLE__ */ - popq %rsi - popq %rax - popq %rcx -L_poly1305_avx2_final_done_blocks: - subq %rcx, 608(%rdi) - xorq %rdx, %rdx - jmp L_poly1305_avx2_final_cmp_copy -L_poly1305_avx2_final_start_copy: - movb 480(%rdi,%rcx,1), %r8b - movb %r8b, 480(%rdi,%rdx,1) - incb %cl - incb %dl -L_poly1305_avx2_final_cmp_copy: - cmp %rcx, %rax - jne L_poly1305_avx2_final_start_copy -#ifndef __APPLE__ - callq poly1305_final_avx@plt -#else - callq _poly1305_final_avx -#endif /* __APPLE__ */ - vpxor %ymm0, %ymm0, %ymm0 - vmovdqu %ymm0, 64(%rdi) - vmovdqu %ymm0, 96(%rdi) - vmovdqu %ymm0, 128(%rdi) - vmovdqu %ymm0, 160(%rdi) - vmovdqu %ymm0, 192(%rdi) - vmovdqu %ymm0, 224(%rdi) - vmovdqu %ymm0, 256(%rdi) - vmovdqu %ymm0, 288(%rdi) - vmovdqu %ymm0, 320(%rdi) - movq $0x00, 608(%rdi) - movw $0x00, 616(%rdi) - repz retq -#ifndef __APPLE__ -.size poly1305_final_avx2,.-poly1305_final_avx2 -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ |